123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
#in the name of God
#[email protected]
#reading Cifar10 and creating an lmdb dataset with preprocessed(zca whitened) images .
import os
import numpy as np
import cPickle
import lmdb
import matplotlib.pyplot as plt
import cv2
from scipy.misc import toimage
import caffe
#read all cifar10 data and test sets into corrosponding variables
#they have the shape [N, 3072]
data_train_size = 50000
data_test_size = 10000
image_size_flattened = 32*32*3 #3072
data_train = np.empty([data_train_size, image_size_flattened], np.uint8)
label_train = np.empty([data_train_size], np.uint8)
data_test = np.empty([data_test_size, image_size_flattened], np.uint8)
label_test = np.empty([data_test_size], np.uint8)
data_batches = np.empty([5, 10000, image_size_flattened], np.uint8)
#downloaded and extracted http://www.cs.utoronto.ca/~kriz/cifar-10-python.tar.gz
cifar10_dir = 'C:/Users/Master/Desktop/cifar-10-batches-py/'
i = 0
j = 0
for file in os.listdir(cifar10_dir):
if 'data_batch_' in os.path.basename(file):
#read the data into the array
with open(cifar10_dir+file,'rb') as input_file:
data = cPickle.load(input_file)
data_batches[j] = data['data']
data_train[i: i+10000, :] = data['data']
label_train[i: i+10000] = data['labels']
i = i+10000
j+=1
elif 'test_batch' in os.path.basename(file):
#read the test into the respective array
with open(cifar10_dir+file,'rb') as input_file:
data = cPickle.load(input_file)
data_test = data['data']
label_test = data['labels']
print ('training and testing sets are read!')
print data_train.shape
print data_test.shape
def show(img):
"""Displays the input image using plt.imshow"""
plt.imshow(img)
plt.show()
def normalize_image(data):
"""normalizes the input image (converts to floats (0.0-1.0))"""
min_ = np.min(data)
max_ = np.max(data)
return (data-min_)/(max_ - min_)
def run_zca_stackOverflow(data):
"""runs zca on the input data and returns 3 outputs:
zca, mean, zca_matrix
"""
print('data shape: ', data.shape)
print ('Zero Centring...')
# zero-center
mean = data.mean(axis=0)
data = data - mean
print('Contrast Normalizing..')
#Contrast Normalization (L2-normalization)
data = data / np.sqrt((data ** 2).sum(axis=1))[:,None]
print ('Calculating Covariance...')
#cov
cov = np.cov(data, rowvar=True) #true calculates covariance among images, and false among dimensions
U,S,V = np.linalg.svd(cov) # U is (N, N), S is (N,)
print('Building ZCA Matrix...')
# build the ZCA matrix
epsilon = 1e-5
zca_matrix = np.dot(U, np.dot(np.diag(1.0/np.sqrt(S + epsilon)), U.T))
print ('Applying ZCA to the data...')
# transform the image data zca_matrix is (N,N)
zca = np.dot(zca_matrix, data)
return zca, mean, zca_matrix
# In[ ]:
#test and see an example image
img_test = data_train[6].copy()
print 'original image shape :',img_test.shape
img_test2 = img_test.reshape(3,32,32).transpose(1,2,0)
print 'reshaped and transposed',img_test2.shape
show(img_test2)
#print img_test2
# In[ ]:
#conducting the zca operation
zca_train, mean, zca_matrix = run_zca_stackOverflow(data_train)
# In[ ]:
#ZCA the test set!
data_test = data_test - mean
data_test = data_test / np.sqrt((data_test ** 2).sum(axis=1))[:,None]
zca_test = np.dot(zca_matrix, data_test)
print 'zca train shape: ',zca_train.shape
print 'zca test shape: ',zca_test.shape
# In[ ]:
#Saving the dataset as lmdb for use in caffe
def Save_lmdb(data_train, data_test):
print 'Outputting training data'
lmdb_file ='cifar10_train_lmdb_whitened'
batch_size = size_train
db = lmdb.open(lmdb_file, map_size=int(data_train.nbytes))
batch = db.begin(write=True)
datum = caffe_pb2.Datum()
for i in range(size_train):
if i % 1000 == 0:
print i
# save in datum
datum = caffe.io.array_to_datum(data_train[i], label_train[i])
keystr = '{:0>5d}'.format(i)
batch.put( keystr, datum.SerializeToString() )
# write batch
if(i + 1) % batch_size == 0:
batch.commit()
batch=db.begin(write=True)
print (i + 1)
# write last batch
if (i+1) % batch_size != 0:
batch.commit()
print 'last batch'
print (i + 1)
print 'Outputting test data'
lmdb_file = 'cifar10_test_lmdb_whitened'
batch_size = size_test
db = lmdb.open(lmdb_file,map_size=int(data_test.nbytes))
batch = db.begin(write=True)
datum = caffe_pb2.Datum()
for i in range(size_test):
# save in datum
datum = caffe.io.array_to_datum(data_test[i], label_test[i])
keystr = '{:0>5d}'.format(i)
batch.put( keystr, datum.SerializeToString() )
# write batch
if(i + 1) % batch_size == 0:
batch.commit()
batch = db.begin(write=True)
print (i + 1)
# write last batch
if (i+1) % batch_size != 0:
batch.commit()
print 'last batch'
print (i + 1)
# In[ ]:
#saving the dataset as leveldb
def Save_leveldb(data_train, data_test):
print 'Outputting training data'
leveldb_file = direct + 'cifar10_train_leveldb_whitened'
batch_size = size_train
# create the leveldb file
db = leveldb.LevelDB(leveldb_file)
batch = leveldb.WriteBatch()
datum = caffe_pb2.Datum()
for i in range(size_train):
if i % 1000 == 0:
print i
# save in datum
datum = caffe.io.array_to_datum(data_train[i], label_train[i])
keystr = '{:0>5d}'.format(i)
batch.Put( keystr, datum.SerializeToString() )
# write batch
if(i + 1) % batch_size == 0:
db.Write(batch, sync=True)
batch = leveldb.WriteBatch()
print (i + 1)
# write last batch
if (i+1) % batch_size != 0:
db.Write(batch, sync=True)
print 'last batch'
print (i + 1)
print 'Outputting test data'
leveldb_file = direct + 'cifar10_test_leveldb_whitened'
batch_size = size_test
# create the leveldb file
db = leveldb.LevelDB(leveldb_file)
batch = leveldb.WriteBatch()
datum = caffe_pb2.Datum()
for i in range(size_test):
# save in datum
datum = caffe.io.array_to_datum(data_test[i], label_test[i])
keystr = '{:0>5d}'.format(i)
batch.Put( keystr, datum.SerializeToString() )
# write batch
if(i + 1) % batch_size == 0:
db.Write(batch, sync=True)
batch = leveldb.WriteBatch()
print (i + 1)
# write last batch
if (i+1) % batch_size != 0:
db.Write(batch, sync=True)
print 'last batch'
print (i + 1)
# In[ ]:
Save_lmdb(zca_train, zca_test)