I have two directories, each of which contains about 50,000 images, which are mostly 240x180 sizes.
I want to pickle their pixel infos as training, validation, and test sets,
but this apparently turns out to be very, very large, and eventually cause the computer to either free or run out of disk spaces.
When the computer froze, the pkl file in the middle of being generated was 28GB.
I'm not sure if this is supposed to be this large.
Am I doing something wrong? Or is there a more efficient way to do this?
from PIL import Image
import pickle
import os
indir1 = 'Positive'
indir2 = 'Negative'
trainimage = []
trainpixels = []
trainlabels = []
validimage = []
validpixels = []
validlabels = []
testimage = []
testpixels = []
testlabels = []
i=0
for (root, dirs, filenames) in os.walk(indir1):
print 'hello'
for f in filenames:
try:
im = Image.open(os.path.join(root,f))
if i<40000:
trainpixels.append(im.tostring())
trainlabels.append(0)
elif i<45000:
validpixels.append(im.tostring())
validlabels.append(0)
else:
testpixels.append(im.tostring())
testlabels.append(0)
print str(i)+'\t'+str(f)
i+=1
except IOError:
continue
i=0
for (root, dirs, filenames) in os.walk(indir2):
print 'hello'
for f in filenames:
try:
im = Image.open(os.path.join(root,f))
if i<40000:
trainpixels.append(im.tostring())
trainlabels.append(1)
elif i<45000:
validpixels.append(im.tostring())
validlabels.append(1)
else:
testpixels.append(im.tostring())
testlabels.append(1)
print str(i)+'\t'+str(f)
i+=1
except IOError:
continue
trainimage.append(trainpixels)
trainimage.append(trainlabels)
validimage.append(validpixels)
validimage.append(validlabels)
testimage.append(testpixels)
testimage.append(testlabels)
output=open('data.pkl','wb')
pickle.dump(trainimage,output)
pickle.dump(validimage,output)
pickle.dump(testimage,output)