I managed to write numpy arrays to lmdb, howewer solution is far from perfection, but actually my X
is just jpg image, so my question is how to directly write jpeg file to lmdb?
Seems like pycaffe
doing similar thing but it use caffe specific Datum
and I need some general solution without dependencies.
Here is example that write image as numpy array and directly as encoded jpg.
As we can see store jpg directly is more efficient in terms of storage.
du -sh *
184K temp.db
120K temp_jpg.db
import numpy as np
import lmdb
import cv2
n_samples= 2
def create_random_image(filename):
img= (np.random.rand(100,120,3)*255).astype(np.uint8)
cv2.imwrite(filename,img)
def write_lmdb(filename):
print 'Write lmdb'
lmdb_env = lmdb.open(filename, map_size=int(1e9))
X= cv2.imread('random_img.jpg')
y= np.random.rand(1).astype(np.float32)*10.0
for i in range(n_samples):
with lmdb_env.begin(write=True) as lmdb_txn:
lmdb_txn.put('X_'+str(i), X)
lmdb_txn.put('y_'+str(i), y)
print 'X.shape:',X.shape
print 'y:',y
def read_lmdb(filename):
print 'Read lmdb'
lmdb_env = lmdb.open(filename)
lmdb_txn = lmdb_env.begin()
lmdb_cursor = lmdb_txn.cursor()
#also can do it without iteration via txn.get('key1')?
n_counter=0
with lmdb_env.begin() as lmdb_txn:
with lmdb_txn.cursor() as lmdb_cursor:
for key, value in lmdb_cursor:
print key
if('X' in key):
print 'X.shape', np.fromstring(value, dtype=np.uint8).shape
if('y' in key):
print np.fromstring(value, dtype=np.float32)
n_counter=n_counter+1
print 'n_samples',n_counter
def write_lmdb_jpg(filename):
print 'Write lmdb'
lmdb_env = lmdb.open(filename, map_size=int(1e9))
X= cv2.imread('random_img.jpg')
y= np.random.rand(1).astype(np.float32)*10.0
for i in range(n_samples):
with lmdb_env.begin(write=True) as lmdb_txn:
lmdb_txn.put('X_'+str(i), cv2.imencode('.jpg', X)[1])
lmdb_txn.put('y_'+str(i), y)
print 'X.shape', cv2.imencode('.jpg', X)[1].shape
print 'y:',y
def read_lmdb_jpg(filename):
print 'Read lmdb'
lmdb_env = lmdb.open(filename)
lmdb_txn = lmdb_env.begin()
lmdb_cursor = lmdb_txn.cursor()
#also can do it without iteration via txn.get('key1')?
n_counter=0
with lmdb_env.begin() as lmdb_txn:
with lmdb_txn.cursor() as lmdb_cursor:
for key, value in lmdb_cursor:
print key
if('X' in key):
X_str= np.fromstring(value, dtype=np.uint8)
print 'X_str.shape', X_str.shape
X= cv2.imdecode(X_str, cv2.CV_LOAD_IMAGE_COLOR)
print 'X.shape', X.shape
if('y' in key):
print np.fromstring(value, dtype=np.float32)
n_counter=n_counter+1
print 'n_samples',n_counter
create_random_image('random_img.jpg')
#Write as numpy array
write_lmdb('temp.db')
read_lmdb('temp.db')
#Write as encoded jpg
write_lmdb_jpg('temp_jpg.db')
read_lmdb_jpg('temp_jpg.db')