I am attempting to store a large number of numpy structured array as datasets in a hdf5 file.
For example,
f['tree1'] = structured_array1
.
.
f['tree60000'] = structured_array60000
(there are ~ 60000 trees),
About 70% of the way into reading the file, I get the error RuntimeError: Unable to register datatype atom (Can't insert duplicate key)
This problem occurs only for an ascii file that is very large (10e7 lines, 5gb). It does not occur if the file is around (10e6 lines, 500mb). It also does not occur if I take out the datatype and just store as a numpy array of strings.
I can fix this problem if I stop reading halfway into the file, close my terminal, open it again, and continuing reading the file starting from halfway to the end (I save the line number I ended on). I tried opening and closing the hdf5 file in the python function itself, but this did not work.
dt = [
('scale', 'f4'),
('haloid', 'i8'),
('scale_desc', 'f4'),
('haloid_desc', 'i8'),
('num_prog', 'i4'),
('pid', 'i8'),
('upid', 'i8'),
('pid_desc', 'i8'),
('phantom', 'i4'),
('mvir_sam', 'f4'),
('mvir', 'f4'),
('rvir', 'f4'),
('rs', 'f4'),
('vrms', 'f4'),
('mmp', 'i4'),
('scale_lastmm', 'f4'),
('vmax', 'f4'),
('x', 'f4'),
('y', 'f4'),
('z', 'f4'),
('vx', 'f4'),
('vy', 'f4'),
('vz', 'f4'),
('jx', 'f4'),
('jy', 'f4'),
('jz', 'f4'),
('spin', 'f4'),
('haloid_breadth_first', 'i8'),
('haloid_depth_first', 'i8'),
('haloid_tree_root', 'i8'),
('haloid_orig', 'i8'),
('snap_num', 'i4'),
('haloid_next_coprog_depthfirst', 'i8'),
('haloid_last_prog_depthfirst', 'i8'),
('haloid_last_mainleaf_depthfirst', 'i8'),
('rs_klypin', 'f4'),
('mvir_all', 'f4'),
('m200b', 'f4'),
('m200c', 'f4'),
('m500c', 'f4'),
('m2500c', 'f4'),
('xoff', 'f4'),
('voff', 'f4'),
('spin_bullock', 'f4'),
('b_to_a', 'f4'),
('c_to_a', 'f4'),
('axisA_x', 'f4'),
('axisA_y', 'f4'),
('axisA_z', 'f4'),
('b_to_a_500c', 'f4'),
('c_to_a_500c', 'f4'),
('axisA_x_500c', 'f4'),
('axisA_y_500c', 'f4'),
('axisA_z_500c', 'f4'),
('t_by_u', 'f4'),
('mass_pe_behroozi', 'f4'),
('mass_pe_diemer', 'f4')
]
def read_in_trees(self):
"""Store each tree as an hdf5 dataset.
"""
with open(self.fname) as ascii_file:
with h5py.File(self.hdf5_name,"r+") as f:
tree_id = ""
current_tree = []
for line in ascii_file:
if(line[0]=='#'): #new tree
arr = np.array(current_tree, dtype = dt)
f[tree_id] = arr
current_tree = []
tree_id = line[6:].strip('\n')
else: #read in next tree element
current_tree.append(tuple(line.split()))
return
Error:
/Volumes/My Passport for Mac/raw_trees/bolshoi/rockstar/asciiReaderOne.py in read_in_trees(self)
129 arr = np.array(current_tree, dtype = dt)
130 # depth_sort = arr['haloid_depth_first'].argsort()
--> 131 f[tree_id] = arr
132 current_tree = []
133 first_line = False
/Library/Python/2.7/site-packages/h5py/_objects.so in h5py._objects.with_phil.wrapper (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/_objects.c:2458)()
/Library/Python/2.7/site-packages/h5py/_objects.so in h5py._objects.with_phil.wrapper (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/_objects.c:2415)()
/Library/Python/2.7/site-packages/h5py/_hl/group.pyc in __setitem__(self, name, obj)
281
282 else:
--> 283 ds = self.create_dataset(None, data=obj, dtype=base.guess_dtype(obj))
284 h5o.link(ds.id, self.id, name, lcpl=lcpl)
285
/Library/Python/2.7/site-packages/h5py/_hl/group.pyc in create_dataset(self, name, shape, dtype, data, **kwds)
101 """
102 with phil:
--> 103 dsid = dataset.make_new_dset(self, shape, dtype, data, **kwds)
104 dset = dataset.Dataset(dsid)
105 if name is not None:
/Library/Python/2.7/site-packages/h5py/_hl/dataset.pyc in make_new_dset(parent, shape, dtype, data, chunks, compression, shuffle, fletcher32, maxshape, compression_opts, fillvalue, scaleoffset, track_times)
124
125 if data is not None:
--> 126 dset_id.write(h5s.ALL, h5s.ALL, data)
127
128 return dset_id
/Library/Python/2.7/site-packages/h5py/_objects.so in h5py._objects.with_phil.wrapper (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/_objects.c:2458)()
/Library/Python/2.7/site-packages/h5py/_objects.so in h5py._objects.with_phil.wrapper (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/_objects.c:2415)()
/Library/Python/2.7/site-packages/h5py/h5d.so in h5py.h5d.DatasetID.write (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5d.c:3260)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t.py_create (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:15314)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t.py_create (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:14903)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t._c_compound (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:14192)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t.py_create (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:15314)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t.py_create (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:14749)()
/Library/Python/2.7/site-packages/h5py/h5t.so in h5py.h5t._c_float (/Users/travis/build/MacPython/h5py-wheels/h5py/h5py/h5t.c:12379)()
RuntimeError: Unable to register datatype atom (Can't insert duplicate key)