Opening a corrupted PyTables HDF5 file

2019-07-24 20:01发布

问题:

I am hoping for some help in opening a corrupted HDF5 file. I am accessing PyTables via Pandas, but a pd.read_hdf() call produces the following error. I don't know anything about the inner workings of PyTables.

I believe the error was created because the process saving to the file (appending every 10 seconds or so) got duplicated, so there were then 2 identical processes appending. I am not sure why this would corrupt the file rather than duplicate data, but the two errors occurred together which is why I think they are causally related.

---------------
HDF5ExtError                              Traceback (most recent call last)
<ipython-input-37-99558b43d768> in <module>()
----> 1 ES2 = h.read('./ES_201509-1')

/Users/AFK/Desktop/fastback/historical_store.pyc in read(self, path, key, **kwargs)
     53         frame.  Extra keyword args are all passed down to pandas.read_hdf().
     54         """
---> 55         df = pd.read_hdf(path, key, **kwargs)
     56         df.index = pd.to_datetime(df.Time)
     57         del df['Time']

//anaconda/lib/python2.7/site-packages/pandas/io/pytables.pyc in read_hdf(path_or_buf, key, **kwargs)
    326             # if there is an error, close the store
    327             try:
--> 328                 store.close()
    329             except:
    330                 pass

//anaconda/lib/python2.7/site-packages/pandas/io/pytables.pyc in close(self)
    566         """
    567         if self._handle is not None:
--> 568             self._handle.close()
    569         self._handle = None
    570 

//anaconda/lib/python2.7/site-packages/tables/file.pyc in close(self)
   2726 
   2727         # Close all loaded nodes.
-> 2728         self.root._f_close()
   2729 
   2730         self._node_manager.shutdown()

//anaconda/lib/python2.7/site-packages/tables/group.pyc in _f_close(self)
    907         # this is not an explicit close issued by the user.
    908         if not (self._v__deleting or self._v_objectid is None):
--> 909             self._g_close_descendents()
    910 
    911         # When all the descendents have been closed, close this group.

//anaconda/lib/python2.7/site-packages/tables/group.pyc in _g_close_descendents(self)
    870 
    871         node_manager = self._v_file._node_manager
--> 872         node_manager.close_subtree(self._v_pathname)
    873 
    874     _g_closeDescendents = previous_api(_g_close_descendents)

//anaconda/lib/python2.7/site-packages/tables/file.pyc in close_subtree(self, prefix)
    540             if path.startswith(prefix) and '/_i_' not in path
    541         ]
--> 542         self._close_nodes(paths, cache.pop)
    543 
    544         # Close everything else (i.e. indices)

//anaconda/lib/python2.7/site-packages/tables/file.pyc in _close_nodes(nodepaths, get_node)
    515                         node._g_close()
    516                     else:
--> 517                         node._f_close()
    518                     del node
    519                 except ClosedNodeError:

//anaconda/lib/python2.7/site-packages/tables/table.pyc in _f_close(self, flush)
   3034         # Flush right now so the row object does not get in the middle.
   3035         if flush:
-> 3036             self.flush()
   3037 
   3038         # Some warnings can be issued after calling `self._g_set_location()`

//anaconda/lib/python2.7/site-packages/tables/table.pyc in flush(self)
   2969         if self.indexed and self.autoindex:
   2970             # Flush any unindexed row
-> 2971             rowsadded = self.flush_rows_to_index(_lastrow=True)
   2972             assert rowsadded <= 0 or self._indexedrows == self.nrows, \
   2973                 ("internal error: the number of indexed rows (%d) "

//anaconda/lib/python2.7/site-packages/tables/table.pyc in flush_rows_to_index(self, _lastrow)
   2578                     if nrows > 0 and not col.index.dirty:
   2579                         rowsadded = self._add_rows_to_index(
-> 2580                             colname, start, nrows, _lastrow, update=True)
   2581             self._unsaved_indexedrows -= rowsadded
   2582             self._indexedrows += rowsadded

//anaconda/lib/python2.7/site-packages/tables/table.pyc in _add_rows_to_index(self, colname, start, nrows, lastrow, update)
   2609         if lastrow and startLR < self.nrows:
   2610             index.append_last_row(
-> 2611                 [self._read(startLR, self.nrows, 1, colname)],
   2612                 update=update)
   2613             indexedrows += self.nrows - startLR

//anaconda/lib/python2.7/site-packages/tables/table.pyc in _read(self, start, stop, step, field, out)
   1895             self._read_field_name(result, start, stop, step, field)
   1896         else:
-> 1897             self.row._fill_col(result, start, stop, step, field)
   1898 
   1899         if select_field:

//anaconda/lib/python2.7/site-packages/tables/tableextension.so in tables.tableextension.Row._fill_col (tables/tableextension.c:12653)()

//anaconda/lib/python2.7/site-packages/tables/tableextension.so in tables.tableextension.Table._read_records (tables/tableextension.c:6721)()

HDF5ExtError: HDF5 error back trace

  File "H5Dio.c", line 174, in H5Dread
    can't read data
  File "H5Dio.c", line 449, in H5D_read
    can't read data
  File "H5Dchunk.c", line 1729, in H5D_chunk_read
    unable to read raw data chunk
  File "H5Dchunk.c", line 2760, in H5D_chunk_lock
    data pipeline read failed
  File "H5Z.c", line 1120, in H5Z_pipeline
    filter returned failure during read
  File "H5Zdeflate.c", line 125, in H5Z_filter_deflate
    inflate() failed

End of HDF5 error back trace

Problems reading records.

回答1:

your file is borked. no way to recover from this. this is specifically warned against (using multiple threads/processes as writers). see docs here.

HDF5 is NOT threadsafe/process safe for writers.