What's the best way to extract Excel cell comm

2020-06-27 05:56发布

问题:

I've been parsing Excel documents in Perl successfully with Spreadhsheet::ParseExcel (as recommended in What's the best way to parse Excel file in Perl?), but I can't figure out how to extract cell comments.

Any ideas? A solution in Perl or Ruby would be ideal.

回答1:

One option is to use Ruby's win32ole library.

The following (somewhat verbose) example connects to an open Excel worksheet and gets the comment text from cell B2.

require 'win32ole'

xl = WIN32OLE.connect('Excel.Application')
ws = xl.ActiveSheet
cell = ws.Range('B2')
comment = cell.Comment
text = comment.Text

More info and examples of using Ruby's win32ole library to automate Excel can be found here:

http://rubyonwindows.blogspot.com/search/label/excel



回答2:

The Python xlrd library will parse cell comments (if you turn on xlrd.sheet.OBJ_MSO_DEBUG, you'll see them), but it doesn't expose them from the API. You could either parse the dump or hack on it a bit so you can get to them programmatically. Here's a start (tested extremely minimally):

diff --git a/xlrd/sheet.py b/xlrd/sheet.py
--- a/xlrd/sheet.py
+++ b/xlrd/sheet.py
@@ -206,6 +206,7 @@
         self._dimncols = 0
         self._cell_values = []
         self._cell_types = []
+        self._cell_notes = []
         self._cell_xf_indexes = []
         self._need_fix_ragged_rows = 0
         self.defcolwidth = None
@@ -252,6 +253,7 @@
         return Cell(
             self._cell_types[rowx][colx],
             self._cell_values[rowx][colx],
+            self._cell_notes[rowx][colx],
             xfx,
             )

@@ -422,12 +424,14 @@
                     if self.formatting_info:
                         self._cell_xf_indexes[nrx].extend(aa('h', [-1]) * nextra)
                 self._cell_values[nrx].extend([''] * nextra)
+                self._cell_notes[nrx].extend([None] * nextra)
         if nc > self.ncols:
             self.ncols = nc
             self._need_fix_ragged_rows = 1
         if nr > self.nrows:
             scta = self._cell_types.append
             scva = self._cell_values.append
+            scna = self._cell_notes.append
             scxa = self._cell_xf_indexes.append
             fmt_info = self.formatting_info
             xce = XL_CELL_EMPTY
@@ -436,6 +440,7 @@
                 for _unused in xrange(self.nrows, nr):
                     scta([xce] * nc)
                     scva([''] * nc)
+                    scna([None] * nc)
                     if fmt_info:
                         scxa([-1] * nc)
             else:
@@ -443,6 +448,7 @@
                 for _unused in xrange(self.nrows, nr):
                     scta(aa('B', [xce]) * nc)
                     scva([''] * nc)
+                    scna([None] * nc)
                     if fmt_info:
                         scxa(aa('h', [-1]) * nc)
             self.nrows = nr
@@ -454,6 +460,7 @@
         aa = array_array
         s_cell_types = self._cell_types
         s_cell_values = self._cell_values
+        s_cell_notes = self._cell_notes
         s_cell_xf_indexes = self._cell_xf_indexes
         s_dont_use_array = self.dont_use_array
         s_fmt_info = self.formatting_info
@@ -465,6 +472,7 @@
             nextra = ncols - rlen
             if nextra > 0:
                 s_cell_values[rowx][rlen:] = [''] * nextra
+                s_cell_notes[rowx][rlen:] = [None] * nextra
                 if s_dont_use_array:
                     trow[rlen:] = [xce] * nextra
                     if s_fmt_info:
@@ -600,6 +608,7 @@
         bk_get_record_parts = bk.get_record_parts
         bv = self.biff_version
         fmt_info = self.formatting_info
+        txos = {}
         eof_found = 0
         while 1:
             # if DEBUG: print "SHEET.READ: about to read from position %d" % bk._position
@@ -877,13 +886,23 @@
                 break
             elif rc == XL_OBJ:
                 # handle SHEET-level objects; note there's a separate Book.handle_obj
-                self.handle_obj(data)
+                obj = self.handle_obj(data)
+                if obj:
+                    obj_id = obj.id
+                else:
+                    obj_id = None
             elif rc == XL_MSO_DRAWING:
                 self.handle_msodrawingetc(rc, data_len, data)
             elif rc == XL_TXO:
-                self.handle_txo(data)
+                txo = self.handle_txo(data)
+                if txo and obj_id:
+                    txos[obj_id] = txo
+                    obj_id = None
             elif rc == XL_NOTE:
-                self.handle_note(data)
+                note = self.handle_note(data)
+                txo = txos.get(note.object_id)
+                if txo:
+                    self._cell_notes[note.rowx][note.colx] = txo.text
             elif rc == XL_FEAT11:
                 self.handle_feat11(data)
             elif rc in bofcodes: ##### EMBEDDED BOF #####
@@ -1387,19 +1406,16 @@


     def handle_obj(self, data):
-        if not OBJ_MSO_DEBUG:
-            return
-        DEBUG = 1
         if self.biff_version < 80:
             return
         o = MSObj()
         data_len = len(data)
         pos = 0
-        if DEBUG:
+        if OBJ_MSO_DEBUG:
             fprintf(self.logfile, "... OBJ record ...\n")
         while pos < data_len:
             ft, cb = unpack('<HH', data[pos:pos+4])
-            if DEBUG:
+            if OBJ_MSO_DEBUG:
                 hex_char_dump(data, pos, cb, base=0, fout=self.logfile)
             if ft == 0x15: # ftCmo ... s/b first
                 assert pos == 0
@@ -1430,16 +1446,14 @@
         else:
             # didn't break out of while loop
             assert pos == data_len
-        if DEBUG:
+        if OBJ_MSO_DEBUG:
             o.dump(self.logfile, header="=== MSOBj ===", footer= " ")
+        return o

     def handle_note(self, data):
-        if not OBJ_MSO_DEBUG:
-            return
-        DEBUG = 1
         if self.biff_version < 80:
             return
-        if DEBUG:
+        if OBJ_MSO_DEBUG:
             fprintf(self.logfile, '... NOTE record ...\n')
             hex_char_dump(data, 0, len(data), base=0, fout=self.logfile)
         o = MSNote()
@@ -1453,13 +1467,11 @@
         o.original_author, endpos = unpack_unicode_update_pos(data, 8, lenlen=2)
         assert endpos == data_len - 1
         o.last_byte = data[-1]
-        if DEBUG:
+        if OBJ_MSO_DEBUG:
             o.dump(self.logfile, header="=== MSNote ===", footer= " ")
+        return o

     def handle_txo(self, data):
-        if not OBJ_MSO_DEBUG:
-            return
-        DEBUG = 1
         if self.biff_version < 80:
             return
         o = MSTxo()
@@ -1477,8 +1489,9 @@
         rc3, data3_len, data3 = self.book.get_record_parts()
         assert rc3 == XL_CONTINUE
         # ignore the formatting runs for the moment
-        if DEBUG:
+        if OBJ_MSO_DEBUG:
             o.dump(self.logfile, header="=== MSTxo ===", footer= " ")
+        return o

     def handle_feat11(self, data):
         if not OBJ_MSO_DEBUG:
@@ -1638,11 +1651,12 @@

 class Cell(BaseObject):

-    __slots__ = ['ctype', 'value', 'xf_index']
+    __slots__ = ['ctype', 'value', 'note', 'xf_index']

-    def __init__(self, ctype, value, xf_index=None):
+    def __init__(self, ctype, value, note=None, xf_index=None):
         self.ctype = ctype
         self.value = value
+        self.note = note
         self.xf_index = xf_index

     def __repr__(self):

Then you could write something like:

import xlrd

xlrd.sheet.OBJ_MSO_DEBUG = True

xls = xlrd.open_workbook('foo.xls')

for sheet in xls.sheets():
    print 'sheet %s (%d x %d)' % (sheet.name, sheet.nrows, sheet.ncols)
    for rownum in xrange(sheet.nrows):
        for cell in sheet.row(rownum):
            print cell, cell.note