Read Rows Sequentially, but On Demand

2019-03-01 01:30发布

问题:

I need to iterate through an Excel spreadsheet row by row, but not all at once. I want to keep a stream open to the spreadsheet, and then pass that object along to another method. This method will ask for the next row of data from time to time. This method is already established, so I can't change it.

I tried doing this initially with XSSFWorkbook, which worked really well, until I ran out of memory on real data.

Then I tried switching to the SAX parser, using XSSFSheetXMLHandler, with a custom parser similar to the sample one XLSX2CSV provided by the Apache POI project. But this results in all rows processing at one time; and if I store them in memory to read later I also run out of memory. I also didn't have access to the DataFormatter anymore, which I need for cell value processing.

Can someone point me to an example/class that will allow me to do this?

回答1:

My preferrred Streaming API for XML is StAX.

Knowing that a *.xlsx file is simply a ZIP archive and apache pois OPCPackage for this is a ZipPackage, we could think about the following approach:

  • Get the /xl/worksheets/sheetN.xml package part from the *.xlsx Excel ZipPackage.
  • Create a StAX reader on it.
  • Now we can read from this XML using this reader.

The following example creates a rudimentary application which is doing this row by row using a button click.

import java.awt.*;
import java.awt.event.*;
import javax.swing.*;

import org.apache.poi.openxml4j.opc.*;

import javax.xml.stream.*;
import javax.xml.stream.events.*;

import javax.xml.namespace.QName;

import java.util.regex.Pattern;

public class GetExcelRowByRow extends JPanel implements ActionListener {
 protected JButton button;
 protected JTextArea textArea;
 private final static String newline = "\n";

 //file path to Excel file and sheet number to work with
 private final static String filepath = "file.xlsx";
 private final static int scheetnr = 1;

 private StaxExcelRowByRowReader reader;

 public GetExcelRowByRow() {
  super(new GridBagLayout());
  button = new JButton("Next Row");
  button.addActionListener(this);
  textArea = new JTextArea(15, 50) {
   @Override
   public boolean getScrollableTracksViewportWidth() {
    return true;
   }
  };
  textArea.setLineWrap(true);
  textArea.setEditable(false);
  JScrollPane scrollPane = new JScrollPane(textArea);
  GridBagConstraints c = new GridBagConstraints();
  c.gridwidth = GridBagConstraints.REMAINDER;
  c.fill = GridBagConstraints.HORIZONTAL;
  add(button, c);
  c.fill = GridBagConstraints.BOTH;
  c.weightx = 1.0;
  c.weighty = 1.0;
  add(scrollPane, c);

  try {
   reader = new StaxExcelRowByRowReader(filepath, scheetnr);
  } catch (Exception ex) {
   ex.printStackTrace();
  }

 }
 @Override
 public void actionPerformed(ActionEvent evt) {
  String row = "Row not found...";
  try {
   row = reader.getNextRow();
  } catch (Exception ex) {
   ex.printStackTrace();
  }
  textArea.append(row + newline);
  textArea.setCaretPosition(textArea.getDocument().getLength());
 }

 public StaxExcelRowByRowReader getReader() {
  return reader;
 }

 private static void createAndShowGUI() {
  JFrame frame = new JFrame("Get Excel row by row");
  frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
  GetExcelRowByRow app = new GetExcelRowByRow();
  frame.add(app);

  frame.addWindowListener(new WindowAdapter() {
   @Override
   public void windowClosing(WindowEvent windowEvent) {
    try {
     app.getReader().close();
    } catch (Exception ex) {
     ex.printStackTrace();
    }
    System.exit(0);
   }
  });

  frame.pack();
  frame.setVisible(true);
 }

 public static void main(String[] args) {
  javax.swing.SwingUtilities.invokeLater(new Runnable() {
   public void run() {
    createAndShowGUI();
   }
  });
 }

 //class for reading a /xl/worksheets/sheetN.xml package part from a *.xlsx Excel ZipPackage 
 private class StaxExcelRowByRowReader {

  private XMLEventReader sheetreader;
  private OPCPackage opcpackage; 

  public StaxExcelRowByRowReader(String filepath, int sheetnr) {
   try {
    opcpackage = OPCPackage.open(filepath, PackageAccess.READ);
    //get the sheet package part
    PackagePart sheetpart = opcpackage.getPartsByName(Pattern.compile("/xl/worksheets/sheet"+sheetnr+".xml")).get(0);
    //create reader for the sheet package part            
    sheetreader = XMLInputFactory.newInstance().createXMLEventReader(sheetpart.getInputStream());
   } catch (Exception ex) {
    ex.printStackTrace();
   }
  }

  //method for getting the next row from the reader
  public String getNextRow() throws Exception {
   StringBuffer row = new StringBuffer();
   boolean valueFound = false;
   boolean nextValueIsSharedString = false;
   while(sheetreader.hasNext()){
    XMLEvent event = sheetreader.nextEvent();
    if(event.isStartElement()) {
     StartElement startElement = (StartElement)event;
     QName startElementName = startElement.getName();
     if(startElementName.getLocalPart().equalsIgnoreCase("row")) { //start element of row
      row.append("<row");
      row.append(" " + startElement.getAttributeByName(new QName("r")));
      row.append(">");
     } else if(startElementName.getLocalPart().equalsIgnoreCase("c")) { //start element of cell
      row.append("<c");
      row.append(" " + startElement.getAttributeByName(new QName("r")));
      row.append(" " + startElement.getAttributeByName(new QName("s")));
      row.append(" " + startElement.getAttributeByName(new QName("t")));
      row.append(">");
      Attribute type = startElement.getAttributeByName(new QName("t"));
      if (type != null && "s".equals(type.getValue())) {
       nextValueIsSharedString = true;
      } else {
       nextValueIsSharedString = false;
      }
     } else if(startElementName.getLocalPart().equalsIgnoreCase("v")) { //start element of value
      row.append("<v>");
      valueFound = true;
     }
    } else if(event.isCharacters() && valueFound) {
     Characters characters = (Characters)event;
     if (nextValueIsSharedString) {
      row.append("shared string: " + characters.getData());
     } else {
      row.append(characters.getData());
     }
    } else if(event.isEndElement()) {
     EndElement endElement = (EndElement)event;
     QName endElementName = endElement.getName();
     if(endElementName.getLocalPart().equalsIgnoreCase("v")) { //end element of value
      row.append("</v>");
      valueFound = false;
     } else if(endElementName.getLocalPart().equalsIgnoreCase("c")) { //end element of cell
      row.append("</c>");
     } else if(endElementName.getLocalPart().equalsIgnoreCase("row")) { //end element of row
      row.append("</row>");
      return row.toString();
     }
    }
   }
   return "No more rows.";
  }

  public void close() throws Exception {
   if (sheetreader != null) sheetreader.close();
   if (opcpackage != null) opcpackage.close();
  }
 }
}

For sure this is only a draft to show the principle. The whole application would be much more code.

Next we would have to read and parse the /xl/sharedStrings.xml package part, which contains the shared strings. Also we have to read and parse the /xl/styles.xml package part which contains the cell styles. We need the styles to detect whether a numeric value is a date or a number and if a number, then what kind of number. This is necessary because Excel stores all kinds of numbers as double values. And dates also are numbers as doubles, meaning days after 01/01/1900 with fractional part as 1h = 1/24, 1m = 1/24/60, 1s = 1/24/60/60.

But this is possible using the same approach as with the /xl/worksheets/sheetN.xml package part.