Is it possible to parse MS Word using Apache POI a

2020-06-18 04:41发布

问题:

Is it possible to convert a MS Word to XML file using Apache POI ?

If it is, can you point me to any tutorials for doing that?

回答1:

I'd say you have two options, both powered by Apache POI

One is to use Apache Tika. Tika is a text and metadata extraction toolkit, and is able to extract fairly rich text from Word documents by making appropriate calls to POI. The result is that Tika will give you XHTML style XML for the contents of your word document.

The other option is to use a class that was added fairly recently to POI, which is WordToHtmlConverter. This will turn your word document into HTML for you, and generally will preserve slightly more of the structure and formatting than Tika will.

Depending on the kind of XML you're hoping to get out, one of these should be a good bet for you. I'd suggest you try both against some of your sample files, and see which one is the best fit for your problem domain and needs.



回答2:

The purpose of HWPF subproject is exactly that: process Word files.

http://poi.apache.org/hwpf/index.html

Then, to convert the data to XML you have to build XML by the ususal ways: StAX, JDOM, XStream...

Apache offers a Quick Guide:

http://poi.apache.org/hwpf/quick-guide.html

and I also have found that:

http://sanjaal.com/java/tag/simple-java-tutorial-to-read-microsoft-document-in-java/

If you want to process docx files, you might want to look at the OpenXML4J subproject:

http://poi.apache.org/oxml4j/index.html



回答3:

package com.govind.service;
import java.io.File;
import java.io.FileInputStream;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

/**
 * DOC to XML converter service
 * 
 * @author govind.sharma
 *
 */

public class DocToXmlConverter {
 static final Logger logger = Logger.getLogger(DocToXmlConverter.class);
 DocumentBuilderFactory docFactory = null;
 DocumentBuilder docBuilder = null;
 Element rootElement = null;
 Document docxml = null;
 boolean subHeaders = false;
 Element UrlElement = null;

 /**
  * @param path
  * @param fileName
  */
 public void processDocxToXml(String path, String fileName) {

  XWPFDocument xdoc = null;
  FileInputStream fis = null;

  String fullPath = path + "/" + fileName + ".docx";

  try {
   // Read file
   fis = new FileInputStream(fullPath);
   xdoc = new XWPFDocument(OPCPackage.open(fis));

   initializeXml();
   // get Document Body Paragraph content

   List < XWPFParagraph > paragraphList = xdoc.getParagraphs();
   for (XWPFParagraph paragraph: paragraphList) {

    String styleName = paragraph.getStyle();
    String paraText = paragraph.getParagraphText();
    String bulletsPoints = paragraph.getNumFmt();
    createXmlTags(styleName, paraText, bulletsPoints);

   }
   // write the content into XML file
   generateXml(path, fileName);
   logger.info("Doc to Xml Convertion completed.");

  } catch (Exception ex) {
   logger.error("Exception while generating XML from DOC" + ex.getMessage());
   System.exit(0);
  }
 }

 /**
  * @param path
  * @param fileName
  */
 public void processDocToXml(String path, String fileName) {
  HWPFDocument doc = null;

  String fullPath = path + "/" + fileName + ".doc";

  WordExtractor we = null;
  try {
   POIFSFileSystem fis = new POIFSFileSystem(new FileInputStream(fullPath));
   doc = new HWPFDocument(fis);

  } catch (Exception e) {
   logger.error("Unable to Read File..." + e.getMessage());
   System.exit(0);
  }
  try {
   we = new WordExtractor(doc);
   Range range = doc.getRange();

   initializeXml();

   String[] paragraphs = we.getParagraphText();

   for (int i = 0; i < paragraphs.length; i++) {
    org.apache.poi.hwpf.usermodel.Paragraph pr = range.getParagraph(i);

    int j = 0;
    while (true) {
     CharacterRun run = pr.getCharacterRun(j++);

     StyleDescription style = doc.getStyleSheet().getStyleDescription(run.getStyleIndex());
     String styleName = style.getName();
     String paraText = run.text();
     String bulletsPoints = null;

     createXmlTags(styleName, paraText, bulletsPoints);

     if (run.getEndOffset() == pr.getEndOffset()) {
      break;
     }
    }
   }

   generateXml(path, fileName);

   logger.info("Document to Xml Convertion completed.");
  } catch (Exception ex) {
   logger.error("Exception while generating XML from DOC" + ex.getMessage());
   System.exit(0);
  }
 }

 /**
  * 
  */
 private void initializeXml() {

  // initialize XML Document
  try {
   docFactory = DocumentBuilderFactory.newInstance();
   docBuilder = docFactory.newDocumentBuilder();
   docxml = docBuilder.newDocument();

   rootElement = docxml.createElement("ROOT");
   docxml.appendChild(rootElement);
  } catch (ParserConfigurationException e) {
   logger.error("Exception while initializing XML" + e.getMessage());
  }

 }

 /**
  * @param styleName
  * @param paragraphText
  * @param bulletsPoints
  */
 private void createXmlTags(String styleName, String paragraphText, String bulletsPoints) {

  // create XML Tags

  if (styleName != null && paragraphText.length() > 1) {
   if (styleName.equalsIgnoreCase("Style4")) {
    Element pragElement = docxml.createElement("TITLE");
    pragElement.appendChild(docxml.createTextNode(paragraphText.trim()));
    rootElement.appendChild(pragElement);
    subHeaders = true;
   } else if (styleName.equalsIgnoreCase("Default")) {
    Element pragElement = docxml.createElement("P");
    pragElement.appendChild(docxml.createTextNode(paragraphText));
    rootElement.appendChild(pragElement);
    subHeaders = true;
   } else if (styleName.equalsIgnoreCase("Normal")) {
    Element pragElement = docxml.createElement("P");
    pragElement.appendChild(docxml.createTextNode(paragraphText));
    rootElement.appendChild(pragElement);
    subHeaders = true;
   } else if (styleName.equalsIgnoreCase("BodyCopy") && bulletsPoints != null) {
    Element pragElement = docxml.createElement("LI");
    pragElement.appendChild(docxml.createTextNode(paragraphText));
    UrlElement.appendChild(pragElement);
    subHeaders = false;
   } else if (styleName.equalsIgnoreCase("BodyCopy")) {
    Element pragElement = docxml.createElement("PS");
    pragElement.appendChild(docxml.createTextNode(paragraphText));
    rootElement.appendChild(pragElement);
    subHeaders = true;
   } else if (styleName.equalsIgnoreCase("ListParagraph")) {
    Element pragElement = docxml.createElement("LI");
    pragElement.appendChild(docxml.createTextNode(paragraphText));
    UrlElement.appendChild(pragElement);
    subHeaders = false;
   } else if (styleName.equalsIgnoreCase("Subheader1")) {
    UrlElement = docxml.createElement("UL");

    Element pragElement = docxml.createElement("LI");
    pragElement.appendChild(docxml.createTextNode(paragraphText));
    UrlElement.appendChild(pragElement);
    rootElement.appendChild(UrlElement);
    subHeaders = false;

   } else {
    Element pragElement = docxml.createElement("PS");
    pragElement.appendChild(docxml.createTextNode(paragraphText));
    rootElement.appendChild(pragElement);
    subHeaders = true;
   }

  } else if (paragraphText.trim().length() > 1) {
   Element pragElement = docxml.createElement("P");
   pragElement.appendChild(docxml.createTextNode(paragraphText));
   rootElement.appendChild(pragElement);
   subHeaders = true;
  }

  if (subHeaders) {
   Element pragElement = docxml.createElement("NEWLINE");
   pragElement.appendChild(docxml.createTextNode(""));
   rootElement.appendChild(pragElement);
  }
 }

 /**
  * @param path
  * @param fileName
  */
 private void generateXml(String path, String fileName) {
  try {
   // write the content into xml file
   TransformerFactory transformerFactory = TransformerFactory.newInstance();
   Transformer transformer = transformerFactory.newTransformer();
   transformer.setOutputProperty(OutputKeys.METHOD, "xml");
   transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
   transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
   transformer.setOutputProperty(OutputKeys.INDENT, "yes");
   transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");

   DOMSource source = new DOMSource(docxml);

   StreamResult result = new StreamResult(new File(path + "/" + fileName + ".xml"));
   transformer.transform(source, result);
  } catch (Exception e) {
   logger.error("Exception while generating XML" + e.getMessage());
  }
 }

}