Extract text from a XPS Document [closed]

2019-03-06 10:07发布

问题:

i need to extract the text of a specific page from a XPS document. The extracted text should be written in a string. I need this to read out the extracted text using Microsofts SpeechLib. Please examples only in C#.

Thanks

回答1:

Add References to ReachFramework and WindowsBase and the following using statement:

using System.Windows.Xps.Packaging;

Then use this code:

XpsDocument _xpsDocument=new XpsDocument("/path",System.IO.FileAccess.Read);
IXpsFixedDocumentSequenceReader fixedDocSeqReader 
    =_xpsDocument.FixedDocumentSequenceReader;
IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
IXpsFixedPageReader _page 
    = _document.FixedPages[documentViewerElement.MasterPageNumber];
StringBuilder _currentText = new StringBuilder();
System.Xml.XmlReader _pageContentReader = _page.XmlReader;
if (_pageContentReader != null)
{
  while (_pageContentReader.Read())
  {
    if (_pageContentReader.Name == "Glyphs")
    {
      if (_pageContentReader.HasAttributes)
      {
        if (_pageContentReader.GetAttribute("UnicodeString") != null )
        {                                   
          _currentText.
            Append(_pageContentReader.
            GetAttribute("UnicodeString"));                              
        }
      }
    }
  }
}
string _fullPageText = _currentText.ToString();

Text exists in Glyphs -> UnicodeString string attribute. You have to use XMLReader for fixed page.



回答2:

    private string ReadXpsFile(string fileName)
    {
        XpsDocument _xpsDocument = new XpsDocument(fileName, System.IO.FileAccess.Read);
        IXpsFixedDocumentSequenceReader fixedDocSeqReader
            = _xpsDocument.FixedDocumentSequenceReader;
        IXpsFixedDocumentReader _document = fixedDocSeqReader.FixedDocuments[0];
        FixedDocumentSequence sequence = _xpsDocument.GetFixedDocumentSequence();
        string _fullPageText="";
        for (int pageCount = 0; pageCount < sequence.DocumentPaginator.PageCount; ++pageCount)
        {
            IXpsFixedPageReader _page
                = _document.FixedPages[pageCount];
            StringBuilder _currentText = new StringBuilder();
            System.Xml.XmlReader _pageContentReader = _page.XmlReader;
            if (_pageContentReader != null)
            {
                while (_pageContentReader.Read())
                {
                    if (_pageContentReader.Name == "Glyphs")
                    {
                        if (_pageContentReader.HasAttributes)
                        {
                            if (_pageContentReader.GetAttribute("UnicodeString") != null)
                            {
                                _currentText.
                                  Append(_pageContentReader.
                                  GetAttribute("UnicodeString"));
                            }
                        }
                    }
                }
            }
            _fullPageText += _currentText.ToString();
        }
        return _fullPageText;
    }


回答3:

Method that returns text from all pages (modified Amir:s code, hope that's ok):

/// <summary>
///   Get all text strings from an XPS file.
///   Returns a list of lists (one for each page) containing the text strings.
/// </summary>
private static List<List<string>> ExtractTextFromXps(string xpsFilePath)
{
   var xpsDocument = new XpsDocument(xpsFilePath, FileAccess.Read);
   var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
   if (fixedDocSeqReader == null)
      return null;

   const string UnicodeString = "UnicodeString";
   const string GlyphsString = "Glyphs";

   var textLists = new List<List<string>>();
   foreach (IXpsFixedDocumentReader fixedDocumentReader in fixedDocSeqReader.FixedDocuments)
   {
      foreach (IXpsFixedPageReader pageReader in fixedDocumentReader.FixedPages)
      {
         var pageContentReader = pageReader.XmlReader;
         if (pageContentReader == null)
            continue;

         var texts = new List<string>();
         while (pageContentReader.Read())
         {
            if (pageContentReader.Name != GlyphsString)
               continue;
            if (!pageContentReader.HasAttributes)
               continue;
            if (pageContentReader.GetAttribute(UnicodeString) != null)
               texts.Add(pageContentReader.GetAttribute(UnicodeString));
         }
         textLists.Add(texts);   
      }
   }
   xpsDocument.Close();
   return textLists;
}

Usage:

var txtLists = ExtractTextFromXps(@"C:\myfile.xps");

int pageIdx = 0;
foreach (List<string> txtList in txtLists)
{
   pageIdx++;
   Console.WriteLine("== Page {0} ==", pageIdx);
   foreach (string txt in txtList)
      Console.WriteLine(" "+txt);
   Console.WriteLine();
}


回答4:

Full Code of Class:

using System.Collections.Generic;
using System.Drawing;
using System.Windows.Forms;
using System.Windows.Xps.Packaging;

namespace XPS_Data_Transfer
{
    internal static class XpsDataReader
    {
        public static List<string> ReadXps(string address, int pageNumber)
        {
            var xpsDocument = new XpsDocument(address, System.IO.FileAccess.Read);
            var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
            if (fixedDocSeqReader == null) return null;

            const string uniStr = "UnicodeString";
            const string glyphs = "Glyphs";
            var document = fixedDocSeqReader.FixedDocuments[pageNumber - 1];
            var page = document.FixedPages[0];
            var currentText = new List<string>();
            var pageContentReader = page.XmlReader;

            if (pageContentReader == null) return null;
            while (pageContentReader.Read())
            {
                if (pageContentReader.Name != glyphs) continue;
                if (!pageContentReader.HasAttributes) continue;
                if (pageContentReader.GetAttribute(uniStr) != null)
                    currentText.Add(Dashboard.CleanReversedPersianText(pageContentReader.GetAttribute(uniStr)));
            }
            return currentText;
        }
    }
}

that return a list of string data from custom page of custom file.