Convert OOXML inline formatting to a merged elemen

2019-03-29 05:34发布

问题:

In OOXML, formatting such as bold, italic, etc. can be (and often annoyingly is) split up between multiple elements, like so:

<w:p>
    <w:r>
        <w:rPr>
            <w:b/>
         </w:rPr>
         <w:t xml:space="preserve">This is a </w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t xml:space="preserve">bold </w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
            <w:i/>
        </w:rPr>
        <w:t>with a bit of italic</w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t xml:space="preserve"> </w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t>paragr</w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t>a</w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t>ph</w:t>
    </w:r>
    <w:r>
        <w:t xml:space="preserve"> with some non-bold in it too.</w:t>
    </w:r>
</w:p>

I need to combine these formatting elements to produce this:

<p><b>This is a mostly bold <i>with a bit of italic</i> paragraph</b> with some non-bold in it too.</p>

My initial approach was going to be to write out the start formatting tag when it is first encountered using:

 <xsl:text disable-output-escaping="yes">&lt;b&gt;</xsl:text>

And then after I process each <w:r>, check the next one to see if the formatting is still present. If it's not, add the end tag in the same way I add the start tag. I keep thinking there must be a better way to do this, and I'd be grateful for any suggestions.

Should also mention that I am tied to XSLT 1.0.

The reason for needing this, is that we need to compare an XML file before it is transformed into OOXML, and after it is transformed out of OOXML. The extra formatting tags make it appear as though changes were made when they were not.

回答1:

Here is a complete XSLT 1.0 solution:

<xsl:stylesheet version="1.0"
 xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 xmlns:ext="http://exslt.org/common" xmlns:w="w"
 exclude-result-prefixes="ext w">
 <xsl:output omit-xml-declaration="yes"/>
 <xsl:strip-space elements="*"/>

 <xsl:template match="w:p">
  <xsl:variable name="vrtfPass1">
   <p>
    <xsl:apply-templates/>
   </p>
  </xsl:variable>

  <xsl:apply-templates mode="pass2"
   select="ext:node-set($vrtfPass1)/*"/>
 </xsl:template>

 <xsl:template match="w:r">
  <xsl:variable name="vrtfProps">
   <xsl:for-each select="w:rPr/*">
    <xsl:sort select="local-name()"/>
    <xsl:copy-of select="."/>
   </xsl:for-each>
  </xsl:variable>

  <xsl:call-template name="toHtml">
   <xsl:with-param name="pProps" select=
       "ext:node-set($vrtfProps)/*"/>
   <xsl:with-param name="pText" select="w:t/text()"/>
  </xsl:call-template>
 </xsl:template>

 <xsl:template name="toHtml">
  <xsl:param name="pProps"/>
  <xsl:param name="pText"/>

  <xsl:choose>
   <xsl:when test="not($pProps)">
     <xsl:copy-of select="$pText"/>
   </xsl:when>
   <xsl:otherwise>
    <xsl:element name="{local-name($pProps[1])}">
      <xsl:call-template name="toHtml">
        <xsl:with-param name="pProps" select=
            "$pProps[position()>1]"/>
        <xsl:with-param name="pText" select="$pText"/>
      </xsl:call-template>
    </xsl:element>
   </xsl:otherwise>
  </xsl:choose>
 </xsl:template>

  <xsl:template match="/*" mode="pass2">
  <xsl:copy>
    <xsl:copy-of select="@*"/>
    <xsl:call-template name="processInner">
     <xsl:with-param name="pNodes" select="node()"/>
    </xsl:call-template>
  </xsl:copy>
 </xsl:template>

 <xsl:template name="processInner">
  <xsl:param name="pNodes"/>

  <xsl:variable name="pNode1" select="$pNodes[1]"/>

  <xsl:if test="$pNode1">
   <xsl:choose>
    <xsl:when test="not($pNode1/self::*)">
     <xsl:copy-of select="$pNode1"/>
     <xsl:call-template name="processInner">
      <xsl:with-param name="pNodes" select=
      "$pNodes[position()>1]"/>
     </xsl:call-template>
    </xsl:when>
    <xsl:otherwise>
      <xsl:variable name="vbatchLength">
        <xsl:call-template name="getBatchLength">
         <xsl:with-param name="pNodes"
              select="$pNodes[position()>1]"/>
         <xsl:with-param name="pName"
             select="name($pNode1)"/>
         <xsl:with-param name="pCount" select="1"/>
        </xsl:call-template>
      </xsl:variable>

      <xsl:element name="{name($pNode1)}">
        <xsl:copy-of select="@*"/>

        <xsl:call-template name="processInner">
         <xsl:with-param name="pNodes" select=
         "$pNodes[not(position()>$vbatchLength)]
                        /node()"/>
        </xsl:call-template>
      </xsl:element>

      <xsl:call-template name="processInner">
       <xsl:with-param name="pNodes" select=
       "$pNodes[position()>$vbatchLength]"/>
      </xsl:call-template>
    </xsl:otherwise>
   </xsl:choose>
  </xsl:if>
 </xsl:template>

 <xsl:template name="getBatchLength">
  <xsl:param name="pNodes"/>
  <xsl:param name="pName"/>
  <xsl:param name="pCount"/>

  <xsl:choose>
   <xsl:when test=
   "not($pNodes) or not($pNodes[1]/self::*)
    or not(name($pNodes[1])=$pName)">
   <xsl:value-of select="$pCount"/>
   </xsl:when>
   <xsl:otherwise>
    <xsl:call-template name="getBatchLength">
     <xsl:with-param name="pNodes" select=
         "$pNodes[position()>1]"/>
     <xsl:with-param name="pName" select="$pName"/>
     <xsl:with-param name="pCount" select="$pCount+1"/>
    </xsl:call-template>
   </xsl:otherwise>
  </xsl:choose>
 </xsl:template>
</xsl:stylesheet>

when this transformation is applied to the following XML document (based on the provided, but made more complicated to show how more edge-cases are covered):

<w:p xmlns:w="w">
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t xml:space="preserve">This is a </w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t xml:space="preserve">bold </w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
            <w:i/>
        </w:rPr>
        <w:t>with a bit of italic</w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
            <w:i/>
        </w:rPr>
        <w:t> and some more italic</w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:i/>
        </w:rPr>
        <w:t> and just italic, no-bold</w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t xml:space="preserve"></w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t>paragr</w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t>a</w:t>
    </w:r>
    <w:r>
        <w:rPr>
            <w:b/>
        </w:rPr>
        <w:t>ph</w:t>
    </w:r>
    <w:r>
        <w:t xml:space="preserve"> with some non-bold in it too.</w:t>
    </w:r>
</w:p>

the wanted, correct result is produced:

<p><b>This is a bold <i>with a bit of italic and some more italic</i></b><i> and just italic, no-bold</i><b>paragraph</b> with some non-bold in it too.</p>

Explanation:

  1. This is a two-pass transformation. The first pass is relatively simple and converts the source XML document (in our specific case) to the following:

pass1 result (indented for readability):

<p>
   <b>This is a </b>
   <b>bold </b>
   <b>
      <i>with a bit of italic</i>
   </b>
   <b>
      <i> and some more italic</i>
   </b>
   <i> and just italic, no-bold</i>
   <b/>
   <b>paragr</b>
   <b>a</b>
   <b>ph</b> with some non-bold in it too.</p>

.2. The second pass (executed in mode "pass2") merges any batch of consecutive and identically named elements into a single element with that name. It recursively calls-itself on the children of the merged elements -- thus batches at any depth are merged.

.3. Do note: We do not (and cannot) use the axes following-sibling:: or preceding-sibling, because only the nodes (to be merged) at the top level are really siblings. Due to this reason we process all nodes just as a node-set.

.4. This solution is completely generic -- it merges any batch of consecutive identically-named elements at any depth -- and no specific names are hardcoded.



回答2:

This isn't really a complete solution, but it's far simpler than trying to do it with pure XSLT. Depending on the complexity of your source it might not be ideal either, but it might be worth a try. These templates:

<xsl:template match="w:p">
  <p>
    <xsl:apply-templates />
  </p>
</xsl:template>

<xsl:template match="w:r[w:rPr/w:b]">
  <b>
    <xsl:apply-templates />
  </b>
</xsl:template>

<xsl:template match="w:r[w:rPr/w:i]">
  <i>
    <xsl:apply-templates />
  </i>
</xsl:template>

<xsl:template match="w:r[w:rPr/w:i and w:rPr/w:b]">
  <b>
    <i>
      <xsl:apply-templates />
    </i>
  </b>
</xsl:template>

Will output <p><b>This is a </b><b>bold </b><b><i>with a bit of italic</i></b><b> </b><b>paragr</b><b>a</b><b>ph</b> with some non-bold in it too.</p>

You can then use simple text manipulation to remove any occurrences of </b><b>, and </i><i>, leaving you with:

<p><b>This is a bold <i>with a bit of italic</i> paragraph</b> with some non-bold in it too.</p>



回答3:

OOXML is a defined standard which has its own specification. To create a general transform from OOXML to HTML (that's interesting, even if I think there should be already existing implementations around the web) you should study at least a bit of the standard (and you need to study a bit of XSLT I think).

Generally (very generally), the contents of a WordML document is mainly composed by w:p (paragraphs) elements containing w:r runs (region of text with same properties). Inside each run, you can normally find the text properties of the region (w:rPr) and the text itself (w:t).

The model is much more intricated, but you can start working on this general structure.

For instance, you can start working with the following (a bit) general transform. Note that it manages only paragraphs with bold, italic and undelined text.


XSLT 2.0 tested under Saxon-HE 9.2.1.1J

<xsl:stylesheet version="2.0" 
    xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml"
    exclude-result-prefixes="w">
    <xsl:output method="html"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="w:document/w:body">
        <html>
            <body>
                <xsl:apply-templates select="w:p"/>
            </body>
        </html>
    </xsl:template>

    <!-- match paragraph -->
    <xsl:template match="w:p">
        <p>
            <xsl:apply-templates select="w:r"/>
        </p>
    </xsl:template>

    <!-- match run with property -->
    <xsl:template match="w:r[w:rPr]">
        <xsl:apply-templates select="w:rPr/*[1]"/>
    </xsl:template>

    <!-- Recursive template for bold, italic and underline
    properties applied to the same run. Escape to paragraph
    text -->
    <xsl:template match="w:b | w:i | w:u">
        <xsl:element name="{local-name(.)}">
            <xsl:choose>
                <!-- recurse to next sibling property i, b or u -->
                <xsl:when test="count(following-sibling::*[1])=1">
                    <xsl:apply-templates select="following-sibling::*
                        [local-name(.)='i' or 
                        local-name(.)='b' or 
                        local-name(.)='u']"/>
                </xsl:when>
                <xsl:otherwise>
                    <!-- escape to text -->
                    <xsl:apply-templates select="parent::w:rPr/
                        following-sibling::w:t"/>
                </xsl:otherwise>
            </xsl:choose>
        </xsl:element>
    </xsl:template>

    <!-- match run without property -->
    <xsl:template match="w:r[not(w:rPr)]">
        <xsl:apply-templates select="w:t"/>
    </xsl:template>

    <!-- match text -->
    <xsl:template match="w:t">
        <xsl:value-of select="."/>
    </xsl:template>

</xsl:stylesheet>

Applied on:

<w:document xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml">
    <w:body>
        <w:p>
            <w:r>
                <w:rPr>
                    <w:b/>
                </w:rPr>
                <w:t xml:space="preserve">This is a </w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:b/>
                </w:rPr>
                <w:t xml:space="preserve">bold </w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:b/>
                    <w:i/>
                </w:rPr>
                <w:t>with a bit of italic</w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:b/>
                </w:rPr>
                <w:t xml:space="preserve"> </w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:b/>
                </w:rPr>
                <w:t>paragr</w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:b/>
                </w:rPr>
                <w:t>a</w:t>
            </w:r>
            <w:r>
                <w:rPr>
                    <w:b/>
                </w:rPr>
                <w:t>ph</w:t>
            </w:r>
            <w:r>
                <w:t xml:space="preserve"> with some non-bold in it too.</w:t>
            </w:r>
        </w:p>
    </w:body>
</w:document>

produces:

<html>
   <body>
      <p><b>This is a </b><b>bold </b><b><i>with a bit of italic</i></b><b> </b><b>paragr</b><b>a</b><b>ph</b> with some non-bold in it too.
      </p>
   </body>
</html>

The side effect of having grotesque HTML code is unavoidable, due to the WordML underlaying schema. Perhaps the task of making the final HTML much legible could be deferred to some user friendly (and powerful) utility like HTML tidy.



回答4:

Another approach, similar to Flynn's but staying with XSLT instead of adding a separate text processing layer, would be to transform the initial HTML output in the same stylesheet to collapse the adjacent elements of <b> or <i> into single elements.

In other words, the stylesheet would first generate the initial HTML result tree, then pass that as input to a set of templates (using a special mode) that performed the collapsing operation.

Updated: Here is a working, 2-stage stylesheet, built on @empo's stage-1 stylesheet:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
   xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs w"
   xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" version="2.0">

   <xsl:output method="html"/>
   <xsl:strip-space elements="*"/>
   <xsl:variable name="collapsibles" select="('i', 'b', 'u')"/>      

   <!-- identity template, except we collapse any adjacent b or i child elements. -->
   <xsl:template match="*" mode="collapse-adjacent">
      <xsl:copy>
         <xsl:copy-of select="@*"/>
         <xsl:for-each select="node()">
            <xsl:choose>
               <xsl:when test="index-of($collapsibles, local-name()) and
                     not(name(preceding-sibling::node()[1]) = name())">
                  <xsl:copy>
                     <xsl:copy-of select="@*"/>
                     <xsl:call-template name="process-niblings"/>
                  </xsl:copy>
               </xsl:when>
               <xsl:when test="index-of($collapsibles, local-name())"/>
               <!-- do not copy -->
               <xsl:otherwise>
                  <xsl:copy>
                     <xsl:copy-of select="@*"/>
                     <xsl:apply-templates mode="collapse-adjacent"/>
                  </xsl:copy>
               </xsl:otherwise>
            </xsl:choose>
         </xsl:for-each>
      </xsl:copy>
   </xsl:template>

   <!-- apply templates to children of current element *and* of all
      consecutively following elements of the same name. -->
   <xsl:template name="process-niblings">
      <xsl:apply-templates mode="collapse-adjacent"/>
      <!-- If immediate following sibling is the same element type, recurse with
         context node set to that sibling. -->
      <xsl:for-each
         select="following-sibling::node()[1][name() = name(current())]">
         <xsl:call-template name="process-niblings"/>
      </xsl:for-each>
   </xsl:template>

   <!-- @empo's stylesheet (modified) follows. --> 
   <xsl:template match="/">
      <html>
         <body>
            <xsl:variable name="raw-html">
               <xsl:apply-templates />
            </xsl:variable>
            <xsl:apply-templates select="$raw-html" mode="collapse-adjacent"/>            
         </body>
      </html>
   </xsl:template>

   <xsl:template match="w:document | w:body">
      <xsl:apply-templates />
   </xsl:template>

   <!-- match paragraph -->
   <xsl:template match="w:p">
      <p>
         <xsl:apply-templates select="w:r"/>
      </p>
   </xsl:template>

   <!-- match run with property -->
   <xsl:template match="w:r[w:rPr]">
      <xsl:apply-templates select="w:rPr/*[1]"/>
   </xsl:template>

   <!-- Recursive template for bold, italic and underline
      properties applied to the same run. Escape to paragraph
      text -->
   <xsl:template match="w:b | w:i | w:u">
      <xsl:element name="{local-name(.)}">
         <xsl:choose>
            <!-- recurse to next sibling property i, b or u -->
            <xsl:when test="count(following-sibling::*[1])=1">
               <xsl:apply-templates select="following-sibling::*
                  [index-of($collapsibles, local-name(.))]"/>
            </xsl:when>
            <xsl:otherwise>
               <!-- escape to text -->
               <xsl:apply-templates select="parent::w:rPr/
                  following-sibling::w:t"/>
            </xsl:otherwise>
         </xsl:choose>
      </xsl:element>
   </xsl:template>

   <!-- match run without property -->
   <xsl:template match="w:r[not(w:rPr)]">
      <xsl:apply-templates select="w:t"/>
   </xsl:template>

   <!-- match text -->
   <xsl:template match="w:t">
      <xsl:value-of select="."/>
   </xsl:template>

</xsl:stylesheet>

When tested again the sample input you gave, the above stylesheet yields

<html>
   <body>
      <p><b>This is a bold <i>with a bit of italic</i> paragraph</b> with some non-bold in it too.
      </p>
   </body>
</html>

which looks like what you wanted.



标签: xslt