How to read the values of blocks of cells from an

2019-08-21 17:29发布

I want to get the values in the cells which are in an .ods file's /content.xml using an XSLT 2.0 transform. Regardless of blanks cells I want to get the values of column D for example.

It could be any column to iterate over.

The same thing for all the other columns and I want to put them into different nodes in XML output.

For example, this content.xml file:

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<office:document-content
xmlns:table="urn:oasis:names:tc:opendocument:xmlns:table:1.0" 
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0" 
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0" xmlns:style="urn:oasis:names:tc:opendocument:xmlns:style:1.0" xmlns:draw="urn:oasis:names:tc:opendocument:xmlns:drawing:1.0" xmlns:fo="urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:number="urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0" xmlns:svg="urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0" xmlns:msoxl="http://schemas.microsoft.com/office/excel/formula">
<office:font-face-decls>
    <style:font-face style:name="Calibri" svg:font-family="Calibri"/>
</office:font-face-decls>
<office:automatic-styles>
    <style:style style:name="ce1" style:family="table-cell" style:parent-style-name="Default" style:data-style-name="N0"/>
    <style:style style:name="co1" style:family="table-column">
        <style:table-column-properties fo:break-before="auto" style:column-width="1.69333333333333cm"/>
    </style:style>
    <style:style style:name="ro1" style:family="table-row">
        <style:table-row-properties style:row-height="15pt" style:use-optimal-row-height="true" fo:break-before="auto"/>
    </style:style>
    <style:style style:name="ta1" style:family="table" style:master-page-name="mp1">
        <style:table-properties table:display="true" style:writing-mode="lr-tb"/>
    </style:style>
</office:automatic-styles>
<office:body>
    <office:spreadsheet>
        <table:calculation-settings table:case-sensitive="false" table:search-criteria-must-apply-to-whole-cell="false"/>
        <table:table table:name="Folha1" table:style-name="ta1">
            <table:table-column table:style-name="co1" table:number-columns-repeated="16384" table:default-cell-style-name="ce1"/>
            <table:table-row table:style-name="ro1">
                <table:table-cell office:value-type="string" table:style-name="ce1">
                    <text:p>item1</text:p>
                </table:table-cell>
                <table:table-cell table:number-columns-repeated="2" table:style-name="ce1"/>
                <table:table-cell office:value-type="string" table:style-name="ce1">
                    <text:p>item3</text:p>
                </table:table-cell>
                <table:table-cell table:number-columns-repeated="16380" table:style-name="ce1"/>
            </table:table-row>
            <table:table-row table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="16384"/>
            </table:table-row>
            <table:table-row table:style-name="ro1">
                <table:table-cell office:value-type="string" table:style-name="ce1">
                    <text:p>item2</text:p>
                </table:table-cell>
                <table:table-cell table:number-columns-repeated="16383" table:style-name="ce1"/>
            </table:table-row>
            <table:table-row table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="16384"/>
            </table:table-row>
            <table:table-row table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="3" table:style-name="ce1"/>
                <table:table-cell office:value-type="string" table:style-name="ce1">
                    <text:p>item4</text:p>
                </table:table-cell>
                <table:table-cell table:number-columns-repeated="3" table:style-name="ce1"/>
                <table:table-cell office:value-type="string" table:style-name="ce1">
                    <text:p>item5</text:p>
                </table:table-cell>
                <table:table-cell table:number-columns-repeated="16376"/>
            </table:table-row>
            <table:table-row table:number-rows-repeated="5" table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="16384"/>
            </table:table-row>
            <table:table-row table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="7" table:style-name="ce1"/>
                <table:table-cell office:value-type="string" table:style-name="ce1">
                    <text:p>item5</text:p>
                </table:table-cell>
                <table:table-cell table:number-columns-repeated="16376"/>
            </table:table-row>
            <table:table-row table:number-rows-repeated="3" table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="16384"/>
            </table:table-row>
            <table:table-row table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="7" table:style-name="ce1"/>
                <table:table-cell office:value-type="string" table:style-name="ce1">
                    <text:p>item6</text:p>
                </table:table-cell>
                <table:table-cell table:number-columns-repeated="16376"/>
            </table:table-row>
            <table:table-row table:number-rows-repeated="1048561" table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="16384"/>
            </table:table-row>
        </table:table>
        <table:table table:name="Folha2" table:style-name="ta1">
            <table:table-column table:style-name="co1" table:number-columns-repeated="16384" table:default-cell-style-name="ce1"/>
            <table:table-row table:number-rows-repeated="1048576" table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="16384"/>
            </table:table-row>
        </table:table>
        <table:table table:name="Folha3" table:style-name="ta1">
            <table:table-column table:style-name="co1" table:number-columns-repeated="16384" table:default-cell-style-name="ce1"/>
            <table:table-row table:number-rows-repeated="1048576" table:style-name="ro1">
                <table:table-cell table:number-columns-repeated="16384"/>
            </table:table-row>
        </table:table>
    </office:spreadsheet>
</office:body>
</office:document-content>


<?xml version="1.0" encoding="UTF-8"?>

Basic XSLT Example:

<xsl:template match="office:document-content/office:body/office:spreadsheet//table:table[@table:name='Folha1']">
    <xsl:variable name="description" select="table:table-row/table:table-cell[4]/text:p"/>
    <!--xsl:if test="$description != '.'"-->
    <Description>
        <xsl:value-of select="$description"/>
    </Description>
</xsl:template>

The output of this example gives me "item5", okay, but I really don't know the logic behind this - how it is counting etc.

As far as I know in XSLT empty cells are not really counted!
Am I right?
But somehow an .ods file does save these empty cells and the structure of the document as well so that it can retrieve it the next time you open the file.
But how?

So what I want to know is:
how do we really select the items with counting these "empty" cells as well and find the certain place of each item in the sheet.

The desired output would be like:

<group1>
    <Description>item</Description>
    <Description>item2</Description>
<group4>
    <Description>item3</Description>
    <Description>item4</Description>
<group8>
    <Description>item5</Description>
    <Description>item6</Description>
    <Description>item7</Description>

标签: xml xslt
1条回答
Juvenile、少年°
2楼-- · 2019-08-21 17:43

I extended my code for the solution to this problem. In this SO answer I explained a small part of how .ods files are structured and how to find a specific cell. The new code extends that to blocks of cells in the common spreadsheet notation like

A1:A5
B12:C14,E5:E7

and makes use of XSLT-2.0 features. I do not claim this to be the most elegant solution, but the tests so far look promising.

Here is the code to retrieve the content of a block of cells:

<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="2.0" 
  xmlns:xsl   ="http://www.w3.org/1999/XSL/Transform" 
  xmlns:xs    ="http://www.w3.org/2001/XMLSchema" 
  xmlns:fn    ="http://www.w3.org/2005/xpath-functions"
  xmlns:udf   ="http://user.defined.functions"
  xmlns:table ="urn:oasis:names:tc:opendocument:xmlns:table:1.0" 
  xmlns:text  ="urn:oasis:names:tc:opendocument:xmlns:text:1.0" 
  xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"> 

    <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
    <!-- Name and location of the content.xml file from the ODS file -->
    <!-- Unzip ODS file to get content.xml -->
    <xsl:variable name="ODSpath" select="'content.xml'" />
    <xsl:variable name="doc" select="document($ODSpath)/office:document-content/office:body/office:spreadsheet/table:table" /> 

    <!-- Alphabet as a sequence -->
    <xsl:variable name="upperCaseLetters" select="'ABCDEFGHIJKLMNOPQRSTUVWXYZ'" />
    <xsl:variable name="lowerCaseLetters" select="'abcdefghijklmnopqrstuvwxyz'" />
    <xsl:variable name="alphabet" select="'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'" />

    <!-- Helper function: Gets the number of a letter of the alphabet: A=1,B=2,C=3... -->
    <xsl:function name="udf:letter-idx" as="xs:integer">
      <xsl:param    name="coord" as="xs:string"/>
      <xsl:variable name="cnt"   select="string-length($coord)" />
      <xsl:variable name="cur"   select="substring($coord,$cnt,1)" />
      <xsl:value-of select="if ($cnt = 1) then index-of($alphabet,$cur) else udf:letter-idx(substring($coord,1,$cnt - 1))*26 + index-of($alphabet,$cur)"/>
    </xsl:function>

    <!-- Convert coordinate string to separate squares in XML format -->
    <xsl:template name="ConvertCoordinates">
      <xsl:param name="cellStr" as="xs:string" />
      <xsl:analyze-string select="$cellStr" regex="([A-Z]+)([0-9]+):([A-Z]+)([0-9]+),?">
        <xsl:matching-substring>
          <xsl:element name="Item">
            <xsl:element name="FromX"><xsl:value-of select="udf:letter-idx(regex-group(1))" /></xsl:element>
            <xsl:element name="FromY"><xsl:value-of select="regex-group(2)" /></xsl:element>
            <xsl:element name="ToX"><xsl:value-of select="udf:letter-idx(regex-group(3))" /></xsl:element>
            <xsl:element name="ToY"><xsl:value-of select="regex-group(4)" /></xsl:element>
          </xsl:element>
        </xsl:matching-substring>
      </xsl:analyze-string>
    </xsl:template>

    <!-- Get values of every cell in a square -->
    <xsl:template name="VisitAll">    
        <xsl:param name="result" as="element()*" />
        <xsl:param name="xStart" as="xs:integer" />
        <xsl:param name="yStart" as="xs:integer" />
        <xsl:param name="xEnd"   as="xs:integer" />
        <xsl:param name="yEnd"   as="xs:integer" />
        <xsl:param name="curX"   as="xs:integer" select="$xStart" />
        <xsl:param name="curY"   as="xs:integer" select="$yStart" />
        <xsl:choose>
          <!-- Check if xEnd/yEnd has been reached -->
          <xsl:when test="$curY le $yEnd">
            <xsl:variable name="cellNode" as="element()">
              <xsl:call-template name="GetCellValue">
                <xsl:with-param name="x" select="$curX" />
                <xsl:with-param name="y" select="$curY" />
              </xsl:call-template>
            </xsl:variable>    
            <!-- Recurse to next cell x+1/y (next column) or x=xStart/y+1 (next row) -->
            <xsl:call-template name="VisitAll">
              <xsl:with-param name="result" select="if ($cellNode/text()!='') then ($result,$cellNode) else $result" />
              <xsl:with-param name="xStart" select="$xStart" />
              <xsl:with-param name="yStart" select="$yStart" />
              <xsl:with-param name="xEnd" select="$xEnd" />
              <xsl:with-param name="yEnd" select="$yEnd" />
              <xsl:with-param name="curX" select="if ($curX = $xEnd) then $xStart else $curX + 1" />
              <xsl:with-param name="curY" select="if ($curX = $xEnd) then $curY + 1 else $curY" />
            </xsl:call-template>
          </xsl:when>
          <xsl:otherwise>
            <!-- Recursion end - return result -->
            <xsl:sequence select="$result" />
          </xsl:otherwise>
        </xsl:choose>
    </xsl:template>

    <!-- Get the value of one cell at pos x/y -->
    <xsl:template name="GetCellValue">
      <xsl:param name="x" as="xs:integer" />
      <xsl:param name="y" as="xs:integer" />
      <xsl:variable name="targetRow" select="$doc/table:table-row[sum(preceding-sibling::*/@table:number-rows-repeated) + position() - count(preceding-sibling::*/@table:number-rows-repeated)= $y]" />
      <xsl:variable name="targetCell" select="$targetRow/table:table-cell[sum(preceding-sibling::*/@table:number-columns-repeated) + position() - count(preceding-sibling::*/@table:number-columns-repeated) le $x]" />
      <xsl:variable name="targetCellValue" select="$targetCell[last()]/text:p/text()" />
      <!-- create namespace-free element with result value -->
      <xsl:element name="Text">
        <xsl:value-of select="$targetCellValue" />
      </xsl:element>
    </xsl:template>

    <!-- Gets arbitrary number of cells specified in squares (typical for Excel/Office) --> 
    <!-- For example: A4:B7,E4:E7 -->
    <xsl:template name="GetMultipleCellsNodes">
      <xsl:param name="cells" />
      <!-- Convert cell coord string to coordinate element sets and convert lowercase to uppercase -->
      <xsl:variable name="squares">
        <xsl:call-template name="ConvertCoordinates">
          <xsl:with-param name="cellStr" select="translate($cells,$lowerCaseLetters,$upperCaseLetters)" />
        </xsl:call-template>       
      </xsl:variable>
      <!-- Call VisitAll on all squares in each coordinate element set and concatenate to sequence -->
      <xsl:variable name="itemSequence" as="element()*">
        <xsl:for-each select="$squares/Item">
          <!-- Visit every cell in each square -->
          <xsl:call-template name="VisitAll">
            <xsl:with-param name="result" select="()" />
            <xsl:with-param name="xStart" select="FromX" />
            <xsl:with-param name="yStart" select="FromY" />
            <xsl:with-param name="xEnd" select="ToX" />
            <xsl:with-param name="yEnd" select="ToY" />
          </xsl:call-template>
        </xsl:for-each>
      </xsl:variable>
      <xsl:sequence select="$itemSequence" />
    </xsl:template>

    <!-- Demo template -->
    <xsl:template match="/">
      <xsl:variable name="selectedElements">
        <xsl:call-template name="GetMultipleCellsNodes">
          <xsl:with-param name="cells" select="'A1:A20,D1:D20,H1:h20'" />
        </xsl:call-template>       
      </xsl:variable>
      <xsl:element name="Cells">
        <xsl:for-each select="$selectedElements">
          <xsl:copy-of select="." />
        </xsl:for-each>
      </xsl:element>
    </xsl:template>

</xsl:stylesheet>

The name of the content.xml file is specified in the ODSpath variable at the beginning. So the source file is not passed to the XSLT processor but rather via a variable containing its name. The desired cells/columns are specified with the string A1:A20,D1:D20,H1:h20 in the demo template at the bottom.

I tested this with your above content.xml which failed because LibreOffice said that the file is damaged(???). But (re)creating the file with the content from the image succeeded and resulted in the following output:

<?xml version="1.0" encoding="UTF-8"?>
<Cells>
   <Text>item1</Text>
   <Text>item2</Text>
   <Text>item3</Text>
   <Text>item4</Text>
   <Text>item5</Text>
   <Text>item6</Text>
   <Text>item7</Text>
</Cells>

To get the exact result you described above just split the three columns to separate calls to GetMultipleCellsNodes:

<!-- Demo template 2 -->
<xsl:template match="/">
  <xsl:variable name="group1">
    <xsl:call-template name="GetMultipleCellsNodes">
      <xsl:with-param name="cells" select="'A1:A20'" />
    </xsl:call-template>       
  </xsl:variable>
  <xsl:variable name="group4">
    <xsl:call-template name="GetMultipleCellsNodes">
      <xsl:with-param name="cells" select="'D1:D20'" />
    </xsl:call-template>       
  </xsl:variable>
  <xsl:variable name="group8">
    <xsl:call-template name="GetMultipleCellsNodes">
      <xsl:with-param name="cells" select="'H1:h20'" />
    </xsl:call-template>       
  </xsl:variable>
  <xsl:element name="group1">
    <xsl:for-each select="$group1/Text">
      <xsl:element name="Description"><xsl:value-of select="." /></xsl:element>
    </xsl:for-each>
  </xsl:element>
  <xsl:element name="group4">
    <xsl:for-each select="$group4/Text">
      <xsl:element name="Description"><xsl:value-of select="." /></xsl:element>
    </xsl:for-each>
  </xsl:element>
  <xsl:element name="group8">
    <xsl:for-each select="$group8/Text">
      <xsl:element name="Description"><xsl:value-of select="." /></xsl:element>
    </xsl:for-each>
  </xsl:element>
</xsl:template>    

The output of this demo template is:

<?xml version="1.0" encoding="UTF-8"?>
<group1>
   <Description>item1</Description>
   <Description>item2</Description>
</group1>
<group4>
   <Description>item3</Description>
   <Description>item4</Description>
</group4>
<group8>
   <Description>item5</Description>
   <Description>item6</Description>
   <Description>item7</Description>
</group8>
查看更多
登录 后发表回答