XSLT: Transform nodes by comparing their child nod

2019-09-11 12:32发布

问题:

To start off, I'm aware of this SO question which is a bit different.

I have an XML file which looks like this:

<?xml version="1.0" encoding="UTF-8"?>
<Document xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:iso:std:iso:20022:tech:xsd:pain.001.001.03">
    <CstmrCdtTrfInitn>
        <GrpHdr>
            <MsgId>123</MsgId>
            <CreDtTm>321</CreDtTm>
            <NbOfTxs>10</NbOfTxs>
            <CtrlSum>18700.68</CtrlSum>
            <InitgPty>
                <Nm>some info</Nm>
            </InitgPty>
        </GrpHdr>

        <PmtInf>
            <!-- start -->
            <PmtInfId>asd</PmtInfId>
            <PmtMtd>TRF</PmtMtd>
            <BtchBookg>false</BtchBookg>
            <PmtTpInf>
                <InstrPrty>NORM</InstrPrty>
                <SvcLvl>
                    <Prtry>test</Prtry>
                </SvcLvl>
            </PmtTpInf>
            <ReqdExctnDt>date</ReqdExctnDt>
            <Dbtr>
                <Nm>something</Nm>
                <PstlAdr>
                    <AdrLine>addr 1</AdrLine>
                </PstlAdr>
            </Dbtr>
            <!-- end -->

            <CdtTrfTxInf>
                <PmtId>
                    <InstrId>16082672122</InstrId>
                    <EndToEndId>16082672122</EndToEndId>
                </PmtId>
                <Amt>
                    <InstdAmt Ccy="RON">2159.41</InstdAmt>
                </Amt>
                <CdtrAgt>
                    <FinInstnId>
                        <BIC>some bic</BIC>
                    </FinInstnId>
                </CdtrAgt>
            </CdtTrfTxInf>
        </PmtInf>

        <PmtInf>
            <!-- start -->
            <PmtInfId>asd</PmtInfId>
            <PmtMtd>TRF</PmtMtd>
            <BtchBookg>false</BtchBookg>
            <PmtTpInf>
                <InstrPrty>NORM</InstrPrty>
                <SvcLvl>
                    <Prtry>test</Prtry>
                </SvcLvl>
            </PmtTpInf>
            <ReqdExctnDt>date</ReqdExctnDt>
            <Dbtr>
                <Nm>something</Nm>
                <PstlAdr>
                    <AdrLine>addr 1</AdrLine>
                </PstlAdr>
            </Dbtr>
            <!-- end -->

            <CdtTrfTxInf>
                <PmtId>
                    <InstrId>16082672122</InstrId>
                    <EndToEndId>16082672122</EndToEndId>
                </PmtId>
                <Amt>
                    <InstdAmt Ccy="RON">2159.41</InstdAmt>
                </Amt>
                <CdtrAgt>
                    <FinInstnId>
                        <BIC>some bic</BIC>
                    </FinInstnId>
                </CdtrAgt>
            </CdtTrfTxInf>
        </PmtInf>

        <PmtInf>
            <!-- start -->
            <PmtInfId>asd</PmtInfId>
            <PmtMtd>TRF</PmtMtd>
            <BtchBookg>false</BtchBookg>
            <PmtTpInf>
                <InstrPrty>NORM</InstrPrty>
                <SvcLvl>
                    <Prtry>test</Prtry>
                </SvcLvl>
            </PmtTpInf>
            <ReqdExctnDt>date</ReqdExctnDt>
            <Dbtr>
                <Nm>something</Nm>
                <PstlAdr>
                    <AdrLine>addr 1</AdrLine>
                </PstlAdr>
            </Dbtr>
            <!-- end -->

            <CdtTrfTxInf>
                <PmtId>
                    <InstrId>16082672122</InstrId>
                    <EndToEndId>16082672122</EndToEndId>
                </PmtId>
                <Amt>
                    <InstdAmt Ccy="RON">2159.41</InstdAmt>
                </Amt>
                <CdtrAgt>
                    <FinInstnId>
                        <BIC>some bic</BIC>
                    </FinInstnId>
                </CdtrAgt>
            </CdtTrfTxInf>
        </PmtInf>

        <PmtInf>
            <!-- start -->
            <PmtInfId>asd</PmtInfId>
            <PmtMtd>TRF</PmtMtd>
            <BtchBookg>false</BtchBookg>
            <PmtTpInf>
                <InstrPrty>NORM</InstrPrty>
                <SvcLvl>
                    <Prtry>test</Prtry>
                </SvcLvl>
            </PmtTpInf>
            <ReqdExctnDt>date</ReqdExctnDt>
            <Dbtr>
                <Nm>something</Nm>
                <PstlAdr>
                    <AdrLine>addr 1</AdrLine>
                </PstlAdr>
            </Dbtr>
            <!-- end -->

            <CdtTrfTxInf>
                <PmtId>
                    <InstrId>16082672122</InstrId>
                    <EndToEndId>16082672122</EndToEndId>
                </PmtId>
                <Amt>
                    <InstdAmt Ccy="RON">2159.41</InstdAmt>
                </Amt>
                <CdtrAgt>
                    <FinInstnId>
                        <BIC>some bic</BIC>
                    </FinInstnId>
                </CdtrAgt>
            </CdtTrfTxInf>
        </PmtInf>
    </CstmrCdtTrfInitn>
</Document>
  • as you can see, I have multiple (4) <PmtInf></PmtInf> sections which have almost the same structure.
  • what I'd like to do, is:

    1. compare <PmtInfId>asd</PmtInfId> from the first PmtInf with <PmtInfId>asd</PmtInfId> from the second PmtInf. If there's a perfect match(as in the same tag and text), move to the next tag elements and compare them (<PmtMtd>TRF</PmtMtd> from the first PmtInf with <PmtMtd>TRF</PmtMtd> from the second PmtInf... and if there's always a perfect match do so until we reach the <CdtTrfTxInf> tag.
    2. when we reached <CdtTrfTxInf>, it means the first part of the first PmtInf is the same as the first part of the second PmtInf. At this point, move <CdtTrfTxInf></CdtTrfTxInf> from the second PmtInf right after the <CdtTrfTxInf></CdtTrfTxInf> section from the first PmtInf. Then, remove the second PmtInf section.

So, at this moment, the xml would look like this:

<?xml version="1.0" encoding="UTF-8"?>
<Document xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:iso:std:iso:20022:tech:xsd:pain.001.001.03">
    <CstmrCdtTrfInitn>
        <GrpHdr>
            <MsgId>123</MsgId>
            <CreDtTm>321</CreDtTm>
            <NbOfTxs>10</NbOfTxs>
            <CtrlSum>18700.68</CtrlSum>
            <InitgPty>
                <Nm>some info</Nm>
            </InitgPty>
        </GrpHdr>

        <PmtInf>
            <!-- start -->
            <PmtInfId>asd</PmtInfId>
            <PmtMtd>TRF</PmtMtd>
            <BtchBookg>false</BtchBookg>
            <PmtTpInf>
                <InstrPrty>NORM</InstrPrty>
                <SvcLvl>
                    <Prtry>test</Prtry>
                </SvcLvl>
            </PmtTpInf>
            <ReqdExctnDt>date</ReqdExctnDt>
            <Dbtr>
                <Nm>something</Nm>
                <PstlAdr>
                    <AdrLine>addr 1</AdrLine>
                </PstlAdr>
            </Dbtr>
            <!-- end -->

            <CdtTrfTxInf>
                <PmtId>
                    <InstrId>16082672122</InstrId>
                    <EndToEndId>16082672122</EndToEndId>
                </PmtId>
                <Amt>
                    <InstdAmt Ccy="RON">2159.41</InstdAmt>
                </Amt>
                <CdtrAgt>
                    <FinInstnId>
                        <BIC>some bic</BIC>
                    </FinInstnId>
                </CdtrAgt>
            </CdtTrfTxInf>

            <CdtTrfTxInf>
                <PmtId>
                    <InstrId>16082672122</InstrId>
                    <EndToEndId>16082672122</EndToEndId>
                </PmtId>
                <Amt>
                    <InstdAmt Ccy="RON">2159.41</InstdAmt>
                </Amt>
                <CdtrAgt>
                    <FinInstnId>
                        <BIC>some bic</BIC>
                    </FinInstnId>
                </CdtrAgt>
            </CdtTrfTxInf>


        </PmtInf>

        <PmtInf>
            <!-- start -->
            <PmtInfId>qwe</PmtInfId>
            <PmtMtd>TRF</PmtMtd>
            <BtchBookg>false</BtchBookg>
            <PmtTpInf>
                <InstrPrty>HIGH</InstrPrty>
                <SvcLvl>
                    <Prtry>test</Prtry>
                </SvcLvl>
            </PmtTpInf>
            <ReqdExctnDt>date</ReqdExctnDt>
            <Dbtr>
                <Nm>something</Nm>
                <PstlAdr>
                    <AdrLine>addr 1</AdrLine>
                </PstlAdr>
            </Dbtr>
            <!-- end -->

            <CdtTrfTxInf>
                <PmtId>
                    <InstrId>16082672122</InstrId>
                    <EndToEndId>16082672122</EndToEndId>
                </PmtId>
                <Amt>
                    <InstdAmt Ccy="RON">2159.41</InstdAmt>
                </Amt>
                <CdtrAgt>
                    <FinInstnId>
                        <BIC>some bic</BIC>
                    </FinInstnId>
                </CdtrAgt>
            </CdtTrfTxInf>
        </PmtInf>

        <PmtInf>
            <!-- start -->
            <PmtInfId>asd</PmtInfId>
            <PmtMtd>TRF</PmtMtd>
            <BtchBookg>false</BtchBookg>
            <PmtTpInf>
                <InstrPrty>NORM</InstrPrty>
                <SvcLvl>
                    <Prtry>test</Prtry>
                </SvcLvl>
            </PmtTpInf>
            <ReqdExctnDt>date</ReqdExctnDt>
            <Dbtr>
                <Nm>something</Nm>
                <PstlAdr>
                    <AdrLine>addr 1</AdrLine>
                </PstlAdr>
            </Dbtr>
            <!-- end -->

            <CdtTrfTxInf>
                <PmtId>
                    <InstrId>16082672122</InstrId>
                    <EndToEndId>16082672122</EndToEndId>
                </PmtId>
                <Amt>
                    <InstdAmt Ccy="RON">2159.41</InstdAmt>
                </Amt>
                <CdtrAgt>
                    <FinInstnId>
                        <BIC>some bic</BIC>
                    </FinInstnId>
                </CdtrAgt>
            </CdtTrfTxInf>
        </PmtInf>
    </CstmrCdtTrfInitn>
</Document>
  1. now repeat the process with the first PmtInf section and the third one and then with the forth one. If there are almost perfect matches, we should only have one PmtInf tag with 4 CdtTrfTxInf tags inside it.
  2. if, at some point, there's a mismatch (say, when comparing <InstrPrty>NORM</InstrPrty> from the first PmtInf with <InstrPrty>HIGH</InstrPrty> from the third PmtInf, leave that PmtInf section as it is and go to the next one.
  3. after we finished to compare the first PmtInfs with all PmtInfs above it, compare the second PmtInf with the third one and apply the same rules, then the third one with the forth one...and so on.

Now I might ask too much, but can this be done with XSLT ? I know I didn't try a thing but I just spent too much on trying to achieve this with simple Python string manipulations and it looks like XSLT transformations docs require some time to get used with the syntax.


I'm calling the script like this:

def parse_xml(file, output_path):
    parser = ET.XMLParser(encoding='utf-8', recover=True)
    dom = ET.parse(file, parser=parser)
    xslt = ET.fromstring(TEMPLATE_XSLT)  # TEMPLATE_XSLT contains the transformation

    transform = ET.XSLT(xslt)
    new_dom = transform(dom)

    with open(output_path, 'wb') as xml_file:
        xml_file.write(new_dom)

回答1:

I hesitate whether to post this at all, because you seem to be a beginner at XSLT and this is a complex solution to a complex problem. It may take you quite some time to wrap your head around this.

How this works:

In the first pass, we generate a pmt element for each PmtInf element in your input XML, and populate it with a string that contains a name/value pair for each node (element or attribute) descendant of the current PmtInf. In the given example, each such pmt element would look similar to:

<pmt id="idp1696">[PmtInfId:asd][PmtMtd:TRF][BtchBookg:false][PmtTpInf:NORMtest][InstrPrty:NORM][SvcLvl:test][Prtry:test][ReqdExctnDt:date][Dbtr:somethingaddr 1][Nm:something][PstlAdr:addr 1][AdrLine:addr 1]</pmt>

In the next step we apply Muenchian grouping to the pmt nodes generated in the first pass. For each distinct pmt node, we create a PmtInf element and populate it with:

  1. the contents of the corresponding PmtInf element, except CdtTrfTxInf ;

  2. a copy of all CdtTrfTxInf elements from all the members of the group.

XSLT 1.0

<xsl:stylesheet version="1.0" 
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:ns0="urn:iso:std:iso:20022:tech:xsd:pain.001.001.03"
xmlns="urn:iso:std:iso:20022:tech:xsd:pain.001.001.03"
xmlns:exsl="http://exslt.org/common"
exclude-result-prefixes="ns0"
extension-element-prefixes="exsl">
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
<xsl:strip-space elements="*"/>

<xsl:key name="pmt" match="ns0:pmt" use="." />
<xsl:key name="PmtInf" match="ns0:PmtInf" use="generate-id()" />

<xsl:variable name="input" select="/" />

<!-- identity transform -->
<xsl:template match="@*|node()">
    <xsl:copy>
        <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
</xsl:template>

<xsl:template match="ns0:CstmrCdtTrfInitn">
    <!-- first pass -->
    <xsl:variable name="first-pass-rtf">
        <xsl:apply-templates select="ns0:PmtInf" mode="gen-key"/>
    </xsl:variable>
    <xsl:variable name="first-pass" select="exsl:node-set($first-pass-rtf)" />
    <!-- output -->
    <xsl:copy>
        <xsl:copy-of select="ns0:GrpHdr"/>
            <!-- for each distinct pmt -->
            <xsl:for-each select="$first-pass/ns0:pmt[count(. | key('pmt', .)[1]) = 1]">
                <xsl:variable name="id" select="@id" />
                <xsl:variable name="ids" select="key('pmt', .)/@id" />
                <PmtInf>
                    <!-- switch context back to XML input -->
                    <xsl:for-each select="$input">
                        <xsl:copy-of select="key('PmtInf', $id)/*[not(self::ns0:CdtTrfTxInf)]"/>
                        <xsl:copy-of select="key('PmtInf', $ids)/ns0:CdtTrfTxInf"/>
                    </xsl:for-each>
                </PmtInf>
            </xsl:for-each>
    </xsl:copy>
</xsl:template>

<xsl:template match="ns0:PmtInf" mode="gen-key">
    <pmt id="{generate-id()}">
        <xsl:apply-templates select="@*|*" mode="gen-key"/>
    </pmt>
</xsl:template>

<xsl:template match="@*|node()" mode="gen-key">
    <xsl:text>[</xsl:text>
    <xsl:value-of select="name()"/>
    <xsl:text>:</xsl:text>
    <xsl:value-of select="."/>
    <xsl:text>]</xsl:text>
    <xsl:apply-templates select="@*|*" mode="gen-key"/>
</xsl:template>

<xsl:template match="ns0:CdtTrfTxInf" mode="gen-key"/>

</xsl:stylesheet>