Parse Nested XML (with namespaces) in R

2019-08-02 11:02发布

问题:

I am trying to parse an xml response from a web API.

For a simple xml as below, I am able to work with xpathSApply and get the relevant data out very easily.

Following is example.xml

<?xml version="1.0" encoding="UTF-8"?>
<CATALOG>
    <PLANT>
        <COMMON>Bloodroot</COMMON>
        <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
        <ZONE>4</ZONE>
        <LIGHT>Mostly Shady</LIGHT>
        <PRICE>$2.44</PRICE>
        <AVAILABILITY>031599</AVAILABILITY>
    </PLANT>
    <PLANT>
        <COMMON>Columbine</COMMON>
        <BOTANICAL>Aquilegia canadensis</BOTANICAL>
        <ZONE>3</ZONE>
        <LIGHT>Mostly Shady</LIGHT>
        <PRICE>$9.37</PRICE>
        <AVAILABILITY>030699</AVAILABILITY>
    </PLANT>
</CATALOG>

>library(XML)
>doc<-xmlTreeParse("example.xml",useInternal=TRUE) 
>rootNode<-xmlRoot(doc)
>xpathSApply(rootNode,"//COMMON",xmlValue)
[1] "Bloodroot" "Columbine"

> getNodeSet(doc,"//PLANT")
[[1]]
<PLANT>
  <COMMON>Bloodroot</COMMON>
  <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
  <ZONE>4</ZONE>
  <LIGHT>Mostly Shady</LIGHT>
  <PRICE>$2.44</PRICE>
  <AVAILABILITY>031599</AVAILABILITY>
</PLANT> 

[[2]]
<PLANT>
  <COMMON>Columbine</COMMON>
  <BOTANICAL>Aquilegia canadensis</BOTANICAL>
  <ZONE>3</ZONE>
  <LIGHT>Mostly Shady</LIGHT>
  <PRICE>$9.37</PRICE>
  <AVAILABILITY>030699</AVAILABILITY>
</PLANT> 

attr(,"class")
[1] "XMLNodeSet"

> xmlSApply(getNodeSet(rootNode,"//PRICE"),xmlValue) #provides a list of all PRICE values in the xml
[1] "$2.44" "$9.37"

However, the same commands do not work for the below xml which have namespace details. Is there anyway that I can fetch the data in the nodes / tags.

Following is example1.xml

<?xml version="1.0" encoding="UTF-8"?>
<s:Envelope xmlns:s="http://schemas.xmlsoap.org/soap/envelope/" xmlns:u="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd"><s:Body><GetByFilterTradeResponse xmlns="http://entrader.contigoenergy.com/Contigo.Entrader.Service"><GetByFilterTradeResult xmlns:i="http://www.w3.org/2001/XMLSchema-instance">
<CATALOG>
    <CATEGORY>
        <FAMILY>
            <PLANT>
                <COMMON>Bloodroot</COMMON>
                <BOTANICAL>Sanguinaria canadensis</BOTANICAL>
                <ZONE>4</ZONE>
                <DETAILS>
                    <PRICEINBULK>2.3</PRICEINBULK>
                    <MINVOLUME>100</MINVOLUME>
                </DETAILS>
                <LIGHT>Mostly Shady</LIGHT>
                <PRICE>$2.44</PRICE>
                <AVAILABILITY>031599</AVAILABILITY>
            </PLANT>
            <PLANT>
                <COMMON>Columbine</COMMON>
                <BOTANICAL>Aquilegia canadensis</BOTANICAL>
                <ZONE>3</ZONE>
                <DETAILS>
                    <PRICEINBULK>9.00</PRICEINBULK>
                    <MINVOLUME>100</MINVOLUME>
                </DETAILS>
                <LIGHT>Mostly Shady</LIGHT>
                <PRICE>$9.37</PRICE>
                <AVAILABILITY>030699</AVAILABILITY>
            </PLANT>
        </FAMILY>
    </CATEGORY> 
</CATALOG>
</GetByFilterTradeResult></GetByFilterTradeResponse></s:Body></s:Envelope>

Following commands does not extract the node values from the above xml

>doc<-xmlTreeParse("example1.xml",useInternal=TRUE) 
>rootNode<-xmlRoot(doc) 
> xpathSApply(rootNode,"//COMMON",xmlValue) 
list()

> getNodeSet(doc,"//PLANT")
list()
attr(,"class")
[1] "XMLNodeSet"

> xmlSApply(getNodeSet(rootNode,"//PRICE"),xmlValue) 
list()

回答1:

USe name() or local-name() in your XPATH:

library(XML)

appText <- '<?xml version="1.0" encoding="UTF-8"?>
<s:Envelope xmlns:s="http://schemas.xmlsoap.org/soap/envelope/" xmlns:u="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd">
<s:Body><GetByFilterTradeResponse xmlns="http://entrader.contigoenergy.com/Contigo.Entrader.Service">
<GetByFilterTradeResult xmlns:i="http://www.w3.org/2001/XMLSchema-instance">
<CATALOG>
<CATEGORY>
<FAMILY>
<PLANT>
<COMMON>Bloodroot</COMMON>
<BOTANICAL>Sanguinaria canadensis</BOTANICAL>
<ZONE>4</ZONE>
<DETAILS>
<PRICEINBULK>2.3</PRICEINBULK>
<MINVOLUME>100</MINVOLUME>
</DETAILS>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$2.44</PRICE>
<AVAILABILITY>031599</AVAILABILITY>
</PLANT>
<PLANT>
<COMMON>Columbine</COMMON>
<BOTANICAL>Aquilegia canadensis</BOTANICAL>
<ZONE>3</ZONE>
<DETAILS>
<PRICEINBULK>9.00</PRICEINBULK>
<MINVOLUME>100</MINVOLUME>
</DETAILS>
<LIGHT>Mostly Shady</LIGHT>
<PRICE>$9.37</PRICE>
<AVAILABILITY>030699</AVAILABILITY>
</PLANT>
</FAMILY>
</CATEGORY> 
</CATALOG>
</GetByFilterTradeResult></GetByFilterTradeResponse></s:Body></s:Envelope>'
doc <- xmlParse(appText)
> xpathSApply(doc,"//*[name()='COMMON']", xmlValue)
[1] "Bloodroot" "Columbine"

alternatively explicitly define the namespace:

> xpathSApply(doc,"//n:COMMON",xmlValue, namespaces = 
+                 c(s = "http://schemas.xmlsoap.org/soap/envelope/", 
+                   n = "http://entrader.contigoenergy.com/Contigo.Entrader.Service", 
+                   i = "http://www.w3.org/2001/XMLSchema-instance")) 
[1] "Bloodroot" "Columbine"

or using xml2 package:

library(xml2)
doc <- read_xml(appText)
# check namespaces
> xml_ns(doc)
d1 <-> http://entrader.contigoenergy.com/Contigo.Entrader.Service
i  <-> http://www.w3.org/2001/XMLSchema-instance
s  <-> http://schemas.xmlsoap.org/soap/envelope/
u  <-> http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd

> xml_text(xml_find_all(doc, "//d1:COMMON"))
[1] "Bloodroot" "Columbine"