Parsing RDF recursive with Jena

2019-07-13 12:35发布

I'm trying to parse a RDF document recursive using Apache Jena. It consists out of datasets like this:

<dcat:dataset>
    <dcat:Dataset rdf:about="http://url/" > 
        <dct:description xml:lang="ca">Description</dct:description>
        <dct:license rdf:resource="http://creativecommons.org/licenses/by/3.0/"/>
        <dcat:keyword xml:lang="ca">Keyword1</dcat:keyword>
        <dcat:distribution>
            <dcat:Download>
                <dcat:accessURL>http:/url/</dcat:accessURL>
                <dct:format>
                    <dct:IMT>
                        <rdf:value>application/pdf</rdf:value>
                        <rdfs:label>pdf</rdfs:label>
                    </dct:IMT>
                </dct:format>
                <dct:modified rdf:datatype="http://www.w3.or/2001/XMLSchema#date">2012-11-09T16:23:22</dct:modified>
           </dcat:Download>
        </dcat:distribution>
        <dct:publisher>
           <foaf:Organization>
              <dct:title xml:lang="en">Company</dct:title>
              <foaf:homepage rdf:resource="http://url/"/>
           </foaf:Organization>
        </dct:publisher>
    </dcat:Dataset>
</dcat:dataset>

I'm so far to get every statement, which is directly beneath dcat:Dataset (Iterate over specific resource in RDF file with Jena), but I want to find every triple in every level. My output should look like this:

description: Description
license: http://creativecommons.org/licenses/by/3.0/
keyword: Keyword1
distribution -> Download -> accessurl: http:/url/
distribution -> Download -> format -> IMT -> value: application/pdf
distribution -> Download -> format -> IMT -> label: pdf
...

I've tried it with a recursive function, which iterates over the statements and when a statement is not a literal it follows the object to the next node. Like this:

private String recursiveQuery(Statement stmt) {
    Resource subject = stmt.getSubject();
    Property predicate = stmt.getPredicate();
    RDFNode object = stmt.getObject();

    if(object.isLiteral()) {
        out.println("LIT: " + predicate.getLocalName());
        return object.toString();

    } else {
        out.println(predicate.getLocalName());
        Resource r = stmt.getResource();
        StmtIterator stmts = r.listProperties();
        while (stmts.hasNext()) {
            Statement s = stmts.next();
            out.println(s.getPredicate().getLocalName());
            return recursiveQuery(s);
        }
    }
    return null;

}

But somehow I'm getting nowhere with this method. Thank you very much for every insight.

标签: rdf jena
1条回答
叛逆
2楼-- · 2019-07-13 12:59

Based on the earlier question that you linked to, I completed your data so that we have some working data to use. Here is the completed data:

<rdf:RDF
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:dcat="http://www.w3.org/ns/dcat#"
    xmlns:skos="http://www.w3.org/2004/02/skos/core#"
    xmlns:foaf="http://xmlns.com/foaf/0.1/"
    xmlns:owl="http://www.w3.org/2002/07/owl#"
    xmlns:dct="http://purl.org/dc/terms/"
    xmlns:dctypes="http://purl.org/dc/dcmitype/"
    xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
  <dcat:Catalog rdf:about="http://uri/">
    <dcat:dataset>
    <dcat:Dataset rdf:about="http://url/" > 
        <dct:description xml:lang="ca">Description</dct:description>
        <dct:license rdf:resource="http://creativecommons.org/licenses/by/3.0/"/>
        <dcat:keyword xml:lang="ca">Keyword1</dcat:keyword>
        <dcat:distribution>
            <dcat:Download>
                <dcat:accessURL>http:/url/</dcat:accessURL>
                <dct:format>
                    <dct:IMT>
                        <rdf:value>application/pdf</rdf:value>
                        <rdfs:label>pdf</rdfs:label>
                    </dct:IMT>
                </dct:format>
                <dct:modified rdf:datatype="http://www.w3.or/2001/XMLSchema#date">2012-11-09T16:23:22</dct:modified>
           </dcat:Download>
        </dcat:distribution>
        <dct:publisher>
           <foaf:Organization>
              <dct:title xml:lang="en">Company</dct:title>
              <foaf:homepage rdf:resource="http://url/"/>
           </foaf:Organization>
        </dct:publisher>
    </dcat:Dataset>
    </dcat:dataset>
  </dcat:Catalog>
 </rdf:RDF>

It sounds like you are just trying to do a depth first search on each element of type dcat:Dataset. That's easy enough to do. We just select each element of type dcat:Dataset and then start a depth first search from that RDFNode.

import java.util.HashSet;
import java.util.Set;

import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.Statement;
import com.hp.hpl.jena.rdf.model.StmtIterator;
import com.hp.hpl.jena.vocabulary.RDF;


public class DFSinRDFwithJena {
    public static void main(String[] args) {
        Model model = ModelFactory.createDefaultModel();
        model.read( "rdfdfs.rdf" );

        StmtIterator stmts = model.listStatements( null, RDF.type, model.getResource( "http://www.w3.org/ns/dcat#" + "Dataset" ));
        while ( stmts.hasNext() ) {
            rdfDFS( stmts.next().getSubject(), new HashSet<RDFNode>(), "" );
        }
        model.write( System.out, "N3" );
    }

    public static void rdfDFS( RDFNode node, Set<RDFNode> visited, String prefix ) {
        if ( visited.contains( node )) {
            return;
        }
        else {
            visited.add( node );
            System.out.println( prefix + node );
            if ( node.isResource() ) {
                StmtIterator stmts = node.asResource().listProperties();
                while ( stmts.hasNext() ) {
                    Statement stmt = stmts.next();
                    rdfDFS( stmt.getObject(), visited, prefix + node + " =[" + stmt.getPredicate() + "]=> " );
                }
            }
        }
    }
}

This produces the output:

http://url/
http://url/ =[http://purl.org/dc/terms/publisher]=> -f6d9b42:13f2e8dc5fb:-7ffd
http://url/ =[http://purl.org/dc/terms/publisher]=> -f6d9b42:13f2e8dc5fb:-7ffd =[http://purl.org/dc/terms/title]=> Company@en
http://url/ =[http://purl.org/dc/terms/publisher]=> -f6d9b42:13f2e8dc5fb:-7ffd =[http://www.w3.org/1999/02/22-rdf-syntax-ns#type]=> http://xmlns.com/foaf/0.1/Organization
http://url/ =[http://www.w3.org/ns/dcat#distribution]=> -f6d9b42:13f2e8dc5fb:-7fff
http://url/ =[http://www.w3.org/ns/dcat#distribution]=> -f6d9b42:13f2e8dc5fb:-7fff =[http://purl.org/dc/terms/modified]=> 2012-11-09T16:23:22^^http://www.w3.or/2001/XMLSchema#date
http://url/ =[http://www.w3.org/ns/dcat#distribution]=> -f6d9b42:13f2e8dc5fb:-7fff =[http://purl.org/dc/terms/format]=> -f6d9b42:13f2e8dc5fb:-7ffe
http://url/ =[http://www.w3.org/ns/dcat#distribution]=> -f6d9b42:13f2e8dc5fb:-7fff =[http://purl.org/dc/terms/format]=> -f6d9b42:13f2e8dc5fb:-7ffe =[http://www.w3.org/2000/01/rdf-schema#label]=> pdf
http://url/ =[http://www.w3.org/ns/dcat#distribution]=> -f6d9b42:13f2e8dc5fb:-7fff =[http://purl.org/dc/terms/format]=> -f6d9b42:13f2e8dc5fb:-7ffe =[http://www.w3.org/1999/02/22-rdf-syntax-ns#value]=> application/pdf
http://url/ =[http://www.w3.org/ns/dcat#distribution]=> -f6d9b42:13f2e8dc5fb:-7fff =[http://purl.org/dc/terms/format]=> -f6d9b42:13f2e8dc5fb:-7ffe =[http://www.w3.org/1999/02/22-rdf-syntax-ns#type]=> http://purl.org/dc/terms/IMT
http://url/ =[http://www.w3.org/ns/dcat#distribution]=> -f6d9b42:13f2e8dc5fb:-7fff =[http://www.w3.org/ns/dcat#accessURL]=> http:/url/
http://url/ =[http://www.w3.org/ns/dcat#distribution]=> -f6d9b42:13f2e8dc5fb:-7fff =[http://www.w3.org/1999/02/22-rdf-syntax-ns#type]=> http://www.w3.org/ns/dcat#Download
http://url/ =[http://www.w3.org/ns/dcat#keyword]=> Keyword1@ca
http://url/ =[http://purl.org/dc/terms/license]=> http://creativecommons.org/licenses/by/3.0/
http://url/ =[http://purl.org/dc/terms/description]=> Description@ca
http://url/ =[http://www.w3.org/1999/02/22-rdf-syntax-ns#type]=> http://www.w3.org/ns/dcat#Dataset

which is less pretty than the output you described, but seems to be what you want.

Note on RDF as a Graph Representation

The question used the notation “every statement, which is directly beneath dcat:Dataset,” and I think that it is worth pointing out, just in case there is any confusion, that RDF is a graph-based representation. It is true that the RDF/XML serialization can be used to provide some nicely structured XML that is human readable, but there is nothing that requires that that XML representation has that sort of structure. To see this difference, note that the following RDF/XML represents the same graph as the one posted earlier in this answer.

<rdf:RDF
    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns:dcat="http://www.w3.org/ns/dcat#"
    xmlns:skos="http://www.w3.org/2004/02/skos/core#"
    xmlns:foaf="http://xmlns.com/foaf/0.1/"
    xmlns:owl="http://www.w3.org/2002/07/owl#"
    xmlns:dct="http://purl.org/dc/terms/"
    xmlns:dctypes="http://purl.org/dc/dcmitype/"
    xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" > 
  <rdf:Description rdf:nodeID="A0">
    <dct:modified rdf:datatype="http://www.w3.or/2001/XMLSchema#date">2012-11-09T16:23:22</dct:modified>
    <dct:format rdf:nodeID="A1"/>
    <dcat:accessURL>http:/url/</dcat:accessURL>
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Download"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://uri/">
    <dcat:dataset rdf:resource="http://url/"/>
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Catalog"/>
  </rdf:Description>
  <rdf:Description rdf:about="http://url/">
    <dct:publisher rdf:nodeID="A2"/>
    <dcat:distribution rdf:nodeID="A0"/>
    <dcat:keyword xml:lang="ca">Keyword1</dcat:keyword>
    <dct:license rdf:resource="http://creativecommons.org/licenses/by/3.0/"/>
    <dct:description xml:lang="ca">Description</dct:description>
    <rdf:type rdf:resource="http://www.w3.org/ns/dcat#Dataset"/>
  </rdf:Description>
  <rdf:Description rdf:nodeID="A2">
    <foaf:homepage rdf:resource="http://url/"/>
    <dct:title xml:lang="en">Company</dct:title>
    <rdf:type rdf:resource="http://xmlns.com/foaf/0.1/Organization"/>
  </rdf:Description>
  <rdf:Description rdf:nodeID="A1">
    <rdfs:label>pdf</rdfs:label>
    <rdf:value>application/pdf</rdf:value>
    <rdf:type rdf:resource="http://purl.org/dc/terms/IMT"/>
  </rdf:Description>
</rdf:RDF>

The RDF graph is exactly the same, even though the XML structure is very different. I only bring this up to highlight the fact that it really is important to work with RDF as a graph, not as hierarchical XML, even if a particular serialization might suggest that we could work with the latter.

查看更多
登录 后发表回答