Sparql query getting duplicate or not I dont under

2019-06-03 20:16发布

问题:

I am using this query to get all programming languages and their details. This is my test class. I have used it in java and it works fine. The problem I am facing is, there is a language named "ML (programming language)"

It prints multiple times with different abstract, different influenced. Not only ML but also some other languages doing this. I don't know is there any problem in my query or its getting exact data as it is.

package io.naztech.dbpedia;

import java.io.ByteArrayOutputStream;
import java.util.List;

import org.apache.jena.query.ResultSet;
import org.apache.jena.query.ResultSetFormatter;
import org.apache.jena.sparql.engine.http.QueryEngineHTTP;
import org.junit.BeforeClass;
import org.junit.Test;

import io.naztech.talent.model.PediaTag;

public class testDataFetching {

    @Test
    public void testAllDataFetching() {

        String q =  "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> \n"+
                    "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n"+
                    "PREFIX dbo: <http://dbpedia.org/ontology/> \n"+
                    "PREFIX dbp: <http://dbpedia.org/property/> \n"+
                    "PREFIX owl: <http://www.w3.org/2002/07/owl#> \n"+
                    "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> \n" +
                    "PREFIX foaf: <http://xmlns.com/foaf/0.1/> \n" +
                    "PREFIX dc: <http://purl.org/dc/elements/1.1/> \n" +
                    "PREFIX : <http://dbpedia.org/resource/> \n" +
                    "PREFIX dbpedia2: <http://dbpedia.org/property/> \n" +
                    "PREFIX dbpedia: <http://dbpedia.org/> \n" +
                    "PREFIX skos: <http://www.w3.org/2004/02/skos/core#> \n" +

                    "SELECT DISTINCT ?pl ?pl_label ?abstract ?_thumbnail \n" +
                    "( Group_concat ( DISTINCT ?_influenced_label; separator= \", \")   AS ?influenced ) \n" + 
                    "( Group_concat ( DISTINCT ?_influencedBy_label; separator= \", \") AS ?influencedBy ) \n" + 
                    "( group_concat ( DISTINCT ?_sameAs; separator=\", \" ) AS ?sameAs ) \n" +
                    "( group_concat ( DISTINCT ?_paradigm_label; separator=\", \" ) AS ?paradigm ) \n" +

                    "WHERE  {\n" +

                    "       ?pl rdf:type dbo:ProgrammingLanguage .\n" + 

                    "       OPTIONAL { ?pl dbo:abstract ?abstract .\n" + 

                    "       FILTER ( LANG ( ?abstract ) = 'en' ) . } \n" + 

                    "       ?pl rdfs:label ?pl_label .\n" + 

                    "       FILTER ( LANG ( ?pl_label ) = 'en' ) .\n" + 

                    "       OPTIONAL { ?pl dbo:influenced ?_influenced . \n" + 

                    "       ?_influenced rdfs:label ?_influenced_label . \n" + 

                    "       FILTER ( LANG ( ?_influenced_label ) = 'en' ) . } \n" + 

                    "       OPTIONAL { ?pl dbo:influencedBy  ?_influencedBy . \n" + 

                    "       ?_influencedBy  rdfs:label ?_influencedBy_label . \n" + 

                    "       FILTER ( LANG ( ?_influencedBy_label ) = 'en' ) . } \n" +

                    "       OPTIONAL { ?pl owl:sameAs ?_sameAs . } \n" +

                    "       OPTIONAL { ?pl dbp:paradigm ?_paradigm . \n" +

                    "       ?_paradigm rdfs:label ?_paradigm_label . } \n" + 

                    "       OPTIONAL { ?pl dbo:thumbnail ?_thumbnail . } \n" +

                    "       }"+

                    "       GROUP BY ?pl ?pl_label ?abstract ?_thumbnail ?influenced ?influencedBy ?sameAs ?paradigm";

        @SuppressWarnings("resource")
        QueryEngineHTTP queryEngine = new QueryEngineHTTP("http://live.dbpedia.org/sparql", q);
        ResultSet results = queryEngine.execSelect();

        int count = 0;

        while (results.hasNext()) 
        {
            QuerySolution qs =  results.next();
            System.out.println("NAME-->\n"+qs.get("pl_label").toString()+"\n");

            if(qs.get("influenced") != null)
            {
            System.out.println("INFLUENCED-->\n"+qs.get("influenced").toString()+"\n"); 
            }
           if(qs.get("influencedBy") != null)
            {
                System.out.println("INFLUENCED BY-->\n"+qs.get("influencedBy").toString()+"\n"); 
            }
           if(qs.get("abstract") != null)
            {
                System.out.println("ABSTRACT-->\n"+qs.get("abstract").toString()+"\n");  
            }

            if(qs.get("sameAs") != null)
            {
                System.out.println("SAME AS-->\n"+qs.get("sameAs").toString()+"\n");  
            }

            if(qs.get("paradigm") != null)
            {
            System.out.println("PARADIGM-->\n"+qs.get("paradigm").toString()+"\n");  
            }

            if(qs.get("_thumbnail") != null)
            {
                System.out.println("THUMBNAIL-->\n"+qs.get("_thumbnail").toString()+"\n");  
            }

            System.out.println("\n");

            count++;
        }

        System.out.println(count);



    }

}

回答1:

There are 3 English abstracts in the dataset, look at the DBpedia Live resource.

You can workaround this by removing the ?abstract variable from the group by ... part and instead using an aggregate function (sample, min, max) to get any of the abstracts:

SELECT  ?pl ?pl_label 
        (MIN(?_abstract) AS ?abstract) # <- used MIN here to ensure stable result
        ?_thumbnail 
        (GROUP_CONCAT(DISTINCT ?_influenced_label ; separator='; ') AS ?influenced) 
        (GROUP_CONCAT(DISTINCT ?_influencedBy_label ; separator='; ') AS ?influencedBy) 
        (GROUP_CONCAT(DISTINCT ?_sameAs ; separator=', ') AS ?sameAs) 
        (GROUP_CONCAT(DISTINCT ?_paradigm_label ; separator=', ') AS ?paradigm)
WHERE
  { ?pl  a  dbo:ProgrammingLanguage ;
         rdfs:label  ?pl_label
    FILTER ( lang(?pl_label) = "en" )

    OPTIONAL
      { ?pl  dbo:abstract  ?_abstract
        FILTER ( lang(?_abstract) = "en" )
      }
    OPTIONAL
      { ?pl       dbo:influenced/rdfs:label  ?_influenced_label
        FILTER ( lang(?_influenced_label) = "en" )
      }
    OPTIONAL
      { ?pl       dbo:influencedBy/rdfs:label  ?_influencedBy_label
        FILTER ( lang(?_influencedBy_label) = "en" )
      }
    OPTIONAL
      { ?pl  owl:sameAs  ?_sameAs }
    OPTIONAL
      { ?pl       dbp:paradigm/rdfs:label  ?_paradigm_label
        FILTER ( lang(?_paradigm_label) = "en" )
      }
    OPTIONAL
      { ?pl  dbo:thumbnail  ?_thumbnail }
  }
GROUP BY ?pl ?pl_label ?_thumbnail

Update

I'll add the comment from @TallTed here, he's one of the people behind Virtuoso and knows it better than me:

Be aware that while the recommended aggregate functions (MIN, MAX, SAMPLE) will get a value of the predicate, there is no assurance that this value will be the latest ingested to the dataset.



标签: java sparql