Extracting data from nested json document using ti

2019-07-17 13:05发布

问题:

I am querying a research publication database. Articles have different number of authors (ranging from 1 to more than 20). My goal is to create an edge list of co-authors for social network analysis using iGraph. Below is a snippet of json

{
"format": "linked-data-api",
"version": "0.2",
"result": {
"_about": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1",
"definition": "http://network.csiro.au:9500/standalone/meta/publications.json",
"extendedMetadataVersion": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1&_metadata=all",
"first": "http://network.csiro.au:9500/standalone/publications.json?_page=0",
"isPartOf": {
"_about": "http://network.csiro.au:9500/standalone/publications.json",
"definition": "http://network.csiro.au:9500/standalone/meta/publications.json",
"hasPart": "http://network.csiro.au:9500/standalone/publications.json?_pageSize=5&_page=1",
"type": [
"http://purl.org/linked-data/api/vocab#ListEndpoint"
]
},
"items": [
{
"_about": "http://network.csiro.au/data/pub_EP1312922",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/aimee.slangen",
"hasName": {
"_about": "http://network.csiro.au/data/aimee.slangen_name",
"firstName": "Aimee",
"lastName": "Slangen",
"title": "Ms"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP1312922_author_0",
"author": {
"_about": "http://network.csiro.au/data/aimee.slangen",
"hasName": {
"_about": "http://network.csiro.au/data/aimee.slangen_name",
"firstName": "Aimee",
"lastName": "Slangen",
"title": "Ms"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_040104",
"name": "Climate Change Processes"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"journalTitle": "Journal of Geophysical Research-Oceans",
"keyword": " ",
"outcome": "Approved",
"pages": "156-164",
"project": "http://network.csiro.au/data/project_PD00003609",
"publicationVolume": "119",
"publishedDate": "9-Jan-2014",
"publisher": "American Geophysical Union",
"title": "Regional Differences of Relative Sea Level Changes in the Northwest Atlantic: Historical Trends and Future Projections",
"wbscode": "R-03426-01-003",
"yearOfPublication": "2014"
},
{
"_about": "http://network.csiro.au/data/pub_EP112347",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/roland.pitcher",
"hasName": {
"_about": "http://network.csiro.au/data/roland.pitcher_name",
"firstName": "Roland",
"lastName": "Pitcher",
"title": "Dr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP112347_author_0",
"author": {
"_about": "http://network.csiro.au/data/roland.pitcher",
"hasName": {
"_about": "http://network.csiro.au/data/roland.pitcher_name",
"firstName": "Roland",
"lastName": "Pitcher",
"title": "Dr"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_050209",
"name": "Natural Resource Management"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"keyword": " ",
"outcome": "Approved",
"project": "http://network.csiro.au/data/project_PD00000752",
"publisher": "Queensland Department of Environment and Resource Management",
"title": "Understanding and Managing the Effects of Trawling on the Seabed in the Great Barrier Reef",
"wbscode": "R-00654-03-003",
"yearOfPublication": " "
},
{
"_about": "http://network.csiro.au/data/pub_EP148991",
"access": "CSIRO Only",
"author": {
"_about": "http://network.csiro.au/data/rob.bramley",
"hasName": {
"_about": "http://network.csiro.au/data/rob.bramley_name",
"firstName": "Rob",
"lastName": "Bramley",
"title": "Dr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP148991_author_0",
"author": {
"_about": "http://network.csiro.au/data/rob.bramley",
"hasName": {
"_about": "http://network.csiro.au/data/rob.bramley_name",
"firstName": "Rob",
"lastName": "Bramley",
"title": "Dr"
}
},
"sequenceNumber": 0
},
"classification": {
"_about": "http://network.csiro.au/data/classification_code_070107",
"name": "Farming Systems Research"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"keyword": " ",
"outcome": "Approved",
"pages": "26 + appendices",
"project": "http://network.csiro.au/data/project_PD00002886",
"publishedDate": "17-Sep-2014",
"publisher": "SRA",
"title": "A collaborative approach to Precision Agriculture RDE for the Australian Sugar Industry",
"wbscode": "R-02709-01",
"yearOfPublication": "2014"
},
{
"_about": "http://network.csiro.au/data/pub_EP151976",
"access": "Public",
"author": {
"_about": "http://network.csiro.au/data/paul.krummel",
"hasName": {
"_about": "http://network.csiro.au/data/paul.krummel_name",
"firstName": "Paul",
"lastName": "Krummel",
"title": "Mr"
}
},
"authorSeq": {
"_about": "http://network.csiro.au/data/pub_EP151976_author_0",
"author": {
"_about": "http://network.csiro.au/data/paul.krummel",
"hasName": {
"_about": "http://network.csiro.au/data/paul.krummel_name",
"firstName": "Paul",
"lastName": "Krummel",
"title": "Mr"
}
},
"sequenceNumber": 0
},
"classification": [
{
"_about": "http://network.csiro.au/data/classification_code_040104",
"name": "Climate Change Processes"
},
{
"_about": "http://network.csiro.au/data/classification_code_040199",
"name": "Atmospheric Sciences not elsewhere classified"
}
],
"classificationLevel": "http://network.csiro.au/data/unclassified",
"journalTitle": "Atmospheric Chemistry and Physics",
"keyword": [
"CH4",
"OH",
"hydroxyl radical",
"methane"
],
"outcome": "Approved",
"pages": "7943\u20137956",
"project": "http://network.csiro.au/data/project_PD00009165",
"publicationVolume": "16",
"publishedDate": "30-Jun-2016",
"publisher": "Copernicus GmbH",
"title": "Role of OH variability in the stalling of the global atmospheric CH4 growth rate from 1999 to 2006",
"wbscode": "R-07848; R-06420; R-07768",
"yearOfPublication": "2016"
},
{
"_about": "http://network.csiro.au/data/pub_EP152677",
"access": "CSIRO Only",
"author": [
{
"_about": "http://network.csiro.au/data/andrew.george",
"hasName": {
"_about": "http://network.csiro.au/data/andrew.george_name",
"firstName": "Andrew",
"lastName": "George",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/sigrid.lehnert",
"hasName": {
"_about": "http://network.csiro.au/data/sigrid.lehnert_name",
"firstName": "Sigrid",
"lastName": "Lehnert",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/toni.reverter-gomez",
"hasName": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez_name",
"firstName": "Toni",
"lastName": "Reverter-Gomez",
"title": "Dr"
}
},
{
"_about": "http://network.csiro.au/data/yutao.li",
"hasName": {
"_about": "http://network.csiro.au/data/yutao.li_name",
"firstName": "Yutao",
"lastName": "Li",
"title": "Dr"
}
}
],
"authorSeq": [
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_0",
"author": {
"_about": "http://network.csiro.au/data/yutao.li",
"hasName": {
"_about": "http://network.csiro.au/data/yutao.li_name",
"firstName": "Yutao",
"lastName": "Li",
"title": "Dr"
}
},
"sequenceNumber": 0
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_1",
"author": {
"_about": "http://network.csiro.au/data/andrew.george",
"hasName": {
"_about": "http://network.csiro.au/data/andrew.george_name",
"firstName": "Andrew",
"lastName": "George",
"title": "Dr"
}
},
"sequenceNumber": 1
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_2",
"author": {
"_about": "http://network.csiro.au/data/sigrid.lehnert",
"hasName": {
"_about": "http://network.csiro.au/data/sigrid.lehnert_name",
"firstName": "Sigrid",
"lastName": "Lehnert",
"title": "Dr"
}
},
"sequenceNumber": 2
},
{
"_about": "http://network.csiro.au/data/pub_EP152677_author_3",
"author": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez",
"hasName": {
"_about": "http://network.csiro.au/data/toni.reverter-gomez_name",
"firstName": "Toni",
"lastName": "Reverter-Gomez",
"title": "Dr"
}
},
"sequenceNumber": 3
}
],
"classification": {
"_about": "http://network.csiro.au/data/classification_code_070201",
"name": "Animal Breeding"
},
"classificationLevel": "http://network.csiro.au/data/unclassified",
"conferenceDate": "28th-30th September 2015",
"conferenceLocation": "Lorne, Victoria",
"conferenceName": "21st AAABG",
"keyword": " ",
"outcome": "Approved",
"pages": "433-436",
"project": "http://network.csiro.au/data/project_PD00005603",
"publicationVolume": "21",
"publishedDate": "25-Sep-2015",
"publisher": "Association for the Advancement of Animal Breeding and Genetics",
"title": "Using Random Forests to Identify SNP Associated With Leg Defect in Broiler Chicken: Impact of Correcting For Population Structures",
"wbscode": "R-05156",
"yearOfPublication": "2015"
}
],
"itemsPerPage": 5,
"next": "http://network.csiro.au:9500/standalone/publications.json?_page=2",
"page": 1,
"prev": "http://network.csiro.au:9500/standalone/publications.json?_page=0",
"startIndex": 6,
"totalResults": 47023,
"type": [
"http://purl.org/linked-data/api/vocab#Page"
]
}
}

I am read the data in as follows:

library(jsonlite)
library(tidyjson)
pubs <- fromJSON("http://network.csiro.au:9500/standalone/publications.json?_page=1&_pageSize=5")

When trying to extract meaningful data using tidyjson, I get this error:

pubs %>%
  as.tbl_json %>%
  enter_object("items")

Error in UseMethod("as.tbl_json") : 
  no applicable method for 'as.tbl_json' applied to an object of class "list"

I am not an expert in R or JSON so would appreciate some guidance. Using the above example, I want to create for each publication an edge list of co-authors like this:

_about                                    yearOfPublication from            to
http://network.url.com/data/pub_EP16079   2011              Colin Jackson   Holly Trueman
http://network.url.com/data/pub_EP16079   2011              Colin Jackson   Tara Sutherland
http://network.url.com/data/pub_EP16079   2011              Colin Jackson   Trevor Rapson
http://network.url.com/data/pub_EP16079   2011              Holly Trueman   Tara Sutherland
http://network.url.com/data/pub_EP16079   2011              Holly Trueman   Trevor Rapson
http://network.url.com/data/pub_EP16079   2011              Tara Sutherland Trevor Rapson

I hope someone can help me! Thanks in advance.

回答1:

This is a bit of a tricky example. See this issue for discussion on how to improve how tidyjson handles objects that are sometimes arrays.

While not the cleanest solution, I think this does get the job done - you could probably functionalize some of these groups of steps to optimize code-reuse.

The basic aim is to parse enough of the object to get to the authors, then use a separate work-flow for objects and arrays. The arrays require tidyr::expand to complete the combinations of all authors (since those combinations are not represented in the data)

json <- paste(readLines("ex.json"), collapse = " ")

library(dplyr)
library(tidyjson)
library(tidyr)

## parse the objects.  Notice some publications have objects representing a
## single author, others have an array of many authors
prep <- json %>% 
 enter_object("result") %>% 
 enter_object("items") %>% 
 gather_array() %>% 
 spread_values(
   about = jstring("_about")
   , yearOfPublication = jstring(yearOfPublication)
 ) %>% 
 enter_object("author") %>% 
 json_types()

## parse object types
authorobj <- prep %>% 
filter(as.character(type) == "object") %>% 
spread_values(
 authorFirst = jstring(hasName, firstName)
 , authorLast = jstring(hasName, lastName)
) %>% 
mutate(from = paste(authorFirst, authorLast), to = from) %>% 
select(-authorFirst, -authorLast) %>% 
tbl_df()


## parse array types - get 'from' authors
authorarr <- prep %>% 
filter(as.character(type) == "array") %>% 
gather_array("authorid") %>% 
spread_values(
 authorFirst = jstring(hasName, firstName)
 , authorLast = jstring(hasName, lastName)
) %>% 
mutate(from = paste(authorFirst, authorLast)) %>% 
select(-authorFirst, -authorLast)


## use tidyr::expand to complete combinations of from/to
authorarr <- authorarr %>% 
tbl_df() %>% 
left_join(
  authorarr %>% 
   group_by(array.index) %>% 
   expand(from = authorarr$from, to = authorarr$from) %>% 
   ungroup()
  , by = c("array.index", "from"))

## stack (select only a few columns for display)
dplyr::bind_rows(authorobj, authorarr) %>% 
 select(array.index, from, to)
#> # A tibble: 20 x 3
#>    array.index                from                  to
#>          <int>               <chr>               <chr>
#>  1           1       Aimee Slangen       Aimee Slangen
#>  2           2      Roland Pitcher      Roland Pitcher
#>  3           3         Rob Bramley         Rob Bramley
#>  4           4        Paul Krummel        Paul Krummel
#>  5           5       Andrew George       Andrew George
#>  6           5       Andrew George      Sigrid Lehnert
#>  7           5       Andrew George Toni Reverter-Gomez
#>  8           5       Andrew George            Yutao Li
#>  9           5      Sigrid Lehnert       Andrew George
#> 10           5      Sigrid Lehnert      Sigrid Lehnert
#> 11           5      Sigrid Lehnert Toni Reverter-Gomez
#> 12           5      Sigrid Lehnert            Yutao Li
#> 13           5 Toni Reverter-Gomez       Andrew George
#> 14           5 Toni Reverter-Gomez      Sigrid Lehnert
#> 15           5 Toni Reverter-Gomez Toni Reverter-Gomez
#> 16           5 Toni Reverter-Gomez            Yutao Li
#> 17           5            Yutao Li       Andrew George
#> 18           5            Yutao Li      Sigrid Lehnert
#> 19           5            Yutao Li Toni Reverter-Gomez
#> 20           5            Yutao Li            Yutao Li


标签: r jsonlite