Getting imported json data into a data frame

2020-01-23 03:24发布

问题:

I have a file containing over 1500 json objects that I want to work with in R. I've been able to import the data as a list, but am having trouble coercing it into a useful structure. I want to create a data frame containing a row for each json object and a column for each key:value pair.

I've recreated my situation with this small, fake data set:

[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
{"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
{"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
{"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
{"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
{"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
{"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]

Some features of the data:

  • The objects all contain the same number of key:value pairs although some of the values are null
  • There are two non-numeric columns per object (name and group)
  • name is the unique identifier, there are 10 or so groups
  • many of the name and group entires contain spaces, commas and other punctuation.

Based on this question: R list(structure(list())) to data frame, I tried the following:

json_file <- "test.json"
json_data <- fromJSON(json_file)
asFrame <- do.call("rbind.fill", lapply(json_data, as.data.frame))

With both my real data and this fake data, the last line give me this error:

Error in data.frame(name = "Doe, John", group = "Red", `age (y)` = 24,  : 
  arguments imply differing number of rows: 1, 0

回答1:

You just need to replace your NULLs with NAs:

require(RJSONIO)    

json_file <-  '[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
    {"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
    {"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
    {"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
    {"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
    {"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
    {"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]'


json_file <- fromJSON(json_file)

json_file <- lapply(json_file, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

Once you have a non-null value for each element, you can call rbind without getting an error:

do.call("rbind", json_file)
     name           group    age (y) height (cm) wieght (kg) score
[1,] "Doe, John"    "Red"    "24"    "182"       "74.8"      NA   
[2,] "Doe, Jane"    "Green"  "30"    "170"       "70.1"      "500"
[3,] "Smith, Joan"  "Yellow" "41"    "169"       "60"        NA   
[4,] "Brown, Sam"   "Green"  "22"    "183"       "75"        "865"
[5,] "Jones, Larry" "Green"  "31"    "178"       "83.9"      "221"
[6,] "Murray, Seth" "Red"    "35"    "172"       "76.2"      "413"
[7,] "Doe, Jane"    "Yellow" "22"    "164"       "68"        "902"


回答2:

This is very simple if you use either library(jsonlite) or library(jsonify)

Both of these handle the null values and converts them to NA, and they preserve the data types.

Data

json_file <-  '[{"name":"Doe, John","group":"Red","age (y)":24,"height (cm)":182,"wieght (kg)":74.8,"score":null},
{"name":"Doe, Jane","group":"Green","age (y)":30,"height (cm)":170,"wieght (kg)":70.1,"score":500},
{"name":"Smith, Joan","group":"Yellow","age (y)":41,"height (cm)":169,"wieght (kg)":60,"score":null},
{"name":"Brown, Sam","group":"Green","age (y)":22,"height (cm)":183,"wieght (kg)":75,"score":865},
{"name":"Jones, Larry","group":"Green","age (y)":31,"height (cm)":178,"wieght (kg)":83.9,"score":221},
{"name":"Murray, Seth","group":"Red","age (y)":35,"height (cm)":172,"wieght (kg)":76.2,"score":413},
{"name":"Doe, Jane","group":"Yellow","age (y)":22,"height (cm)":164,"wieght (kg)":68,"score":902}]'

jsonlite

library(jsonlite)
jsonlite::fromJSON( json_file )
#           name  group age (y) height (cm) wieght (kg) score
# 1    Doe, John    Red      24         182        74.8    NA
# 2    Doe, Jane  Green      30         170        70.1   500
# 3  Smith, Joan Yellow      41         169        60.0    NA
# 4   Brown, Sam  Green      22         183        75.0   865
# 5 Jones, Larry  Green      31         178        83.9   221
# 6 Murray, Seth    Red      35         172        76.2   413
# 7    Doe, Jane Yellow      22         164        68.0   902

str( jsonlite::fromJSON( json_file ) )
# 'data.frame': 7 obs. of  6 variables:
# $ name       : chr  "Doe, John" "Doe, Jane" "Smith, Joan" "Brown, Sam" ...
# $ group      : chr  "Red" "Green" "Yellow" "Green" ...
# $ age (y)    : int  24 30 41 22 31 35 22
# $ height (cm): int  182 170 169 183 178 172 164
# $ wieght (kg): num  74.8 70.1 60 75 83.9 76.2 68
# $ score      : int  NA 500 NA 865 221 413 902

jsonify

library(jsonify)
jsonify::from_json( json_file )
#           name  group age (y) height (cm) wieght (kg) score
# 1    Doe, John    Red      24         182        74.8    NA
# 2    Doe, Jane  Green      30         170        70.1   500
# 3  Smith, Joan Yellow      41         169        60.0    NA
# 4   Brown, Sam  Green      22         183        75.0   865
# 5 Jones, Larry  Green      31         178        83.9   221
# 6 Murray, Seth    Red      35         172        76.2   413
# 7    Doe, Jane Yellow      22         164        68.0   90


str( jsonify::from_json( json_file ) )
# 'data.frame': 7 obs. of  6 variables:
# $ name       : chr  "Doe, John" "Doe, Jane" "Smith, Joan" "Brown, Sam" ...
# $ group      : chr  "Red" "Green" "Yellow" "Green" ...
# $ age (y)    : int  24 30 41 22 31 35 22
# $ height (cm): int  182 170 169 183 178 172 164
# $ wieght (kg): num  74.8 70.1 60 75 83.9 76.2 68
# $ score      : int  NA 500 NA 865 221 413 902


回答3:

To remove null values use parameter nullValue

json_data <- fromJSON(json_file, nullValue = NA)
asFrame <- do.call("rbind.fill", lapply(json_data, as.data.frame))

this way there won´t be any unnecessary quotes in your output



回答4:

library(rjson)
Lines <- readLines("yelp_academic_dataset_business.json") 
business <- as.data.frame(t(sapply(Lines, fromJSON)))

You may try this to load JSON data into R



回答5:

dplyr::bind_rows(fromJSON(file_name))