how do you extract data from nested json data in R

2019-09-12 02:12发布

问题:

I need to be able to extract these fields from this file that has many json entries:

sender: Hostname
mem:used_p
cpu: user_p
load: load_5

//

cat tmp.txt

{"senderDateTimeStamp":"2016-04-07T00:00:00.0093","senderHost":"server1","senderAppcode":"test_infrastats_prod","senderUsecase":"system","destinationTopic":"test_serverstats_realtimedata_topic_prod","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460001606277,"payloadData":{"timestamp":"2016-04-07T00:00:00.093","sender":{"name":"server1","hostname":"server1"},"count":"1","shipper":"server1","mem":{"total":"18855256064","free":"7273639936","actual_used":"3755769856","used_p":"0.6242380717975277","actual_free":"15099486208","used":"11581616128","actual_used_p":"0.2091895439262065"},"cpu":{"steal":"0","idle":"5102727720","system":"16658360","softirq":"13824070","irq":"1659250","system_p":"0.012666049012784248","nice":"32210","iowait":"660220","user_p":"0.18809078763071663","user":"1112770410"},"load":{"load1":"1.54","load15":"1.11","load5":"1.2"},"swap":{"total":"18855256064","free":"1044598784","actual_used":"0","used_p":"0.0","actual_free":"0","used":"11581616128"},"type":"system"},"payloadDataText":null,"key":"test_infrastats_prod:system","destinationTopicName":"test_serverstats_realtimedata_topic_prod","hdfsPath":"test_infrastats_prod/system","esindex":"test_infrastats_prod","estype":"system","appCode":"test_infrastats_prod","useCase":"system"}

{"senderDateTimeStamp":"2016-04-07T00:00:00.0093","senderHost":"server1","senderAppcode":"test_infrastats_prod","senderUsecase":"system","destinationTopic":"test_serverstats_realtimedata_topic_prod","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460001606277,"payloadData":{"timestamp":"2016-04-07T00:00:00.093","sender":{"name":"server1","hostname":"server1"},"count":"1","shipper":"server1","mem":{"total":"18855256064","free":"7273639936","actual_used":"3755769856","used_p":"0.6242380717975277","actual_free":"15099486208","used":"11581616128","actual_used_p":"0.2091895439262065"},"cpu":{"steal":"0","idle":"5102727720","system":"16658360","softirq":"13824070","irq":"1659250","system_p":"0.012666049012784248","nice":"32210","iowait":"660220","user_p":"0.18809078763071663","user":"1112770410"},"load":{"load1":"1.54","load15":"1.11","load5":"1.2"},"swap":{"total":"18855256064","free":"1044598784","actual_used":"0","used_p":"0.0","actual_free":"0","used":"11581616128"},"type":"system"},"payloadDataText":null,"key":"test_infrastats_prod:system","destinationTopicName":"test_serverstats_realtimedata_topic_prod","hdfsPath":"test_infrastats_prod/system","esindex":"test_infrastats_prod","estype":"system","appCode":"test_infrastats_prod","useCase":"system"}

{"senderDateTimeStamp":"2016-04-07T00:00:00.0093","senderHost":"server1","senderAppcode":"test_infrastats_prod","senderUsecase":"system","destinationTopic":"test_serverstats_realtimedata_topic_prod","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460001606277,"payloadData":{"timestamp":"2016-04-07T00:00:00.093","sender":{"name":"server1","hostname":"server1"},"count":"1","shipper":"server1","mem":{"total":"18855256064","free":"7273639936","actual_used":"3755769856","used_p":"0.6242380717975277","actual_free":"15099486208","used":"11581616128","actual_used_p":"0.2091895439262065"},"cpu":{"steal":"0","idle":"5102727720","system":"16658360","softirq":"13824070","irq":"1659250","system_p":"0.012666049012784248","nice":"32210","iowait":"660220","user_p":"0.18809078763071663","user":"1112770410"},"load":{"load1":"1.54","load15":"1.11","load5":"1.2"},"swap":{"total":"18855256064","free":"1044598784","actual_used":"0","used_p":"0.0","actual_free":"0","used":"11581616128"},"type":"system"},"payloadDataText":null,"key":"test_infrastats_prod:system","destinationTopicName":"test_serverstats_realtimedata_topic_prod","hdfsPath":"test_infrastats_prod/system","esindex":"test_infrastats_prod","estype":"system","appCode":"test_infrastats_prod","useCase":"system"}

I can extract data within payloadData section like this:

df <- jsonlite::fromJSON(paste0("[",paste0(readLines("c:/tmp.txt"),collapse=","),"]"))$payloadData[c("timestamp","count")]

but there are nested objects within payloadData section, how would I extract the data in nested section in json data, from the mem, cpu, load section?

回答1:

One way is to use tidyjson:

library(tidyjson)
library(magrittr)

json <- '{"senderDateTimeStamp":"2016-04-07T00:00:00.0093","senderHost":"server1","senderAppcode":"test_infrastats_prod","senderUsecase":"system","destinationTopic":"test_serverstats_realtimedata_topic_prod","correlatedRecord":false,"needCorrelationCacheCleanup":false,"needCorrelation":false,"correlationAttributes":null,"correlationRecordCount":0,"correlateTimeWindowInMills":0,"lastCorrelationRecord":false,"realtimeESStorage":true,"receiverDateTimeStamp":1460001606277,"payloadData":{"timestamp":"2016-04-07T00:00:00.093","sender":{"name":"server1","hostname":"server1"},"count":"1","shipper":"server1","mem":{"total":"18855256064","free":"7273639936","actual_used":"3755769856","used_p":"0.6242380717975277","actual_free":"15099486208","used":"11581616128","actual_used_p":"0.2091895439262065"},"cpu":{"steal":"0","idle":"5102727720","system":"16658360","softirq":"13824070","irq":"1659250","system_p":"0.012666049012784248","nice":"32210","iowait":"660220","user_p":"0.18809078763071663","user":"1112770410"},"load":{"load1":"1.54","load15":"1.11","load5":"1.2"},"swap":{"total":"18855256064","free":"1044598784","actual_used":"0","used_p":"0.0","actual_free":"0","used":"11581616128"},"type":"system"},"payloadDataText":null,"key":"test_infrastats_prod:system","destinationTopicName":"test_serverstats_realtimedata_topic_prod","hdfsPath":"test_infrastats_prod/system","esindex":"test_infrastats_prod","estype":"system","appCode":"test_infrastats_prod","useCase":"system"}'

json %>%
  enter_object("payloadData") %>%
  spread_values(send_host = jstring("sender", "hostname"),
                mem_used_p = jstring("mem", "used_p"),
                cpu_user_p = jstring("cpu", "user_p"),
                load_load_5 = jstring("load","load5"))

#   document.id send_host         mem_used_p          cpu_user_p load_load_5
# 1           1   server1 0.6242380717975277 0.18809078763071663         1.2

Alternatively, you could stack each set of keys:

payload <- json %>% enter_object("payloadData")

sender_keys <- payload %>%
  enter_object("sender") %>%
  gather_keys() %>%
  append_values_string()

mem_keys <- payload %>%
  enter_object("mem") %>%
  gather_keys() %>%
  append_values_string()

cpu_keys <- payload %>%
  enter_object("cpu") %>%
  gather_keys() %>%
  append_values_string()

load_keys <- payload %>%
  enter_object("load") %>%
  gather_keys() %>%
  append_values_string()


标签: json r jsonlite