Recursively ensuring tibbles instead of data frame

2020-04-17 04:51发布

I have to deal with JSON documents that contain nested documents and at some level have an array which in turn contains individual documents that conceptionally would map back to "data frame rows" when reading/parsing the JSON in R.

enter image description here

First order problem/question

I'm looking for a way to ensure that

  • either all data frames are always turned into tibbles

  • or that at least the "leaf data frames" become tibbles while the the "parent data frames" are allowed to become lists

for arbitrary nested structures, either directly upon parsing via {jsonlite} or afterwards via {purrr}.

Second order problem/question

How do I traverse lists and apply map recursively with {purrr} "the right way"?

Related


Example

Example data

json <- '[
  {
    "labels": ["label-a", "label-b"],
    "levelOne": {
      "levelTwo": {
        "levelThree": [
          {
            "x": "A",
            "y": 1,
            "z": true
          },
          {
            "x": "B",
            "y": 2,
            "z": false
          }
          ]
      }
    },
    "schema": "0.0.1"
  },
  {
    "labels": ["label-a", "label-b"],
    "levelOne": {
      "levelTwo": {
        "levelThree": [
          {
            "x": "A",
            "y": 10,
            "z": false
          },
          {
            "x": "B",
            "y": 20,
            "z": true
          }
          ]
      }
    },
    "schema": "0.0.1"
  }
]'

Result after parsing and turning into tibble

x <- json %>% jsonlite::fromJSON()

x %>% str()
# 'data.frame': 2 obs. of  3 variables:
#  $ labels  :List of 2
#   ..$ : chr  "label-a" "label-b"
#   ..$ : chr  "label-a" "label-b"
#  $ levelOne:'data.frame': 2 obs. of  1 variable:
#   ..$ levelTwo:'data.frame':  2 obs. of  1 variable:
#   .. ..$ levelThree:List of 2
#   .. .. ..$ :'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  1 2
#   .. .. .. ..$ z: logi  TRUE FALSE
#   .. .. ..$ :'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  10 20
#   .. .. .. ..$ z: logi  FALSE TRUE
#  $ schema  : chr  "0.0.1" "0.0.1"

x_tbl <- x %>% tibble::as_tibble()

x_tbl %>% str()
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  3 variables:
#  $ labels  :List of 2
#   ..$ : chr  "label-a" "label-b"
#   ..$ : chr  "label-a" "label-b"
#  $ levelOne:'data.frame': 2 obs. of  1 variable:
#   ..$ levelTwo:'data.frame':  2 obs. of  1 variable:
#   .. ..$ levelThree:List of 2
#   .. .. ..$ :'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  1 2
#   .. .. .. ..$ z: logi  TRUE FALSE
#   .. .. ..$ :'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  10 20
#   .. .. .. ..$ z: logi  FALSE TRUE
#  $ schema  : chr  "0.0.1" "0.0.1"

Desired result

x_tbl$levelOne <- x_tbl$levelOne %>% tibble::as_tibble()
x_tbl$levelOne$levelTwo <- x_tbl$levelOne$levelTwo %>% 
  tibble::as_tibble()
x_tbl$levelOne$levelTwo$levelThree <- x_tbl$levelOne$levelTwo$levelThree %>% 
  purrr::map(tibble::as_tibble)

x_tbl %>% str()
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  3 variables:
#  $ labels  :List of 2
#   ..$ : chr  "label-a" "label-b"
#   ..$ : chr  "label-a" "label-b"
#  $ levelOne:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  1 variable:
#   ..$ levelTwo:Classes ‘tbl_df’, ‘tbl’ and 'data.frame':  2 obs. of  1 variable:
#   .. ..$ levelThree:List of 2
#   .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  1 2
#   .. .. .. ..$ z: logi  TRUE FALSE
#   .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  10 20
#   .. .. .. ..$ z: logi  FALSE TRUE
#  $ schema  : chr  "0.0.1" "0.0.1"

If I try to do that via dplyr::mutate() or purrr::map*_df(), I get the Error: Columnis of unsupported class data.frame error

Current implementation

I have something which looks good at first sight, but duplicates the nested structure as soon as you cast the list to tibble. Even if it did work as desired, it seems to complicated and brittle as it was designed with one specific use case/JSON structure in mind:

tidy_nested_data_frames <- function(
  x
) {
  is_data_frame_that_should_be_list <- function(x) {
    is.data.frame(x) && purrr::map_lgl(x, is.data.frame)
  }
  y <- x %>%
    purrr::map_if(is_data_frame_that_should_be_list, as.list)

  # Check for next data frame columns to handle:
  false <- function(.x) FALSE
  class_info <- y %>%
    purrr::map_if(is.list, ~.x %>% purrr::map(is.data.frame), .else = false)

  trans_to_tibble <- function(x) {
    x %>% purrr::map(tibble::as_tibble)
  }
  purrr::map2(class_info, y, function(.x, .y) {
    go_deeper <- .x %>% as.logical() %>% all()

    if (go_deeper) {
      # Continue if data frame columns have been detected:

      tidy_nested_data_frames(.y)
    } else {
      # Handle data frames that have list columns that themselves carry the data
      # frames we want to turn into tibbles:

      # NOTE:
      # This probably does not generalize well yet as the logic seems to much
      # tied to my current use case!

      if (.y %>% is.data.frame()) {
        .y %>%
          purrr::map_if(is.list, trans_to_tibble)
      } else {
        .y
      }
    }
  })
}

Testing

x <- json %>% 
  jsonlite::fromJSON() %>% 
  tidy_nested_data_frames()

x %>% str()
# List of 3
#  $ labels  :List of 2
#   ..$ : chr [1:2] "label-a" "label-b"
#   ..$ : chr [1:2] "label-a" "label-b"
#  $ levelOne:List of 1
#   ..$ levelTwo:List of 1
#   .. ..$ levelThree:List of 2
#   .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr [1:2] "A" "B"
#   .. .. .. ..$ y: int [1:2] 1 2
#   .. .. .. ..$ z: logi [1:2] TRUE FALSE
#   .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr [1:2] "A" "B"
#   .. .. .. ..$ y: int [1:2] 10 20
#   .. .. .. ..$ z: logi [1:2] FALSE TRUE
#  $ schema  : chr [1:2] "0.0.1" "0.0.1"

x_tbl <- x %>% tibble::as_tibble()

x_tbl %>% str()
# Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  3 variables:
#  $ labels  :List of 2
#   ..$ : chr  "label-a" "label-b"
#   ..$ : chr  "label-a" "label-b"
#  $ levelOne:List of 2
#   ..$ levelTwo:List of 1
#   .. ..$ levelThree:List of 2
#   .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  1 2
#   .. .. .. ..$ z: logi  TRUE FALSE
#   .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  10 20
#   .. .. .. ..$ z: logi  FALSE TRUE
#   ..$ levelTwo:List of 1
#   .. ..$ levelThree:List of 2
#   .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  1 2
#   .. .. .. ..$ z: logi  TRUE FALSE
#   .. .. ..$ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame':    2 obs. of  3 variables:
#   .. .. .. ..$ x: chr  "A" "B"
#   .. .. .. ..$ y: int  10 20
#   .. .. .. ..$ z: logi  FALSE TRUE
#  $ schema  : chr  "0.0.1" "0.0.1"

EDIT 2020-01-14

Trying out the approach of Alland Cameron "as is" I get:

library(tibble)

x %>% 
  recursive_tibble() %>% 
  str()
# List of 3
#  $ labels  :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  2 variables:
#   ..$ V1: chr [1:2] "label-a" "label-b"
#   ..$ V2: chr [1:2] "label-a" "label-b"
#  $ levelOne:List of 1
#   ..$ levelTwo:List of 1
#   .. ..$ levelThree:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  6 variables:
#   .. .. ..$ x1: chr [1:2] "A" "A"
#   .. .. ..$ x2: chr [1:2] "B" "B"
#   .. .. ..$ y1: chr [1:2] "1" "10"
#   .. .. ..$ y2: chr [1:2] "2" "20"
#   .. .. ..$ z1: chr [1:2] "TRUE" "FALSE"
#   .. .. ..$ z2: chr [1:2] "FALSE" "TRUE"
#  $ schema  : chr [1:2] "0.0.1" "0.0.1"

Session info

sessioninfo::session_info()
# ─ Session info ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#  setting  value                       
#  version  R version 3.6.1 (2019-07-05)
#  os       Pop!_OS 19.10               
#  system   x86_64, linux-gnu           
#  ui       RStudio                     
#  language en_US:en                    
#  collate  en_US.UTF-8                 
#  ctype    en_US.UTF-8                 
#  tz       UTC                         
#  date     2020-01-14                  
# 
# ─ Packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#  ! package       * version     date       lib source                         
#    askpass         1.1         2019-01-13 [1] CRAN (R 3.6.1)                 
#    assertthat      0.2.1       2019-03-21 [1] CRAN (R 3.6.1)                 
#    backports       1.1.5       2019-10-02 [1] CRAN (R 3.6.1)                 
#    bmp             0.3         2017-09-11 [1] CRAN (R 3.6.1)                 
#    callr           3.4.0       2019-12-09 [1] CRAN (R 3.6.1)                 
#    cli             2.0.1       2020-01-08 [1] CRAN (R 3.6.1)                 
#    colorspace      1.4-1       2019-03-18 [1] CRAN (R 3.6.1)                 
#    config          0.3         2018-03-27 [1] CRAN (R 3.6.1)                 
#    confx           0.0.0.9012  2020-01-05 [1] github (rappster/confx@9695409)
#    crayon          1.3.4       2017-09-16 [1] CRAN (R 3.6.1)                 
#    curl            4.3         2019-12-02 [1] CRAN (R 3.6.1)                 
#  R depot.dts.dce * 0.1.1.9003  <NA>       [?] <NA>                           
#    desc            1.2.0       2018-05-01 [1] CRAN (R 3.6.1)                 
#    devtools        2.2.1       2019-09-24 [1] CRAN (R 3.6.1)                 
#    digest          0.6.23      2019-11-23 [1] CRAN (R 3.6.1)                 
#    dplyr           0.8.3       2019-07-04 [1] CRAN (R 3.6.1)                 
#    ellipsis        0.3.0       2019-09-20 [1] CRAN (R 3.6.1)                 
#    fansi           0.4.1       2020-01-08 [1] CRAN (R 3.6.1)                 
#    fs              1.3.1       2019-05-06 [1] CRAN (R 3.6.1)                 
#    glue            1.3.1       2019-03-12 [1] CRAN (R 3.6.1)                 
#    here            0.1         2017-05-28 [1] CRAN (R 3.6.1)                 
#    igraph          1.2.4.2     2019-11-27 [1] CRAN (R 3.6.1)                 
#    imager          0.41.2      2019-01-23 [1] CRAN (R 3.6.1)                 
#    jpeg            0.1-8.1     2019-10-24 [1] CRAN (R 3.6.1)                 
#    jsonlite        1.6         2018-12-07 [1] CRAN (R 3.6.1)                 
#    knitr           1.26        2019-11-12 [1] CRAN (R 3.6.1)                 
#    later           1.0.0       2019-10-04 [1] CRAN (R 3.6.1)                 
#    lifecycle       0.1.0       2019-08-01 [1] CRAN (R 3.6.1)                 
#    lubridate       1.7.4       2018-04-11 [1] CRAN (R 3.6.1)                 
#    magick          2.2         2019-08-26 [1] CRAN (R 3.6.1)                 
#    magrittr        1.5         2014-11-22 [1] CRAN (R 3.6.1)                 
#    memoise         1.1.0       2017-04-21 [1] CRAN (R 3.6.1)                 
#    mongolite       2.1.0       2019-05-09 [1] CRAN (R 3.6.1)                 
#    munsell         0.5.0       2018-06-12 [1] CRAN (R 3.6.1)                 
#    openssl         1.4.1       2019-07-18 [1] CRAN (R 3.6.1)                 
#    pillar          1.4.3       2019-12-20 [1] CRAN (R 3.6.1)                 
#    pkgbuild        1.0.6       2019-10-09 [1] CRAN (R 3.6.1)                 
#    pkgconfig       2.0.3       2019-09-22 [1] CRAN (R 3.6.1)                 
#    pkgload         1.0.2       2018-10-29 [1] CRAN (R 3.6.1)                 
#    plyr            1.8.5       2019-12-10 [1] CRAN (R 3.6.1)                 
#    png             0.1-7       2013-12-03 [1] CRAN (R 3.6.1)                 
#    prettyunits     1.0.2       2015-07-13 [1] CRAN (R 3.6.1)                 
#    processx        3.4.1       2019-07-18 [1] CRAN (R 3.6.1)                 
#    promises      * 1.1.0       2019-10-04 [1] CRAN (R 3.6.1)                 
#    ps              1.3.0       2018-12-21 [1] CRAN (R 3.6.1)                 
#    purrr           0.3.3       2019-10-18 [1] CRAN (R 3.6.1)                 
#    R6              2.4.1       2019-11-12 [1] CRAN (R 3.6.1)                 
#    Rcpp            1.0.3       2019-11-08 [1] CRAN (R 3.6.1)                 
#    readbitmap      0.1.5       2018-06-27 [1] CRAN (R 3.6.1)                 
#    remotes         2.1.0       2019-06-24 [1] CRAN (R 3.6.1)                 
#    renv            0.9.2       2019-12-09 [1] CRAN (R 3.6.1)                 
#    rlang           0.4.2       2019-11-23 [1] CRAN (R 3.6.1)                 
#    rprojroot       1.3-2       2018-01-03 [1] CRAN (R 3.6.1)                 
#    rstudioapi      0.10        2019-03-19 [1] CRAN (R 3.6.1)                 
#    scales          1.1.0       2019-11-18 [1] CRAN (R 3.6.1)                 
#    sessioninfo     1.1.1       2018-11-05 [1] CRAN (R 3.6.1)                 
#    stringi         1.4.3       2019-03-12 [1] CRAN (R 3.6.1)                 
#    stringr         1.4.0       2019-02-10 [1] CRAN (R 3.6.1)                 
#    testthat      * 2.3.1       2019-12-01 [1] CRAN (R 3.6.1)                 
#    tibble        * 2.1.3       2019-06-06 [1] CRAN (R 3.6.1)                 
#    tidyr           1.0.0       2019-09-11 [1] CRAN (R 3.6.1)                 
#    tidyselect      0.2.5       2018-10-11 [1] CRAN (R 3.6.1)                 
#    tiff            0.1-5       2013-09-04 [1] CRAN (R 3.6.1)                 
#    usethis         1.5.1       2019-07-04 [1] CRAN (R 3.6.1)                 
#    utf8            1.1.4       2018-05-24 [1] CRAN (R 3.6.1)                 
#    vctrs           0.2.99.9001 2020-01-08 [1] github (r-lib/vctrs@ab84679)   
#    withr           2.1.2       2018-03-15 [1] CRAN (R 3.6.1)                 
#    xfun            0.11        2019-11-12 [1] CRAN (R 3.6.1)                 
#    yaml            2.2.0       2018-07-25 [1] CRAN (R 3.6.1)                 
# 
# [1] /home/janko/R/x86_64-pc-linux-gnu-library/3.6
# [2] /usr/local/lib/R/site-library
# [3] /usr/lib/R/site-library
# [4] /usr/lib/R/library

1条回答
The star\"
2楼-- · 2020-04-17 05:04

I guess you're going to have to use recursion to go through the list. Here's an idea I had, but I could only get it to work with fromJSON from the rjson package rather than the jsonlite package.

The first step is to define a recursive function to check the depth of a list element:

depth <- function(list_entry)
{
  if (is.list(list_entry) & !is.tibble(list_entry)) 
      return(max(sapply(list_entry, depth)) + 1)
  else 
      return(0)
}

The next function recursively tries to make a tibble out of depth-1 elements (if they are vectors) or out of depth-2 elements (if the tibble values are listed individually). If it finds a depth-0 element it will return it unchanged, and if the element is > 2 deep or not suitable to turn into a tibble, it will pass the children nodes recursively for the same treatment.

recursive_tibble <- function(json_list)
{
  lapply(json_list, function(y)
  {
    if(depth(y) == 0)
      return(unlist(y))

    if(depth(y) == 1)
    {
        if (length(y) < 2) 
          return(unlist(y))

        if (length(unique(names(y))) == 1)
          return(as_tibble(do.call("rbind", lapply(y, unlist))))

        if (length(unique(unlist(lapply(y, length)))) == 1)
          return(as_tibble(do.call("cbind", lapply(y, unlist))))

        else return(unlist(y))
    }

    if (depth(y) == 2)
    {
        if (length(y) < 2) 
          return(recursive_tibble(y))

        if (all(do.call(`==`, lapply(y, names))))     
          return(as_tibble(do.call("rbind", lapply(y, unlist))))
    }

    else return(recursive_tibble(y))
  })
}

So now you can do:

recursive_tibble(x)
#> List of 2
#>  $ :List of 5
#>   ..$ _id      : chr "1234"
#>   ..$ createdAt: chr "2020-01-13 09:00:00"
#>   ..$ labels   : chr [1:2] "label-a" "label-b"
#>   ..$ levelOne :List of 1
#>   .. ..$ levelTwo:List of 1
#>   .. .. ..$ levelThree:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  3 variables:
#>   .. .. .. ..$ x: chr [1:2] "A" "B"
#>   .. .. .. ..$ y: chr [1:2] "1" "2"
#>   .. .. .. ..$ z: chr [1:2] "TRUE" "FALSE"
#>   ..$ schema   : chr "0.0.1"
#>  $ :List of 5
#>   ..$ _id      : chr "5678"
#>   ..$ createdAt: chr "2020-01-13 09:01:00"
#>   ..$ labels   : chr [1:2] "label-a" "label-b"
#>   ..$ levelOne :List of 1
#>   .. ..$ levelTwo:List of 1
#>   .. .. ..$ levelThree:Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 2 obs. of  3 variables:
#>   .. .. .. ..$ x: chr [1:2] "A" "B"
#>   .. .. .. ..$ y: chr [1:2] "1" "2"
#>   .. .. .. ..$ z: chr [1:2] "TRUE" "FALSE"
#>   ..$ schema   : chr "0.0.1"


查看更多
登录 后发表回答