Extract data from a nested list with loops

2019-06-13 16:46发布

I did web scraping from a login xml format website and turn it into a list already. Now I have difficulties to extract data from the nested list since it's very complicated.

Here is a part of my z2 structure:

dput(z2)
structure(list(scheduleList = structure(list(
schedule = structure(list(
score = structure(list(
  class = structure(list(name = list("011c"), people = list("2"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011c", status = "-2"), 
  class = structure(list(name = list("013"), people = list("0"), teacher = structure(list(name = list("B")), .Names = "name", id = "D14")), .Names = c("name", "people", "teacher"), id = "602d", status = "-4"), 
  class = structure(list(name = list("603"), people = list("6"), teacher = structure(list(name = list("C")), .Names = "name", id = "D31")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")), 
.Names = c("class", "class", "class"), id = "1"), 
score = structure(list(
  class = structure(list(name = list("011c"), people = list("4"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011", status = "-2"), 
  class = structure(list(name = list("015c"), people = list("51"), teacher = structure(list(name = list("D")), .Names = "name", id = "D23")), .Names = c("name",  "people", "teacher"), id = "666", status = "-4")), 
.Names = c("class","class"), id = "2"), 
score = structure(list(
  class = structure(list(name = list("017c"), people = list("1"), teacher = structure(list(name = list("E")), .Names = "name", id = "D15")), .Names = c("name", "people", "teacher"), id = "017", status = "-2"), 
  class = structure(list(name = list("019c"), people = list("22"), teacher = structure(list(name = list("F")), .Names = "name", id = "D28")), .Names = c("name", "people", "teacher"), id = "561", status = "-4"), 
  class = structure(list(name = list("562d"), people = list("28"), teacher = structure(list(name = list("G")), .Names = "name", id = "D21")), .Names = c("name", "people", "teacher"), id = "562", status = "-4")), 
.Names = c("class", "class", "class"), id = "3")), 
.Names = c("score", "score", "score"), date = "2017-01-25"), 
schedule = structure(list(
score = structure(list(
  class = structure(list(name = list("011c"), people = list("80"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"), 
  class = structure(list(name = list("013c"), people = list("37"), teacher = structure(list(name = list("I")), .Names = "name", id = "D18")), .Names = c("name", "people", "teacher"), id = "669", status = "-4"), 
  class = structure(list(name = list("751d"), people = list("15"), teacher = structure(list(name = list("J")), .Names = "name", id = "D61")), .Names = c("name", "people", "teacher"), id = "751", status = "-4")), 
.Names = c("class", "class", "class"), id = "1"), 
score = structure(list(
  class = structure(list(name = list("015c"), people = list("29"), teacher = structure(list(name = list("K")), .Names = "name", id = "D13")), .Names = c("name", "people", "teacher"), id = "567", status = "-2"), 
  class = structure(list(name = list("666d"), people = list("14"), teacher = structure(list(name = list("L")), .Names = "name", id = "D16")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")), 
.Names = c("class", "class"), id = "2"), 
score = structure(list(
  class = structure(list(name = list("015c"), people = list("21"), teacher = structure(list(name = list("M")), .Names = "name", id = "D22")), .Names = c("name", "people", "teacher"), id = "015", status = "-4"),  
  class = structure(list(name = list("602d"), people = list("18"), teacher = structure(list(name = list("N")), .Names = "name", id = "D10")), .Names = c("name", "people", "teacher"), id = "602", status = "-4")), 
.Names = c("class", "class"), id = "3")), 
.Names = c("score", "score", "score"), date = "2017-01-26"), 
schedule = structure(list(
score = structure(list(
  class = structure(list(name = list("011c"), people = list("33"), teacher = structure(list(name = list("O")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"), 
  class = structure(list(name = list("013c"), people = list("70"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "601", status = "-2"), 
  class = structure(list(name = list("603d"), people = list("0"), teacher = structure(list(name = list("P")), .Names = "name", id = "D27")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")), 
.Names = c("class", "class", "class"), id = "1"), 
score = structure(list(
  class = structure(list(name = list("011c"), people = list("56"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "602", status = "-4"), 
  class = structure(list(name = list("666d"), people = list("8"), teacher = structure(list(name = list("Q")), .Names = "name", id = "D20")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")), 
.Names = c("class", "class"), id = "2"), 
score = structure(list(
  class = structure(list(name = list("017c"), people = list("5"), teacher = structure(list(name = list("R")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "017", status = "-4"), 
  class = structure(list(name = list("021c"), people = list("6"), teacher = structure(list(name = list("S")), .Names = "name", id = "D19")), .Names = c("name", "people", "teacher"), id = "561", status = "-4")), 
.Names = c("class", "class"), id = "3")), 
.Names = c("score", "score", "score"), date = "2017-01-27")), 
.Names = c("schedule", "schedule", "schedule"), from = "2017-01-25", to = "2017-01-27")), 
.Names = "scheduleList")

This is part of z2:

$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "017C"


$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "5"


$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "R"


attr(,"id")
[1] "D30"

attr(,"id")
[1] "017"
attr(,"status")
[1] "-4"

$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "021C"


$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "6"


$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "S"


attr(,"id")
[1] "D19"

attr(,"id")
[1] "561"
attr(,"status")
[1] "-4"

attr(,"id")
[1] "3"

attr(,"date")
[1] "2017-01-27"

attr(,"from")
[1] "2017-01-25"
attr(,"to")
[1] "2017-01-27"

I need to extract the information I need from the nested list, since I am new to this, so I use the most inefficient way to do so:

for (i in 1:length(z2[[1]])){               #length(z2[[1]])=7
  for (j in 1:length(z2[[1]][[i]])){        #length(z2[[1]][[i]])=3
    for (k in 1:length(z[[1]][[i]][[j]])){
      cbind=(
      Date=attr(z2[[1]][[i]],"date"),                #date
      Score=attr(z2[[1]][[i]][[j]],"id"),            #score 
      People=z2[[1]][[i]][[j]][[k]][[2]][[1]],       #people
      TName=z2[[1]][[i]][[j]][[k]][[3]][[1]][[1]],   #teacher name
      TID=attr(z2[[1]][[i]][[j]][[k]][[3]],"id"),    #teacher ID
      CName=z2[[1]][[i]][[j]][[k]][[1]][[1]],        #class name
      CID=attr(z2[[1]][[i]][[j]][[k]],"id"),         #class ID
      CSta=attr(z2[[1]][[i]][[j]][[k]],"status")    #class status
      )
    }
  }
}

It doesn't work in my loops. And I want to output it as a data frame or array. The result that I was expected:

Date        Score   TID   TName   CName  CID  CSta  People
2017-01-25    1     D14     B     013c   602   -4     0
2017-01-26    2     D16     L     666d   666   -4    14

XML format website example:

<result status="success">
  <code>1</code>
  <note>success</note>
  <scheduleList from="2017-01-25" to="2017-01-26">
    <schedule date="2017-01-25">
      <score id="1">
        <class id="011" status="-4">
          <name>011c</name>
          <people>116</people>
          <teacher id="D47">
            <name>A</name>
          </teacher>
        </class>
        <class id="669" status="-4">
          <name>669d</name>
          <people>10</people>
          <teacher id="D29">
            <name>B</name>
          </teacher>
        </class>
      </score>
      <score id="2">
        <class id="013" status="-4">
          <name>013c</name>
          <people>9</people>
          <teacher id="D9">
            <name>C</name>
          </teacher>
        </class>
      </score>
      <score id="3">
        <class id="016" status="-4">
          <name>016c</name>
          <people>36</people>
          <teacher id="D18">
            <name>D</name>
          </teacher>
        </class>
        <class id="019" status="-4">
          <name>019c</name>
          <people>9</people>
          <teacher id="D30">
            <name>E</name>
          </teacher>
        </class>
      </score>
    </schedule>
    <schedule date="2017-01-26">
      <score id="1">
        <class id="011" status="-2">
          <name>011c</name>
          <people>2</people>
          <teacher id="D29">
            <name>F</name>
          </teacher>
        </class>
        <class id="013" status="-2">
          <name>013c</name>
          <people>0</people>
          <teacher id="D14">
            <name>G</name>
          </teacher>
        </class>
      </score>
      <score id="2">
        <class id="011" status="-2">
          <name>011c</name>
          <people>4</people>
          <teacher id="D29">
            <name>F</name>
          </teacher>
        </class>
      </score>
      <score id="3">
        <class id="017" status="-2">
          <name>017c</name>
          <people>1</people>
          <teacher id="D141">
            <name>H</name>
          </teacher>
        </class>
        <class id="019" status="-4">
          <name>019c</name>
          <people>22</people>
          <teacher id="D291">
            <name>I</name>
          </teacher>
        </class>
        <class id="020" status="-4">
          <name>020c</name>
          <people>8</people>
          <teacher id="D143">
            <name>J</name>
          </teacher>
        </class>
      </score>
    </schedule>
  </scheduleList>
</result>

code:

url <- "xxxxxxx"
session <-html_session(url)
form  <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
                          "fromDate" = "2017-01-25",
                          "toDate" = "2017-01-26",
                          "userid" = "xxx",
                          "Password" = "aaa")
s <- submit_form(session,filled_form)
z = read_xml(s$response)

2条回答
做自己的国王
2楼-- · 2019-06-13 17:35

Using purrr and dplyr packages from the tidyverse could help for this task

z2$scheduleList %>%
  map_df(~ map_df(.x, 
                  ~ data_frame(
                    TID    = map_chr(.x, list("teacher", attr_getter("id"))),
                    TName  = map_chr(.x, list("teacher", "name", 1)),
                    CName  = map_chr(.x, list("name", 1)),
                    CID    = map_chr(.x, list(attr_getter("id"))),
                    Csta   = map_chr(.x, list(attr_getter("status"))),
                    People = map_chr(.x, list("people", 1))) %>% 
                    mutate(Score = attr(.x, "id")
                    )) %>% 
           mutate(Date = attr(.x, "date"))) %>%
  select(Date, Score, everything())

#> # A tibble: 22 x 8
#>          Date Score   TID TName CName   CID  Csta People
#>         <chr> <chr> <chr> <chr> <chr> <chr> <chr>  <chr>
#>  1 2017-01-25     1   D29     A  011c  011c    -2      2
#>  2 2017-01-25     1   D14     B   013  602d    -4      0
#>  3 2017-01-25     1   D31     C   603   603    -4      6
#>  4 2017-01-25     2   D29     A  011c   011    -2      4
#>  5 2017-01-25     2   D23     D  015c   666    -4     51
#>  6 2017-01-25     3   D15     E  017c   017    -2      1
#>  7 2017-01-25     3   D28     F  019c   561    -4     22
#>  8 2017-01-25     3   D21     G  562d   562    -4     28
#>  9 2017-01-26     1   D47     H  011c   011    -4     80
#> 10 2017-01-26     1   D18     I  013c   669    -4     37
#> # ... with 12 more rows

Don't know if it is more efficient but it could be clearer to read and understand.

Great use case to understand purrr though.

查看更多
我只想做你的唯一
3楼-- · 2019-06-13 17:42

You are not assigning the result of cbind. (And it's used in a wrong way, don't do cbind=something, the equal sign is an error.)
This is a quick and possibly ineffective way of doing it.

result <- data.frame()

for (i in 1:length(z2[[1]])){               #length(z2[[1]])=7
  for (j in 1:length(z2[[1]][[i]])){        #length(z2[[1]][[i]])=3
    for (k in 1:length(z2[[1]][[i]][[j]])){
      row <- cbind(
          Date=attr(z2[[1]][[i]],"date"),                #date
          Score=attr(z2[[1]][[i]][[j]],"id"),            #score 
          People=z2[[1]][[i]][[j]][[k]][[2]][[1]],       #people
          TName=z2[[1]][[i]][[j]][[k]][[3]][[1]][[1]],   #teacher name
          TID=attr(z2[[1]][[i]][[j]][[k]][[3]],"id"),    #teacher ID
          CName=z2[[1]][[i]][[j]][[k]][[1]][[1]],        #class name
          CID=attr(z2[[1]][[i]][[j]][[k]],"id"),         #class ID
          CSta=attr(z2[[1]][[i]][[j]][[k]],"status")    #class status
          )
      result <- rbind(result, row)
    }
  }
}

head(result)
        Date Score People TName TID CName  CID CSta
1 2017-01-25     1      2     A D29  011c 011c   -2
2 2017-01-25     1      0     B D14   013 602d   -4
3 2017-01-25     1      6     C D31   603  603   -4
4 2017-01-25     2      4     A D29  011c  011   -2
5 2017-01-25     2     51     D D23  015c  666   -4
6 2017-01-25     3      1     E D15  017c  017   -2
查看更多
登录 后发表回答