I did web scraping from a login xml format website and turn it into a list already. Now I have difficulties to extract data from the nested list since it's very complicated.
Here is a part of my z2 structure:
dput(z2)
structure(list(scheduleList = structure(list(
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("2"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011c", status = "-2"),
class = structure(list(name = list("013"), people = list("0"), teacher = structure(list(name = list("B")), .Names = "name", id = "D14")), .Names = c("name", "people", "teacher"), id = "602d", status = "-4"),
class = structure(list(name = list("603"), people = list("6"), teacher = structure(list(name = list("C")), .Names = "name", id = "D31")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("011c"), people = list("4"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "011", status = "-2"),
class = structure(list(name = list("015c"), people = list("51"), teacher = structure(list(name = list("D")), .Names = "name", id = "D23")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class","class"), id = "2"),
score = structure(list(
class = structure(list(name = list("017c"), people = list("1"), teacher = structure(list(name = list("E")), .Names = "name", id = "D15")), .Names = c("name", "people", "teacher"), id = "017", status = "-2"),
class = structure(list(name = list("019c"), people = list("22"), teacher = structure(list(name = list("F")), .Names = "name", id = "D28")), .Names = c("name", "people", "teacher"), id = "561", status = "-4"),
class = structure(list(name = list("562d"), people = list("28"), teacher = structure(list(name = list("G")), .Names = "name", id = "D21")), .Names = c("name", "people", "teacher"), id = "562", status = "-4")),
.Names = c("class", "class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-25"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("80"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"),
class = structure(list(name = list("013c"), people = list("37"), teacher = structure(list(name = list("I")), .Names = "name", id = "D18")), .Names = c("name", "people", "teacher"), id = "669", status = "-4"),
class = structure(list(name = list("751d"), people = list("15"), teacher = structure(list(name = list("J")), .Names = "name", id = "D61")), .Names = c("name", "people", "teacher"), id = "751", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("015c"), people = list("29"), teacher = structure(list(name = list("K")), .Names = "name", id = "D13")), .Names = c("name", "people", "teacher"), id = "567", status = "-2"),
class = structure(list(name = list("666d"), people = list("14"), teacher = structure(list(name = list("L")), .Names = "name", id = "D16")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class", "class"), id = "2"),
score = structure(list(
class = structure(list(name = list("015c"), people = list("21"), teacher = structure(list(name = list("M")), .Names = "name", id = "D22")), .Names = c("name", "people", "teacher"), id = "015", status = "-4"),
class = structure(list(name = list("602d"), people = list("18"), teacher = structure(list(name = list("N")), .Names = "name", id = "D10")), .Names = c("name", "people", "teacher"), id = "602", status = "-4")),
.Names = c("class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-26"),
schedule = structure(list(
score = structure(list(
class = structure(list(name = list("011c"), people = list("33"), teacher = structure(list(name = list("O")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "011", status = "-4"),
class = structure(list(name = list("013c"), people = list("70"), teacher = structure(list(name = list("A")), .Names = "name", id = "D29")), .Names = c("name", "people", "teacher"), id = "601", status = "-2"),
class = structure(list(name = list("603d"), people = list("0"), teacher = structure(list(name = list("P")), .Names = "name", id = "D27")), .Names = c("name", "people", "teacher"), id = "603", status = "-4")),
.Names = c("class", "class", "class"), id = "1"),
score = structure(list(
class = structure(list(name = list("011c"), people = list("56"), teacher = structure(list(name = list("H")), .Names = "name", id = "D47")), .Names = c("name", "people", "teacher"), id = "602", status = "-4"),
class = structure(list(name = list("666d"), people = list("8"), teacher = structure(list(name = list("Q")), .Names = "name", id = "D20")), .Names = c("name", "people", "teacher"), id = "666", status = "-4")),
.Names = c("class", "class"), id = "2"),
score = structure(list(
class = structure(list(name = list("017c"), people = list("5"), teacher = structure(list(name = list("R")), .Names = "name", id = "D30")), .Names = c("name", "people", "teacher"), id = "017", status = "-4"),
class = structure(list(name = list("021c"), people = list("6"), teacher = structure(list(name = list("S")), .Names = "name", id = "D19")), .Names = c("name", "people", "teacher"), id = "561", status = "-4")),
.Names = c("class", "class"), id = "3")),
.Names = c("score", "score", "score"), date = "2017-01-27")),
.Names = c("schedule", "schedule", "schedule"), from = "2017-01-25", to = "2017-01-27")),
.Names = "scheduleList")
This is part of z2:
$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "017C"
$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "5"
$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "R"
attr(,"id")
[1] "D30"
attr(,"id")
[1] "017"
attr(,"status")
[1] "-4"
$scheduleList$schedule$score$class
$scheduleList$schedule$score$class$name
$scheduleList$schedule$score$class$name[[1]]
[1] "021C"
$scheduleList$schedule$score$class$people
$scheduleList$schedule$score$class$people[[1]]
[1] "6"
$scheduleList$schedule$score$class$teacher
$scheduleList$schedule$score$class$teacher$name
$scheduleList$schedule$score$class$teacher$name[[1]]
[1] "S"
attr(,"id")
[1] "D19"
attr(,"id")
[1] "561"
attr(,"status")
[1] "-4"
attr(,"id")
[1] "3"
attr(,"date")
[1] "2017-01-27"
attr(,"from")
[1] "2017-01-25"
attr(,"to")
[1] "2017-01-27"
I need to extract the information I need from the nested list, since I am new to this, so I use the most inefficient way to do so:
for (i in 1:length(z2[[1]])){ #length(z2[[1]])=7
for (j in 1:length(z2[[1]][[i]])){ #length(z2[[1]][[i]])=3
for (k in 1:length(z[[1]][[i]][[j]])){
cbind=(
Date=attr(z2[[1]][[i]],"date"), #date
Score=attr(z2[[1]][[i]][[j]],"id"), #score
People=z2[[1]][[i]][[j]][[k]][[2]][[1]], #people
TName=z2[[1]][[i]][[j]][[k]][[3]][[1]][[1]], #teacher name
TID=attr(z2[[1]][[i]][[j]][[k]][[3]],"id"), #teacher ID
CName=z2[[1]][[i]][[j]][[k]][[1]][[1]], #class name
CID=attr(z2[[1]][[i]][[j]][[k]],"id"), #class ID
CSta=attr(z2[[1]][[i]][[j]][[k]],"status") #class status
)
}
}
}
It doesn't work in my loops. And I want to output it as a data frame or array. The result that I was expected:
Date Score TID TName CName CID CSta People
2017-01-25 1 D14 B 013c 602 -4 0
2017-01-26 2 D16 L 666d 666 -4 14
XML format website example:
<result status="success">
<code>1</code>
<note>success</note>
<scheduleList from="2017-01-25" to="2017-01-26">
<schedule date="2017-01-25">
<score id="1">
<class id="011" status="-4">
<name>011c</name>
<people>116</people>
<teacher id="D47">
<name>A</name>
</teacher>
</class>
<class id="669" status="-4">
<name>669d</name>
<people>10</people>
<teacher id="D29">
<name>B</name>
</teacher>
</class>
</score>
<score id="2">
<class id="013" status="-4">
<name>013c</name>
<people>9</people>
<teacher id="D9">
<name>C</name>
</teacher>
</class>
</score>
<score id="3">
<class id="016" status="-4">
<name>016c</name>
<people>36</people>
<teacher id="D18">
<name>D</name>
</teacher>
</class>
<class id="019" status="-4">
<name>019c</name>
<people>9</people>
<teacher id="D30">
<name>E</name>
</teacher>
</class>
</score>
</schedule>
<schedule date="2017-01-26">
<score id="1">
<class id="011" status="-2">
<name>011c</name>
<people>2</people>
<teacher id="D29">
<name>F</name>
</teacher>
</class>
<class id="013" status="-2">
<name>013c</name>
<people>0</people>
<teacher id="D14">
<name>G</name>
</teacher>
</class>
</score>
<score id="2">
<class id="011" status="-2">
<name>011c</name>
<people>4</people>
<teacher id="D29">
<name>F</name>
</teacher>
</class>
</score>
<score id="3">
<class id="017" status="-2">
<name>017c</name>
<people>1</people>
<teacher id="D141">
<name>H</name>
</teacher>
</class>
<class id="019" status="-4">
<name>019c</name>
<people>22</people>
<teacher id="D291">
<name>I</name>
</teacher>
</class>
<class id="020" status="-4">
<name>020c</name>
<people>8</people>
<teacher id="D143">
<name>J</name>
</teacher>
</class>
</score>
</schedule>
</scheduleList>
</result>
code:
url <- "xxxxxxx"
session <-html_session(url)
form <-html_form(read_html(url))[[1]]
filled_form <- set_values(form,
"fromDate" = "2017-01-25",
"toDate" = "2017-01-26",
"userid" = "xxx",
"Password" = "aaa")
s <- submit_form(session,filled_form)
z = read_xml(s$response)
Using
purrr
anddplyr
packages from thetidyverse
could help for this taskDon't know if it is more efficient but it could be clearer to read and understand.
Great use case to understand
purrr
though.You are not assigning the result of
cbind
. (And it's used in a wrong way, don't docbind=something
, the equal sign is an error.)This is a quick and possibly ineffective way of doing it.