R Scrape a list of Google + urls using purrr packa

2019-10-30 04:52发布

我工作的一个网页抓取的项目,其目的是从一组儿童医院的提取谷歌+评论。 我的方法如下:

1)定义谷歌+的URL列表导航到审核刮。 网址已经有定义医院其他变量沿着数据帧。

2)刮的评论,数星星,和后期的时间与给定的URL的所有评论。

3)保存在一个数据帧这些元件,并命名在对应于URL中的数据帧的另一个变量后的数据帧。

4)移动到下一个网址...等等,直到所有的网址为刮掉。

目前,该代码可以从一个单一的URL刮。 我试图创建一个使用功能mappurrr包。 然而,它似乎并不奏效,我做错了什么。

这里是我的尝试,对每一个步骤的目的评论

#Load the necessary libraries
devtools::install_github("ropensci/RSelenium")
library(purrr)
library(dplyr)
library(stringr)
library(rvest)
library(xml2)
library(RSelenium)
#To avoid any SSL error messages
library(httr)
set_config( config( ssl_verifypeer = 0L ) )

定义URL数据帧

#Now to define the dataframe with the urls
urls_df =data.frame(Name=c("CHKD","AIDHC")
                    ,ID=c("AAWZ12","AAWZ13")
                    ,GooglePlus_URL=c("https://www.google.co.uk/search?ei=fJUKW9DcJuqSgAbPsZ3gDQ&q=Childrens+Hospital+of+the+Kings+Daughter+&oq=Childrens+Hospital+of+the+Kings+Daughter+&gs_l=psy-ab.3..0i13k1j0i22i10i30k1j0i22i30k1l7.8445.8445.0.9118.1.1.0.0.0.0.144.144.0j1.1.0....0...1c.1.64.psy-ab..0.1.143....0.qDMr7IDA-uA#lrd=0x89ba9869b87f1a69:0x384861b1e3a4efd3,1,,,",
                                      "https://www.google.co.uk/search?q=Alfred+I+DuPont+Hospital+for+Children&oq=Alfred+I+DuPont+Hospital+for+Children&aqs=chrome..69i57.341j0j8&sourceid=chrome&ie=UTF-8#lrd=0x89c6fce9425c92bd:0x80e502f2175fb19c,1,,,"
                                      ))

创建功能

extract_google_review=function(googleplus_urls) {

  #Opens a Chrome session
  rmDr=rsDriver(browser = "chrome",check = F)
  myclient= rmDr$client

  #Creates a sub-dataframe for the filtered hospital, which I will later use to name the dataframe
  urls_df_sub=urls_df %>% filter(GooglePlus_URL %in% googleplus_urls)

  #Navigate to the url
  myclient$navigate(googleplus_urls)

  #click on the snippet to switch focus----------
  webEle <- myclient$findElement(using = "css",value = ".review-snippet")
  webEle$clickElement()
  # Save page source
  pagesource= myclient$getPageSource()[[1]]

  #simulate scroll down for several times-------------
  count=read_html(pagesource) %>%
    html_nodes(".p13zmc") %>%
    html_text()

  #Stores the number of reviews for the url, so we know how many times to scroll down
  scroll_down_times=count %>%
    str_sub(1,nchar(count)-5) %>%
    as.numeric()

  for(i in 1 :scroll_down_times){
    webEle$sendKeysToActiveElement(sendKeys = list(key="page_down"))
    #the content needs time to load,wait 1.2 second every 5 scroll downs
    if(i%%5==0){
      Sys.sleep(1.2)
    }
  }

  #loop and simulate clicking on all "click on more" elements-------------
  webEles <- myclient$findElements(using = "css",value = ".review-more-link")
  for(webEle in webEles){
    tryCatch(webEle$clickElement(),error=function(e){print(e)})
  }

  pagesource= myclient$getPageSource()[[1]]
  #this should get the full review, including translation and original text
    reviews=read_html(pagesource) %>%
    html_nodes(".review-full-text") %>%
    html_text()

  #number of stars
  stars <- read_html(pagesource) %>%
    html_node(".review-dialog-list") %>%
    html_nodes("g-review-stars > span") %>%
    html_attr("aria-label")

  #time posted
  post_time <- read_html(pagesource) %>%
    html_node(".review-dialog-list") %>%
    html_nodes(".dehysf") %>%
    html_text()

  #Consolidating everything into a dataframe
  reviews=head(reviews,min(length(reviews),length(stars),length(post_time)))
  stars=head(stars,min(length(reviews),length(stars),length(post_time))) 
  post_time=head(post_time,min(length(reviews),length(stars),length(post_time)))
  reviews_df=data.frame(review=reviews,rating=stars,time=post_time)

  #Assign the dataframe a name based on the value in column 'Name' of the dataframe urls_df, defined above
  df_name <- tolower(urls_df_sub$Name)

  if(exists(df_name)) {
    assign(df_name, unique(rbind(get(df_name), reviews_df)))
  } else {
    assign(df_name, reviews_df)
  }


} #End function

饲养的网址进入功能

#Now that the function is defined, it is time to create a vector of urls and feed this vector into the function
googleplus_urls=urls_df$GooglePlus_URL
googleplus_urls %>% map(extract_google_review)

似乎有在功能,这是防止它刮削和存储数据到单独dataframes像预期的错误。

我想要的输出

2个dataframes,每3列

这是如何可以改善任何指针将不胜感激。

文章来源: R Scrape a list of Google + urls using purrr package