I am trying to scrape the name, city, state, email, etc of professionals from this website http://www.napo.net/search/newsearch.asp using rvest, but I can't seem to get the CSS selectors using selector gadget and the e-mails are protected with JavaScript.
I have checked the forums and haven't seen any issue like this.
This solution uses seleniumPipes and RSelenium package. You should also download phantomjs ,unzip it and put .exe file in in your R working directory.
This method uses a headless browser(phantomjs) which simulates user behavior. It can read javascript generated values.
library(rvest)
library(RSelenium) # start a server with utility function
library(seleniumPipes)
rD <- rsDriver (browser = 'chrome',chromever = "latest",port = 4444L)
#open browser
remDr <- remoteDr(browserName = "chrome")
main_page_url <- "http://www.napo.net/search/newsearch.asp"
#go to home page
remDr %>% go(main_page_url)
#switch to iframe
remDr %>% switchToFrame(Id = "SearchResultsFrame")
#get all relative path
relative_path <- remDr %>% getPageSource() %>% html_nodes(".lineitem a[href]") %>% html_attr("href")
#all individual urls:
full_paths <- paste0("http://www.napo.net",relative_path)
#scrape email from each page
email_address <- list()
#Retrieve email adress from the first three results
for(i in seq_along(full_paths[1:3])){
remDr %>% go(full_paths[i])
email_adress <- remDr %>% getPageSource() %>% html_nodes('a[href^="mailto"]') %>% html_text()
temp_list <- list(email = email_adress)
email_address <- c(email_address,temp_list)
Sys.sleep(3)
}
#display result
email_address[1]
$email
[1] "marla@123organize.com"
Above are all for page one, if you want to turn to page two:
remDr %>% go(main_page_url)
remDr %>% switchToFrame(Id = "SearchResultsFrame")
#click on page two on iframe to turn to page 2:
remDr %>% findElement(using = "css selector",value = ".DotNetPager a:nth-child(2)") %>% elementClick()
#get relative and full path again
relative_path <- remDr %>% getPageSource() %>% html_nodes(".lineitem a[href]") %>% html_attr("href")
full_paths <- paste0("http://www.napo.net",relative_path)
#And you can do the for loop again
for(i in seq_along(full_paths[1:3])){
remDr %>% go(full_paths[i])
email_adress <- remDr %>% getPageSource() %>% html_nodes('a[href^="mailto"]') %>% html_text()
temp_list <- list(email = email_adress)
email_address <- c(email_address,temp_list)
Sys.sleep(3)
}
#display result[6]
$email
[1] "lynette@itssimplyplaced.com"
email_address
#You can also do a loop to scrape all pages
#-----
#delete session and close server
remDr %>% deleteSession()
rD[["server"]]$stop()
I am doing it in 2 steps.
1.to get the link to the embedded search result pages:
require(rvest)
require(magrittr)
yourlink <- "http://www.napo.net/search/newsearch.asp"
linktoresult <- yourlink %>% read_html() %>%
html_nodes("iframe") %>% extract(1) %>%
html_attr("src")
# /searchserver/people.aspx?id=FE0436D0-08ED-4763-8588-09112794521D&cdbid=&canconnect=0&canmessage=0&map=True&toggle=False&hhSearchTerms=
2.scrape from the actual search result page:
pagelink <- paste0("http://www.napo.net", linktoresult)
# "http://www.napo.net/searchserver/people.aspx?id=FE0436D0-08ED-4763-8588-09112794521D&cdbid=&canconnect=0&canmessage=0&map=True&toggle=False&hhSearchTerms="
yourresult <- pagelink %>% read_html() %>%
html_nodes("#SearchResultsGrid>.lineitem") %>%
html_nodes("a") %>%
html_attr("href")
#/members/?id=42241027
#NA
#/members/?id=46636113
#/members/?id=37474237
#/members/?id=39530420
#...