I am using PhantomJS to scrape some websites and therefore extract information with r. I am following this tutorial. Everything works fine for a single page, but I couldn't find any simple tutorial on how to automate for multiple pages.
My experiments so far:
var countries = [ "Albania" ,"Afghanistan"];
var len = countries.length;
var name1 = ".html";
var add1 = "http://www.kluwerarbitration.com/CommonUI/BITs.aspx?country=";
var country ="";
var name ="";
var add="";
for (i=1; i <= len; i++){
country = countries[i]
name = country.concat(name1)
add = add1.concat(name1)
var webPage = require('webpage');
var page = webPage.create();
var fs = require('fs');
var path = name
page.open(add, function (status) {
var content = page.content;
fs.write(path,content,'w')
phantom.exit();
});
}
I don't seem to get any error when running the code, the script creates a html file only for the second country, which contains all information on the page exception made for the small table I am interested in.
I tried to gather some information from similar questions. However, also because I couldn't find a simple reproducible example, I don't understand what I am doing wrong.
The main problem seems to be that you're exiting too early. You're creating multiple page
instances in a loop. Since PhantomJS is asynchronous, the call to page.open()
immediately exists and the next for loop iteration is executed.
A for-loop is pretty fast, but web requests are slow. This means that your loop is fully executed before even the first page is loaded. This also means that the first page that is loaded will also exit PhantomJS, because you're calling phantom.exit()
in each of those page.open()
callbacks. I suspect the second URL is faster for some reason and is therefore always written.
var countFinished = 0,
maxFinished = len;
function checkFinish(){
countFinished++;
if (countFinished + 1 === maxFinished) {
phantom.exit();
}
}
for (i=1; i <= len; i++) {
country = countries[i]
name = country.concat(name1)
add = add1.concat(country)
var webPage = require('webpage');
var page = webPage.create();
var fs = require('fs');
var path = name
page.open(add, function (status) {
var content = page.content;
fs.write(path, content,'w')
checkFinish();
});
}
The problem is that you're creating a lot of page
instances without cleaning up. You should close them when you're done with them:
for (i=1; i <= len; i++) {
(function(i){
country = countries[i]
name = country.concat(name1)
add = add1.concat(country)
var webPage = require('webpage');
var page = webPage.create();
var fs = require('fs');
var path = name
page.open(add, function (status) {
var content = page.content;
fs.write(path, content,'w');
page.close();
checkFinish();
});
})(i);
}
Since JavaScript has function-level scope, you would need to use an IIFE to retain a reference to the correct page
instance in the page.open()
callback. See this question for more information about that: Q: JavaScript closure inside loops – simple practical example
If you don't want to clean up afterwards, then you should use the same page
instance over all of those URLs. I already have an answer about doing that here: A: Looping over urls to do the same thing
Given my very very limited knowledge of js I thought about a workaround to the problem. I am still interested in solving the problem properly, but I foresee that will take quite some time.
For the moment I got what I wanted by doing some experimental stuff in R. Instead of running the loop within js, I used R to write multiple single js codes, so that the "phantomjs is asynchronous problem" is bypassed.
The trick consist in exporting the chunk of js code using write.table with the parameter quote=F, and using .js as file extension, so that it is correctly recognized as a js file. I guess this workaround has limited applicability to other similar tasks, but it might nonetheless help someone. Comments are very appreciated.
countries <- c("Afghanistan", "Albania", "Algeria")
for (i in unique(countries)){
df <- data.frame(lines=character(11),
stringsAsFactors=FALSE)
outputline <- paste("var path = '", i, ".html'" , sep="")
inputline <- paste("page.open('http://www.kluwerarbitration.com/CommonUI/BITs.aspx?country=", i ,"', function (status) {", sep="")
df$lines[1] <- "var webPage = require('webpage');"
df$lines[2] <- "var page = webPage.create();"
df$lines[3] <- "var fs = require('fs');"
df$lines[4] <- ""
df$lines[5] <- outputline
df$lines[6] <- ""
df$lines[7] <- inputline
df$lines[8] <- " var content = page.content;"
df$lines[9] <- " fs.write(path,content,'w')"
df$lines[10] <- " phantom.exit();"
df$lines[11] <- "});"
write.table(df, paste(i, ".js", sep = ""), sep=" ", quote=F, row.names=F, col.names=F)
}
library(rvest)
library(stringr)
library(plyr)
library(dplyr)
library(ggvis)
library(knitr)
options(digits = 4)
#run all individual javascript files
index <- 1
for (i in countries){
javacode <- paste0("./phantomjs", sep=" ", countries, ".js")
system(javacode[index])
index <- index + 1
}