I want to do web scraping of this site. I have seen that the APIs are available but, as suggested by duraid in my previous question, it is not advisable to use them.
So I tried to use Node.js
and Phantom.js
with Phantom
.
This is my code:
var phantom = require('phantom');
// object of methods
var methods = {};
var loadInProgress = false;
var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';
methods.download = async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.info(msg);
});
await page.on('onLoadStarted', function() {
loadInProgress = true;
console.log('Load started...');
});
await page.on('onLoadFinished', function() {
loadInProgress = false;
console.log('Load end');
});
const status = await page.open(url);
console.log('STATUS:', status);
const content = await page.property('content');
console.log('CONTENT:', content);
// submit
await page.evaluate(function() {
document.getElementById('crID%3a250').value = 'crID%3a250'; // France
document.getElementById('timeID%3a79').value = 'timeID%3a79'; // 2015
document.getElementById('varID%3a2').value = 'varID%3a2'; // Medium
document.getElementById('ctl00_main_filters_anchorApplyBottom').submit(); // submit button
});
var result = await page.evaluate(function() {
return document.querySelectorAll('html')[0].outerHTML;
});
console.log('RESULT:', result);
await instance.exit();
};
module.exports = methods;
(How can they select more countries and more years?)
I tried to select France as Country or Area, 2015 as a Year and medium as a Variants.
So crID%3a250
is id of element:
<input type="checkbox" id="crID%3a250" value="crID%3a250" name="France" />
<label for="crID%3a250">France</label><br />
timeID%3a79
is id of element:
<input type="checkbox" id="timeID%3a79" value="timeID%3a79" name="2015" />
<label for="timeID%3a79">2015</label><br />
varID%3a2
is id of element:
<input type="checkbox" id="varID%3a2" value="varID%3a2" name="Medium" />
<label for="varID%3a2">Medium</label><br />
And then ctl00_main_filters_anchorApplyBottom
is id of button element:
<div class="All">
<img src="_Images/IconUpdateResults.png" alt="Update" width="11px" height="11px" title="Apply filters" /> <a href="javascript:;" id="ctl00_main_filters_anchorApplyBottom" title="Apply filters" onclick="ApplyFilters(SendFilterRequest);">Apply Filters</a>
</div>
But what I got is the web page itself (in HTML), not the data that interest me. So it's as if I had not selected any parameters. Why?
EDIT 1
After the advice of @Vaviloff I tried to change the code but without success. My server-side language is Node.js.
Using Phantom
I modified the code like this:
methods.download = async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.log('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.log(msg);
});
const status = await page.open(url);
console.log('\n\nSTATUS:', status);
// submit
await page.evaluate(function() {
var countries = {
'Albania': 'crID%3a8',
'Austria': 'crID%3a40',
'Belgium': 'crID%3a56',
'Bulgaria': 'crID%3a100',
'Croatia': 'crID%3a191',
'Cyprus': 'crID%3a196',
'Denmark': 'crID%3a208',
'Estonia': 'crID%3a233',
'Finland': 'crID%3a246',
'France': 'crID%3a250',
'Germany': 'crID%3a276',
'Greece': 'crID%3a300',
'Iceland': 'crID%3a352',
'Ireland': 'crID%3a372',
'Italy': 'crID%3a380',
'Latvia': 'crID%3a428',
'Netherlands': 'crID%3a528',
'Norway': 'crID%3a578',
'Poland': 'crID%3a616',
'Portugal': 'crID%3a620',
'Romania': 'crID%3a642',
'Slovakia': 'crID%3a703',
'Slovenia': 'crID%3a705',
'Spain': 'crID%3a724',
'Sweden': 'crID%3a752',
'Switzerland': 'crID%3a756',
'United Kingdom': 'crID%3a826'
};
// 2018 - 1980
var years = ['timeID%3a83', 'timeID%3a82', 'timeID%3a81', 'timeID%3a79', 'timeID%3a78', 'timeID%3a77', 'timeID%3a76', 'timeID%3a75', 'timeID%3a73', 'timeID%3a72', 'timeID%3a71', 'timeID%3a70', 'timeID%3a69', 'timeID%3a67', 'timeID%3a66', 'timeID%3a65', 'timeID%3a64', 'timeID%3a63', 'timeID%3a61', 'timeID%3a60', 'timeID%3a59', 'timeID%3a58', 'timeID%3a57', 'timeID%3a55', 'timeID%3a54', 'timeID%3a53', 'timeID%3a52', 'timeID%3a51', 'timeID%3a49', 'timeID%3a48', 'timeID%3a47', 'timeID%3a46', 'timeID%3a45', 'timeID%3a43', 'timeID%3a42', 'timeID%3a41', 'timeID%3a40', 'timeID%3a39', 'timeID%3a37'];
// select countries
for(var c in countries) {
document.getElementById(countries[c]).setAttribute('checked', true);
}
// select years
for(var y in years) {
document.getElementById(years[y]).setAttribute('checked', true);
}
// select variants
document.getElementById('varID%3a2').setAttribute('checked', true); // medium
// click button
document.getElementById('ctl00_main_filters_anchorApplyBottom').click();
});
console.log('\nWaiting 1.5 seconds...');
await timeout(1500);
// get only the table contents
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
});
console.log('\n\nRESULT:', result);
elaborateResult(result);
await instance.exit();
};
function elaborateResult(res) {
var el = document.createElement('html'); // ** ERROR HERE **
el.innerHTML = result;
console.log('\n\nTD ELEMENTS:', el.getElementsByTagName('td'));
//var obj = utilFunc.createJsonObjectPop(year, country, population);
//console.log(obj);
}
There are two errors:
result
contains only the values that are on the first page of the results, but with the selections made you get 22 pages of results and I don't understand how I can get all the values that interest me and link them in the variableresult
.- assuming to have solved the problem in point (1), now I should elaborate the results obtained and create an object like this:
var date = [{year: 2018, country: 'Albania', population: 2934.363}, {year: 2017, country: 'Albania', population: 2930.187}, ..., {year: 1980, country: 'United Kingdom ', population: 56265.475}]
This is what the elaborateResult(res)
function should do (of course, the function is not complete, I have to finish it but I get an error at the first line), but I get the error:
ReferenceError: document is not defined
So I changed my strategy and I tried not to use Phantom
but a normal request
:
var options = {
uri: 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8&crID%3a40;timeID%3a79&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302',
transform: function(body) {
return cheerio.load(body);
}
};
methods.download = async function(req, res) {
request(options)
.then(function($) {
console.log('\n\nTHEN: ', $);
})
.catch(function(err) {
console.log('Error', err.stack());
});
}
If I run this code I get:
THEN: function (selector, context, r, opts) {
if (!(this instanceof initialize)) {
return new initialize(selector, context, r, opts);
}
opts = _.defaults(opts || {}, options);
return Cheerio.call(this, selector, context, r || root, opts);
}
In this case I have other problems.
- I don't know how to build the url.
In the example above I chose Albania (
crID% 3a8
) and Austria (crID% 3a40
) and 2015 as year (timeID% 3a79
). Yet if I go to the link just built, I get as a result the data on Albania from 2100 to 2095. - I don't know how to select the years or how to select variants or how to change pages.
I feel a bit stupid but I can't get what I want... I'm stuck. Help would be very welcome!
There are several issues with your script that prevent successful scrape.
To check a checkbox, you don't set its value again (it's already set in HTML!), you set its
checked
attribute to true:The button that submits the form is a hyperlink
<a>
which doesn't have asubmit
method, it should be clicked (it even hasonClick
function in the code)**The search request ** is sent through ajax and takes time to complete, so your script should wait for at least a second vefore trying to fetch the data. I'll show how to wait in the full working code below.
Next, you may get only the table data, no need to sip through all th HTML:
Here's a bit trimmed down version of you script with issues corrected:
The last but not the least observation is that you could simply try to replay an ajax request made by the form and find out that the URL of search request works quite well on its own, when just opened in another tab:
You don't even need a headless browser to get it, just cUrl/requests and process. It happens with sites a lot, so it's useful to check network tab in your browser devtools before scraping.
Update
And if there are so many results that they are scattered over several pages, there is one more parameter to be used in request:
Page
: