Scrape information with form submit using Phantom

2019-05-31 17:33发布

问题:

I want to do web scraping of this site. I have seen that the APIs are available but, as suggested by duraid in my previous question, it is not advisable to use them.

So I tried to use Node.js and Phantom.js with Phantom.

This is my code:

var phantom = require('phantom');

// object of methods
var methods = {};
var loadInProgress = false;
var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';

methods.download = async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.info('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.info(msg);
    });
    await page.on('onLoadStarted', function() {
        loadInProgress = true;
        console.log('Load started...');
    });
    await page.on('onLoadFinished', function() {
        loadInProgress = false;
        console.log('Load end');
    });

    const status = await page.open(url);
    console.log('STATUS:', status);

    const content = await page.property('content');
    console.log('CONTENT:', content);

    // submit
    await page.evaluate(function() {
        document.getElementById('crID%3a250').value = 'crID%3a250'; // France
        document.getElementById('timeID%3a79').value = 'timeID%3a79'; // 2015
        document.getElementById('varID%3a2').value = 'varID%3a2'; // Medium
        document.getElementById('ctl00_main_filters_anchorApplyBottom').submit(); // submit button
    });

    var result = await page.evaluate(function() {
        return document.querySelectorAll('html')[0].outerHTML; 
    });
    console.log('RESULT:', result);

    await instance.exit();
};

module.exports = methods;

(How can they select more countries and more years?)

I tried to select France as Country or Area, 2015 as a Year and medium as a Variants.

So crID%3a250 is id of element:

<input type="checkbox" id="crID%3a250" value="crID%3a250" name="France" />
<label for="crID%3a250">France</label><br />

timeID%3a79 is id of element:

<input type="checkbox" id="timeID%3a79" value="timeID%3a79" name="2015" />
<label for="timeID%3a79">2015</label><br />

varID%3a2 is id of element:

<input type="checkbox" id="varID%3a2" value="varID%3a2" name="Medium" />
<label for="varID%3a2">Medium</label><br />

And then ctl00_main_filters_anchorApplyBottom is id of button element:

<div class="All">
    <img src="_Images/IconUpdateResults.png" alt="Update" width="11px" height="11px" title="Apply filters" />&nbsp;<a href="javascript:;" id="ctl00_main_filters_anchorApplyBottom" title="Apply filters" onclick="ApplyFilters(SendFilterRequest);">Apply Filters</a>
</div>

But what I got is the web page itself (in HTML), not the data that interest me. So it's as if I had not selected any parameters. Why?


EDIT 1

After the advice of @Vaviloff I tried to change the code but without success. My server-side language is Node.js.

Using Phantom I modified the code like this:

methods.download = async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.log('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.log(msg);
    });

    const status = await page.open(url);
    console.log('\n\nSTATUS:', status);

    // submit
    await page.evaluate(function() {
        var countries = {
            'Albania': 'crID%3a8',
            'Austria': 'crID%3a40',
            'Belgium': 'crID%3a56',
            'Bulgaria': 'crID%3a100',
            'Croatia': 'crID%3a191',
            'Cyprus': 'crID%3a196',
            'Denmark': 'crID%3a208',
            'Estonia': 'crID%3a233',
            'Finland': 'crID%3a246',
            'France': 'crID%3a250',
            'Germany': 'crID%3a276',
            'Greece': 'crID%3a300',
            'Iceland': 'crID%3a352',
            'Ireland': 'crID%3a372',
            'Italy': 'crID%3a380',
            'Latvia': 'crID%3a428',
            'Netherlands': 'crID%3a528',
            'Norway': 'crID%3a578',
            'Poland': 'crID%3a616',
            'Portugal': 'crID%3a620',
            'Romania': 'crID%3a642',
            'Slovakia': 'crID%3a703',
            'Slovenia': 'crID%3a705',
            'Spain': 'crID%3a724',
            'Sweden': 'crID%3a752',
            'Switzerland': 'crID%3a756',
            'United Kingdom': 'crID%3a826'
        };
        // 2018 - 1980
        var years = ['timeID%3a83', 'timeID%3a82', 'timeID%3a81', 'timeID%3a79', 'timeID%3a78', 'timeID%3a77', 'timeID%3a76', 'timeID%3a75', 'timeID%3a73', 'timeID%3a72', 'timeID%3a71', 'timeID%3a70', 'timeID%3a69', 'timeID%3a67', 'timeID%3a66', 'timeID%3a65', 'timeID%3a64', 'timeID%3a63', 'timeID%3a61', 'timeID%3a60', 'timeID%3a59', 'timeID%3a58', 'timeID%3a57', 'timeID%3a55', 'timeID%3a54', 'timeID%3a53', 'timeID%3a52', 'timeID%3a51', 'timeID%3a49', 'timeID%3a48', 'timeID%3a47', 'timeID%3a46', 'timeID%3a45', 'timeID%3a43', 'timeID%3a42', 'timeID%3a41', 'timeID%3a40', 'timeID%3a39', 'timeID%3a37']; 

        // select countries
        for(var c in countries) {
            document.getElementById(countries[c]).setAttribute('checked', true);
        }
        // select years
        for(var y in years) {
            document.getElementById(years[y]).setAttribute('checked', true);
        }
        // select variants
        document.getElementById('varID%3a2').setAttribute('checked', true); // medium
        // click button
        document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); 
    });

    console.log('\nWaiting 1.5 seconds...');    
   await timeout(1500);

   // get only the table contents
    var result = await page.evaluate(function() {
        return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
    });
    console.log('\n\nRESULT:', result);

    elaborateResult(result);

    await instance.exit();
};

function elaborateResult(res) {
    var el = document.createElement('html'); // ** ERROR HERE **
    el.innerHTML = result;
    console.log('\n\nTD ELEMENTS:', el.getElementsByTagName('td'));
    //var obj = utilFunc.createJsonObjectPop(year, country, population);
    //console.log(obj);
}

There are two errors:

  1. result contains only the values that are on the first page of the results, but with the selections made you get 22 pages of results and I don't understand how I can get all the values that interest me and link them in the variable result.
  2. assuming to have solved the problem in point (1), now I should elaborate the results obtained and create an object like this:

var date = [{year: 2018, country: 'Albania', population: 2934.363}, {year: 2017, country: 'Albania', population: 2930.187}, ..., {year: 1980, country: 'United Kingdom ', population: 56265.475}]

This is what the elaborateResult(res) function should do (of course, the function is not complete, I have to finish it but I get an error at the first line), but I get the error:

ReferenceError: document is not defined

So I changed my strategy and I tried not to use Phantom but a normal request:

var options = {
    uri: 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8&crID%3a40;timeID%3a79&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302',
    transform: function(body) {
        return cheerio.load(body);
    }
};

methods.download = async function(req, res) {
    request(options)
    .then(function($) {
        console.log('\n\nTHEN: ', $);
    })
    .catch(function(err) {
        console.log('Error', err.stack());
    });
}

If I run this code I get:

THEN:  function (selector, context, r, opts) {
    if (!(this instanceof initialize)) {
      return new initialize(selector, context, r, opts);
    }
    opts = _.defaults(opts || {}, options);
    return Cheerio.call(this, selector, context, r || root, opts);
  }

In this case I have other problems.

  1. I don't know how to build the url. In the example above I chose Albania (crID% 3a8) and Austria (crID% 3a40) and 2015 as year (timeID% 3a79). Yet if I go to the link just built, I get as a result the data on Albania from 2100 to 2095.
  2. I don't know how to select the years or how to select variants or how to change pages.

I feel a bit stupid but I can't get what I want... I'm stuck. Help would be very welcome!

回答1:

There are several issues with your script that prevent successful scrape.

To check a checkbox, you don't set its value again (it's already set in HTML!), you set its checked attribute to true:

document.getElementById('crID%3a250').setAttribute("checked", true); // France

The button that submits the form is a hyperlink <a> which doesn't have a submit method, it should be clicked (it even has onClick function in the code)

 document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // submit the form

**The search request ** is sent through ajax and takes time to complete, so your script should wait for at least a second vefore trying to fetch the data. I'll show how to wait in the full working code below.

Next, you may get only the table data, no need to sip through all th HTML:

var result = await page.evaluate(function() {
    return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
});

Here's a bit trimmed down version of you script with issues corrected:

var phantom = require('phantom');

var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';

// A promise to wait for n of milliseconds
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms));

(async function(req, res) {
    const instance = await phantom.create();
    const page = await instance.createPage();

    await page.on('onResourceRequested', function(requestData) {
        console.info('Requesting', requestData.url);
    });
    await page.on('onConsoleMessage', function(msg) {
        console.info(msg);
    });

    const status = await page.open(url);
    await console.log('STATUS:', status);

    // submit
    await page.evaluate(function() {
        document.getElementById('crID%3a250').setAttribute("checked", true); // France
        document.getElementById('timeID%3a79').setAttribute("checked", true); // 2015
        document.getElementById('varID%3a2').setAttribute("checked", true); // Medium
        document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // click submit button
    });

    console.log('Waiting 1.5 seconds..');    
    await timeout(1500);

    // Get only the table contents
    var result = await page.evaluate(function() {
        return document.querySelectorAll('.DataContainer table')[0].outerHTML; 
    });
    await console.log('RESULT:', result);

    await instance.exit();
})();

The last but not the least observation is that you could simply try to replay an ajax request made by the form and find out that the URL of search request works quite well on its own, when just opened in another tab:

You don't even need a headless browser to get it, just cUrl/requests and process. It happens with sites a lot, so it's useful to check network tab in your browser devtools before scraping.

Update

And if there are so many results that they are scattered over several pages, there is one more parameter to be used in request: Page:

data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=variableID:12&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=461