setTimeout in Phantom.js

2019-01-19 12:20发布

问题:

The code below wants Phantom.js to load the page, click on a button and wait for 5 seconds before returning the page's HTML code.

Problem: However using setTimeout() to create the 5 seconds delay causes the page.evaluate function to return null to the callback function instead of the HTML.

myUrl = 'http://www.google.com'

var phantom = Meteor.npmRequire('phantom')
phantom.create = Meteor.wrapAsync(phantom.create)
phantom.create( function(ph) {

    ph.createPage = Meteor.wrapAsync(ph.createPage)
    ph.createPage(function(page) {

        page.open = Meteor.wrapAsync(page.open)
        page.open(listingUrl, function(status) {
            console.log('Page loaded')

            page.evaluate = Meteor.wrapAsync(page.evaluate)
            page.evaluate(function() {

                // Find the button
                var element = document.querySelector( '.search-btn' );

                // create a mouse click event
                var event = document.createEvent( 'MouseEvents' );
                event.initMouseEvent( 'click', true, true, window, 1, 0, 0 );

                // send click to element
                element.dispatchEvent( event );

                // Give page time to process Click event
                setTimeout(function() {
                    // Return HTML code
                    return document.documentElement.outerHTML
                }, 5000)

            }, function(html) {

                // html is `null`
                doSomething()

            })
        })
    })
})

Replacing setTimeout() with Meteor.setTimeout() causes another error:

phantom stdout: ReferenceError: Can't find variable: Meteor

回答1:

page.evaluate() is the sandboxed page context of PhantomJS. It has no access to variables defined on the outside. If you need the timeout, then you need to do two calls to page.evaluate(), because you cannot return anything from a asynchronous function (explanation):

page.evaluate(function() {
    ...
    element.dispatchEvent( event );
}, function() {
    setTimeout(function() {
        page.evaluate(function() {    
            return document.documentElement.outerHTML
        }, function(html) {
            doSomething()
        })
    }, 5000)
})

Instead of using the second page.evaluate() call, you may shorten the code by directly accessing the content as defined here:

setTimeout(function() {
    page.get("content", function(content) {
        doSomething()
    })
}, 5000)


回答2:

This is not a great solution but works if all you want to do is handle page changes on button clicks and form submits. Just declare the function variables outside page.open() and then assign them page evaluation functions later inside. onLoadFinished will be called after the page has reloaded with changes from the button click and then you can evaluate it again.

var loadInProgress = false,
jurl = 'http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js',
page = require('webpage').create();

// declare variables outside page.open and assign them later inside
var evalPageFunc;

// assign callbacks which will be called by phantom
page.onLoadStarted = function() {
    loadInProgress = true;
    console.log('load started');
};
page.onLoadFinished = function() {
    loadInProgress = false;
    console.log('load finished');
    if (evalPageFunc) {
      // since the page has loaded we can safely evaluate it
      var mydata = evalPageFunc();
      console.log(mydata);
      if (!mydata.havemore) {
        phantom.exit();
        // or next url
      }
    }
};

page.open(url, function(status) {
  page.includeJs(jurl, function(){

    // define your page evaluating functions
    evalPageFunc = function(){
      return page.evaluate(function() {
        var datafromhtml = {}, havemoretoclick = true;
        // get your data and perform clicks if you want to
        // datafromhtml.somedata = $('stealme').text();
        // $("clickme").click();
        return {
          havemore: havemoretoclick,
          data: datafromhtml
        };
      });
    }
    var k = evalPageFunc();
  });
});

Its not pretty but it works.