phantomjs not waiting for “full” page load

2018-12-31 14:23发布

I'm using PhantomJS v1.4.1 to load some web pages. I don't have access to their server-side, I just getting links pointing to them. I'm using obsolete version of Phantom because I need to support Adobe Flash on that web pages.

The problem is many web-sites are loading their minor content async and that's why Phantom's onLoadFinished callback (analogue for onLoad in HTML) fired too early when not everything still has loaded. Can anyone suggest how can I wait for full load of a webpage to make, for example, a screenshot with all dynamic content like ads?

13条回答
浅入江南
2楼-- · 2018-12-31 14:43

In my program, I use some logic to judge if it was onload: watching it's network request, if there was no new request on past 200ms, I treat it onload.

Use this, after onLoadFinish().

function onLoadComplete(page, callback){
    var waiting = [];  // request id
    var interval = 200;  //ms time waiting new request
    var timer = setTimeout( timeout, interval);
    var max_retry = 3;  //
    var counter_retry = 0;

    function timeout(){
        if(waiting.length && counter_retry < max_retry){
            timer = setTimeout( timeout, interval);
            counter_retry++;
            return;
        }else{
            try{
                callback(null, page);
            }catch(e){}
        }
    }

    //for debug, log time cost
    var tlogger = {};

    bindEvent(page, 'request', function(req){
        waiting.push(req.id);
    });

    bindEvent(page, 'receive', function (res) {
        var cT = res.contentType;
        if(!cT){
            console.log('[contentType] ', cT, ' [url] ', res.url);
        }
        if(!cT) return remove(res.id);
        if(cT.indexOf('application') * cT.indexOf('text') != 0) return remove(res.id);

        if (res.stage === 'start') {
            console.log('!!received start: ', res.id);
            //console.log( JSON.stringify(res) );
            tlogger[res.id] = new Date();
        }else if (res.stage === 'end') {
            console.log('!!received end: ', res.id, (new Date() - tlogger[res.id]) );
            //console.log( JSON.stringify(res) );
            remove(res.id);

            clearTimeout(timer);
            timer = setTimeout(timeout, interval);
        }

    });

    bindEvent(page, 'error', function(err){
        remove(err.id);
        if(waiting.length === 0){
            counter_retry = 0;
        }
    });

    function remove(id){
        var i = waiting.indexOf( id );
        if(i < 0){
            return;
        }else{
            waiting.splice(i,1);
        }
    }

    function bindEvent(page, evt, cb){
        switch(evt){
            case 'request':
                page.onResourceRequested = cb;
                break;
            case 'receive':
                page.onResourceReceived = cb;
                break;
            case 'error':
                page.onResourceError = cb;
                break;
            case 'timeout':
                page.onResourceTimeout = cb;
                break;
        }
    }
}
查看更多
其实,你不懂
3楼-- · 2018-12-31 14:49

this is my solution its worked for me .

page.onConsoleMessage = function(msg, lineNum, sourceId) {

    if(msg=='hey lets take screenshot')
    {
        window.setInterval(function(){      
            try
            {               
                 var sta= page.evaluateJavaScript("function(){ return jQuery.active;}");                     
                 if(sta == 0)
                 {      
                    window.setTimeout(function(){
                        page.render('test.png');
                        clearInterval();
                        phantom.exit();
                    },1000);
                 }
            }
            catch(error)
            {
                console.log(error);
                phantom.exit(1);
            }
       },1000);
    }       
};


page.open(address, function (status) {      
    if (status !== "success") {
        console.log('Unable to load url');
        phantom.exit();
    } else { 
       page.setContent(page.content.replace('</body>','<script>window.onload = function(){console.log(\'hey lets take screenshot\');}</script></body>'), address);
    }
});
查看更多
姐姐魅力值爆表
4楼-- · 2018-12-31 14:52

Here is a solution that waits for all resource requests to complete. Once complete it will log the page content to the console and generate a screenshot of the rendered page.

Although this solution can serve as a good starting point, I have observed it fail so it's definitely not a complete solution!

I didn't have much luck using document.readyState.

I was influenced by the waitfor.js example found on the phantomjs examples page.

var system = require('system');
var webPage = require('webpage');

var page = webPage.create();
var url = system.args[1];

page.viewportSize = {
  width: 1280,
  height: 720
};

var requestsArray = [];

page.onResourceRequested = function(requestData, networkRequest) {
  requestsArray.push(requestData.id);
};

page.onResourceReceived = function(response) {
  var index = requestsArray.indexOf(response.id);
  requestsArray.splice(index, 1);
};

page.open(url, function(status) {

  var interval = setInterval(function () {

    if (requestsArray.length === 0) {

      clearInterval(interval);
      var content = page.content;
      console.log(content);
      page.render('yourLoadedPage.png');
      phantom.exit();
    }
  }, 500);
});
查看更多
明月照影归
5楼-- · 2018-12-31 14:52

I found this approach useful in some cases:

page.onConsoleMessage(function(msg) {
  // do something e.g. page.render
});

Than if you own the page put some script inside:

<script>
  window.onload = function(){
    console.log('page loaded');
  }
</script>
查看更多
皆成旧梦
6楼-- · 2018-12-31 14:53

This is an old question, but since I was looking for full page load but for Spookyjs (that uses casperjs and phantomjs) and didn't find my solution, I made my own script for that, with the same approach as the user deemstone . What this approach does is, for a given quantity of time, if the page did not receive or started any request it will end the execution.

On casper.js file (if you installed it globally, the path would be something like /usr/local/lib/node_modules/casperjs/modules/casper.js) add the following lines:

At the top of the file with all the global vars:

var waitResponseInterval = 500
var reqResInterval = null
var reqResFinished = false
var resetTimeout = function() {}

Then inside function "createPage(casper)" just after "var page = require('webpage').create();" add the following code:

 resetTimeout = function() {
     if(reqResInterval)
         clearTimeout(reqResInterval)

     reqResInterval = setTimeout(function(){
         reqResFinished = true
         page.onLoadFinished("success")
     },waitResponseInterval)
 }
 resetTimeout()

Then inside "page.onResourceReceived = function onResourceReceived(resource) {" on the first line add:

 resetTimeout()

Do the same for "page.onResourceRequested = function onResourceRequested(requestData, request) {"

Finally, on "page.onLoadFinished = function onLoadFinished(status) {" on the first line add:

 if(!reqResFinished)
 {
      return
 }
 reqResFinished = false

And that's it, hope this one helps someone in trouble like I was. This solution is for casperjs but works directly for Spooky.

Good luck !

查看更多
零度萤火
7楼-- · 2018-12-31 14:54

You could try a combination of the waitfor and rasterize examples:

/**
 * See https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js
 * 
 * Wait until the test condition is true or a timeout occurs. Useful for waiting
 * on a server response or for a ui change (fadeIn, etc.) to occur.
 *
 * @param testFx javascript condition that evaluates to a boolean,
 * it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
 * as a callback function.
 * @param onReady what to do when testFx condition is fulfilled,
 * it can be passed in as a string (e.g.: "1 == 1" or "$('#bar').is(':visible')" or
 * as a callback function.
 * @param timeOutMillis the max amount of time to wait. If not specified, 3 sec is used.
 */
function waitFor(testFx, onReady, timeOutMillis) {
    var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s
        start = new Date().getTime(),
        condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()), //< defensive code
        interval = setInterval(function() {
            if ( (new Date().getTime() - start < maxtimeOutMillis) && !condition ) {
                // If not time-out yet and condition not yet fulfilled
                condition = (typeof(testFx) === "string" ? eval(testFx) : testFx()); //< defensive code
            } else {
                if(!condition) {
                    // If condition still not fulfilled (timeout but condition is 'false')
                    console.log("'waitFor()' timeout");
                    phantom.exit(1);
                } else {
                    // Condition fulfilled (timeout and/or condition is 'true')
                    console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");
                    typeof(onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled
                    clearInterval(interval); //< Stop this interval
                }
            }
        }, 250); //< repeat check every 250ms
};

var page = require('webpage').create(), system = require('system'), address, output, size;

if (system.args.length < 3 || system.args.length > 5) {
    console.log('Usage: rasterize.js URL filename [paperwidth*paperheight|paperformat] [zoom]');
    console.log('  paper (pdf output) examples: "5in*7.5in", "10cm*20cm", "A4", "Letter"');
    phantom.exit(1);
} else {
    address = system.args[1];
    output = system.args[2];
    if (system.args.length > 3 && system.args[2].substr(-4) === ".pdf") {
        size = system.args[3].split('*');
        page.paperSize = size.length === 2 ? {
            width : size[0],
            height : size[1],
            margin : '0px'
        } : {
            format : system.args[3],
            orientation : 'portrait',
            margin : {
                left : "5mm",
                top : "8mm",
                right : "5mm",
                bottom : "9mm"
            }
        };
    }
    if (system.args.length > 4) {
        page.zoomFactor = system.args[4];
    }
    var resources = [];
    page.onResourceRequested = function(request) {
        resources[request.id] = request.stage;
    };
    page.onResourceReceived = function(response) {
        resources[response.id] = response.stage;
    };
    page.open(address, function(status) {
        if (status !== 'success') {
            console.log('Unable to load the address!');
            phantom.exit();
        } else {
            waitFor(function() {
                // Check in the page if a specific element is now visible
                for ( var i = 1; i < resources.length; ++i) {
                    if (resources[i] != 'end') {
                        return false;
                    }
                }
                return true;
            }, function() {
               page.render(output);
               phantom.exit();
            }, 10000);
        }
    });
}
查看更多
登录 后发表回答