How to get dynamic HTML and Javascript values from

2019-06-09 12:26发布

问题:

How can I get the latest page data (HTML & Javascript varaibles) from PhantomJS

e.g page.refresh() or something?

I have an Interval, than checks a variable (on the page) every 200ms. However, this variable and the page content, isn't shown to have changed over time. (even though I know it has)

So I need an efficient way to check the value of a JS variable every 200ms or so,

then once I've discovered that variable has changed value, I want to request the latest page HTML.

How can I do this?

var Error = function (description) {
    this.description = description;
    return this;
};

var DTO = function (status, content, error) {
    this.status = status;
    this.content = content;
    this.error = error;
    return this;
};

function outputAndExit(dto) {
    console.log(JSON.stringify(dto));
    phantom.exit();
}

//For any uncaught exception, just log it out for .NET to capture
window.onerror = function (errorMsg, url, lineNumber) {
    var description = 'window.onerror caught an error: ' +
        'errorMsg: ' + errorMsg +
        'url: ' + url +
        'lineNumber: ' + lineNumber;
    outputAndExit(new DTO(false, null, new Error(description)));
};

var GetDynamicPageResult__ = function () {
    var obj = new GetDynamicPageResult();
    obj.initialize();
    return obj;
};

var GetDynamicPageResult = function () {
    var self = this;
    this.initialize = function () {

        this.error = null;
        this.isContentReadyForCrawler = false;

        this.ticker = null;
        this.tickerInterval = 150;
        this.tickerElapsed = 0;

        this.url = '';

        this.loadDependencies();
        this.processArgs();

        this.openPage();

    };
    this.loadDependencies = function () {
        this.system = require('system'),
        this.page = require('webpage').create(),
        this.page.injectJs('jquery-1.10.2.min');
        this.fs = require('fs');
    };
    this.processArgs = function () {
        if (this.system.args.length == 0) {
            outputAndExit(new DTO(false, null, new Error('No arguments given')));
        }
        //system.args[0] Was the name of this script
        this.url = this.system.args[1];
    };
    this.updateIsContentReadyForCrawler = function () {
        var updateIsContentReadyForCrawler = self.page.evaluate(function () {
            self.isContentReadyForCrawler = window.isContentReadyForCrawler;
        });
    };
    this.openPage = function () {
        self.page.open(this.url, function (status) { //NB: status = 'success' || 'fail'
            if (status !== 'success') {
                outputAndExit(new DTO(false, null, new Error('page.open received a non-success status')));
            }
            self.initTicker();
        });
    };

    this.initTicker = function () {
        this.ticker = setInterval(self.handleTick, self.tickerInterval);
    };
    this.handleTick = function () {
        self.tickerElapsed += self.tickerInterval;
        self.updateIsContentReadyForCrawler();
        if (self.isContentReadyForCrawler) {
            clearInterval(self.ticker);
            var content = self.page.content;
            self.finish(true, content, null);
        } else {
            var tooMuchTimeElapsed = self.tickerElapsed > 7000;
            if (tooMuchTimeElapsed) {
                clearInterval(self.ticker);
                self.finish(false, null, new Error('Too much time elapsed'));
            }
        }
    };
    this.finish = function (status, content, error) {
        content = content || '';
        error = error || {};
        outputAndExit(new DTO(status, content, error));
    };
};

/**********************************************************************************/
/***************************** Helpers *****************************/
/**********************************************************************************/

var Utility__ = function () {
    var obj = new Utility();
    obj.initialize();
    return obj;
};

var Utility = function () {
    var self = this;
    this.initialize = function () {
    };
    this.isEmpty = function (obj) {
        var isEmpty = false;
        (obj == undefined || obj == null) && (isEmpty = true);
        return isEmpty;
    };
    this.isStringEmpty = function (str) {
        var isEmpty = false;
        isEmpty(str) && (isEmpty = true);
        (isEmpty == false && $.trim(str) == '') && (isEmpty = true);
        return isEmpty;
    };
};

var getDynamicPageResult = new GetDynamicPageResult__();

回答1:

I think you are almost there: you need to be using page.evaluate(), but currently only use it to get window.isContentReadyForCrawler. You need to use page.evaluate() to grab the latest HTML too.

I'm going to shamelessly paste in code from another answer (https://stackoverflow.com/a/12044474/841830):

var html = page.evaluate(function () {
    var root = document.getElementsByTagName("html")[0];
    var html = root ? root.outerHTML : document.body.innerHTML;
    return html;
});