Need to open an array of URL's in PhantomJS

2019-08-09 05:33发布

问题:

I have created a script in phantomJs. What it does is, it fetches some elements from a specific page which works fine.

Here is the code:

var page = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;

page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted    = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished   = function() { loadInProgress = false; console.log("load finished"); };

// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';

// Enable/Disable Javascript
// page.settings.javascriptEnabled = false;

var steps = [

    function() { //Load Page
        page.open("http://www.example.com/mobiles/");
        page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");
    },

    function() { //Fetch Products
        page.onCallback = function(result) {
            var fs = require('fs');
            fs.write('product-list.csv', result, 'w+');
        };

        page.evaluate(function() {
            var arr_mainList = new Array();
            var arr_innerList = new Array();

            try {
                for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                    arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);

                    window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");
                    //window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href + ", ");

                    var myWindow = window.open(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href);
                    console.log(myWindow.getElementsByClassName("item_desc")[0].textContent);
                    myWindow.close();

                    if (arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href.length > 43) {
                        var innerURL = arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
                        console.log(innerURL);
                    }

                    window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");

                    arr_innerList.push(arr_mainList[i]); 

                    for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {                 
                        if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
                            window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
                        }
                        else {
                            window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
                        }
                    };
                    //window.callPhantom(", ");
                    window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
                    window.callPhantom("\n");
                };

                loadInProgress = true;
                console.log("Successful.");
            }
            catch(ex) {
                console.log("Failed: " + ex);
            }
        });
    }
];

interval = setInterval(function() {
    if (!loadInProgress && typeof steps[testindex] == "function") {
    console.log("step " + (testindex + 1));
    steps[testindex]();
    testindex++;
    }

    if (typeof steps[testindex] != "function") {
        setTimeout(function() {
            //fs.write('product-list.html', page.content, 'w');
            console.log("test complete!");
            phantom.exit();
        }, 100);
    }
}, 5000);

Now if i run the program I get all the information in csv file. Except when it goes to window.open, phantomJs stops. I know i can't open a new page inside page.evaluate. But i need to fetch the product description and add it to csv file in place of product link. I have been searching for hours now, any help would be nice. Note: My limitations are that i have to use phantomJs.

回答1:

I have modified your script a little bit. So now you can do whatever you want. Just keep in mind not to give to many items to scrap or you will have memory issues. So if pagination exists in used website use new function for it. In this code i have assumed that you need description of every device but you can also access other elements.

Note: As you may know cross-domain policy does not allow us to access iFrames using javascript/jQuery, this would be a huge flaw. You have to add the

--web-security=no

flag when executing script in cmd/terminal.

var page = new WebPage(), innerPage = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;

page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted    = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished   = function() { loadInProgress = false; console.log("load finished"); };

// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';

// Enable/Disable Javascript
//page.settings.javascriptEnabled = false;

//IMPORTANT FLAGS
//--web-security=yes/no

var steps = [
  function() { //Load Page
    page.open("http://www.example.com/mobiles-apple/", function() {
        page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");

        page.evaluate(function() {
            try {
                $("#main1").append('<div id="inner-data_iframes"></div>');

                for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                    var iFrameAdd = document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
                    $("#inner-data_iframes").append('<iframe id="myIframe' + [i] + '" src="' + iFrameAdd + '"></iframe>');
                    window.document.body.scrollTop = document.body.scrollHeight;
                }
                console.log("Mission Successful.");
            }
            catch(ex) {
                console.log("Failed to add iFrame.");
            }
        });
    });
  },

  function() { //Fetch Products
      page.onCallback = function(result) {
          var fs = require('fs');
          fs.write('product-list.csv', result, 'w+');
    };

    page.evaluate(function() {
        var arr_mainList = new Array();
        var arr_innerList = new Array();

        try {
            for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);

                window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");

                var desc = $("#myIframe" + [i]).contents().find(".item_desc").html();
                desc = desc.replace(/,/g, "");
                window.callPhantom(desc + ", ");

                window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");

                arr_innerList.push(arr_mainList[i]); 

                for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {

                    if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
                        window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
                    }
                    else {
                        window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
                    }
                }

                window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
                window.callPhantom("\n");
            }

                loadInProgress = true;
                console.log("Successful.");
            }
            catch(ex) {
                console.log("Failed: " + ex);
            }
        });
    }
];

interval = setInterval(function() {
    if (!loadInProgress && typeof steps[testindex] == "function") {
        console.log("step " + (testindex + 1));
        steps[testindex]();
        testindex++;
    }

    if (typeof steps[testindex] != "function") {
        setTimeout(function(){
            //fs.write('product-list.html', page.content, 'w');
            console.log("test complete!");
            phantom.exit();
        }, 100);
    }
}, 5000);