需要打开的URL在PhantomJS数组(Need to open an array of URL&

2019-10-24 08:06发布

我已经创造了phantomJs的脚本。 它所做的是,它将从工作正常的特定页面的一些元素。

下面是代码:

var page = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;

page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted    = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished   = function() { loadInProgress = false; console.log("load finished"); };

// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';

// Enable/Disable Javascript
// page.settings.javascriptEnabled = false;

var steps = [

    function() { //Load Page
        page.open("http://www.example.com/mobiles/");
        page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");
    },

    function() { //Fetch Products
        page.onCallback = function(result) {
            var fs = require('fs');
            fs.write('product-list.csv', result, 'w+');
        };

        page.evaluate(function() {
            var arr_mainList = new Array();
            var arr_innerList = new Array();

            try {
                for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                    arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);

                    window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");
                    //window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href + ", ");

                    var myWindow = window.open(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href);
                    console.log(myWindow.getElementsByClassName("item_desc")[0].textContent);
                    myWindow.close();

                    if (arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href.length > 43) {
                        var innerURL = arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
                        console.log(innerURL);
                    }

                    window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");

                    arr_innerList.push(arr_mainList[i]); 

                    for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {                 
                        if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
                            window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
                        }
                        else {
                            window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
                        }
                    };
                    //window.callPhantom(", ");
                    window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
                    window.callPhantom("\n");
                };

                loadInProgress = true;
                console.log("Successful.");
            }
            catch(ex) {
                console.log("Failed: " + ex);
            }
        });
    }
];

interval = setInterval(function() {
    if (!loadInProgress && typeof steps[testindex] == "function") {
    console.log("step " + (testindex + 1));
    steps[testindex]();
    testindex++;
    }

    if (typeof steps[testindex] != "function") {
        setTimeout(function() {
            //fs.write('product-list.html', page.content, 'w');
            console.log("test complete!");
            phantom.exit();
        }, 100);
    }
}, 5000);

现在,如果我运行这个程序,我得到的csv文件的所有信息。 除了当它进入的window.open,phantomJs停止。 我知道我无法打开里面page.evaluate了新的一页。 但我需要获取产品说明和到位的产品链接添加到CSV文件。 我一直在寻找小时,任何帮助将是很好。 注:我的限制,我不得不使用phantomJs。

Answer 1:

我已经修改了你的脚本一点点。 所以,现在你可以做任何你想要的。 请记住不要给许多项目报废,否则将有内存问题。 因此,如果在使用的网站存在分页使用新功能吧。 在此代码,我认为你需要每一个设备的描述,但你也可以访问其他的元素。

注意:您可能知道跨域策略不允许我们访问使用JavaScript / jQuery的内部框架,这将是一个巨大的缺陷。 您必须添加

--web安全=无

执行在CMD /终端脚本时标志。

var page = new WebPage(), innerPage = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;

page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted    = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished   = function() { loadInProgress = false; console.log("load finished"); };

// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';

// Enable/Disable Javascript
//page.settings.javascriptEnabled = false;

//IMPORTANT FLAGS
//--web-security=yes/no

var steps = [
  function() { //Load Page
    page.open("http://www.example.com/mobiles-apple/", function() {
        page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");

        page.evaluate(function() {
            try {
                $("#main1").append('<div id="inner-data_iframes"></div>');

                for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                    var iFrameAdd = document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
                    $("#inner-data_iframes").append('<iframe id="myIframe' + [i] + '" src="' + iFrameAdd + '"></iframe>');
                    window.document.body.scrollTop = document.body.scrollHeight;
                }
                console.log("Mission Successful.");
            }
            catch(ex) {
                console.log("Failed to add iFrame.");
            }
        });
    });
  },

  function() { //Fetch Products
      page.onCallback = function(result) {
          var fs = require('fs');
          fs.write('product-list.csv', result, 'w+');
    };

    page.evaluate(function() {
        var arr_mainList = new Array();
        var arr_innerList = new Array();

        try {
            for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
                arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);

                window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");

                var desc = $("#myIframe" + [i]).contents().find(".item_desc").html();
                desc = desc.replace(/,/g, "");
                window.callPhantom(desc + ", ");

                window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");

                arr_innerList.push(arr_mainList[i]); 

                for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {

                    if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
                        window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
                    }
                    else {
                        window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
                    }
                }

                window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
                window.callPhantom("\n");
            }

                loadInProgress = true;
                console.log("Successful.");
            }
            catch(ex) {
                console.log("Failed: " + ex);
            }
        });
    }
];

interval = setInterval(function() {
    if (!loadInProgress && typeof steps[testindex] == "function") {
        console.log("step " + (testindex + 1));
        steps[testindex]();
        testindex++;
    }

    if (typeof steps[testindex] != "function") {
        setTimeout(function(){
            //fs.write('product-list.html', page.content, 'w');
            console.log("test complete!");
            phantom.exit();
        }, 100);
    }
}, 5000);


文章来源: Need to open an array of URL's in PhantomJS