我已经创造了phantomJs的脚本。 它所做的是,它将从工作正常的特定页面的一些元素。
下面是代码:
var page = new WebPage(), testindex = 0, loadInProgress = false, fs = require('fs'), i = 0, j = 0, k = 0;
page.onConsoleMessage = function(msg) { console.log(msg); };
page.onLoadStarted = function() { loadInProgress = true; console.log("load started"); };
page.onLoadFinished = function() { loadInProgress = false; console.log("load finished"); };
// Sets the User Agent
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36';
// Enable/Disable Javascript
// page.settings.javascriptEnabled = false;
var steps = [
function() { //Load Page
page.open("http://www.example.com/mobiles/");
page.injectJs("http://ajax.googleapis.com/ajax/libs/jquery/1.6.1/jquery.min.js");
},
function() { //Fetch Products
page.onCallback = function(result) {
var fs = require('fs');
fs.write('product-list.csv', result, 'w+');
};
page.evaluate(function() {
var arr_mainList = new Array();
var arr_innerList = new Array();
try {
for (i = 0; i < (document.getElementsByClassName("item_grid")[0].getElementsByTagName("ul").length); i++) {
arr_mainList.push(document.getElementsByClassName("lap_thu_box")[i]);
window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].textContent + ", ");
//window.callPhantom(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href + ", ");
var myWindow = window.open(arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href);
console.log(myWindow.getElementsByClassName("item_desc")[0].textContent);
myWindow.close();
if (arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href.length > 43) {
var innerURL = arr_mainList[i].getElementsByTagName("h3")[0].getElementsByTagName("a")[0].href;
console.log(innerURL);
}
window.callPhantom(arr_mainList[i].getElementsByTagName("img")[0].getAttribute("data-original") + ", ");
arr_innerList.push(arr_mainList[i]);
for (j = 0; j < (document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length); j++) {
if ((j+1) < document.getElementsByClassName("lap_thu_box")[i].getElementsByTagName("ul")[0].getElementsByTagName("li").length) {
window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + " | ");
}
else {
window.callPhantom(arr_innerList[i].getElementsByTagName("li")[j].textContent.replace(/,/g, "") + ", ");
}
};
//window.callPhantom(", ");
window.callPhantom(arr_innerList[i].getElementsByClassName("cat_price")[0].textContent.replace(/,/g, ""));
window.callPhantom("\n");
};
loadInProgress = true;
console.log("Successful.");
}
catch(ex) {
console.log("Failed: " + ex);
}
});
}
];
interval = setInterval(function() {
if (!loadInProgress && typeof steps[testindex] == "function") {
console.log("step " + (testindex + 1));
steps[testindex]();
testindex++;
}
if (typeof steps[testindex] != "function") {
setTimeout(function() {
//fs.write('product-list.html', page.content, 'w');
console.log("test complete!");
phantom.exit();
}, 100);
}
}, 5000);
现在,如果我运行这个程序,我得到的csv文件的所有信息。 除了当它进入的window.open,phantomJs停止。 我知道我无法打开里面page.evaluate了新的一页。 但我需要获取产品说明和到位的产品链接添加到CSV文件。 我一直在寻找小时,任何帮助将是很好。 注:我的限制,我不得不使用phantomJs。