可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I am trying to scrape a webpage which has a form with many dropdowns and values in the form are interdependent. At many point I need the code to wait till the refresh of the page complete. Eg after selecting an option from the list, the code should wait till the next list is populated based on this selection. It would be really helpful if someone could give pointers because strangely my code is working only after I gave so much unnecessary logging statements which in-turn created some delay. Any suggestions to improve the code would be very helpful.
var casper = require('casper').create({
verbose: true,
logLevel: 'debug',
userAgent: 'Mozilla/5.0 poi poi poi (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22',
pageSettings: {}
});
casper.start('http://www.abc.com', function () {
console.log("casper started");
this.fill('form[action="http://www.abc.com/forum/member.php"]', {
quick_username: "qwe",
quick_password: "qwe"
}, true);
this.capture('screen.png');
});
casper.thenOpen("http://www.abc.com/search/index.php").then(function () {
this.click('input[type="checkbox"][name="firstparam"]');
this.click('a#poi');
casper.evaluate(function () {
document.getElementsByName("status")[0].value = 1;
document.getElementsByName("state")[0].value = 1078;
changeState(); //This function is associated with the dropdown ie state
and the page reloads at this point. Only after complete refresh the code shoud execute! How can this be achieved?
return true;
});
this.echo('Inside the first thenOpen' + this.evaluate(function () {
return document.search.action;
}));
});
casper.then(function () {
this.capture("poi.png");
console.log('just before injecting jquery');
casper.page.injectJs('./jquery.js');
this.click('input[type="checkbox"][name="or"]');
this.evaluate(function () {
$('.boxline .filelist input:checkbox[value=18127]').attr("checked", true);
});
this.echo('Just before pressing the add college button' + this.evaluate(function () {
return document.search.action;
}));
this.capture('collegeticked.png');
if (this.exists('input[type="button"][name="niv"]')) {
this.echo('button is there');
} else {
this.echo('button is not there');
}
this.echo("Going to print return value");
this.click('input[type="button"][name="poi"]'); // This click again causes a page refresh. Code should wait at this point for completion.
this.echo('Immediately after pressing the add college btn getPresentState()' + this.evaluate(function () {
return getPresentState();
}));
this.echo('Immediately after pressing add colleg button' + this.evaluate(function () {
return document.search.action;
}));
this.capture('iu.png');
});
casper.then(function () {
console.log('just before form submit');
this.click('form[name="search"] input[type="submit"]'); //Again page refresh. Wait.
this.echo('Immediately after search btn getPresentState()' + this.evaluate(function () {
return getPresentState();
}));
this.echo('Immediately after search button-action' + this.evaluate(function () {
return document.search.action;
}));
this.capture("mnf.png");
});
casper.then(function () {
casper.page.injectJs('./jquery.js');
this.capture("resultspage.png");
this.echo('Page title is: ' + this.evaluate(function () {
return document.title;
}), 'INFO');
var a = casper.evaluate(function () {
return $('tbody tr td.tdbottom:contains("tye") ').siblings().filter($('td>a').parent());
});
console.log("ARBABU before" + a.length);
});
casper.run();
回答1:
I've been using the waitForSelector 'workaround' mentioned by Arun here:
https://stackoverflow.com/a/22217657/1842033
It's the best solution I've found; the 'drawback' as it were is that you need to be aware of what element you're expecting to load. I say drawback, personally I don't think I've encountered a situation where I've not had some kind of feedback saying that whatever I'm waiting for has happened
this.waitForSelector("{myElement}",
function pass () {
test.pass("Found {myElement}");
},
function fail () {
test.fail("Did not load element {myElement}");
},
20000 // timeout limit in milliseconds
);
Although I'd guess you could use waitForResource() or something like that if you didn't have visual feedback.
回答2:
What I've taken to doing to get around this issue, when there isn't anything specific to target and wait for in the reloaded page, is to use the following:
var classname = 'reload-' + (new Date().getTime()),
callback = function(){},
timeout = function(){};
/// It happens when they change something...
casper.evaluate(function(classname){
document.body.className += ' ' + classname;
}, classname);
casper.thenClick('#submit'); /// <-- will trigger a reload of the page
casper.waitWhileSelector('body.' + classname, callback, timeout);
This way I don't have to rely on a specific expected element in the next page, I've basically done the inverse. I've created a specific selector to watch out for, and execution moves on once that selector fails to match.
For my intents and purposes it was enough to know the page had begun reloading, I didn't need to wait until the next page had fully reloaded. This is so that I could then trigger certain waitForSelector
calls on elements that may have existed both before and after the reload. Waiting until the temporary class has been removed lets me know that anything that existed before has since been destroyed, so no fear of selecting elements prior to the reload.
回答3:
Seems there are no real solutions.
http://docs.casperjs.org/en/latest/modules/casper.html#waitforselector is an available workaround which may not work always.
回答4:
I have the same experience doing the same thing as you. script these way in user perspective never gone well. it crash in middle of nowhere and very unreliable. I was doing search from salesforce that also require login.
You need to keep your step as minimum as possible. script in a cron job way. don't do form fill/button click unless you are doing UI testing. I would advice you to break the process into two parts
// this part do search and find out the exact url of your screen capture.
// save it in a db/csv file
1 - start by POST to http://www.abc.com/forum/member.php with username password in body.
2 - POST/GET to http://www.abc.com/search/index.php with your search criteria, you look at what the website require. if they do POST, then POST.
// second part read your input
1 - login same as first part.
2 - casper forEach your input save your capture. (save the capture result in db/csv)
my script now is pure phantomjs, casper script just keep crashing for no reason. even phantomjs is unreliable. I save the result/status on each successful search/download, whenever there is error I exit the script if not the rest of result is unpredictable(good result in chrome turn out bad in phantomjs).
回答5:
I found this question when searching for solution to a problem where click() or fill() action reloads exactly the same data in a child iframe. Here is my improvement to Pebbl answer:
casper.clickAndUnload = function (click_selector, unload_selector, callback, timeout) {
var classname = 'reload-' + (new Date().getTime());
this.evaluate(function (unload_selector, classname) {
$(unload_selector).addClass(classname);
}, unload_selector, classname);
this.thenClick(click_selector);
this.waitWhileSelector(unload_selector + '.' + classname, callback, timeout);
};
casper.fillAndUnload = function (form_selector, data, unload_selector, callback, timeout) {
var classname = 'reload-' + (new Date().getTime());
this.evaluate(function (unload_selector, classname) {
$(unload_selector).addClass(classname);
}, unload_selector, classname);
this.fill(form_selector, data, true);
this.waitWhileSelector(unload_selector + '.' + classname, callback, timeout);
};
This solution assumes that page uses jQuery. It should not be hard to modify it for pages that don't. unload_selector
is an element that is expected to be reloaded after click or form submission.
回答6:
Since Casperjs is written for developers, it's expected one knows what state the page loaded should be in, and what elements should be available to define a page-loaded state.
One option is to check for the presence of, for example, a javascript resource that is loaded at the end of the page.
When running any type of test, results must be reproducable each time and therefore idempotency is essential. For this to happen, the tester must be able to control the environment enough to make this happen.
回答7:
Just evaluate document.readyState
to be complete
or interactive
. Then it's loaded.
This is an implementation with a while
, but maybe can be done with interval...
this.then(function () {
while(this.evaluate(function () { return document.readyState != 'complete' && document.readyState != 'interactive'; })) {}
});