I am currently working with PhantomJS and CasperJS to scrape for links in a website. The site uses javascript to dynamically load results. The below snippet however is not getting me all the results the page contains. What I need is to scroll down to the bottom of the page, see if the spinner shows up (meaning there’s more content still to come), wait until the new content had loaded and then keep scrolling until no more new content was shown. Then store the links with class name .title
in an array. Link to the webpage for scraping.
var casper = require('casper').create();
var urls = [];
function tryAndScroll(casper) {
casper.waitFor(function() {
this.page.scrollPosition = { top: this.page.scrollPosition["top"] + 4000, left: 0 };
return true;
}, function() {
var info = this.getElementInfo('.badge-post-grid-load-more');
if (info["visible"] == true) {
this.waitWhileVisible('.badge-post-grid-load-more', function () {
this.emit('results.loaded');
}, function () {
this.echo('next results not loaded');
}, 5000);
}
}, function() {
this.echo("Scrolling failed. Sorry.").exit();
}, 500);
}
casper.on('results.loaded', function () {
tryAndScroll(this);
});
casper.start('http://example.com/', function() {
this.waitUntilVisible('.title', function() {
tryAndScroll(this);
});
});
casper.then(function() {
casper.each(this.getElementsInfo('.title'), function(casper, element, j) {
var url = element["attributes"]["href"];
urls.push(url);
});
});
casper.run(function() {
this.echo(urls.length + ' links found:');
this.echo(urls.join('\n')).exit();
});