Scraping an infinite scroll page stops without scr

2019-06-23 22:14发布

问题:

I am currently working with PhantomJS and CasperJS to scrape for links in a website. The site uses javascript to dynamically load results. The below snippet however is not getting me all the results the page contains. What I need is to scroll down to the bottom of the page, see if the spinner shows up (meaning there’s more content still to come), wait until the new content had loaded and then keep scrolling until no more new content was shown. Then store the links with class name .title in an array. Link to the webpage for scraping.

var casper = require('casper').create();
var urls = [];
function tryAndScroll(casper) {
  casper.waitFor(function() {
    this.page.scrollPosition = { top: this.page.scrollPosition["top"] + 4000, left: 0 };
    return true;
  }, function() {
    var info = this.getElementInfo('.badge-post-grid-load-more');
    if (info["visible"] == true) {
      this.waitWhileVisible('.badge-post-grid-load-more', function () {
        this.emit('results.loaded');
      }, function () {
        this.echo('next results not loaded');
      }, 5000);
    }
  }, function() {
    this.echo("Scrolling failed. Sorry.").exit();
  }, 500);
}

casper.on('results.loaded', function () {
  tryAndScroll(this);
});

casper.start('http://example.com/', function() {
    this.waitUntilVisible('.title', function() {
        tryAndScroll(this);
      });
});

casper.then(function() {
  casper.each(this.getElementsInfo('.title'), function(casper, element, j) {
    var url = element["attributes"]["href"];
    urls.push(url);
  });
});

casper.run(function() {
    this.echo(urls.length + ' links found:');
    this.echo(urls.join('\n')).exit();
});

回答1:

I've looked at the page. Your misconception is probably that you think the .badge-post-grid-load-more element vanishes as soon as the next elements are loaded. This is not the case. It doesn't change at all. You have to find another way to test whether new elements were put into the DOM.

You could for example retrieve the current number of elements and use waitFor to detect when the number changes.

function getNumberOfItems(casper) {
    return casper.getElementsInfo(".listview .badge-grid-item").length;
}

function tryAndScroll(casper) {
  casper.page.scrollPosition = { top: casper.page.scrollPosition["top"] + 4000, left: 0 };
  var info = casper.getElementInfo('.badge-post-grid-load-more');
  if (info.visible) {
    var curItems = getNumberOfItems(casper);
    casper.waitFor(function check(){
      return curItems != getNumberOfItems(casper);
    }, function then(){
      tryAndScroll(this);
    }, function onTimeout(){
      this.echo("Timout reached");
    }, 20000);
  } else {
    casper.echo("no more items");
  }
}

I've also streamlined tryAndScroll a little. There were completely unnecessary functions: the first casper.waitFor wasn't waiting at all and because of that the onTimeout callback could never be invoked.