Best method of moving to a new page with request-p

2019-08-25 18:45发布

问题:

I am tinkering around with request-promise to crawl a friends webpage. I am using the crawl a webpage better example on their GitHub. What I have so far is this:

var rp = require('request-promise');
var cheerio = require('cheerio'); // Basically jQuery for node.js

var options = {
  uri: 'https://friendspage.org',
  transform: function(body) {
    return cheerio.load(body);
  }
};

rp(options)
  .then(function($) {
    // Process html like you would with jQuery...
    var nxtPage = $("a[data-url$='nxtPageId']").attr('data');

    // How do I use nxtPage here to go to that site

  })
  .catch(function(err) {
    // Crawling failed or Cheerio choked...
  });

What is the proper way to go to the link I have in nxtPage? I still want to be able to use cheerio/jQuery on it. Do I need to repeat the whole var option = ... thing inside the current then function?

回答1:

You can just create your own utility function that creates your options and then calls rp() like this:

const rp = require('request-promise');
const cheerio = require('cheerio'); // Basically jQuery for node.js

// shared function
function getPage(url) {
    const options = {
        uri: url,
        transform: function(body) {
          return cheerio.load(body);
        }
    };
    return rp(options);
}

getPage('https://friendspage.org').then($ => {
    // Process html like you would with jQuery...
    const nxtPage = $("a[data-url$='nxtPageId']").attr('data');
    return getPage(nxtPage).then($ => {
        // more processing here
    });
}).catch(err => {
    console.log(err);
    // error handling here
});

This is just factoring code that you want to use in multiple places into a shared function. Nothing in particular to do with rp() or cheerio, just regular code factoring in Javascript (or any language).



回答2:

Wrap inside a function and keep calling it with a condition so recursion breaks at some time.

(function repeatUntilAConditionIsMetInThen(uri = 'https://friendspage.org')
  var options = {
    uri,
    transform: function(body) {
      return cheerio.load(body);
    }
  };
  rp(options)
    .then(function($) {
      var nxtPage = $("a[data-url$='nxtPageId']").attr('data');
      //There should be some condition here otherwise it will be infinite loop
      repeatUntilAConditionIsMetInThen(nxtPage);
    })
   .catch(function(err) {
   });
})();