google script scrape parser with 2 classes with th

2019-02-19 06:36发布

问题:

I am looking to build a scraper in google scripts. There are 2 classe with the same name and I can't find a way to get the second class. It only outputs the first one.

function myFunction() {
    var url = "https://www.zchocolat.com/shop/fr/livraison-cadeau-chocolat/espagne";
    var fromText = '<p class="article"';
    var toText = '">';

    var content = UrlFetchApp.fetch(url).getContentText();
    var scraped = Parser
                .data(content)
                .setLog()
                .from(fromText)
                .to(toText)
                .build();
    Logger.log(scraped);
    return scraped;
}

function SAVE_DATA() {
   var sheet = SpreadsheetApp.openById('').getSheetByName('Feuille 1'); 
   sheet.appendRow([ new Date(), myFunction() ]);

}

回答1:

How about this sample?

Modification points :

  1. When it sees the Parser of library you use at here, it seems that by using iterate(), data can be retrieved as an array.
  2. The data you want is second one.

When these are reflected to your script, the modified script is as follows.

Modified script :

function myFunction() {
  var url = "https://www.zchocolat.com/shop/fr/livraison-cadeau-chocolat/espagne";
  var fromText = '<p class="article">';
  var toText = '</p>';
  var content = UrlFetchApp.fetch(url).getContentText();
  var scraped = Parser
              .data(content)
              .from(fromText)
              .to(toText)
              .iterate();
  Logger.log(scraped[1]);
  return scraped;
}

Result :

97% de nos colis ont &eacute;t&eacute; livr&eacute;s dans les temps en 2016.
                                        zChocolat a d&eacute;j&agrave; livr&eacute; avec succ&egrave;s 21,923 cadeaux chocolat en Espagne.


回答2:

You should parse the html with the XmlService so that you can more easily extract the nodes you want. There are some good examples at this site (https://sites.google.com/site/scriptsexamples/learn-by-example/parsing-html)

You would end up with something like:

function myFunction() {
    var url = "https://www.zchocolat.com/shop/fr/livraison-cadeau-chocolat/espagne";
    var fromText = '<p class="article"';
    var toText = '">';

    var content = UrlFetchApp.fetch(url).getContentText();

    var doc = XmlService.parse(html);
    var html = doc.getRootElement();
    var articles = getElementsByClassName(html, 'articles');
    Logger.log(articles);
}

function getElementsByClassName(element, classToFind) {  
  var data = [];
  var descendants = element.getDescendants();
  descendants.push(element);  
  for(i in descendants) {
    var elt = descendants[i].asElement();
    if(elt != null) {
      var classes = elt.getAttribute('class');
      if(classes != null) {
        classes = classes.getValue();
        if(classes == classToFind) data.push(elt);
        else {
          classes = classes.split(' ');
          for(j in classes) {
            if(classes[j] == classToFind) {
              data.push(elt);
              break;
            }
          }
        }
      }
    }
  }
  return data;
}