I am looking to build a scraper in google scripts. There are 2 classe with the same name and I can't find a way to get the second class. It only outputs the first one.
function myFunction() {
var url = "https://www.zchocolat.com/shop/fr/livraison-cadeau-chocolat/espagne";
var fromText = '<p class="article"';
var toText = '">';
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser
.data(content)
.setLog()
.from(fromText)
.to(toText)
.build();
Logger.log(scraped);
return scraped;
}
function SAVE_DATA() {
var sheet = SpreadsheetApp.openById('').getSheetByName('Feuille 1');
sheet.appendRow([ new Date(), myFunction() ]);
}
How about this sample?
Modification points :
- When it sees the
Parser
of library you use at here, it seems that by using iterate()
, data can be retrieved as an array.
- The data you want is second one.
When these are reflected to your script, the modified script is as follows.
Modified script :
function myFunction() {
var url = "https://www.zchocolat.com/shop/fr/livraison-cadeau-chocolat/espagne";
var fromText = '<p class="article">';
var toText = '</p>';
var content = UrlFetchApp.fetch(url).getContentText();
var scraped = Parser
.data(content)
.from(fromText)
.to(toText)
.iterate();
Logger.log(scraped[1]);
return scraped;
}
Result :
97% de nos colis ont été livrés dans les temps en 2016.
zChocolat a déjà livré avec succès 21,923 cadeaux chocolat en Espagne.
You should parse the html with the XmlService so that you can more easily extract the nodes you want. There are some good examples at this site (https://sites.google.com/site/scriptsexamples/learn-by-example/parsing-html)
You would end up with something like:
function myFunction() {
var url = "https://www.zchocolat.com/shop/fr/livraison-cadeau-chocolat/espagne";
var fromText = '<p class="article"';
var toText = '">';
var content = UrlFetchApp.fetch(url).getContentText();
var doc = XmlService.parse(html);
var html = doc.getRootElement();
var articles = getElementsByClassName(html, 'articles');
Logger.log(articles);
}
function getElementsByClassName(element, classToFind) {
var data = [];
var descendants = element.getDescendants();
descendants.push(element);
for(i in descendants) {
var elt = descendants[i].asElement();
if(elt != null) {
var classes = elt.getAttribute('class');
if(classes != null) {
classes = classes.getValue();
if(classes == classToFind) data.push(elt);
else {
classes = classes.split(' ');
for(j in classes) {
if(classes[j] == classToFind) {
data.push(elt);
break;
}
}
}
}
}
}
return data;
}