Google Apps scraping script to run regullary till

2019-05-25 02:56发布

I've done a scraping script that scrapes any site's (url to be entered) inner pages one by one thru crawling, fetching other inner url and proceeding them to fetch all the pages and extract their pure text (stripped html). The script works well, yet the google script run limit is 6 min, so for huge sites it won't work (stopped after 6 min and no output in google doc file).

function onOpen() { 
    DocumentApp.getUi() // Or DocumentApp or FormApp.
      .createMenu('New scrape web docs')
      .addItem('Enter Url', 'showPrompt')
      .addToUi(); 
}

function showPrompt() { 
  var ui = DocumentApp.getUi();   
  var result = ui.prompt(
      'Scrape whole website into text!',
      'Please enter website url (with http(s)://):',
      ui.ButtonSet.OK_CANCEL); 

// Process the user's response.
  var button = result.getSelectedButton();
  var url = result.getResponseText();  
  var links=[];  
  var base_url = url; 

  if (button == ui.Button.OK) 
  {     
      // gather initial links 
      var inner_links_arr = scrapeAndPaste(url, 1); // first run and clear the document
      links = links.concat(inner_links_arr); // append an array to all the links
      var new_links=[]; // array for new links  
      var processed_urls =[url]; // processed links
      var link, current;

      while (links.length) 
      {  
         link = links.shift(); // get the most left link (inner url)
         processed_urls.push(link);
         current = base_url + link;  
         new_links = scrapeAndPaste(current, 0); // second and consecutive runs we do not clear up the document
         //ui.alert('Processed... ' + current                  + '\nReturned links: ' + new_links.join('\n') );
         // add new links into links array (stack) if appropriate
         for (var i in new_links){
           var item = new_links[i];
           if (links.indexOf(item) === -1 && processed_urls.indexOf(item) === -1)
               links.push(item);
         }    
     }
  } 
}

function scrapeAndPaste(url, clear) { 
  var text; 
  try {
    var html = UrlFetchApp.fetch(url).getContentText();
    // some html pre-processing 
    if (html.indexOf('</head>') !== -1 ){ 
       html = html.split('</head>')[1];
    }
    if (html.indexOf('</body>') !== -1 ){ // thus we split the body only
       html = html.split('</body>')[0] + '</body>';
    }       
   // fetch inner links
    var inner_links_arr= [];
    var linkRegExp = /href="(.*?)"/gi; // regex expression object 
    var match = linkRegExp.exec(html);
    while (match != null) {
      // matched text: match[0]
      if (match[1].indexOf('#') !== 0 
       && match[1].indexOf('http') !== 0 
       //&& match[1].indexOf('https://') !== 0  
       && match[1].indexOf('mailto:') !== 0 
       && match[1].indexOf('.pdf') === -1 ) {
         inner_links_arr.push(match[1]);
      }    
      // match start: match.index
      // capturing group n: match[n]
      match = linkRegExp.exec(html);
    }

    text = getTextFromHtml(html);
    outputText(url, text, clear); // output text into the current document with given url
    return inner_links_arr; //we return all inner links of this doc as array  

  } catch (e) { 
    MailApp.sendEmail(Session.getActiveUser().getEmail(), "Scrape error report at " 
      + Utilities.formatDate(new Date(), "GMT", "yyyy-MM-dd  HH:mm:ss"), 
      "\r\nMessage: " + e.message
      + "\r\nFile: " +  e.fileName+ '.gs' 
      + "\r\nWeb page under scrape: " + url
      + "\r\nLine: " +  e.lineNumber); 
    outputText(url, 'Scrape error for this page cause of malformed html!', clear);   
  } 
}

function getTextFromHtml(html) {
  return getTextFromNode(Xml.parse(html, true).getElement());
}
function getTextFromNode(x) {
  switch(x.toString()) {
    case 'XmlText': return x.toXmlString();
    case 'XmlElement': return x.getNodes().map(getTextFromNode).join(' ');
    default: return '';
  }
}

function outputText(url, text, clear){
  var body = DocumentApp.getActiveDocument().getBody();
  if (clear){ 
    body.clear(); 
  }
  else {
    body.appendHorizontalRule();       
  }
  var section = body.appendParagraph(' * ' + url);
  section.setHeading(DocumentApp.ParagraphHeading.HEADING2);
  body.appendParagraph(text); 
} 

My thought is to use additional spreadsheet to save scraped links and re-start the script on the regular base automatically (using ScriptApp.newTrigger). But some hindrances transpired:

  1. When invoked thru trigger the script is given only a 30 sec run time.
  2. If run from trigger, no way for user to have interaction with the script! Should I use a spreadsheet cell again to input initial base url?
  3. How to flush scraped content into google doc file prior the script is stopped cause of run limit time (30 sec or 6 min)?
  4. How to stop script invoking by trigger if all the site links are processed?

You might answer each question separately for convenience.

Is there a better solution to crawl over site pages, scrape and save output as one text file?

1条回答
Viruses.
2楼-- · 2019-05-25 03:37
  1. AFAIK, you need to give at least 6 minutes between triggers, then it will run for another 6 minutes.

  2. You can ask for all URLs at once and save them in properties, then call the properties in the trigger.

  3. You can make regular checks of the time, knowing it will run for only 6minutes, if it reaches 5min, paste all then set trigger.

  4. Save the object with the current links that needs to be processed in properties, then when script is invoked by the trigger, it retrieves only the URLs that need to be processed.

You probably won't be able to save whole website in properties since it has a 100kb limit, but you can split every page into a diferent property, dunno if it can hit a limit that way.

Another alternative is to make retrieve calls run asynchronously, with HTMLService or setTimeout. I haven't used setTimeout in GAS scripting, but works great in HTML Javascript.

查看更多
登录 后发表回答