Index Match Large Array Google Script Taking Very

2019-01-12 12:22发布

问题:

I have the function below where I am trying to scrape 4 websites, and then combine the results into a spreadsheet. Is there a faster way to match over a large array that isn't the INDEX/MATCH formulas. My desired output would be (obv this is an example)

MLBID | FG_ID | PA | K | K% | wOBA

12345 | 12345 | 12 | 5 | 41.7% | .300

While the code I have below works, it takes wayyyy too long reaches the 6-minute limit of Google Script. The matching that I am trying to do is with ~4000 rows. I have commented my code as much as possible.

function minors_batting_stats() {

  //this is the spreadsheet where I have a list of all of the IDs -- MLB and FG
  var ids = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Player List");

  //this is the output sheet
  var mb18vR_sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("2018 minors bat vs R");

  //various URLs I am trying to scrape
  var mb18vR_PA_url = 'https://www.mlb.com/prospects/stats/search?level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=results&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'
  var mb18vR_SO_url = 'https://www.mlb.com/prospects/stats/search?pa_result=strikeout&level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=results&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'
  var mb18vR_wOBA_url = 'https://www.mlb.com/prospects/stats/search?level=11&level=12&level=13&level=14&level=15&level=16&pitcher_throws=R&batter_stands=&game_date_gt=&game_date_lt=&season=2017&home_away=&draft_year=&prospect=&player_type=batter&sort_by=woba&sort_order=desc&group_by=name&min_pa=&min_pitches=#results'

  //creating an array for each scrape
  var res = [];
  var res1 = [];
  var res2 = [];
  var res3 = [];

  //getting the MLB and FG ids from the spreadsheet
  var mlbids = ids.getRange(1, 11, ids.getLastRow()).getValues();
  var fgids = ids.getRange(1,9, ids.getLastRow()).getValues();

  //scraping SO against RHP
  var content_SO = UrlFetchApp.fetch(mb18vR_SO_url).getContentText();
  var e_SO = Parser.data(content_SO).from('tbody').to('</tbody>').build();
  var rows_SO = Parser.data(e_SO).from('<tr class="player_row"').to('</tr>').iterate();

  for (var i=0; i<rows_SO.length; i++) { //rows.length
    res1[i] = [];
    res1[i][0] = Parser.data(rows_SO[i]).from('/player/').to('/').build();

    var SOs = Parser.data(rows_SO[i]).from('<td align="left">').to('</td>').iterate();
    res1[i][1] = SOs[1];
  }

  //scraping wOBA against RHP
  var content_wOBA = UrlFetchApp.fetch(mb18vR_wOBA_url).getContentText();
  var e_wOBA = Parser.data(content_wOBA).from('tbody').to('</tbody>').build();
  var rows_wOBA = Parser.data(e_wOBA).from('<tr class="player_row"').to('</tr>').iterate();

  for (var i=0; i<rows_wOBA.length; i++) { //rows.length
    res2[i] = [];
    res2[i][0] = Parser.data(rows_wOBA[i]).from('/player/').to('/').build();

    var wOBAs = Parser.data(rows_wOBA[i]).from('<td align="left">').to('</td>').iterate();
    res2[i][1] = wOBAs[2];
  }

  //scraping PA against RHP
  var content = UrlFetchApp.fetch(mb18vR_PA_url).getContentText();
  var e = Parser.data(content).from('tbody').to('</tbody>').build();
  var rows = Parser.data(e).from('<tr class="player_row"').to('</tr>').iterate();

  for (var i=0; i<rows.length; i++) { //rows.length
    res[i] = [];
    res[i][0] = Parser.data(rows[i]).from('/player/').to('/').build();
    res[i][1] = [];

    //matching the MLB_ID with FG_ID
    var mlbID = res[i][0];
    for(var j = 0; j<mlbids.length;j++){
      if(mlbids[j] == mlbID){
        res[i][1] = fgids[j];
      }
    }

    var PAs = Parser.data(rows[i]).from('<td align="left">').to('</td>').iterate();
    res[i][2] = PAs[1];

    //matching the MLB_ID from PA (res) with SO (res1)
    res[i][3] = 0;
    for (var w=0; w<res1.length; w++) {
      if (res[i][0] == res1[w][0]) {
        res[i][3] = res1[w][1];
      }
    }

    //Calculating K%
    res[i][4] = res[i][3] / res[i][2]

    //matching the MLB_ID from PA (res) with wOBA (res1)
    res[i][5] = 0;
    for (var v=0; v<res2.length; v++) {
      if (res[i][0] == res2[v][0]) {
        res[i][5] = res2[v][1];
      }
    }
  }

  //pasting values
  mb18vR_sheet.getRange(2, 1, res.length, res[0].length).setValues(res);
}

回答1:

The issue you have is that you are forcing your script to loop through large datasets many many times for each row of compared data. A better approach is to build a lookup object, which maps between a desired unique identifier and the row of the data array you want to access:

/* Make an object from an Array[][] that has a unique identifier in one of the columns.
 * @param Array[][] data       The 2D array of data to index, e.g. [ [r1c1, r1c2, ...], [r2c1, r2c2, ...], ... ]
 * @param Integer   idColumn   The column in the data array that is a unique row identifier
                               e.g. the column index that contains the product's serial number, in a data
                               array that has only a single row per unique product.
   @return Object {}           An object that maps between an id and a row index, such that
                               `object[id]` = the row index for the specific row in data that has id = id
 */
function makeKey(data, idColumn) {
  if(!data || !data.length || !data[0].length)
    throw new ValueError("Input data argument is not Array[][]");
  // Assume the first column is the column with the unique identifier if not given by the caller.
  if(idColumn === undefined)
    idColumn = 0;

  var key = {};
  for(var r = 0, rows = data.length; r < rows; ++r) {
    var id = data[r][idColumn];
    if (key[id])
      throw new ValueError("ID is not unique for id='" + id + "'");

    key[id] = r;
  }
  return key;
}

Usage:

var database = someSheet.getDataRange().getValues();
var lookup = makeKey(database, 3); // here we say that the 4th column has the unique values.

var newData = /* read a 2D array from somewhere */;
for(var r = 0, rows < newData.length; r < rows; ++r) {
  var id = newData[r][3];
  var existingIndex = lookup[id];
  if (existingIndex) {
    var oldDataRow = database[existingIndex];
  } else {
    // No existing data.
  }
}

By making a lookup object for your data arrays, you no longer have to re-search them and make comparisons, because you did the search once and stored the relationship, rather than discarding it every time. Note that the key that was made is based on a specific (and unique) property of the data. Without that relationship, this particular indexing approach won't work - but a different one will.