I am running a server using Node.js and need to request data from another server that I am running (localhost:3001
). I need to make many requests (~200) to the data server and collect the data (response sizes vary from ~20Kb to ~20Mb). Each request is independent, and I would like to save the responses as one giant array of the form:
[{"urlAAA": responseAAA}, {"urlCCC": responseCCC}, {"urlBBB": responseBBB}, etc ]
Notice that the order of the items in unimportant, they should ideally fill the array in the order that the data becomes available.
var express = require('express');
var router = express.Router();
var async = require("async");
var papa = require("papaparse");
var sync_request = require('sync-request');
var request = require("request");
var pinnacle_data = {};
var lookup_list = [];
for (var i = 0; i < 20; i++) {
lookup_list.push(i);
}
function write_delayed_files(object, key, value) {
object[key] = value;
return;
}
var show_file = function (file_number) {
var file_index = Math.round(Math.random() * 495) + 1;
var pinnacle_file_index = 'http://localhost:3001/generate?file=' + file_index.toString();
var response_json = sync_request('GET', pinnacle_file_index);
var pinnacle_json = JSON.parse(response_json.getBody('utf8'));
var object_key = "file_" + file_number.toString();
pinnacle_data[object_key] = pinnacle_json;
console.log("We've handled file: " + file_number);
return;
};
async.each(lookup_list, show_file, function (err) {});
console.log(pinnacle_data);
/* GET contact us page. */
router.get('/', function (req, res, next) {
res.render('predictionsWtaLinks', {title: 'Async Trial'});
});
module.exports = router;
Now when this program is run it displays:
We've handled file: 0
We've handled file: 1
We've handled file: 2
We've handled file: 3
We've handled file: 4
We've handled file: 5
etc
Now as the files are of such variable size I was expecting that this would perform the requests "in parallel", but it seems to perform them sequentially, which is what I was trying to avoid through using async.each()
. Currently it takes about 1-2s to connect to the data server and so to perform this over many files is taking too long.
I realise I am using synchronous requesting, and so would like to ideally replace:
var response_json = sync_request('GET', pinnacle_file_index);
with something similar to
request(pinnacle_file_index, function (error, response, body) {
if (!error && response.statusCode == 200) {
pinnacle_data[object_key] = JSON.parse(body);
}
});
Any help would be much appreciated.
Additionally I have looked at trying:
- Converting the list of urls into a list of anonymous functions and using
async.parallel(function_list, function (err, results) { //add results to pinnacle_data[]});
. (I have encountered problems trying to define unique functions for each element in the array).
Similarly I have looked at other related topics:
I have tried to mimic suggested solutions from Asynchronous http calls with nodeJS with no progress.
- How to do parallel async multiple requests at once with Promises in Node
EDIT - WORKING SOLUTION
The following code now does the task (taking ~80ms per request, including having to make repeated requests using npm requestretry
). Similarly this scales very well, taking an average request time of ~80ms for making between 5 request in total, up to 1000.
var performance = require("performance-now");
var time_start = performance();
var async = require("async");
var request_retry = require('requestretry');
var lookup_list = [];
var total_requests = 50;
for (var i = 0; i < total_requests; i++) {
lookup_list.push(i);
}
var pinnacle_data = {};
async.map(lookup_list, function (item, callback) {
var file_index = Math.round(Math.random() * 495) + 1;
var pinnacle_file_index = 'http://localhost:3001/generate?file=' + file_index;
request_retry({
url: pinnacle_file_index,
maxAttempts: 20,
retryDelay: 20,
retryStrategy: request_retry.RetryStrategies.HTTPOrNetworkError
},
function (error, response, body) {
if (!error && response.statusCode == 200) {
body = JSON.parse(body);
var data_array = {};
data_array[file_index.toString()] = body;
callback(null, data_array);
} else {
console.log(error);
callback(error || response.statusCode);
}
});
},
function (err, results) {
var time_finish = performance();
console.log("It took " + (time_finish - time_start).toFixed(3) + "ms to complete " + total_requests + " requests.");
console.log("This gives an average rate of " + ((time_finish - time_start) / total_requests).toFixed(3) + " ms/request");
if (!err) {
for (var i = 0; i < results.length; i++) {
for (key in results[i]) {
pinnacle_data[key] = results[i][key];
}
}
var length_array = Object.keys(pinnacle_data).length.toString();
console.log("We've got all the data, totalling " + length_array + " unique entries.");
} else {
console.log("We had an error somewhere.");
}
});
Thanks for the help.
As you have discovered,
async.parallel()
can only parallelize operations that are themselves asynchronous. If the operations are synchronous, then because of the single threaded nature of node.js, the operations will run one after another, not in parallel. But, if the operations are themselves asynchronous, thenasync.parallel()
(or other async methods) will start them all at once and coordinate the results for you.Here's a general idea using
async.map()
. I usedasync.map()
because the idea there is that it takes an array as input and produces an array of results in the same order as the original, but runs all the requests in parallel which seems to line up with what you want:And, here's a version using Bluebird promises and somewhat similarly using
Promise.map()
to iterate the initial array:Try this:
Sounds like you're just trying to download a bunch of URLs in parallel. This will do that:
or even simpler, using
async.map
: