Downloading N number of remote files using Node.js

2019-07-27 11:47发布

问题:

I'm working on a simple app using Node.js which needs to do the following when given a valid URL

  1. Retrieve the HTML of the remote page, save it locally.
  2. Spider the HTML (using cheerio) and record all JS and CSS file references.
  3. Make HTTP request for each JS/CSS file and save it to the server by file name.
  4. Zip up the html, css, and js files and stream the resulting file to the browser.

I've got 1 and 2 working, and the first half of #3 but I'm running into issues with the synchronous nature of the downloads. My code is running too fast and generating file names for the CSS and JS files, but none of the content. I'm guessing this is because my code isn't synchronous. The problem is that I don't know in advance how many files there might be and all of them have to be there before the ZIP file can be generated.

Here's the flow of my app as it currently exists. I've left out the helper methods as they don't affect synchronicity. Can any of you provide input as to what I should do?

http.get(fullurl, function(res) {
    res.on('data', function (chunk) {
        var $source = $(''+chunk),
            js = getJS($source, domain),
            css = getCSS($source, domain),
            uniqueName = pw(),
            dir = [baseDir,'jsd-', uniqueName, '/'].join(''),
            jsdir = dir + 'js/',
            cssdir = dir + 'css/',
            html = rewritePaths($source);

        // create tmp directory
        fs.mkdirSync(dir);

        console.log('creating index.html');

        // save index file
        fs.writeFileSync(dir + 'index.html', html);

        // create js directory
        fs.mkdirSync(jsdir);

        // Save JS files
        js.forEach(function(jsfile){
            var filename = jsfile.split('/').reverse()[0];
            request(jsfile).pipe(fs.createWriteStream(jsdir + filename));
            console.log('creating ' + filename);
        });

        // create css directory
        fs.mkdirSync(cssdir);

        // Save CSS files
        css.forEach(function(cssfile){
            var filename = cssfile.split('/').reverse()[0];
            request(cssfile).pipe(fs.createWriteStream(cssdir + filename));
            console.log('creating ' + filename);
        });

        // write zip file to /tmp
        writeZip(dir,uniqueName);

        // https://npmjs.org/package/node-zip
        // http://stuk.github.com/jszip/

    });
}).on('error', function(e) {
    console.log("Got error: " + e.message);
});

回答1:

The way you are downloading file through request module is asynchronous

request(cssfile).pipe(fs.createWriteStream(cssdir + filename));

instead of download like that you need to do like this create a seperate function

function download (localFile, remotePath, callback) {
var localStream = fs.createWriteStream(localFile);

var out = request({ uri: remotePath });
out.on('response', function (resp) {
    if (resp.statusCode === 200){
        out.pipe(localStream);
        localStream.on('close', function () {
            callback(null, localFile);
        });
    }
    else
        callback(new Error("No file found at given url."),null);
})
};

you need to use async module by colan https://github.com/caolan/async for

// Save JS files
    async.forEach(js,function(jsfile,cb){
        var filename = jsfile.split('/').reverse()[0];
        download(jsdir + filename,jsfile,function(err,result){
          //handle error here

          console.log('creating ' + filename);
          cb();
        })                        
    },function(err){
     // create css directory
    fs.mkdirSync(cssdir);

    // Save CSS files
    css.forEach(function(cssfile){
        var filename = cssfile.split('/').reverse()[0];
        request(cssfile).pipe(fs.createWriteStream(cssdir + filename));
        console.log('creating ' + filename);
    });

    // write zip file to /tmp
    writeZip(dir,uniqueName);
    });