node js azure SDK getBlobToStream uses lots of mem

2019-08-20 17:25发布

问题:

I am writing a backup script that simply downloads all the blobs in all the blob containers of a specific Azure account.

The script uses async.js to make sure only so much threads can run at the same time so it doesn't overload the server. When I run this script it works fine, but when it hits large files it runs out of memory. I'm guessing the download runs faster than the disk can write, and it eventually fills up the in-memory buffer so badly that I run out of memory entirely, but debugging the exact cause has been impossible so far.

The specific function which appears to use a lot of memory is called as follows:

blobService.getBlobToStream(
  containerName,
  blob.name,
  fs.createWriteStream(fullPath),
  function(error) {
    if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
    console.log("Failed writing " + blob.name + " (" + error + ")");
    callback();
    }
    else if(!error) { //Write the last modified date and finish the queue item silently
    fs.writeFile(fullPath + ".date", blobLastModified, function(err)
    { if(err) console.log("Couldn't write .date file: " + err); });
    callback();
    }
    });

Even a single 700MB download will easily fill up 1GB of memory on my side.

Is there any way around this? Am I missing a parameter which magically prevents the Azure SDK from buffering everything and the kitchen sink?

Full code:

#!/usr/bin/env node

//Requires
var azure = require('azure');
var fs    = require('fs');
var mkdirp = require('mkdirp');
var path  = require('path');
var async = require('async');

var maxconcurrency = 1; //Max amount of simultaneous running threads of getBlobsAndSaveThem() running through async.js.

var blobService = azure.createBlobService();

backupPrefix='/backups/azurebackup/' //Always end with a '/'!!

//Main flow of the script is near the bottom of the file.
var containerProcessingQueue = async.queue(
 function getBlobsAndSaveThem(containerName) {
console.log(containerName); //DEBUG
  blobService.listBlobs(containerName,
   function(error, blobs) {
     if(!error){
        var blobProcessingQueue =
         async.queue(function(index,callback) {
                var blob = blobs[index];
                console.log(blob); //DEBUG
                var fullPath = backupPrefix + containerName + '/' + blob.name;
                var blobLastModified = new Date(blob.properties['last-modified']);

                //Only create if the directoy doesn't exist, since mkdirp fails if the directory exists.
                if(!fs.existsSync(path.dirname(fullPath))){ //And do it sync, because otherwise it'll check 99999 times if the directory exists simultaneously, doesn't find it, then fails to create it 99998 times.
                        mkdirp.sync(path.dirname(fullPath), function(err) { console.log('Failed to create directory ' + path.dirname(fullPath) + " ("+ err + ")"); });
                        }


                if(fs.existsSync(fullPath + ".date")){
                        if(blobLastModified == fs.readFileSync(fullPath + ".date").toString()) {
                                callback();
                                return; //If the file is unmodified, return. No this won't exit the program, because it's called within a function definition (async.queue(function ...))
                                }
                        }

                blobService.getBlobToStream(
                  containerName,
                  blob.name,
                  fs.createWriteStream(fullPath),
                  function(error) {
                        if(error){ //Something went wrong, write it to the console but finish the queue item and continue.
                                console.log("Failed writing " + blob.name + " (" + error + ")");
                                callback();
                                }
                        else if(!error) { //Write the last modified date and finish the queue item silently
                                fs.writeFile(fullPath + ".date", blobLastModified, function(err)
                                { if(err) console.log("Couldn't write .date file: " + err); });
                                callback();
                                }
                           });

                },maxconcurrency);

        for(var blobindex in blobs){
                blobProcessingQueue.push(blobindex);
                 } //Push new items to the queue for processing



        }
        else {
         console.log("An error occurred listing the blobs: " + error);
        }
});
},1);

blobService.listContainers(function(err, result){
        for(var i=0;i<result.length;i++) {
                containerProcessingQueue.push(result[i].name);
        }
});

回答1:

One thing that you could possibly do is read only a chunk of data into stream instead of whole blob data, append that to the file and read next chunk. Blob Storage service supports that. If you look at the source code for getBlobToStream (https://github.com/WindowsAzure/azure-sdk-for-node/blob/master/lib/services/blob/blobservice.js), you can specify from/to bytes in the options - rangeStartHeader and rangeEndHeader. See if that helps.

I have hacked some code which does just that (as you can see from my code, my knowledge about node.js is quite primitive :)). [Please use this code just to get an idea about how you can do chunked download as I think it still has some glitches]

var azure = require('azure');
var fs = require('fs');

var blobService = azure.createBlobService("account", "accountkey");
var containerName = "container name";
var blobName = "blob name";
var blobSize;
var chunkSize = 1024 * 512;//chunk size -- we'll read 512 KB at a time.
var startPos = 0;
var fullPath = "D:\\node\\";
var blobProperties = blobService.getBlobProperties(containerName, blobName, null, function (error, blob) {
        if (error) {
            throw error;
        }
        else    {
            blobSize = blob.contentLength;
            fullPath = fullPath + blobName;
            console.log(fullPath);
            doDownload();
        }
    }
);

function doDownload() {
    var stream = fs.createWriteStream(fullPath, {flags: 'a'});
    var endPos = startPos + chunkSize;
    if (endPos > blobSize) {
        endPos = blobSize;
    }
    console.log("Downloading " + (endPos - startPos) + " bytes starting from " + startPos + " marker.");
    blobService.getBlobToStream("test", blobName, stream, 
        { "rangeStartHeader": startPos, "rangeEndHeader": endPos-1 }, function(error) {
        if (error) {
            throw error;
        }
        else if (!error) {
            startPos = endPos;
            if (startPos <= blobSize - 1) {
                doDownload();
            }
        }
    });
}


回答2:

For all those now curious the variables for the start and end have changed. They are now just rangeStart and rangeEnd. Here is the azure node documentation for more help http://dl.windowsazure.com/nodestoragedocs/BlobService.html