download and split large file into 100 MB chunks i

2020-07-27 04:54发布

问题:

I have a 2GB file in blob storage and am building a console application that will download this file into a desktop. Requirement is to split into 100MB chunks and append a number into the filename. I do not need to re-combine those files again. What I need is only the chunks of files.

I currently have this code from Azure download blob part

But I cannot figure out how to stop downloading when the file size is already 100MB and create a new one.

Any help will be appreciated.

Update: Here is my code

CloudStorageAccount account = CloudStorageAccount.Parse(connectionString);
            var blobClient = account.CreateCloudBlobClient();
            var container = blobClient.GetContainerReference(containerName);
            var file = uri;
            var blob = container.GetBlockBlobReference(file);
            //First fetch the size of the blob. We use this to create an empty file with size = blob's size
            blob.FetchAttributes();
            var blobSize = blob.Properties.Length;
            long blockSize = (1 * 1024 * 1024);//1 MB chunk;
            blockSize = Math.Min(blobSize, blockSize);
            //Create an empty file of blob size
            using (FileStream fs = new FileStream(file, FileMode.Create))//Create empty file.
            {
                fs.SetLength(blobSize);//Set its size
            }
            var blobRequestOptions = new BlobRequestOptions
            {
                RetryPolicy = new ExponentialRetry(TimeSpan.FromSeconds(5), 3),
                MaximumExecutionTime = TimeSpan.FromMinutes(60),
                ServerTimeout = TimeSpan.FromMinutes(60)
            };
            long startPosition = 0;
            long currentPointer = 0;
            long bytesRemaining = blobSize;
            do
            {
                var bytesToFetch = Math.Min(blockSize, bytesRemaining);
                using (MemoryStream ms = new MemoryStream())
                {
                    //Download range (by default 1 MB)
                    blob.DownloadRangeToStream(ms, currentPointer, bytesToFetch, null, blobRequestOptions);
                    ms.Position = 0;
                    var contents = ms.ToArray();
                    using (var fs = new FileStream(file, FileMode.Open))//Open that file
                    {
                        fs.Position = currentPointer;//Move the cursor to the end of file.
                        fs.Write(contents, 0, contents.Length);//Write the contents to the end of file.
                    }
                    startPosition += blockSize;
                    currentPointer += contents.Length;//Update pointer
                    bytesRemaining -= contents.Length;//Update bytes to fetch

                    Console.WriteLine(fileName + dateTimeStamp + ".csv " + (startPosition / 1024 / 1024) + "/" + (blob.Properties.Length / 1024 / 1024) + " MB downloaded...");
                }
            }
            while (bytesRemaining > 0);

回答1:

Per my understanding, you could break your blob file into your expected pieces (100MB), then leverage CloudBlockBlob.DownloadRangeToStream to download each of your chunks of files. Here is my code snippet, you could refer to it:

ParallelDownloadBlob

private static void ParallelDownloadBlob(Stream outPutStream, CloudBlockBlob blob,long startRange,long endRange)
{
    blob.FetchAttributes();
    int bufferLength = 1 * 1024 * 1024;//1 MB chunk for download
    long blobRemainingLength = endRange-startRange;
    Queue<KeyValuePair<long, long>> queues = new Queue<KeyValuePair<long, long>>();
    long offset = startRange;
    while (blobRemainingLength > 0)
    {
        long chunkLength = (long)Math.Min(bufferLength, blobRemainingLength);
        queues.Enqueue(new KeyValuePair<long, long>(offset, chunkLength));
        offset += chunkLength;
        blobRemainingLength -= chunkLength;
    }
    Parallel.ForEach(queues,
        new ParallelOptions()
        {
            MaxDegreeOfParallelism = 5
        }, (queue) =>
        {
            using (var ms = new MemoryStream())
            {
                blob.DownloadRangeToStream(ms, queue.Key, queue.Value);
                lock (outPutStream)
                {
                    outPutStream.Position = queue.Key- startRange;
                    var bytes = ms.ToArray();
                    outPutStream.Write(bytes, 0, bytes.Length);
                }
            }
        });
}

Program Main

var container = storageAccount.CreateCloudBlobClient().GetContainerReference(defaultContainerName);
var blob = container.GetBlockBlobReference("code.txt");
blob.FetchAttributes();
long blobTotalLength = blob.Properties.Length;
long chunkLength = 10 * 1024; //divide blob file into each file with 10KB in size
for (long i = 0; i <= blobTotalLength; i += chunkLength)
{

    long startRange = i;
    long endRange = (i + chunkLength) > blobTotalLength ? blobTotalLength : (i + chunkLength);

    using (var fs = new FileStream(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, $"resources\\code_[{startRange}]_[{endRange}].txt"), FileMode.Create))
    {
        Console.WriteLine($"\nParallelDownloadBlob from range [{startRange}] to [{endRange}] start...");
        Stopwatch sp = new Stopwatch();
        sp.Start();

        ParallelDownloadBlob(fs, blob, startRange, endRange);
        sp.Stop();
        Console.WriteLine($"download done, time cost:{sp.ElapsedMilliseconds / 1000.0}s");
    }
}

RESULT

UPDATE:

Based on your requirement, I recommend that you could download your blob into a single file, then leverage LumenWorks.Framework.IO to read your large file records line by line, then check the byte size you have read and save into a new csv file with the size up to 100MB. Here is a code snippet, you could refer to it:

using (CsvReader csv = new CsvReader(new StreamReader("data.csv"), true))
{
    int fieldCount = csv.FieldCount;
    string[] headers = csv.GetFieldHeaders();
    while (csv.ReadNextRecord())
    {
        for (int i = 0; i < fieldCount; i++)
            Console.Write(string.Format("{0} = {1};",
                          headers[i],
                          csv[i] == null ? "MISSING" : csv[i]));
        //TODO: 
        //1.Read the current record, check the total bytes you have read;
        //2.Create a new csv file if the current total bytes up to 100MB, then save the current record to the current CSV file.
    }
}

Additionally, you could refer to A Fast CSV Reader and CsvHelper for more details.

UPDATE2

Code sample for breaking large CSV file into small CSV file with the fixed bytes, I used CsvHelper 2.16.3 for the following code snippet, you could refer to it:

string[] headers = new string[0];
using (var sr = new StreamReader(@"C:\Users\v-brucch\Desktop\BlobHourMetrics.csv")) //83.9KB
{
    using (CsvHelper.CsvReader csvReader = new CsvHelper.CsvReader(sr,
        new CsvHelper.Configuration.CsvConfiguration()
        {
            Delimiter = ",",
            Encoding = Encoding.UTF8
        }))
    {
        //check header
        if (csvReader.ReadHeader())
        {
            headers = csvReader.FieldHeaders;
        }

        TextWriter writer = null;
        CsvWriter csvWriter = null;
        long readBytesCount = 0;
        long chunkSize = 30 * 1024; //divide CSV file into each CSV file with byte size up to 30KB

        while (csvReader.Read())
        {
            var curRecord = csvReader.CurrentRecord;
            var curRecordByteCount = curRecord.Sum(r => Encoding.UTF8.GetByteCount(r)) + headers.Count() + 1;
            readBytesCount += curRecordByteCount;

            //check bytes you have read
            if (writer == null || readBytesCount > chunkSize)
            {
                readBytesCount = curRecordByteCount + headers.Sum(h => Encoding.UTF8.GetByteCount(h)) + headers.Count() + 1;
                if (writer != null)
                {
                    writer.Flush();
                    writer.Close();
                }
                string fileName = $"BlobHourMetrics_{Guid.NewGuid()}.csv";
                writer = new StreamWriter(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, fileName), true);
                csvWriter = new CsvWriter(writer);
                csvWriter.Configuration.Encoding = Encoding.UTF8;
                //output header field
                foreach (var header in headers)
                {
                    csvWriter.WriteField(header);
                }
                csvWriter.NextRecord();
            }
            //output record field
            foreach (var field in curRecord)
            {
                csvWriter.WriteField(field);
            }
            csvWriter.NextRecord();
        }
        if (writer != null)
        {
            writer.Flush();
            writer.Close();
        }
    }
}

RESULT