JS - How to compute MD5 on binary data

2019-08-30 07:14发布

问题:

EDIT: changed title from "JS File API - write and read UTF-8 data is inconsistent" to reflect the actual question.

I have some binary content i need to calculate the MD5 of. The content is a WARC file, that means that it holds text as well as encoded images. To avoid errors in the file saving, I convert and store all the data in arrayBuffers. All the data is put in UInt8Arrays to convert it to UTF-8.

My first attempt, for testing, is to use the saveAs library to save files from Chrome extensions. This means I was using a blob object to be passed on to the method and create the file.

var b = new Blob(arrayBuffers, {type: "text/plain;charset=utf-8"});
saveAs(b,'name.warc');

I haven't found a tool to compute the MD5 from a Blob object so what I was doing was using a FileReader to read the blob file as binary data and then use an MD5 tool (I used cryptoJS as well as a tool from faultylabs) to compute the result.

f = new FileReader();
f.readAsBinaryString(b);
f.onloadend = function(a){
    console.log( 'Original file checksum: ', faultylabs.MD5(this.result) );
}

The resources (images) are downloaded directly in arraybuffer format so I have no need to convert them.

The result was wrong, meaning that checking the MD5 from the code and checking it from the file I saved on my local machine gave 2 different results. Reading as text, obviously shoots out an error.

The workaround I found, consists in writing the blob object on the disk using the filesystem API and then read it back as binary data, compute the MD5 and then save that retrieved file as WARC file (not directly the blob object but this "refreshed" version of the file). In this case the computed MD5 is fine ( I calculate it on the "refreshed" version of the warc file) but when I launch the WARC replay instance with the "refreshed" warc archive, it throws me errors - while with the original file I don't have any problem (but the MD5 is not correct).

var fd = new FormData();

// To compute the md5 hash and to have it correct on the server side, we need to write the file to the system, read it back and then calculate the md5 value.
// We need to send this version of the warc file to the server as well.
window.requestFileSystem  = window.requestFileSystem || window.webkitRequestFileSystem;

function computeWARC_MD5(callback,formData) {
    window.requestFileSystem(window.TEMPORARY, b.size, onInitFs);
    function onInitFs(fs) {
        fs.root.getFile('warc.warc', {create: true}, function(fileEntry) {
            fileEntry.createWriter(function(fileWriter) {
                fileWriter.onwriteend = function(e) {
                  readAndMD5();
                };
                fileWriter.onerror = function(e) {
                  console.error('Write failed: ' + e.toString());
                };
                fileWriter.write(b);
            });
        });

        function readAndMD5() {
            fs.root.getFile('warc.warc', {}, function(fileEntry) {
                fileEntry.file( function(file) {
                    var reader = new FileReader();
                    reader.onloadend = function(e) {
                        var warcMD5 = faultylabs.MD5( this.result );
                        console.log(warcMD5);
                        var g = new Blob([this.result],{type: "text/plain;charset=utf-8"});
                        saveAs(g, o_request.file);
                        formData.append('warc_file', g)
                        formData.append('warc_checksum_md5', warcMD5.toLowerCase());
                        callback(formData);
                    };
                    reader.readAsBinaryString(file);
                });
            });
        }
    }
}

function uploadData(formData) {
    // upload
    $.ajax({
        type: 'POST',
        url: server_URL_upload,
        data: fd,
        processData: false,
        contentType: false,
        // [SPECS] fire a progress event named progress at the XMLHttpRequestUpload object about every 50ms or for every byte transmitted, whichever is least frequent
        xhrFields: {
            onprogress: function (e) {
                if (e.lengthComputable) {
                    console.log(e.loaded / e.total * 100 + '%');
                }
            }
        }
    }).done(function(data) {
       console.log('done uploading!');
       //displayMessage(port_to_page, 'Upload finished!', 'normal')
       //port_to_page.postMessage( { method:"doneUpload" } );
    });
}
computeWARC_MD5(uploadData, fd);
saveAs(b, 'warc.warc');

Could anybody explain me why there is this discrepancy? What am I missing in treating all the objects I am dealing with as binary data (store, read)?

回答1:

Basically I tried another route and converted the blob file back to arraybuffer and computed the MD5 on that. At that point, the file's MD5 and the arraybuffer's are the same.

var b = new Blob(arrayBuffers, {type: "text/plain;charset=utf-8"});
            var blobHtml = new Blob( [str2ab(o_request.main_page_html)], {type: "text/plain;charset=utf-8"} );

f = new FileReader();
f.readAsArrayBuffer(b);
f.onloadend = function(a){
  var warcMD5 = faultylabs.MD5(this.result);
  var fd = new FormData();
  fd.append('warc_file', b)
  fd.append('warc_checksum_md5', warcMD5.toLowerCase());

  uploadData(fd);
}

I guess the result from a binary string and from a buffer array is different, that's why also the MD5 is inconsistent.