EDIT: changed title from "JS File API - write and read UTF-8 data is inconsistent" to reflect the actual question.
I have some binary content i need to calculate the MD5 of. The content is a WARC file, that means that it holds text as well as encoded images. To avoid errors in the file saving, I convert and store all the data in arrayBuffers. All the data is put in UInt8Array
s to convert it to UTF-8.
My first attempt, for testing, is to use the saveAs
library to save files from Chrome extensions. This means I was using a blob object to be passed on to the method and create the file.
var b = new Blob(arrayBuffers, {type: "text/plain;charset=utf-8"});
saveAs(b,'name.warc');
I haven't found a tool to compute the MD5 from a Blob
object so what I was doing was using a FileReader
to read the blob file as binary data and then use an MD5 tool (I used cryptoJS as well as a tool from faultylabs) to compute the result.
f = new FileReader();
f.readAsBinaryString(b);
f.onloadend = function(a){
console.log( 'Original file checksum: ', faultylabs.MD5(this.result) );
}
The resources (images) are downloaded directly in arraybuffer
format so I have no need to convert them.
The result was wrong, meaning that checking the MD5 from the code and checking it from the file I saved on my local machine gave 2 different results. Reading as text, obviously shoots out an error.
The workaround I found, consists in writing the blob object on the disk using the filesystem API and then read it back as binary data, compute the MD5 and then save that retrieved file as WARC file (not directly the blob object but this "refreshed" version of the file). In this case the computed MD5 is fine ( I calculate it on the "refreshed" version of the warc file) but when I launch the WARC replay instance with the "refreshed" warc archive, it throws me errors - while with the original file I don't have any problem (but the MD5 is not correct).
var fd = new FormData();
// To compute the md5 hash and to have it correct on the server side, we need to write the file to the system, read it back and then calculate the md5 value.
// We need to send this version of the warc file to the server as well.
window.requestFileSystem = window.requestFileSystem || window.webkitRequestFileSystem;
function computeWARC_MD5(callback,formData) {
window.requestFileSystem(window.TEMPORARY, b.size, onInitFs);
function onInitFs(fs) {
fs.root.getFile('warc.warc', {create: true}, function(fileEntry) {
fileEntry.createWriter(function(fileWriter) {
fileWriter.onwriteend = function(e) {
readAndMD5();
};
fileWriter.onerror = function(e) {
console.error('Write failed: ' + e.toString());
};
fileWriter.write(b);
});
});
function readAndMD5() {
fs.root.getFile('warc.warc', {}, function(fileEntry) {
fileEntry.file( function(file) {
var reader = new FileReader();
reader.onloadend = function(e) {
var warcMD5 = faultylabs.MD5( this.result );
console.log(warcMD5);
var g = new Blob([this.result],{type: "text/plain;charset=utf-8"});
saveAs(g, o_request.file);
formData.append('warc_file', g)
formData.append('warc_checksum_md5', warcMD5.toLowerCase());
callback(formData);
};
reader.readAsBinaryString(file);
});
});
}
}
}
function uploadData(formData) {
// upload
$.ajax({
type: 'POST',
url: server_URL_upload,
data: fd,
processData: false,
contentType: false,
// [SPECS] fire a progress event named progress at the XMLHttpRequestUpload object about every 50ms or for every byte transmitted, whichever is least frequent
xhrFields: {
onprogress: function (e) {
if (e.lengthComputable) {
console.log(e.loaded / e.total * 100 + '%');
}
}
}
}).done(function(data) {
console.log('done uploading!');
//displayMessage(port_to_page, 'Upload finished!', 'normal')
//port_to_page.postMessage( { method:"doneUpload" } );
});
}
computeWARC_MD5(uploadData, fd);
saveAs(b, 'warc.warc');
Could anybody explain me why there is this discrepancy? What am I missing in treating all the objects I am dealing with as binary data (store, read)?