UTF-8: should it hold that string = encode(decode(

2019-08-29 08:37发布

问题:

In a Javascript Chrome extension, I am trying to transfer a binary string containing a PDF file to an API (Evernote) that uses a thrift.js library. After some attempts, I discovered that the string (which is already an UTF-8 file) is being utf-8 encoded in the thrift.js library when shipped off, and the resulting PDF file appears botched on the server (see my previous question for details). (See below for the encode() method that the library uses.)

Trying to tweak the library and just skip the encode() call leads to file being somehow lost on server side or in the transport process. So I realized that I need to decode() my string before I send it off, and I copied into my code a decode() function from that library method. So my string is essentially subjected to decode() in my code and then encode() in the library code.

It got better but the file is still botched at higher-order characters. I am attaching the image with the original and resulting files. I made a simple check to see if string = encode(decode(string)) and it appears not.

Please give me an advice on what to do with the decode() function in order to achieve string = encode(decode(string)).


Decode() function:

function decode1(utftext) {
    var string = "";
    var i = 0;
    var c = c1 = c2 = 0;

    while (i < utftext.length) {

        c = utftext.charCodeAt(i);

        if (c < 128) {
            string += String.fromCharCode(c);
            i++;
        }
        else if ((c > 191) && (c < 224)) {
            c2 = utftext.charCodeAt(i + 1);
            string += String.fromCharCode(((c & 31) << 6) | (c2 & 63));
            i += 2;
        }
        else {
            c2 = utftext.charCodeAt(i + 1);
            c3 = utftext.charCodeAt(i + 2);
            string += String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
            i += 3;
        }
    }
    return string;
}

Encode() function:

function encode1(string) {
    string = string.replace(/\r\n/g, "\n");
    var utftext = "";
    for (var n = 0; n < string.length; n++) {

        var c = string.charCodeAt(n);

        if (c < 128) {
            utftext += String.fromCharCode(c);
        }
        else if ((c > 127) && (c < 2048)) {
            utftext += String.fromCharCode((c >> 6) | 192);
            utftext += String.fromCharCode((c & 63) | 128);
        }
        else {
            utftext += String.fromCharCode((c >> 12) | 224);
            utftext += String.fromCharCode(((c >> 6) & 63) | 128);
            utftext += String.fromCharCode((c & 63) | 128);
        }
    }
    return utftext;
}

Original PDF file (first 20 lines):

2550 4446 2d31 2e34 0a25 c7ec 8fa2 0a35
2030 206f 626a 0a3c 3c2f 4c65 6e67 7468
2036 2030 2052 2f46 696c 7465 7220 2f46
6c61 7465 4465 636f 6465 3e3e 0a73 7472
6561 6d0a 789c 6d4e 310e c230 10db ef15
1e61 39e2 a45c 722b 120f 00e5 0715 2021
6528 ff1f 482a a176 c05e acb3 65df 82a0
8c08 833f 3137 215e 421a 2c97 ac96 710e
2027 7c1e f214 daff 7bdc e527 8bb6 39a1
b72d 5234 0dac 137b 3d37 5caa 9cee 19ae
6ea8 3daf ee9e 8aad 36c1 a41e 5158 c683
b5c9 81c7 fa96 6b95 5be7 1787 6026 7465
6e64 7374 7265 616d 0a65 6e64 6f62 6a0a
3620 3020 6f62 6a0a 3132 330a 656e 646f
626a 0a34 2030 206f 626a 0a3c 3c2f 5479
7065 2f50 6167 652f 4d65 6469 6142 6f78
205b 3020 3020 3539 3520 3834 325d 0a2f
526f 7461 7465 2030 2f50 6172 656e 7420
3320 3020 520a 2f52 6573 6f75 7263 6573
3c3c 2f50 726f 6353 6574 5b2f 5044 4620

Resulting PDF (first 20 lines):

2550 4446 2d31 2e34 0a25 e7ac 8fe2 8ab5
2030 206f 626a 0a3c 3c2f 4c65 6e67 7468
2036 2030 2052 2f46 696c 7465 7220 2f46
6c61 7465 4465 636f 6465 3e3e 0a73 7472
6561 6d0a 78ec ad8e 310e e2b0 90eb af95
1e61 39e2 a49c 722b 120f 00e5 8795 2021
6528 ef9f 882a e1b6 805e ecb3 a5ef 82a0
ec88 833f 3137 215e 421a 2ce7 ac96 710e
2027 7c1e e294 9aef bb9c e5a7 8be6 b9a1
e7ad 9234 0dec 93bb 3d37 5cea 9cae 19ee
aea8 3def ae9e eaad b6e1 a49e 5158 e683
b5e9 8187 ea96 abe5 9ba7 17e7 a0a6 7465
6e64 7374 7265 616d 0a65 6e64 6f62 6a0a
3620 3020 6f62 6a0a 3132 330a 656e 646f
626a 0a34 2030 206f 626a 0a3c 3c2f 5479
7065 2f50 6167 652f 4d65 6469 6142 6f78
205b 3020 3020 3539 3520 3834 325d 0a2f
526f 7461 7465 2030 2f50 6172 656e 7420
3320 3020 520a 2f52 6573 6f75 7263 6573
3c3c 2f50 726f 6353 6574 5b2f 5044 4620