String length in bytes in JavaScript

2019-01-01 14:57发布

In my JavaScript code I need to compose a message to server in this format:

<size in bytes>CRLF
<data>CRLF

Example:

3
foo

The data may contain unicode characters. I need to send them as UTF-8.

I'm looking for the most cross-browser way to calculate the length of the string in bytes in JavaScript.

I've tried this to compose my payload:

return unescape(encodeURIComponent(str)).length + "\n" + str + "\n"

But it does not give me accurate results for the older browsers (or, maybe the strings in those browsers in UTF-16?).

Any clues?

Update:

Example: length in bytes of the string ЭЭХ! Naïve? in UTF-8 is 15 bytes, but some browsers report 23 bytes instead.

11条回答
时光乱了年华
2楼-- · 2019-01-01 15:34

There is no way to do it in JavaScript natively.

If you know the character encoding, you can calculate it yourself though.

encodeURIComponent assumes UTF-8 as the character encoding, so if you need that encoding, you can do,

function lengthInUtf8Bytes(str) {
  // Matches only the 10.. bytes that are non-initial characters in a multi-byte sequence.
  var m = encodeURIComponent(str).match(/%[89ABab]/g);
  return str.length + (m ? m.length : 0);
}

This should work because of the way UTF-8 encodes multi-byte sequences. The first encoded byte always starts with either a high bit of zero for a single byte sequence, or a byte whose first hex digit is C, D, E, or F. The second and subsequent bytes are the ones whose first two bits are 10. Those are the extra bytes you want to count in UTF-8.

The table in wikipedia makes it clearer

Bits        Last code point Byte 1          Byte 2          Byte 3
  7         U+007F          0xxxxxxx
 11         U+07FF          110xxxxx        10xxxxxx
 16         U+FFFF          1110xxxx        10xxxxxx        10xxxxxx
...

If instead you need to understand the page encoding, you can use this trick:

function lengthInPageEncoding(s) {
  var a = document.createElement('A');
  a.href = '#' + s;
  var sEncoded = a.href;
  sEncoded = sEncoded.substring(sEncoded.indexOf('#') + 1);
  var m = sEncoded.match(/%[0-9a-f]{2}/g);
  return sEncoded.length - (m ? m.length * 2 : 0);
}
查看更多
临风纵饮
3楼-- · 2019-01-01 15:35

Here is an independent and efficient method to count UTF-8 bytes of a string.

//count UTF-8 bytes of a string
function byteLengthOf(s){
	//assuming the String is UCS-2(aka UTF-16) encoded
	var n=0;
	for(var i=0,l=s.length; i<l; i++){
		var hi=s.charCodeAt(i);
		if(hi<0x0080){ //[0x0000, 0x007F]
			n+=1;
		}else if(hi<0x0800){ //[0x0080, 0x07FF]
			n+=2;
		}else if(hi<0xD800){ //[0x0800, 0xD7FF]
			n+=3;
		}else if(hi<0xDC00){ //[0xD800, 0xDBFF]
			var lo=s.charCodeAt(++i);
			if(i<l&&lo>=0xDC00&&lo<=0xDFFF){ //followed by [0xDC00, 0xDFFF]
				n+=4;
			}else{
				throw new Error("UCS-2 String malformed");
			}
		}else if(hi<0xE000){ //[0xDC00, 0xDFFF]
			throw new Error("UCS-2 String malformed");
		}else{ //[0xE000, 0xFFFF]
			n+=3;
		}
	}
	return n;
}

var s="\u0000\u007F\u07FF\uD7FF\uDBFF\uDFFF\uFFFF";
console.log("expect byteLengthOf(s) to be 14, actually it is %s.",byteLengthOf(s));

Note that the method may throw error if an input string is UCS-2 malformed

查看更多
临风纵饮
4楼-- · 2019-01-01 15:35

This would work for BMP and SIP/SMP characters.

    String.prototype.lengthInUtf8 = function() {
        var asciiLength = this.match(/[\u0000-\u007f]/g) ? this.match(/[\u0000-\u007f]/g).length : 0;
        var multiByteLength = encodeURI(this.replace(/[\u0000-\u007f]/g)).match(/%/g) ? encodeURI(this.replace(/[\u0000-\u007f]/g, '')).match(/%/g).length : 0;
        return asciiLength + multiByteLength;
    }

    'test'.lengthInUtf8();
    // returns 4
    '\u{2f894}'.lengthInUtf8();
    // returns 4
    'سلام علیکم'.lengthInUtf8();
    // returns 19, each Arabic/Persian alphabet character takes 2 bytes. 
    '你好,JavaScript 世界'.lengthInUtf8();
    // returns 26, each Chinese character/punctuation takes 3 bytes. 
查看更多
高级女魔头
5楼-- · 2019-01-01 15:38

Actually, I figured out what's wrong. For the code to work the page <head> should have this tag:

<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />

Or, as suggested in comments, if server sends HTTP Content-Encoding header, it should work as well.

Then results from different browsers are consistent.

Here is an example:

<html>
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
  <title>mini string length test</title>
</head>
<body>

<script type="text/javascript">
document.write('<div style="font-size:100px">' 
    + (unescape(encodeURIComponent("ЭЭХ! Naïve?")).length) + '</div>'
  );
</script>
</body>
</html>

Note: I suspect that specifying any (accurate) encoding would fix the encoding problem. It is just a coincidence that I need UTF-8.

查看更多
爱死公子算了
6楼-- · 2019-01-01 15:44

Here is a much faster version, which doesn't use regular expressions, nor encodeURIComponent:

function byteLength(str) {
  // returns the byte length of an utf8 string
  var s = str.length;
  for (var i=str.length-1; i>=0; i--) {
    var code = str.charCodeAt(i);
    if (code > 0x7f && code <= 0x7ff) s++;
    else if (code > 0x7ff && code <= 0xffff) s+=2;
    if (code >= 0xDC00 && code <= 0xDFFF) i--; //trail surrogate
  }
  return s;
}

Here is a performance comparison.

It just computes the length in UTF8 of each unicode codepoints returned by charCodeAt (based on wikipedia's descriptions of UTF8, and UTF16 surrogate characters).

It follows RFC3629 (where UTF-8 characters are at most 4-bytes long).

查看更多
一个人的天荒地老
7楼-- · 2019-01-01 15:46

This function will return the byte size of any UTF-8 string you pass to it.

function byteCount(s) {
    return encodeURI(s).split(/%..|./).length - 1;
}

Source

查看更多
登录 后发表回答