I am working on a project that involves encode/decode UTF-8 bytes from/to Javascript String. I’ve been searching through the web and there are couple of UTF-8 coder implementations. Unfortunately none of them handle supplementary code points correctly, meaning all of them would produce invalid UTF-8 bytes/JS strings when it encounter characters belong to the supplementary code points.
In Javascript, characters are encoded using UCS-2, which is a subset of UTF-16, and is encoded the same way as UTF-16 for characters in Basic Multilingual Plane (BMP). In UCS-2, however, doesn’t have surrogate pair supports, which is how UTF-16 using two 16bits to represent one character in the supplementary code points. We can emulate the surrogate pair in Javascript by outputting two characters, correspond to the high and low surrogates. Most browsers would be able to print out the string correctly, having a side effect that if you do a String.length, it will return 2 instead of 1 for a surrogate pair.
I come up with the following code:
// Namespace UTF8
var UTF8 = (function() {
return {
// Encodes UCS2 into UTF8
// Returns an array of numbers (bytes)
encode : function(str) {
var len = str.length;
var result = [];
var code;
var i;
for (i = 0; i < len; i++) {
code = str.charCodeAt(i);
if (code <= 0x7f) {
result.push(code);
} else if (code <= 0x7ff) { // 2 bytes
result.push(0xc0 | (code >>> 6 & 0x1f),
0x80 | (code & 0x3f));
} else if (code <= 0xd700 || code >= 0xe000) { // 3 bytes
result.push(0xe0 | (code >>> 12 & 0x0f),
0x80 | (code >>> 6 & 0x3f),
0x80 | (code & 0x3f));
} else { // 4 bytes, surrogate pair
code = (((code - 0xd800) << 10) | (str.charCodeAt(++i) - 0xdc00)) + 0x10000;
result.push(0xf0 | (code >>> 18 & 0x07),
0x80 | (code >>> 12 & 0x3f),
0x80 | (code >>> 6 & 0x3f),
0x80 | (code & 0x3f));
}
}
return result;
},
// Decodes UTF8 into UCS2
// Returns a string
decode : function(bytes) {
var len = bytes.length;
var result = "";
var code;
var i;
for (i = 0; i < len; i++) {
if (bytes[i] <= 0x7f) {
result += String.fromCharCode(bytes[i]);
} else if (bytes[i] >= 0xc0) { // Mutlibytes
if (bytes[i] < 0xe0) { // 2 bytes
code = ((bytes[i++] & 0x1f) << 6) |
(bytes[i] & 0x3f);
} else if (bytes[i] < 0xf0) { // 3 bytes
code = ((bytes[i++] & 0x0f) << 12) |
((bytes[i++] & 0x3f) << 6) |
(bytes[i] & 0x3f);
} else { // 4 bytes
// turned into two characters in JS as surrogate pair
code = (((bytes[i++] & 0x07) << 18) |
((bytes[i++] & 0x3f) << 12) |
((bytes[i++] & 0x3f) << 6) |
(bytes[i] & 0x3f)) - 0x10000;
// High surrogate
result += String.fromCharCode(((code & 0xffc00) >>> 10) + 0xd800);
// Low surrogate
code = (code & 0x3ff) + 0xdc00;
}
result += String.fromCharCode(code);
} // Otherwise it's an invalid UTF-8, skipped.
}
return result;
}
};
}());
Feel free to use it if you find it useful.
References:
Pingback: Javascript UTF-8 codec that supports supplementary code points … | Tutorial4bd.com
Nice! Now i can display brazilian accent characters, like áéíóú, even if they are returned as utf8 from the server api