I am working on a project that involves encode/decode UTF-8 bytes from/to Javascript String. I’ve been searching through the web and there are couple of UTF-8 coder implementations. Unfortunately none of them handle supplementary code points correctly, meaning all of them would produce invalid UTF-8 bytes/JS strings when it encounter characters belong to the supplementary code points.
In Javascript, characters are encoded using UCS-2, which is a subset of UTF-16, and is encoded the same way as UTF-16 for characters in Basic Multilingual Plane (BMP). In UCS-2, however, doesn’t have surrogate pair supports, which is how UTF-16 using two 16bits to represent one character in the supplementary code points. We can emulate the surrogate pair in Javascript by outputting two characters, correspond to the high and low surrogates. Most browsers would be able to print out the string correctly, having a side effect that if you do a String.length
, it will return 2 instead of 1 for a surrogate pair.
I come up with the following code:
// Namespace UTF8 var UTF8 = (function() { return { // Encodes UCS2 into UTF8 // Returns an array of numbers (bytes) encode : function(str) { var len = str.length; var result = []; var code; var i; for (i = 0; i < len; i++) { code = str.charCodeAt(i); if (code <= 0x7f) { result.push(code); } else if (code <= 0x7ff) { // 2 bytes result.push(0xc0 | (code >>> 6 & 0x1f), 0x80 | (code & 0x3f)); } else if (code <= 0xd700 || code >= 0xe000) { // 3 bytes result.push(0xe0 | (code >>> 12 & 0x0f), 0x80 | (code >>> 6 & 0x3f), 0x80 | (code & 0x3f)); } else { // 4 bytes, surrogate pair code = (((code - 0xd800) << 10) | (str.charCodeAt(++i) - 0xdc00)) + 0x10000; result.push(0xf0 | (code >>> 18 & 0x07), 0x80 | (code >>> 12 & 0x3f), 0x80 | (code >>> 6 & 0x3f), 0x80 | (code & 0x3f)); } } return result; }, // Decodes UTF8 into UCS2 // Returns a string decode : function(bytes) { var len = bytes.length; var result = ""; var code; var i; for (i = 0; i < len; i++) { if (bytes[i] <= 0x7f) { result += String.fromCharCode(bytes[i]); } else if (bytes[i] >= 0xc0) { // Mutlibytes if (bytes[i] < 0xe0) { // 2 bytes code = ((bytes[i++] & 0x1f) << 6) | (bytes[i] & 0x3f); } else if (bytes[i] < 0xf0) { // 3 bytes code = ((bytes[i++] & 0x0f) << 12) | ((bytes[i++] & 0x3f) << 6) | (bytes[i] & 0x3f); } else { // 4 bytes // turned into two characters in JS as surrogate pair code = (((bytes[i++] & 0x07) << 18) | ((bytes[i++] & 0x3f) << 12) | ((bytes[i++] & 0x3f) << 6) | (bytes[i] & 0x3f)) - 0x10000; // High surrogate result += String.fromCharCode(((code & 0xffc00) >>> 10) + 0xd800); // Low surrogate code = (code & 0x3ff) + 0xdc00; } result += String.fromCharCode(code); } // Otherwise it's an invalid UTF-8, skipped. } return result; } }; }());
Feel free to use it if you find it useful.
References: