04 | March | 2011 | Programming collection

I am working on a project that involves encode/decode UTF-8 bytes from/to Javascript String. I’ve been searching through the web and there are couple of UTF-8 coder implementations. Unfortunately none of them handle supplementary code points correctly, meaning all of them would produce invalid UTF-8 bytes/JS strings when it encounter characters belong to the supplementary code points.

In Javascript, characters are encoded using UCS-2, which is a subset of UTF-16, and is encoded the same way as UTF-16 for characters in Basic Multilingual Plane (BMP). In UCS-2, however, doesn’t have surrogate pair supports, which is how UTF-16 using two 16bits to represent one character in the supplementary code points. We can emulate the surrogate pair in Javascript by outputting two characters, correspond to the high and low surrogates. Most browsers would be able to print out the string correctly, having a side effect that if you do a String.length, it will return 2 instead of 1 for a surrogate pair.

I come up with the following code:

// Namespace UTF8
var UTF8 = (function() {
    return {
        // Encodes UCS2 into UTF8
        // Returns an array of numbers (bytes)
        encode : function(str) {
            var len = str.length;
            var result = [];
            var code;
            var i;
            for (i = 0; i < len; i++) {
                code = str.charCodeAt(i);
                if (code <= 0x7f) {
                    result.push(code);
                } else if (code <= 0x7ff) {                         // 2 bytes                     
                    result.push(0xc0 | (code >>> 6 & 0x1f),
                                0x80 | (code & 0x3f));
                } else if (code <= 0xd700 || code >= 0xe000) {      // 3 bytes
                    result.push(0xe0 | (code >>> 12 & 0x0f),
                                0x80 | (code >>> 6 & 0x3f),
                                0x80 | (code & 0x3f));
                } else {                                            // 4 bytes, surrogate pair
                    code = (((code - 0xd800) << 10) | (str.charCodeAt(++i) - 0xdc00)) + 0x10000;
                    result.push(0xf0 | (code >>> 18 & 0x07),
                                0x80 | (code >>> 12 & 0x3f),
                                0x80 | (code >>> 6 & 0x3f),
                                0x80 | (code & 0x3f));
                }
            }
            return result;
        },

        // Decodes UTF8 into UCS2
        // Returns a string
        decode : function(bytes) {
            var len = bytes.length;
            var result = "";
            var code;
            var i;
            for (i = 0; i < len; i++) {
                if (bytes[i] <= 0x7f) {                     
                    result += String.fromCharCode(bytes[i]);
                } else if (bytes[i] >= 0xc0) {                                   // Mutlibytes
                    if (bytes[i] < 0xe0) {                                       // 2 bytes
                        code = ((bytes[i++] & 0x1f) << 6) |
                                (bytes[i] & 0x3f);
                    } else if (bytes[i] < 0xf0) {                                // 3 bytes
                        code = ((bytes[i++] & 0x0f) << 12) |
                               ((bytes[i++] & 0x3f) << 6)  |
                                (bytes[i] & 0x3f);
                    } else {                                                     // 4 bytes
                        // turned into two characters in JS as surrogate pair
                        code = (((bytes[i++] & 0x07) << 18) |
                                ((bytes[i++] & 0x3f) << 12) |
                                ((bytes[i++] & 0x3f) << 6) |                                  
                                 (bytes[i] & 0x3f)) - 0x10000;
                        // High surrogate
                        result += String.fromCharCode(((code & 0xffc00) >>> 10) + 0xd800);
                        // Low surrogate
                        code = (code & 0x3ff) + 0xdc00;
                    }
                    result += String.fromCharCode(code);
                } // Otherwise it's an invalid UTF-8, skipped.
            }
            return result;
        }
    };
}());

Feel free to use it if you find it useful.

References:

S	M	T	W	T	F	S
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

Programming collection

Programming collection

Day: March 4, 2011

Javascript UTF-8 codec that supports supplementary code points

Rate this:

Share this: