Javascript UTF-8 codec that supports supplementary code points

I am working on a project that involves encode/decode UTF-8 bytes from/to Javascript String. I’ve been searching through the web and there are couple of UTF-8 coder implementations. Unfortunately none of them handle supplementary code points correctly, meaning all of them would produce invalid UTF-8 bytes/JS strings when it encounter characters belong to the supplementary code points.

In Javascript, characters are encoded using UCS-2, which is a subset of UTF-16, and is encoded the same way as UTF-16 for characters in Basic Multilingual Plane (BMP). In UCS-2, however, doesn’t have surrogate pair supports, which is how UTF-16 using two 16bits to represent one character in the supplementary code points. We can emulate the surrogate pair in Javascript by outputting two characters, correspond to the high and low surrogates. Most browsers would be able to print out the string correctly, having a side effect that if you do a String.length, it will return 2 instead of 1 for a surrogate pair.

I come up with the following code:

// Namespace UTF8
var UTF8 = (function() {
    return {
        // Encodes UCS2 into UTF8
        // Returns an array of numbers (bytes)
        encode : function(str) {
            var len = str.length;
            var result = [];
            var code;
            var i;
            for (i = 0; i < len; i++) {
                code = str.charCodeAt(i);
                if (code <= 0x7f) {
                    result.push(code);
                } else if (code <= 0x7ff) {                         // 2 bytes                     
                    result.push(0xc0 | (code >>> 6 & 0x1f),
                                0x80 | (code & 0x3f));
                } else if (code <= 0xd700 || code >= 0xe000) {      // 3 bytes
                    result.push(0xe0 | (code >>> 12 & 0x0f),
                                0x80 | (code >>> 6 & 0x3f),
                                0x80 | (code & 0x3f));
                } else {                                            // 4 bytes, surrogate pair
                    code = (((code - 0xd800) << 10) | (str.charCodeAt(++i) - 0xdc00)) + 0x10000;
                    result.push(0xf0 | (code >>> 18 & 0x07),
                                0x80 | (code >>> 12 & 0x3f),
                                0x80 | (code >>> 6 & 0x3f),
                                0x80 | (code & 0x3f));
                }
            }
            return result;
        },

        // Decodes UTF8 into UCS2
        // Returns a string
        decode : function(bytes) {
            var len = bytes.length;
            var result = "";
            var code;
            var i;
            for (i = 0; i < len; i++) {
                if (bytes[i] <= 0x7f) {                     
                    result += String.fromCharCode(bytes[i]);
                } else if (bytes[i] >= 0xc0) {                                   // Mutlibytes
                    if (bytes[i] < 0xe0) {                                       // 2 bytes
                        code = ((bytes[i++] & 0x1f) << 6) |
                                (bytes[i] & 0x3f);
                    } else if (bytes[i] < 0xf0) {                                // 3 bytes
                        code = ((bytes[i++] & 0x0f) << 12) |
                               ((bytes[i++] & 0x3f) << 6)  |
                                (bytes[i] & 0x3f);
                    } else {                                                     // 4 bytes
                        // turned into two characters in JS as surrogate pair
                        code = (((bytes[i++] & 0x07) << 18) |
                                ((bytes[i++] & 0x3f) << 12) |
                                ((bytes[i++] & 0x3f) << 6) |                                  
                                 (bytes[i] & 0x3f)) - 0x10000;
                        // High surrogate
                        result += String.fromCharCode(((code & 0xffc00) >>> 10) + 0xd800);
                        // Low surrogate
                        code = (code & 0x3ff) + 0xdc00;
                    }
                    result += String.fromCharCode(code);
                } // Otherwise it's an invalid UTF-8, skipped.
            }
            return result;
        }
    };
}());

Feel free to use it if you find it useful.

References:

Advertisements

2 thoughts on “Javascript UTF-8 codec that supports supplementary code points

  1. Pingback: Javascript UTF-8 codec that supports supplementary code points … | Tutorial4bd.com

  2. Nice! Now i can display brazilian accent characters, like áéíóú, even if they are returned as utf8 from the server api

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s