From 1d5d44d63d2c07ac51f9886eb426c8ee8f9a22ab Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Thu, 12 Jan 2017 11:45:47 +0000 Subject: [PATCH] TextEncoder polyfill Apparently Safari doesn't sport a TextEncoder, so here's a polyfill for it. --- src/utils/TextDecoderPolyfill.js | 131 +++++++++++++++++++++++++ src/utils/TextEncoderPolyfill.js | 78 +++++++++++++++ test/utils/TextDecoderPolyfill-test.js | 85 ++++++++++++++++ test/utils/TextEncoderPolyfill-test.js | 39 ++++++++ 4 files changed, 333 insertions(+) create mode 100644 src/utils/TextDecoderPolyfill.js create mode 100644 src/utils/TextEncoderPolyfill.js create mode 100644 test/utils/TextDecoderPolyfill-test.js create mode 100644 test/utils/TextEncoderPolyfill-test.js diff --git a/src/utils/TextDecoderPolyfill.js b/src/utils/TextDecoderPolyfill.js new file mode 100644 index 0000000000..e203676bb7 --- /dev/null +++ b/src/utils/TextDecoderPolyfill.js @@ -0,0 +1,131 @@ +/* +Copyright 2017 Vector Creations Ltd + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +"use strict"; + +// Polyfill for TextDecoder. + +const REPLACEMENT_CHAR = '\uFFFD'; + +export default class TextDecoder { + /** + * Decode a UTF-8 byte array as a javascript string + * + * @param {Uint8Array} u8Array UTF-8-encoded onput + * @return {str} + */ + decode(u8Array) { + let u0, u1, u2, u3; + + let str = ''; + let idx = 0; + while (idx < u8Array.length) { + u0 = u8Array[idx++]; + if (!(u0 & 0x80)) { + str += String.fromCharCode(u0); + continue; + } + + if ((u0 & 0xC0) != 0xC0) { + // continuation byte where we expect a leading byte + str += REPLACEMENT_CHAR; + continue; + } + + if (u0 > 0xF4) { + // this would imply a 5-byte or longer encoding, which is + // invalid and unsupported here. + str += REPLACEMENT_CHAR; + continue; + } + + u1 = u8Array[idx++]; + if (u1 === undefined) { + str += REPLACEMENT_CHAR; + continue; + } + + if ((u1 & 0xC0) != 0x80) { + // leading byte where we expect a continuation byte + str += REPLACEMENT_CHAR.repeat(2); + continue; + } + u1 &= 0x3F; + if (!(u0 & 0x20)) { + const u = ((u0 & 0x1F) << 6) | u1; + if (u < 0x80) { + // over-long + str += REPLACEMENT_CHAR.repeat(2); + } else { + str += String.fromCharCode(u); + } + continue; + } + + u2 = u8Array[idx++]; + if (u2 === undefined) { + str += REPLACEMENT_CHAR.repeat(2); + continue; + } + if ((u2 & 0xC0) != 0x80) { + // leading byte where we expect a continuation byte + str += REPLACEMENT_CHAR.repeat(3); + continue; + } + u2 &= 0x3F; + if (!(u0 & 0x10)) { + const u = ((u0 & 0x0F) << 12) | (u1 << 6) | u2; + if (u < 0x800) { + // over-long + str += REPLACEMENT_CHAR.repeat(3); + } else if (u == 0xFEFF && idx == 3) { + // byte-order mark: do not add to output + } else { + str += String.fromCharCode(u); + } + continue; + } + + u3 = u8Array[idx++]; + if (u3 === undefined) { + str += REPLACEMENT_CHAR.repeat(3); + continue; + } + if ((u3 & 0xC0) != 0x80) { + // leading byte where we expect a continuation byte + str += REPLACEMENT_CHAR.repeat(4); + continue; + } + u3 &= 0x3F; + const u = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | u3; + if (u < 0x10000) { + // over-long + str += REPLACEMENT_CHAR.repeat(4); + continue; + } + if (u > 0x1FFFF) { + // unicode stops here. + str += REPLACEMENT_CHAR.repeat(4); + continue; + } + + // encode as utf-16 + const v = u - 0x10000; + str += String.fromCharCode(0xD800 | (v >> 10), 0xDC00 | (v & 0x3FF)); + } + return str; + } +} diff --git a/src/utils/TextEncoderPolyfill.js b/src/utils/TextEncoderPolyfill.js new file mode 100644 index 0000000000..2da09710f2 --- /dev/null +++ b/src/utils/TextEncoderPolyfill.js @@ -0,0 +1,78 @@ +/* +Copyright 2017 Vector Creations Ltd + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +"use strict"; + +// Polyfill for TextEncoder. Based on emscripten's stringToUTF8Array. + +function utf8len(str) { + var len = 0; + for (var i = 0; i < str.length; ++i) { + var u = str.charCodeAt(i); + if (u >= 0xD800 && u <= 0xDFFF && i < str.length-1) { + // lead surrogate - combine with next surrogate + u = 0x10000 + ((u & 0x3FF) << 10) | (str.charCodeAt(++i) & 0x3FF); + } + + if (u <= 0x7F) { + ++len; + } else if (u <= 0x7FF) { + len += 2; + } else if (u <= 0xFFFF) { + len += 3; + } else { + len += 4; + } + } + return len; +} + +export default class TextEncoder { + /** + * Encode a javascript string as utf-8 + * + * @param {String} str String to encode + * @return {Uint8Array} UTF-8-encoded output + */ + encode(str) { + const outU8Array = new Uint8Array(utf8len(str)); + var outIdx = 0; + for (var i = 0; i < str.length; ++i) { + var u = str.charCodeAt(i); + if (u >= 0xD800 && u <= 0xDFFF && i < str.length-1) { + // lead surrogate - combine with next surrogate + u = 0x10000 + ((u & 0x3FF) << 10) | (str.charCodeAt(++i) & 0x3FF); + } + + if (u <= 0x7F) { + outU8Array[outIdx++] = u; + } else if (u <= 0x7FF) { + outU8Array[outIdx++] = 0xC0 | (u >> 6); + outU8Array[outIdx++] = 0x80 | (u & 63); + } else if (u <= 0xFFFF) { + outU8Array[outIdx++] = 0xE0 | (u >> 12); + outU8Array[outIdx++] = 0x80 | ((u >> 6) & 63); + outU8Array[outIdx++] = 0x80 | (u & 63); + } else { + outU8Array[outIdx++] = 0xF0 | (u >> 18); + outU8Array[outIdx++] = 0x80 | ((u >> 12) & 63); + outU8Array[outIdx++] = 0x80 | ((u >> 6) & 63); + outU8Array[outIdx++] = 0x80 | (u & 63); + } + } + return outU8Array; + } +} diff --git a/test/utils/TextDecoderPolyfill-test.js b/test/utils/TextDecoderPolyfill-test.js new file mode 100644 index 0000000000..84f5edf187 --- /dev/null +++ b/test/utils/TextDecoderPolyfill-test.js @@ -0,0 +1,85 @@ +/* +Copyright 2017 Vector Creations Ltd + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +"use strict"; + +import TextDecoderPolyfill from 'utils/TextDecoderPolyfill'; + +import * as testUtils from '../test-utils'; +import expect from 'expect'; + +describe('textDecoderPolyfill', function() { + beforeEach(function() { + testUtils.beforeEach(this); + }); + + it('should correctly decode a range of strings', function() { + const decoder = new TextDecoderPolyfill(); + + expect(decoder.decode(Uint8Array.of(65, 66, 67))).toEqual('ABC'); + expect(decoder.decode(Uint8Array.of(0xC3, 0xA6))).toEqual('æ'); + expect(decoder.decode(Uint8Array.of(0xE2, 0x82, 0xAC))).toEqual('€'); + expect(decoder.decode(Uint8Array.of(0xF0, 0x9F, 0x92, 0xA9))).toEqual('\uD83D\uDCA9'); + }); + + it('should ignore byte-order marks', function() { + const decoder = new TextDecoderPolyfill(); + expect(decoder.decode(Uint8Array.of(0xEF, 0xBB, 0xBF, 65))) + .toEqual('A'); + }); + + it('should not ignore byte-order marks in the middle of the array', function() { + const decoder = new TextDecoderPolyfill(); + expect(decoder.decode(Uint8Array.of(65, 0xEF, 0xBB, 0xBF, 66))) + .toEqual('A\uFEFFB'); + }); + + it('should reject overlong encodings', function() { + const decoder = new TextDecoderPolyfill(); + + // euro, as 4 bytes + expect(decoder.decode(Uint8Array.of(65, 0xF0, 0x82, 0x82, 0xAC, 67))) + .toEqual('A\uFFFD\uFFFD\uFFFD\uFFFDC'); + }); + + it('should reject 5 and 6-byte encodings', function() { + const decoder = new TextDecoderPolyfill(); + + expect(decoder.decode(Uint8Array.of(65, 0xF8, 0x82, 0x82, 0x82, 0x82, 67))) + .toEqual('A\uFFFD\uFFFD\uFFFD\uFFFD\uFFFDC'); + }); + + it('should reject code points beyond 0x10000', function() { + const decoder = new TextDecoderPolyfill(); + + expect(decoder.decode(Uint8Array.of(0xF4, 0xA0, 0x80, 0x80))) + .toEqual('\uFFFD\uFFFD\uFFFD\uFFFD'); + }); + + it('should cope with end-of-string', function() { + const decoder = new TextDecoderPolyfill(); + + expect(decoder.decode(Uint8Array.of(65, 0xC3))) + .toEqual('A\uFFFD'); + + expect(decoder.decode(Uint8Array.of(65, 0xE2, 0x82))) + .toEqual('A\uFFFD\uFFFD'); + + expect(decoder.decode(Uint8Array.of(65, 0xF0, 0x9F, 0x92))) + .toEqual('A\uFFFD\uFFFD\uFFFD'); + }); + +}); diff --git a/test/utils/TextEncoderPolyfill-test.js b/test/utils/TextEncoderPolyfill-test.js new file mode 100644 index 0000000000..4f422ec375 --- /dev/null +++ b/test/utils/TextEncoderPolyfill-test.js @@ -0,0 +1,39 @@ +/* +Copyright 2017 Vector Creations Ltd + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +"use strict"; + +import TextEncoderPolyfill from 'utils/TextEncoderPolyfill'; + +import * as testUtils from '../test-utils'; +import expect from 'expect'; + +describe('textEncoderPolyfill', function() { + beforeEach(function() { + testUtils.beforeEach(this); + }); + + it('should correctly encode a range of strings', function() { + const encoder = new TextEncoderPolyfill(); + + expect(encoder.encode('ABC')).toEqual(Uint8Array.of(65, 66, 67)); + expect(encoder.encode('æ')).toEqual(Uint8Array.of(0xC3, 0xA6)); + expect(encoder.encode('€')).toEqual(Uint8Array.of(0xE2, 0x82, 0xAC)); + + // PILE OF POO (💩) + expect(encoder.encode('\uD83D\uDCA9')).toEqual(Uint8Array.of(0xF0, 0x9F, 0x92, 0xA9)); + }); +});