diff --git a/binary/decoder.js b/binary/decoder.js index 1186add..c44053f 100644 --- a/binary/decoder.js +++ b/binary/decoder.js @@ -47,7 +47,7 @@ goog.provide('jspb.BinaryDecoder'); goog.require('jspb.asserts'); -goog.require('goog.crypt'); +goog.require('jspb.binary.utf8'); goog.require('jspb.utils'); @@ -256,7 +256,7 @@ jspb.BinaryDecoder.prototype.setCursor = function(cursor) { */ jspb.BinaryDecoder.prototype.advance = function(count) { this.cursor_ += count; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); }; @@ -397,6 +397,17 @@ jspb.BinaryDecoder.prototype.readSplitFixed64 = function(convert) { return convert(lowBits, highBits); }; +/** + * Asserts that our cursor is in bounds. + * + * @private + * @return {void} + */ +jspb.BinaryDecoder.prototype.checkCursor = function () { + if (this.cursor_ > this.end_) { + asserts.fail('Read past the end ' + this.cursor_ + ' > ' + this.end_); + } +} /** * Skips over a varint in the block without decoding it. @@ -452,7 +463,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { var x = (temp & 0x7F); if (temp < 128) { this.cursor_ += 1; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; } @@ -460,7 +471,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { x |= (temp & 0x7F) << 7; if (temp < 128) { this.cursor_ += 2; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; } @@ -468,7 +479,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { x |= (temp & 0x7F) << 14; if (temp < 128) { this.cursor_ += 3; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; } @@ -476,7 +487,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { x |= (temp & 0x7F) << 21; if (temp < 128) { this.cursor_ += 4; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; } @@ -486,7 +497,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { // We're reading the high bits of an unsigned varint. The byte we just read // also contains bits 33 through 35, which we're going to discard. this.cursor_ += 5; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x >>> 0; } @@ -500,7 +511,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { jspb.asserts.assert(false); } - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; }; @@ -679,7 +690,7 @@ jspb.BinaryDecoder.prototype.readZigzagVarint64String = function() { jspb.BinaryDecoder.prototype.readUint8 = function() { var a = this.bytes_[this.cursor_ + 0]; this.cursor_ += 1; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return a; }; @@ -694,7 +705,7 @@ jspb.BinaryDecoder.prototype.readUint16 = function() { var a = this.bytes_[this.cursor_ + 0]; var b = this.bytes_[this.cursor_ + 1]; this.cursor_ += 2; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return (a << 0) | (b << 8); }; @@ -711,7 +722,7 @@ jspb.BinaryDecoder.prototype.readUint32 = function() { var c = this.bytes_[this.cursor_ + 2]; var d = this.bytes_[this.cursor_ + 3]; this.cursor_ += 4; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return ((a << 0) | (b << 8) | (c << 16) | (d << 24)) >>> 0; }; @@ -756,7 +767,7 @@ jspb.BinaryDecoder.prototype.readUint64String = function() { jspb.BinaryDecoder.prototype.readInt8 = function() { var a = this.bytes_[this.cursor_ + 0]; this.cursor_ += 1; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return (a << 24) >> 24; }; @@ -771,7 +782,7 @@ jspb.BinaryDecoder.prototype.readInt16 = function() { var a = this.bytes_[this.cursor_ + 0]; var b = this.bytes_[this.cursor_ + 1]; this.cursor_ += 2; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return (((a << 0) | (b << 8)) << 16) >> 16; }; @@ -788,7 +799,7 @@ jspb.BinaryDecoder.prototype.readInt32 = function() { var c = this.bytes_[this.cursor_ + 2]; var d = this.bytes_[this.cursor_ + 3]; this.cursor_ += 4; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return (a << 0) | (b << 8) | (c << 16) | (d << 24); }; @@ -858,7 +869,9 @@ jspb.BinaryDecoder.prototype.readDouble = function() { * @export */ jspb.BinaryDecoder.prototype.readBool = function() { - return !!this.bytes_[this.cursor_++]; + const b = !!this.bytes_[this.cursor_++]; + this.checkCursor(); + return b; }; @@ -879,59 +892,17 @@ jspb.BinaryDecoder.prototype.readEnum = function() { * Supports codepoints from U+0000 up to U+10FFFF. * (http://en.wikipedia.org/wiki/UTF-8). * @param {number} length The length of the string to read. + * @param {boolean} requireUtf8 Whether to throw when invalid utf8 is found. * @return {string} The decoded string. * @export */ -jspb.BinaryDecoder.prototype.readString = function(length) { - var bytes = this.bytes_; - var cursor = this.cursor_; - var end = cursor + length; - var codeUnits = []; - - var result = ''; - while (cursor < end) { - var c = bytes[cursor++]; - if (c < 128) { // Regular 7-bit ASCII. - codeUnits.push(c); - } else if (c < 192) { - // UTF-8 continuation mark. We are out of sync. This - // might happen if we attempted to read a character - // with more than four bytes. - continue; - } else if (c < 224) { // UTF-8 with two bytes. - var c2 = bytes[cursor++]; - codeUnits.push(((c & 31) << 6) | (c2 & 63)); - } else if (c < 240) { // UTF-8 with three bytes. - var c2 = bytes[cursor++]; - var c3 = bytes[cursor++]; - codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); - } else if (c < 248) { // UTF-8 with 4 bytes. - var c2 = bytes[cursor++]; - var c3 = bytes[cursor++]; - var c4 = bytes[cursor++]; - // Characters written on 4 bytes have 21 bits for a codepoint. - // We can't fit that on 16bit characters, so we use surrogates. - var codepoint = - ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63); - // Surrogates formula from wikipedia. - // 1. Subtract 0x10000 from codepoint - codepoint -= 0x10000; - // 2. Split this into the high 10-bit value and the low 10-bit value - // 3. Add 0xD800 to the high value to form the high surrogate - // 4. Add 0xDC00 to the low value to form the low surrogate: - var low = (codepoint & 1023) + 0xDC00; - var high = ((codepoint >> 10) & 1023) + 0xD800; - codeUnits.push(high, low); - } - // Avoid exceeding the maximum stack size when calling `apply`. - if (codeUnits.length >= 8192) { - result += String.fromCharCode.apply(null, codeUnits); - codeUnits.length = 0; - } - } - result += goog.crypt.byteArrayToString(codeUnits); - this.cursor_ = cursor; +jspb.BinaryDecoder.prototype.readString = function (length, requireUtf8) { + const cursor = this.cursor_; + this.cursor_ += length; + this.checkCursor(); + const result = + jspb.binary.utf8.decodeUtf8(jspb.asserts.assert(this.bytes_), cursor, length, requireUtf8); return result; }; @@ -966,7 +937,7 @@ jspb.BinaryDecoder.prototype.readBytes = function(length) { var result = this.bytes_.subarray(this.cursor_, this.cursor_ + length); this.cursor_ += length; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return result; }; diff --git a/binary/decoder_test.js b/binary/decoder_test.js index 77f6877..7a71c0d 100644 --- a/binary/decoder_test.js +++ b/binary/decoder_test.js @@ -354,7 +354,7 @@ describe('binaryDecoderTest', () => { const decoder = jspb.BinaryDecoder.alloc(encoder.end()); - expect(decoder.readString(len)).toEqual(long_string); + expect(decoder.readString(len, true)).toEqual(long_string); }); /** @@ -375,11 +375,11 @@ describe('binaryDecoderTest', () => { const decoder = jspb.BinaryDecoder.alloc(encoder.end()); - expect(decoder.readString(ascii.length)).toEqual(ascii); - expect(utf8_two_bytes).toEqual(decoder.readString(utf8_two_bytes.length)); + expect(decoder.readString(ascii.length, /* enforceUtf8= */ true)).toEqual(ascii); + expect(utf8_two_bytes).toEqual(decoder.readString(2, /* enforceUtf8= */ true)); expect(utf8_three_bytes) - .toEqual(decoder.readString(utf8_three_bytes.length)); - expect(utf8_four_bytes).toEqual(decoder.readString(utf8_four_bytes.length)); + .toEqual(decoder.readString(3, /* enforceUtf8= */ true)); + expect(utf8_four_bytes).toEqual(decoder.readString(4, /* enforceUtf8= */ true)); }); /** diff --git a/binary/reader.js b/binary/reader.js index 7be3b58..0f8c961 100644 --- a/binary/reader.js +++ b/binary/reader.js @@ -52,6 +52,26 @@ goog.require('jspb.BinaryConstants'); goog.require('jspb.BinaryDecoder'); goog.require('jspb.utils'); +/** + * Whether to enforce that string fields are valid utf8. + * + *

Currently set to `ALWAYS`, can be set to `DEPRECATED_PROTO3_ONLY` to only + * enforce utf8 for proto3 string fields, for proto2 string fields it will use + * replacement characters when encoding errors are found. + * + *

TODO: Remove the flag, simplify BinaryReader to remove + * readStringRequireUtf8 and related support in the code generator et. al. + * + * @define {string} + */ +const ENFORCE_UTF8 = goog.define('jspb.binary.ENFORCE_UTF8', 'ALWAYS'); + +// Constrain the set of values to only these two. +jspb.asserts.assert( + ENFORCE_UTF8 === 'DEPRECATED_PROTO3_ONLY' || ENFORCE_UTF8 === 'ALWAYS'); + +const /** boolean */ UTF8_PARSING_ERRORS_ARE_FATAL = ENFORCE_UTF8 === 'ALWAYS'; + /** @@ -996,10 +1016,29 @@ jspb.BinaryReader.prototype.readEnum = function() { * @export */ jspb.BinaryReader.prototype.readString = function() { + // delegate to the other reader so that inlining can eliminate this method + // in the common case. + if (UTF8_PARSING_ERRORS_ARE_FATAL) { + return this.readStringRequireUtf8(); + } + jspb.asserts.assert( this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED); var length = this.decoder_.readUnsignedVarint32(); - return this.decoder_.readString(length); + return this.decoder_.readString(length, /*requireUtf8=*/ false); +}; + +/** + * Reads a string field from the binary stream, or throws an error if the next + * field in the stream is not of the correct wire type, or if the string is + * not valid utf8. + * + * @return {string} The value of the string field. + */ +jspb.BinaryReader.prototype.readStringRequireUtf8 = function () { + jspb.asserts.assert(this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED); + const length = this.decoder_.readUnsignedVarint32(); + return this.decoder_.readString(length, /*requireUtf8=*/ true); }; diff --git a/binary/utf8.js b/binary/utf8.js new file mode 100644 index 0000000..b77602b --- /dev/null +++ b/binary/utf8.js @@ -0,0 +1,426 @@ +/** + * @fileoverview UTF8 encoding and decoding routines + */ +goog.provide('jspb.binary.utf8'); + +goog.require('jspb.asserts'); + + +/** + * Whether to use the browser based `TextEncoder` and `TextDecoder` APIs for + * handling utf8. + * + *

Enabled by default for `goog.FEATURESET_YEAR >= 2020`. The code also + * performs feature detection for this API and will always use it if available, + * this variable enables us to not ship the polyfill. + * + *

See http://go/jscompiler-flags#browser-featureset-year-options for the + * behavior here. + * + * @define {boolean} + */ +const USE_TEXT_ENCODING = + goog.define('jspb.binary.USE_TEXTENCODING', goog.FEATURESET_YEAR >= 2020); + +const /** number */ MIN_SURROGATE = 0xD800; +const /** number */ MIN_HIGH_SURROGATE = MIN_SURROGATE; +const /** number */ MAX_HIGH_SURROGATE = 0xDBFF; +const /** number */ MIN_LOW_SURROGATE = 0xDC00; +const /** number */ MAX_LOW_SURROGATE = 0xDFFF; +const /** number */ MAX_SURROGATE = MAX_LOW_SURROGATE; + +/** + * Returns whether the byte is not a valid continuation of the form + * '10XXXXXX'. + * @return {boolean} + */ +function isNotTrailingByte(/** number */ byte) { + // 0xC0 is '11000000' in binary + // 0x80 is '10000000' in binary + return (byte & 0xC0) !== 0x80; +} + + +/** + * Either throws an error or appends a replacement codepoint of invalid utf8 + */ +function invalid( + /** boolean */ parsingErrorsAreFatal, /** !Array */ codeUnits) { + if (parsingErrorsAreFatal) { + throw new Error('Invalid UTF8'); + } + codeUnits.push(0xFFFD); // utf8 replacement character +} + +/** @return {string} */ +function codeUnitsToString( + /** string? */ accum, /** !Array */ utf16CodeUnits) { + const suffix = String.fromCharCode.apply(null, utf16CodeUnits); + return accum == null ? suffix : accum + suffix; +} + +/** + * Our handwritten UTF8 decoder. + * + * https://en.wikipedia.org/wiki/UTF-8#Encoding describes the bit layout + * + * https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling + * describes important cases to check for which are namely: + * - overlong encodings, meaning a value expressable in N bytes could have been + * expressed in fewer bytes + * - invalid bytes, meaning bytes that are generally out of range + * - surrogate codepoints, utf8 never encodes directly a utf16 surrogate value + * - underflow where there aren't enough bytes for the sequence we are parsing + * - out of range codepoints. + * + * @return {string} + */ +jspb.binary.utf8.polyfillDecodeUtf8 = function ( + /** !Uint8Array */ bytes, /** number */ offset, /** number */ length, + /** boolean */ parsingErrorsAreFatal) { + let cursor = offset; + const end = cursor + length; + const codeUnits = []; + let result = null; + + // This is significantly slower than the TextDecoder implementation. + // Ideas for improving performance: + // 1. Reduce branching with non-shortcircuting operators, e.g. + // https://stackoverflow.com/q/5652363 + // 2. improve isNotTrailingByte using xor? + // 3. consider having a dedicate ascii loop (java impls do this) + let c1, c2, c3, c4; + while (cursor < end) { + c1 = bytes[cursor++]; + if (c1 < 0x80) { // Regular 7-bit ASCII. + codeUnits.push(c1); + } else if (c1 < 0xE0) { // UTF-8 with two bytes. + if (cursor >= end) { + invalid(parsingErrorsAreFatal, codeUnits); + } else { + c2 = bytes[cursor++]; + // Make sure that c1 is a valid leading byte and c2 is a valid + // trailing byte + // 0xC2 is '11000010', if c1 is less than this then we have an overlong + // encoding because there would only be 7 significant bits. + if (c1 < 0xC2 || isNotTrailingByte(c2)) { + cursor--; // push c2 back since it isn't 'accepted' + invalid(parsingErrorsAreFatal, codeUnits); + } else { + // The codeUnit is the lower 6 bits from c2 and the lower 5 bits from + // c1 + const codeUnit = ((c1 & 0x1F) << 6) | (c2 & 0x3F); + // Consistency check that the computed code is in range for a 2 byte + // sequence. + jspb.asserts.assert(codeUnit >= 0x80 && codeUnit <= 0x07FF); + codeUnits.push(codeUnit); + } + } + } else if (c1 < 0xF0) { // UTF-8 with three bytes. + if (cursor >= end - 1) { + invalid(parsingErrorsAreFatal, codeUnits); + } else { + c2 = bytes[cursor++]; + if (isNotTrailingByte(c2) || + // These checks were taken from + // java/com/google/protobuf/Utf8.java + // overlong? 5 most significant bits must not all be zero + (c1 === 0xE0 && c2 < 0xA0) + // check for illegal surrogate codepoints + || (c1 === 0xED && c2 >= 0xA0) || + // We delay reading c3 until now so than an error in c2 or c1 will + // preserve c3 for the next loop iteration + isNotTrailingByte(c3 = bytes[cursor++])) { + cursor--; // push back c2 or c3, depending on how far we made it + invalid(parsingErrorsAreFatal, codeUnits); + } else { + // 4 bits from the first byte + // 6 bits from each of the two lower bytes + // == 16 bits total + const codeUnit = + ((c1 & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + // Consistency check, this is the valid range for a 3 byte character + jspb.asserts.assert(codeUnit >= 0x800 && codeUnit <= 0xFFFF); + // And that Utf16 surrogates are disallowed + jspb.asserts.assert(codeUnit < MIN_SURROGATE || codeUnit > MAX_SURROGATE); + codeUnits.push(codeUnit); + } + } + } else if (c1 <= 0xF4) { // UTF-8 with 4 bytes. + // 0xF8 matches the bitpattern for utf8 with 4 bytes, but all leading + // bytes > 0xF4 are either overlong encodings or exceed the valid range. + if (cursor >= end - 2) { + invalid(parsingErrorsAreFatal, codeUnits); + } else { + c2 = bytes[cursor++]; + if (isNotTrailingByte(c2) || + // This check was inspired by + // java/com/google/protobuf/Utf8.java + // Tricky optimized form of: + // valid 4-byte leading byte? + // if (byte1 > (byte) 0xF4 || + // overlong? 4 most significant bits must not all be zero + // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || + // codepoint larger than the highest code point (U+10FFFF)? + // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) + (((c1 << 28) + (c2 - 0x90)) >> 30) !== 0 || + // We delay reading c3 and c4 until now so than an error in c2 or c1 + // will preserve them for the next loop iteration. + isNotTrailingByte(c3 = bytes[cursor++]) || + isNotTrailingByte(c4 = bytes[cursor++])) { + cursor--; // push back c2, c3 or c4 depending on how far we made it + invalid(parsingErrorsAreFatal, codeUnits); + } else { + // Characters written on 4 bytes have 21 bits for a codepoint. + // We can't fit that on 16bit characters, so we use surrogates. + // 3 bits from the uppermost byte, 6 bits from each of the lower 3 + // bytes. This is 21 bits which is too big for a 16 bit utf16 code + // unit so we use surrogates. + let codepoint = ((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12) | + ((c3 & 0x3F) << 6) | (c4 & 0x3F); + // Consistency check, this is the valid range for a 4 byte character. + jspb.asserts.assert(codepoint >= 0x10000 && codepoint <= 0x10FFFF); + // Surrogates formula from wikipedia. + // 1. Subtract 0x10000 from codepoint + codepoint -= 0x10000; + // 2. Split this into the high 10-bit value and the low 10-bit value + // 3. Add 0xD800 to the high value to form the high surrogate + // 4. Add 0xDC00 to the low value to form the low surrogate: + const low = (codepoint & 0x3FF) + MIN_LOW_SURROGATE; + const high = ((codepoint >> 10) & 0x3FF) + MIN_HIGH_SURROGATE; + codeUnits.push(high, low); + } + } + } else { + // initial byte is too large for utf8 + invalid(parsingErrorsAreFatal, codeUnits); + } + // Accumulate as we go to avoid exceeding the maximum stack size when + // calling `apply`. + if (codeUnits.length >= 8192) { + result = codeUnitsToString(result, codeUnits); + codeUnits.length = 0; + } + } + // ensure we don't overflow or underflow + jspb.asserts.assert(cursor === end, `expected ${cursor} === ${end}`); + return codeUnitsToString(result, codeUnits); +} + + +/** @type {boolean|undefined} */ +let isFatalTextDecoderCachableAfterThrowing_ = + // chrome version >= 2020 are not subject to https://crbug.com/910292 + goog.FEATURESET_YEAR >= 2020 ? true : undefined; + +/** @return {boolean} */ +function isFatalTextDecoderCachableAfterThrowing(/** !TextDecoder */ decoder) { + // Test if the decoder is subject to https://crbug.com/910292 + // chrome versions with this bug cause one failed decode to cause all later + // decodes to throw. + if (isFatalTextDecoderCachableAfterThrowing_ === undefined) { + // In theory we shouldn't need to generate an error here since this function + // is only called in the context of a failed decode. However, the buggy + // chrome versions are not 'consistent' in corrupting their internal state + // since it depends on where in the decode stream the error occurs. This + // error however does consistently trigger the bug based on manual testing. + try { + // A lonely continuation byte + decoder.decode(new Uint8Array([0x80])); + } catch (e) { + // expected + } + try { + // 'a' in hex + decoder.decode(new Uint8Array([0x61])); + isFatalTextDecoderCachableAfterThrowing_ = true; + } catch (e) { + // This decode should not throw, if it does it means our chrome version + // is buggy and we need to flush our cached decoder when failures occur + isFatalTextDecoderCachableAfterThrowing_ = false; + } + } + return isFatalTextDecoderCachableAfterThrowing_; +} + +/** @type {!TextDecoder|undefined} */ +let fatalDecoderInstance; + +/** @return {!TextDecoder}*/ +function getFatalDecoderInstance() { + let instance = fatalDecoderInstance; + if (!instance) { + instance = fatalDecoderInstance = new TextDecoder('utf-8', { fatal: true }); + } + return instance; +} + +/** @type {!TextDecoder|undefined} */ +let nonFatalDecoderInstance; + +/** @return {!TextDecoder}*/ +function getNonFatalDecoderInstance() { + let instance = nonFatalDecoderInstance; + if (!instance) { + instance = nonFatalDecoderInstance = + new TextDecoder('utf-8', { fatal: false }); + } + return instance; +} + +/** + * A `subarray` implementation that avoids calling `subarray` if it isn't needed + * + * `subarray` tends to be surprisingly slow. + * @return {!Uint8Array} + */ +function subarray( + /** !Uint8Array*/ bytes, /** number */ offset, /** number */ end) { + return offset === 0 && end === bytes.length ? bytes : + bytes.subarray(offset, end); +} + +/** + * @return {string} + */ +jspb.binary.utf8.textDecoderDecodeUtf8 = function ( + /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length, + /** boolean*/ parsingErrorsAreFatal) { + const /** !TextDecoder */ decoder = parsingErrorsAreFatal ? + getFatalDecoderInstance() : + getNonFatalDecoderInstance(); + + bytes = subarray(bytes, offset, offset + length); + try { + return decoder.decode(bytes); + } catch (e) { + if (parsingErrorsAreFatal && + !isFatalTextDecoderCachableAfterThrowing(decoder)) { + fatalDecoderInstance = undefined; + } + throw e; + } +} + +/** @const {boolean} */ +const useTextDecoderDecode = + USE_TEXT_ENCODING || typeof TextDecoder !== 'undefined'; + +/** + * A utf8 decoding routine either based upon TextDecoder if available or using + * our polyfill implementation + * @return {string} + */ +jspb.binary.utf8.decodeUtf8 = function ( + /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length, + /** boolean*/ parsingErrorsAreFatal) { + return useTextDecoderDecode ? + jspb.binary.utf8.textDecoderDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal) : + jspb.binary.utf8.polyfillDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal); +} + +/** @type {!TextEncoder|undefined} */ +let textEncoderInstance; + +/** @return {!Uint8Array} */ +jspb.binary.utf8.textEncoderEncode = function ( + /** string */ s, /** boolean */ rejectUnpairedSurrogates) { + if (rejectUnpairedSurrogates) { + checkWellFormed(s); + } + + if (!textEncoderInstance) { + textEncoderInstance = new TextEncoder(); + } + return textEncoderInstance.encode(s); +} + +// isWellFormed landed in major browsers in early 2023 so it will only be +// definitely available in 2024 See +// http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed +const /** boolean */ HAS_WELL_FORMED_METHOD = goog.FEATURESET_YEAR > 2023 || + typeof String.prototype.isWellFormed === 'function'; + +jspb.binary.utf8.checkWellFormed = function (/** string */ text) { + if (HAS_WELL_FORMED_METHOD ? + // Externs don't contain the definition of this function yet. + // http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed + !(/** @type{{isWellFormed:function():boolean}}*/ ( + /** @type {?} */ (text)) + .isWellFormed()) : + /(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/ + .test(text)) { + throw new Error('Found an unpaired surrogate'); + } +} + + +/** @return {!Uint8Array} */ +jspb.binary.utf8.polyfillEncode = function ( + /** string */ s, /** boolean */ rejectUnpairedSurrogates) { + let bi = 0; + // The worse case is that every character requires 3 output bytes, so we + // allocate for this. This assumes that the buffer will be short lived. + // Callers can always `slice` if needed + const buffer = new Uint8Array(3 * s.length); + for (let ci = 0; ci < s.length; ci++) { + let c = s.charCodeAt(ci); + if (c < 0x80) { + buffer[bi++] = c; + } else if (c < 0x800) { + buffer[bi++] = (c >> 6) | 0xC0; + buffer[bi++] = (c & 63) | 0x80; + } else { + jspb.asserts.assert(c < 65536); + // Look for surrogates + // First check if it is surrogate range + if (c >= MIN_SURROGATE && c <= MAX_SURROGATE) { + // is it a high surrogate? + if (c <= MAX_HIGH_SURROGATE && ci < s.length) { + const c2 = s.charCodeAt(++ci); + if (c2 >= MIN_LOW_SURROGATE && c2 <= MAX_LOW_SURROGATE) { + // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + const codePoint = + (c - MIN_SURROGATE) * 0x400 + c2 - MIN_LOW_SURROGATE + 0x10000; + buffer[bi++] = (codePoint >> 18) | 0xF0; + buffer[bi++] = ((codePoint >> 12) & 63) | 0x80; + buffer[bi++] = ((codePoint >> 6) & 63) | 0x80; + buffer[bi++] = (codePoint & 63) | 0x80; + continue; + } else { + // else c2 not in low surrogate range, treat c as a lone surrogate + // and back up ci so we process c2 on the next loop as an + // independent character + ci--; + } + } // else c not a high surrogate + if (rejectUnpairedSurrogates) { + throw new Error('Found an unpaired surrogate'); + } + c = 0xFFFD; // Error! Unpaired surrogate + } + buffer[bi++] = (c >> 12) | 0xE0; + buffer[bi++] = ((c >> 6) & 63) | 0x80; + buffer[bi++] = (c & 63) | 0x80; + } + } + return subarray(buffer, 0, bi); +} + +/** @const {boolean} */ +const useTextEncoderEncode = + (USE_TEXT_ENCODING || typeof TextEncoder !== 'undefined'); + +/** + * A utf8 encoding routine either based upon TextEncoder if available or using + * our polyfill implementation + * @return {!Uint8Array} + */ +jspb.binary.utf8.encodeUtf8 = function ( + /**string*/ string, /** boolean=*/ rejectUnpairedSurrogates = false) { + jspb.asserts.assertString(string); + return useTextEncoderEncode ? + jspb.binary.utf8.textEncoderEncode(string, rejectUnpairedSurrogates) : + jspb.binary.utf8.polyfillEncode(string, rejectUnpairedSurrogates); +} + diff --git a/generator/js_generator.cc b/generator/js_generator.cc index 84365dc..6ca0ad3 100644 --- a/generator/js_generator.cc +++ b/generator/js_generator.cc @@ -1073,17 +1073,21 @@ std::string JSFieldTypeAnnotation(const GeneratorOptions& options, return jstype; } -std::string JSBinaryReaderMethodType(const FieldDescriptor* field) { +std::string JSBinaryMethodType(const FieldDescriptor* field, bool is_writer) { std::string name = field->type_name(); if (name[0] >= 'a' && name[0] <= 'z') { name[0] = (name[0] - 'a') + 'A'; } + if (!is_writer && field->type() == FieldDescriptor::TYPE_STRING && + field->requires_utf8_validation()) { + name = name + "RequireUtf8"; + } return IsIntegralFieldWithStringJSType(field) ? (name + "String") : name; } std::string JSBinaryReadWriteMethodName(const FieldDescriptor* field, bool is_writer) { - std::string name = JSBinaryReaderMethodType(field); + std::string name = JSBinaryMethodType(field, is_writer); if (field->is_packed()) { name = "Packed" + name; } else if (is_writer && field->is_repeated()) { @@ -3128,11 +3132,11 @@ void Generator::GenerateClassDeserializeBinaryField( printer->Print( " var values = /** @type {$fieldtype$} */ " "(reader.isDelimited() " - "? reader.readPacked$reader$() : [reader.read$reader$()]);\n", + "? reader.read$reader$() : [reader.read$reader$()]);\n", "fieldtype", JSFieldTypeAnnotation(options, field, false, true, /* singular_if_not_packed */ false, BYTES_U8), - "reader", JSBinaryReaderMethodType(field)); + "reader", JSBinaryReadWriteMethodName(field, /* is_writer=*/false)); } else { printer->Print( " var value = /** @type {$fieldtype$} */ " diff --git a/gulpfile.js b/gulpfile.js index e7f7511..426e0f1 100644 --- a/gulpfile.js +++ b/gulpfile.js @@ -145,6 +145,7 @@ function getClosureCompilerCommand(exportsFile, outputFile) { '--js=binary/decoder.js', '--js=binary/encoder.js', '--js=binary/reader.js', + '--js=binary/utf8.js', '--js=binary/utils.js', '--js=binary/writer.js', `--js=${exportsFile}`, @@ -194,7 +195,7 @@ function commonjs_out(cb) { function closure_make_deps(cb) { exec( - './node_modules/.bin/closure-make-deps --closure-path=. --file=node_modules/google-closure-library/closure/goog/deps.js binary/arith.js binary/constants.js binary/decoder.js binary/encoder.js binary/reader.js binary/utils.js binary/writer.js asserts.js debug.js map.js message.js node_loader.js test_bootstrap.js > deps.js', + './node_modules/.bin/closure-make-deps --closure-path=. --file=node_modules/google-closure-library/closure/goog/deps.js binary/arith.js binary/constants.js binary/decoder.js binary/encoder.js binary/reader.js binary/utf8.js binary/utils.js binary/writer.js asserts.js debug.js map.js message.js node_loader.js test_bootstrap.js > deps.js', make_exec_logging_callback(cb)); }