From 993b707305b6339042a2e7127880c6836ad00ecb Mon Sep 17 00:00:00 2001 From: Stephanie DiBenedetto Date: Thu, 15 Feb 2024 17:05:28 +0000 Subject: [PATCH 1/2] Port fix: Fix JSPB binary utf8 decoding to be spec compliant. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our prior behavior was extremely undefined when confronted with errors, it would read out of bounds, accept overlong encodings, skip over out of range bytes, compose out of range codepoints. The new implementation always detects and handles errors consistently by either throwing or using replacement characters (� aka \uFFFD) This also adds support for aligning with the proto3 spec to the code generator which requires that parsing fail for proto3 messages with invalid utf8 payloads for string fields. For now, actual failing is disabled via the goog.define jspb.binary.ENFORCE_UTF8 which is set to NEVER. A future change will flip this to DEFAULT. --- binary/decoder.js | 101 ++++----- binary/decoder_test.js | 10 +- binary/reader.js | 41 +++- binary/utf8.js | 426 ++++++++++++++++++++++++++++++++++++++ generator/js_generator.cc | 12 +- gulpfile.js | 3 +- 6 files changed, 517 insertions(+), 76 deletions(-) create mode 100644 binary/utf8.js diff --git a/binary/decoder.js b/binary/decoder.js index 1186add..c44053f 100644 --- a/binary/decoder.js +++ b/binary/decoder.js @@ -47,7 +47,7 @@ goog.provide('jspb.BinaryDecoder'); goog.require('jspb.asserts'); -goog.require('goog.crypt'); +goog.require('jspb.binary.utf8'); goog.require('jspb.utils'); @@ -256,7 +256,7 @@ jspb.BinaryDecoder.prototype.setCursor = function(cursor) { */ jspb.BinaryDecoder.prototype.advance = function(count) { this.cursor_ += count; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); }; @@ -397,6 +397,17 @@ jspb.BinaryDecoder.prototype.readSplitFixed64 = function(convert) { return convert(lowBits, highBits); }; +/** + * Asserts that our cursor is in bounds. + * + * @private + * @return {void} + */ +jspb.BinaryDecoder.prototype.checkCursor = function () { + if (this.cursor_ > this.end_) { + asserts.fail('Read past the end ' + this.cursor_ + ' > ' + this.end_); + } +} /** * Skips over a varint in the block without decoding it. @@ -452,7 +463,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { var x = (temp & 0x7F); if (temp < 128) { this.cursor_ += 1; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; } @@ -460,7 +471,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { x |= (temp & 0x7F) << 7; if (temp < 128) { this.cursor_ += 2; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; } @@ -468,7 +479,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { x |= (temp & 0x7F) << 14; if (temp < 128) { this.cursor_ += 3; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; } @@ -476,7 +487,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { x |= (temp & 0x7F) << 21; if (temp < 128) { this.cursor_ += 4; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; } @@ -486,7 +497,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { // We're reading the high bits of an unsigned varint. The byte we just read // also contains bits 33 through 35, which we're going to discard. this.cursor_ += 5; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x >>> 0; } @@ -500,7 +511,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() { jspb.asserts.assert(false); } - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return x; }; @@ -679,7 +690,7 @@ jspb.BinaryDecoder.prototype.readZigzagVarint64String = function() { jspb.BinaryDecoder.prototype.readUint8 = function() { var a = this.bytes_[this.cursor_ + 0]; this.cursor_ += 1; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return a; }; @@ -694,7 +705,7 @@ jspb.BinaryDecoder.prototype.readUint16 = function() { var a = this.bytes_[this.cursor_ + 0]; var b = this.bytes_[this.cursor_ + 1]; this.cursor_ += 2; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return (a << 0) | (b << 8); }; @@ -711,7 +722,7 @@ jspb.BinaryDecoder.prototype.readUint32 = function() { var c = this.bytes_[this.cursor_ + 2]; var d = this.bytes_[this.cursor_ + 3]; this.cursor_ += 4; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return ((a << 0) | (b << 8) | (c << 16) | (d << 24)) >>> 0; }; @@ -756,7 +767,7 @@ jspb.BinaryDecoder.prototype.readUint64String = function() { jspb.BinaryDecoder.prototype.readInt8 = function() { var a = this.bytes_[this.cursor_ + 0]; this.cursor_ += 1; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return (a << 24) >> 24; }; @@ -771,7 +782,7 @@ jspb.BinaryDecoder.prototype.readInt16 = function() { var a = this.bytes_[this.cursor_ + 0]; var b = this.bytes_[this.cursor_ + 1]; this.cursor_ += 2; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return (((a << 0) | (b << 8)) << 16) >> 16; }; @@ -788,7 +799,7 @@ jspb.BinaryDecoder.prototype.readInt32 = function() { var c = this.bytes_[this.cursor_ + 2]; var d = this.bytes_[this.cursor_ + 3]; this.cursor_ += 4; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return (a << 0) | (b << 8) | (c << 16) | (d << 24); }; @@ -858,7 +869,9 @@ jspb.BinaryDecoder.prototype.readDouble = function() { * @export */ jspb.BinaryDecoder.prototype.readBool = function() { - return !!this.bytes_[this.cursor_++]; + const b = !!this.bytes_[this.cursor_++]; + this.checkCursor(); + return b; }; @@ -879,59 +892,17 @@ jspb.BinaryDecoder.prototype.readEnum = function() { * Supports codepoints from U+0000 up to U+10FFFF. * (http://en.wikipedia.org/wiki/UTF-8). * @param {number} length The length of the string to read. + * @param {boolean} requireUtf8 Whether to throw when invalid utf8 is found. * @return {string} The decoded string. * @export */ -jspb.BinaryDecoder.prototype.readString = function(length) { - var bytes = this.bytes_; - var cursor = this.cursor_; - var end = cursor + length; - var codeUnits = []; - - var result = ''; - while (cursor < end) { - var c = bytes[cursor++]; - if (c < 128) { // Regular 7-bit ASCII. - codeUnits.push(c); - } else if (c < 192) { - // UTF-8 continuation mark. We are out of sync. This - // might happen if we attempted to read a character - // with more than four bytes. - continue; - } else if (c < 224) { // UTF-8 with two bytes. - var c2 = bytes[cursor++]; - codeUnits.push(((c & 31) << 6) | (c2 & 63)); - } else if (c < 240) { // UTF-8 with three bytes. - var c2 = bytes[cursor++]; - var c3 = bytes[cursor++]; - codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63)); - } else if (c < 248) { // UTF-8 with 4 bytes. - var c2 = bytes[cursor++]; - var c3 = bytes[cursor++]; - var c4 = bytes[cursor++]; - // Characters written on 4 bytes have 21 bits for a codepoint. - // We can't fit that on 16bit characters, so we use surrogates. - var codepoint = - ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63); - // Surrogates formula from wikipedia. - // 1. Subtract 0x10000 from codepoint - codepoint -= 0x10000; - // 2. Split this into the high 10-bit value and the low 10-bit value - // 3. Add 0xD800 to the high value to form the high surrogate - // 4. Add 0xDC00 to the low value to form the low surrogate: - var low = (codepoint & 1023) + 0xDC00; - var high = ((codepoint >> 10) & 1023) + 0xD800; - codeUnits.push(high, low); - } - // Avoid exceeding the maximum stack size when calling `apply`. - if (codeUnits.length >= 8192) { - result += String.fromCharCode.apply(null, codeUnits); - codeUnits.length = 0; - } - } - result += goog.crypt.byteArrayToString(codeUnits); - this.cursor_ = cursor; +jspb.BinaryDecoder.prototype.readString = function (length, requireUtf8) { + const cursor = this.cursor_; + this.cursor_ += length; + this.checkCursor(); + const result = + jspb.binary.utf8.decodeUtf8(jspb.asserts.assert(this.bytes_), cursor, length, requireUtf8); return result; }; @@ -966,7 +937,7 @@ jspb.BinaryDecoder.prototype.readBytes = function(length) { var result = this.bytes_.subarray(this.cursor_, this.cursor_ + length); this.cursor_ += length; - jspb.asserts.assert(this.cursor_ <= this.end_); + this.checkCursor(); return result; }; diff --git a/binary/decoder_test.js b/binary/decoder_test.js index 77f6877..b8e1084 100644 --- a/binary/decoder_test.js +++ b/binary/decoder_test.js @@ -354,7 +354,7 @@ describe('binaryDecoderTest', () => { const decoder = jspb.BinaryDecoder.alloc(encoder.end()); - expect(decoder.readString(len)).toEqual(long_string); + expect(decoder.readString(len, true)).toEqual(long_string); }); /** @@ -375,11 +375,11 @@ describe('binaryDecoderTest', () => { const decoder = jspb.BinaryDecoder.alloc(encoder.end()); - expect(decoder.readString(ascii.length)).toEqual(ascii); - expect(utf8_two_bytes).toEqual(decoder.readString(utf8_two_bytes.length)); + expect(decoder.readString(ascii.length, true)).toEqual(ascii); + expect(utf8_two_bytes).toEqual(decoder.readString(2, true)); expect(utf8_three_bytes) - .toEqual(decoder.readString(utf8_three_bytes.length)); - expect(utf8_four_bytes).toEqual(decoder.readString(utf8_four_bytes.length)); + .toEqual(decoder.readString(3, true)); + expect(utf8_four_bytes).toEqual(decoder.readString(4, true)); }); /** diff --git a/binary/reader.js b/binary/reader.js index 7be3b58..0f8c961 100644 --- a/binary/reader.js +++ b/binary/reader.js @@ -52,6 +52,26 @@ goog.require('jspb.BinaryConstants'); goog.require('jspb.BinaryDecoder'); goog.require('jspb.utils'); +/** + * Whether to enforce that string fields are valid utf8. + * + *

Currently set to `ALWAYS`, can be set to `DEPRECATED_PROTO3_ONLY` to only + * enforce utf8 for proto3 string fields, for proto2 string fields it will use + * replacement characters when encoding errors are found. + * + *

TODO: Remove the flag, simplify BinaryReader to remove + * readStringRequireUtf8 and related support in the code generator et. al. + * + * @define {string} + */ +const ENFORCE_UTF8 = goog.define('jspb.binary.ENFORCE_UTF8', 'ALWAYS'); + +// Constrain the set of values to only these two. +jspb.asserts.assert( + ENFORCE_UTF8 === 'DEPRECATED_PROTO3_ONLY' || ENFORCE_UTF8 === 'ALWAYS'); + +const /** boolean */ UTF8_PARSING_ERRORS_ARE_FATAL = ENFORCE_UTF8 === 'ALWAYS'; + /** @@ -996,10 +1016,29 @@ jspb.BinaryReader.prototype.readEnum = function() { * @export */ jspb.BinaryReader.prototype.readString = function() { + // delegate to the other reader so that inlining can eliminate this method + // in the common case. + if (UTF8_PARSING_ERRORS_ARE_FATAL) { + return this.readStringRequireUtf8(); + } + jspb.asserts.assert( this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED); var length = this.decoder_.readUnsignedVarint32(); - return this.decoder_.readString(length); + return this.decoder_.readString(length, /*requireUtf8=*/ false); +}; + +/** + * Reads a string field from the binary stream, or throws an error if the next + * field in the stream is not of the correct wire type, or if the string is + * not valid utf8. + * + * @return {string} The value of the string field. + */ +jspb.BinaryReader.prototype.readStringRequireUtf8 = function () { + jspb.asserts.assert(this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED); + const length = this.decoder_.readUnsignedVarint32(); + return this.decoder_.readString(length, /*requireUtf8=*/ true); }; diff --git a/binary/utf8.js b/binary/utf8.js new file mode 100644 index 0000000..b77602b --- /dev/null +++ b/binary/utf8.js @@ -0,0 +1,426 @@ +/** + * @fileoverview UTF8 encoding and decoding routines + */ +goog.provide('jspb.binary.utf8'); + +goog.require('jspb.asserts'); + + +/** + * Whether to use the browser based `TextEncoder` and `TextDecoder` APIs for + * handling utf8. + * + *

Enabled by default for `goog.FEATURESET_YEAR >= 2020`. The code also + * performs feature detection for this API and will always use it if available, + * this variable enables us to not ship the polyfill. + * + *

See http://go/jscompiler-flags#browser-featureset-year-options for the + * behavior here. + * + * @define {boolean} + */ +const USE_TEXT_ENCODING = + goog.define('jspb.binary.USE_TEXTENCODING', goog.FEATURESET_YEAR >= 2020); + +const /** number */ MIN_SURROGATE = 0xD800; +const /** number */ MIN_HIGH_SURROGATE = MIN_SURROGATE; +const /** number */ MAX_HIGH_SURROGATE = 0xDBFF; +const /** number */ MIN_LOW_SURROGATE = 0xDC00; +const /** number */ MAX_LOW_SURROGATE = 0xDFFF; +const /** number */ MAX_SURROGATE = MAX_LOW_SURROGATE; + +/** + * Returns whether the byte is not a valid continuation of the form + * '10XXXXXX'. + * @return {boolean} + */ +function isNotTrailingByte(/** number */ byte) { + // 0xC0 is '11000000' in binary + // 0x80 is '10000000' in binary + return (byte & 0xC0) !== 0x80; +} + + +/** + * Either throws an error or appends a replacement codepoint of invalid utf8 + */ +function invalid( + /** boolean */ parsingErrorsAreFatal, /** !Array */ codeUnits) { + if (parsingErrorsAreFatal) { + throw new Error('Invalid UTF8'); + } + codeUnits.push(0xFFFD); // utf8 replacement character +} + +/** @return {string} */ +function codeUnitsToString( + /** string? */ accum, /** !Array */ utf16CodeUnits) { + const suffix = String.fromCharCode.apply(null, utf16CodeUnits); + return accum == null ? suffix : accum + suffix; +} + +/** + * Our handwritten UTF8 decoder. + * + * https://en.wikipedia.org/wiki/UTF-8#Encoding describes the bit layout + * + * https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling + * describes important cases to check for which are namely: + * - overlong encodings, meaning a value expressable in N bytes could have been + * expressed in fewer bytes + * - invalid bytes, meaning bytes that are generally out of range + * - surrogate codepoints, utf8 never encodes directly a utf16 surrogate value + * - underflow where there aren't enough bytes for the sequence we are parsing + * - out of range codepoints. + * + * @return {string} + */ +jspb.binary.utf8.polyfillDecodeUtf8 = function ( + /** !Uint8Array */ bytes, /** number */ offset, /** number */ length, + /** boolean */ parsingErrorsAreFatal) { + let cursor = offset; + const end = cursor + length; + const codeUnits = []; + let result = null; + + // This is significantly slower than the TextDecoder implementation. + // Ideas for improving performance: + // 1. Reduce branching with non-shortcircuting operators, e.g. + // https://stackoverflow.com/q/5652363 + // 2. improve isNotTrailingByte using xor? + // 3. consider having a dedicate ascii loop (java impls do this) + let c1, c2, c3, c4; + while (cursor < end) { + c1 = bytes[cursor++]; + if (c1 < 0x80) { // Regular 7-bit ASCII. + codeUnits.push(c1); + } else if (c1 < 0xE0) { // UTF-8 with two bytes. + if (cursor >= end) { + invalid(parsingErrorsAreFatal, codeUnits); + } else { + c2 = bytes[cursor++]; + // Make sure that c1 is a valid leading byte and c2 is a valid + // trailing byte + // 0xC2 is '11000010', if c1 is less than this then we have an overlong + // encoding because there would only be 7 significant bits. + if (c1 < 0xC2 || isNotTrailingByte(c2)) { + cursor--; // push c2 back since it isn't 'accepted' + invalid(parsingErrorsAreFatal, codeUnits); + } else { + // The codeUnit is the lower 6 bits from c2 and the lower 5 bits from + // c1 + const codeUnit = ((c1 & 0x1F) << 6) | (c2 & 0x3F); + // Consistency check that the computed code is in range for a 2 byte + // sequence. + jspb.asserts.assert(codeUnit >= 0x80 && codeUnit <= 0x07FF); + codeUnits.push(codeUnit); + } + } + } else if (c1 < 0xF0) { // UTF-8 with three bytes. + if (cursor >= end - 1) { + invalid(parsingErrorsAreFatal, codeUnits); + } else { + c2 = bytes[cursor++]; + if (isNotTrailingByte(c2) || + // These checks were taken from + // java/com/google/protobuf/Utf8.java + // overlong? 5 most significant bits must not all be zero + (c1 === 0xE0 && c2 < 0xA0) + // check for illegal surrogate codepoints + || (c1 === 0xED && c2 >= 0xA0) || + // We delay reading c3 until now so than an error in c2 or c1 will + // preserve c3 for the next loop iteration + isNotTrailingByte(c3 = bytes[cursor++])) { + cursor--; // push back c2 or c3, depending on how far we made it + invalid(parsingErrorsAreFatal, codeUnits); + } else { + // 4 bits from the first byte + // 6 bits from each of the two lower bytes + // == 16 bits total + const codeUnit = + ((c1 & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + // Consistency check, this is the valid range for a 3 byte character + jspb.asserts.assert(codeUnit >= 0x800 && codeUnit <= 0xFFFF); + // And that Utf16 surrogates are disallowed + jspb.asserts.assert(codeUnit < MIN_SURROGATE || codeUnit > MAX_SURROGATE); + codeUnits.push(codeUnit); + } + } + } else if (c1 <= 0xF4) { // UTF-8 with 4 bytes. + // 0xF8 matches the bitpattern for utf8 with 4 bytes, but all leading + // bytes > 0xF4 are either overlong encodings or exceed the valid range. + if (cursor >= end - 2) { + invalid(parsingErrorsAreFatal, codeUnits); + } else { + c2 = bytes[cursor++]; + if (isNotTrailingByte(c2) || + // This check was inspired by + // java/com/google/protobuf/Utf8.java + // Tricky optimized form of: + // valid 4-byte leading byte? + // if (byte1 > (byte) 0xF4 || + // overlong? 4 most significant bits must not all be zero + // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || + // codepoint larger than the highest code point (U+10FFFF)? + // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) + (((c1 << 28) + (c2 - 0x90)) >> 30) !== 0 || + // We delay reading c3 and c4 until now so than an error in c2 or c1 + // will preserve them for the next loop iteration. + isNotTrailingByte(c3 = bytes[cursor++]) || + isNotTrailingByte(c4 = bytes[cursor++])) { + cursor--; // push back c2, c3 or c4 depending on how far we made it + invalid(parsingErrorsAreFatal, codeUnits); + } else { + // Characters written on 4 bytes have 21 bits for a codepoint. + // We can't fit that on 16bit characters, so we use surrogates. + // 3 bits from the uppermost byte, 6 bits from each of the lower 3 + // bytes. This is 21 bits which is too big for a 16 bit utf16 code + // unit so we use surrogates. + let codepoint = ((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12) | + ((c3 & 0x3F) << 6) | (c4 & 0x3F); + // Consistency check, this is the valid range for a 4 byte character. + jspb.asserts.assert(codepoint >= 0x10000 && codepoint <= 0x10FFFF); + // Surrogates formula from wikipedia. + // 1. Subtract 0x10000 from codepoint + codepoint -= 0x10000; + // 2. Split this into the high 10-bit value and the low 10-bit value + // 3. Add 0xD800 to the high value to form the high surrogate + // 4. Add 0xDC00 to the low value to form the low surrogate: + const low = (codepoint & 0x3FF) + MIN_LOW_SURROGATE; + const high = ((codepoint >> 10) & 0x3FF) + MIN_HIGH_SURROGATE; + codeUnits.push(high, low); + } + } + } else { + // initial byte is too large for utf8 + invalid(parsingErrorsAreFatal, codeUnits); + } + // Accumulate as we go to avoid exceeding the maximum stack size when + // calling `apply`. + if (codeUnits.length >= 8192) { + result = codeUnitsToString(result, codeUnits); + codeUnits.length = 0; + } + } + // ensure we don't overflow or underflow + jspb.asserts.assert(cursor === end, `expected ${cursor} === ${end}`); + return codeUnitsToString(result, codeUnits); +} + + +/** @type {boolean|undefined} */ +let isFatalTextDecoderCachableAfterThrowing_ = + // chrome version >= 2020 are not subject to https://crbug.com/910292 + goog.FEATURESET_YEAR >= 2020 ? true : undefined; + +/** @return {boolean} */ +function isFatalTextDecoderCachableAfterThrowing(/** !TextDecoder */ decoder) { + // Test if the decoder is subject to https://crbug.com/910292 + // chrome versions with this bug cause one failed decode to cause all later + // decodes to throw. + if (isFatalTextDecoderCachableAfterThrowing_ === undefined) { + // In theory we shouldn't need to generate an error here since this function + // is only called in the context of a failed decode. However, the buggy + // chrome versions are not 'consistent' in corrupting their internal state + // since it depends on where in the decode stream the error occurs. This + // error however does consistently trigger the bug based on manual testing. + try { + // A lonely continuation byte + decoder.decode(new Uint8Array([0x80])); + } catch (e) { + // expected + } + try { + // 'a' in hex + decoder.decode(new Uint8Array([0x61])); + isFatalTextDecoderCachableAfterThrowing_ = true; + } catch (e) { + // This decode should not throw, if it does it means our chrome version + // is buggy and we need to flush our cached decoder when failures occur + isFatalTextDecoderCachableAfterThrowing_ = false; + } + } + return isFatalTextDecoderCachableAfterThrowing_; +} + +/** @type {!TextDecoder|undefined} */ +let fatalDecoderInstance; + +/** @return {!TextDecoder}*/ +function getFatalDecoderInstance() { + let instance = fatalDecoderInstance; + if (!instance) { + instance = fatalDecoderInstance = new TextDecoder('utf-8', { fatal: true }); + } + return instance; +} + +/** @type {!TextDecoder|undefined} */ +let nonFatalDecoderInstance; + +/** @return {!TextDecoder}*/ +function getNonFatalDecoderInstance() { + let instance = nonFatalDecoderInstance; + if (!instance) { + instance = nonFatalDecoderInstance = + new TextDecoder('utf-8', { fatal: false }); + } + return instance; +} + +/** + * A `subarray` implementation that avoids calling `subarray` if it isn't needed + * + * `subarray` tends to be surprisingly slow. + * @return {!Uint8Array} + */ +function subarray( + /** !Uint8Array*/ bytes, /** number */ offset, /** number */ end) { + return offset === 0 && end === bytes.length ? bytes : + bytes.subarray(offset, end); +} + +/** + * @return {string} + */ +jspb.binary.utf8.textDecoderDecodeUtf8 = function ( + /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length, + /** boolean*/ parsingErrorsAreFatal) { + const /** !TextDecoder */ decoder = parsingErrorsAreFatal ? + getFatalDecoderInstance() : + getNonFatalDecoderInstance(); + + bytes = subarray(bytes, offset, offset + length); + try { + return decoder.decode(bytes); + } catch (e) { + if (parsingErrorsAreFatal && + !isFatalTextDecoderCachableAfterThrowing(decoder)) { + fatalDecoderInstance = undefined; + } + throw e; + } +} + +/** @const {boolean} */ +const useTextDecoderDecode = + USE_TEXT_ENCODING || typeof TextDecoder !== 'undefined'; + +/** + * A utf8 decoding routine either based upon TextDecoder if available or using + * our polyfill implementation + * @return {string} + */ +jspb.binary.utf8.decodeUtf8 = function ( + /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length, + /** boolean*/ parsingErrorsAreFatal) { + return useTextDecoderDecode ? + jspb.binary.utf8.textDecoderDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal) : + jspb.binary.utf8.polyfillDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal); +} + +/** @type {!TextEncoder|undefined} */ +let textEncoderInstance; + +/** @return {!Uint8Array} */ +jspb.binary.utf8.textEncoderEncode = function ( + /** string */ s, /** boolean */ rejectUnpairedSurrogates) { + if (rejectUnpairedSurrogates) { + checkWellFormed(s); + } + + if (!textEncoderInstance) { + textEncoderInstance = new TextEncoder(); + } + return textEncoderInstance.encode(s); +} + +// isWellFormed landed in major browsers in early 2023 so it will only be +// definitely available in 2024 See +// http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed +const /** boolean */ HAS_WELL_FORMED_METHOD = goog.FEATURESET_YEAR > 2023 || + typeof String.prototype.isWellFormed === 'function'; + +jspb.binary.utf8.checkWellFormed = function (/** string */ text) { + if (HAS_WELL_FORMED_METHOD ? + // Externs don't contain the definition of this function yet. + // http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed + !(/** @type{{isWellFormed:function():boolean}}*/ ( + /** @type {?} */ (text)) + .isWellFormed()) : + /(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/ + .test(text)) { + throw new Error('Found an unpaired surrogate'); + } +} + + +/** @return {!Uint8Array} */ +jspb.binary.utf8.polyfillEncode = function ( + /** string */ s, /** boolean */ rejectUnpairedSurrogates) { + let bi = 0; + // The worse case is that every character requires 3 output bytes, so we + // allocate for this. This assumes that the buffer will be short lived. + // Callers can always `slice` if needed + const buffer = new Uint8Array(3 * s.length); + for (let ci = 0; ci < s.length; ci++) { + let c = s.charCodeAt(ci); + if (c < 0x80) { + buffer[bi++] = c; + } else if (c < 0x800) { + buffer[bi++] = (c >> 6) | 0xC0; + buffer[bi++] = (c & 63) | 0x80; + } else { + jspb.asserts.assert(c < 65536); + // Look for surrogates + // First check if it is surrogate range + if (c >= MIN_SURROGATE && c <= MAX_SURROGATE) { + // is it a high surrogate? + if (c <= MAX_HIGH_SURROGATE && ci < s.length) { + const c2 = s.charCodeAt(++ci); + if (c2 >= MIN_LOW_SURROGATE && c2 <= MAX_LOW_SURROGATE) { + // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae + const codePoint = + (c - MIN_SURROGATE) * 0x400 + c2 - MIN_LOW_SURROGATE + 0x10000; + buffer[bi++] = (codePoint >> 18) | 0xF0; + buffer[bi++] = ((codePoint >> 12) & 63) | 0x80; + buffer[bi++] = ((codePoint >> 6) & 63) | 0x80; + buffer[bi++] = (codePoint & 63) | 0x80; + continue; + } else { + // else c2 not in low surrogate range, treat c as a lone surrogate + // and back up ci so we process c2 on the next loop as an + // independent character + ci--; + } + } // else c not a high surrogate + if (rejectUnpairedSurrogates) { + throw new Error('Found an unpaired surrogate'); + } + c = 0xFFFD; // Error! Unpaired surrogate + } + buffer[bi++] = (c >> 12) | 0xE0; + buffer[bi++] = ((c >> 6) & 63) | 0x80; + buffer[bi++] = (c & 63) | 0x80; + } + } + return subarray(buffer, 0, bi); +} + +/** @const {boolean} */ +const useTextEncoderEncode = + (USE_TEXT_ENCODING || typeof TextEncoder !== 'undefined'); + +/** + * A utf8 encoding routine either based upon TextEncoder if available or using + * our polyfill implementation + * @return {!Uint8Array} + */ +jspb.binary.utf8.encodeUtf8 = function ( + /**string*/ string, /** boolean=*/ rejectUnpairedSurrogates = false) { + jspb.asserts.assertString(string); + return useTextEncoderEncode ? + jspb.binary.utf8.textEncoderEncode(string, rejectUnpairedSurrogates) : + jspb.binary.utf8.polyfillEncode(string, rejectUnpairedSurrogates); +} + diff --git a/generator/js_generator.cc b/generator/js_generator.cc index 84365dc..8136f58 100644 --- a/generator/js_generator.cc +++ b/generator/js_generator.cc @@ -1073,17 +1073,21 @@ std::string JSFieldTypeAnnotation(const GeneratorOptions& options, return jstype; } -std::string JSBinaryReaderMethodType(const FieldDescriptor* field) { +std::string JSBinaryMethodType(const FieldDescriptor* field, bool is_writer) { std::string name = field->type_name(); if (name[0] >= 'a' && name[0] <= 'z') { name[0] = (name[0] - 'a') + 'A'; } + if (!is_writer && field->type() == FieldDescriptor::TYPE_STRING && + field->file()->syntax() == FileDescriptor::SYNTAX_PROTO3) { + name = name + "RequireUtf8"; + } return IsIntegralFieldWithStringJSType(field) ? (name + "String") : name; } std::string JSBinaryReadWriteMethodName(const FieldDescriptor* field, bool is_writer) { - std::string name = JSBinaryReaderMethodType(field); + std::string name = JSBinaryMethodType(field, is_writer); if (field->is_packed()) { name = "Packed" + name; } else if (is_writer && field->is_repeated()) { @@ -3128,11 +3132,11 @@ void Generator::GenerateClassDeserializeBinaryField( printer->Print( " var values = /** @type {$fieldtype$} */ " "(reader.isDelimited() " - "? reader.readPacked$reader$() : [reader.read$reader$()]);\n", + "? reader.read$reader$() : [reader.read$reader$()]);\n", "fieldtype", JSFieldTypeAnnotation(options, field, false, true, /* singular_if_not_packed */ false, BYTES_U8), - "reader", JSBinaryReaderMethodType(field)); + "reader", JSBinaryReadWriteMethodName(field, /* is_writer=*/false)); } else { printer->Print( " var value = /** @type {$fieldtype$} */ " diff --git a/gulpfile.js b/gulpfile.js index e7f7511..426e0f1 100644 --- a/gulpfile.js +++ b/gulpfile.js @@ -145,6 +145,7 @@ function getClosureCompilerCommand(exportsFile, outputFile) { '--js=binary/decoder.js', '--js=binary/encoder.js', '--js=binary/reader.js', + '--js=binary/utf8.js', '--js=binary/utils.js', '--js=binary/writer.js', `--js=${exportsFile}`, @@ -194,7 +195,7 @@ function commonjs_out(cb) { function closure_make_deps(cb) { exec( - './node_modules/.bin/closure-make-deps --closure-path=. --file=node_modules/google-closure-library/closure/goog/deps.js binary/arith.js binary/constants.js binary/decoder.js binary/encoder.js binary/reader.js binary/utils.js binary/writer.js asserts.js debug.js map.js message.js node_loader.js test_bootstrap.js > deps.js', + './node_modules/.bin/closure-make-deps --closure-path=. --file=node_modules/google-closure-library/closure/goog/deps.js binary/arith.js binary/constants.js binary/decoder.js binary/encoder.js binary/reader.js binary/utf8.js binary/utils.js binary/writer.js asserts.js debug.js map.js message.js node_loader.js test_bootstrap.js > deps.js', make_exec_logging_callback(cb)); } From f7fee3dd5fb885292c55d23939946c8acc55579c Mon Sep 17 00:00:00 2001 From: Stephanie DiBenedetto Date: Mon, 17 Jun 2024 10:46:27 -0700 Subject: [PATCH 2/2] clarify enforceUtf8 argument Co-authored-by: Luke Sandberg --- binary/decoder_test.js | 8 ++++---- generator/js_generator.cc | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/binary/decoder_test.js b/binary/decoder_test.js index b8e1084..7a71c0d 100644 --- a/binary/decoder_test.js +++ b/binary/decoder_test.js @@ -375,11 +375,11 @@ describe('binaryDecoderTest', () => { const decoder = jspb.BinaryDecoder.alloc(encoder.end()); - expect(decoder.readString(ascii.length, true)).toEqual(ascii); - expect(utf8_two_bytes).toEqual(decoder.readString(2, true)); + expect(decoder.readString(ascii.length, /* enforceUtf8= */ true)).toEqual(ascii); + expect(utf8_two_bytes).toEqual(decoder.readString(2, /* enforceUtf8= */ true)); expect(utf8_three_bytes) - .toEqual(decoder.readString(3, true)); - expect(utf8_four_bytes).toEqual(decoder.readString(4, true)); + .toEqual(decoder.readString(3, /* enforceUtf8= */ true)); + expect(utf8_four_bytes).toEqual(decoder.readString(4, /* enforceUtf8= */ true)); }); /** diff --git a/generator/js_generator.cc b/generator/js_generator.cc index 8136f58..6ca0ad3 100644 --- a/generator/js_generator.cc +++ b/generator/js_generator.cc @@ -1079,7 +1079,7 @@ std::string JSBinaryMethodType(const FieldDescriptor* field, bool is_writer) { name[0] = (name[0] - 'a') + 'A'; } if (!is_writer && field->type() == FieldDescriptor::TYPE_STRING && - field->file()->syntax() == FileDescriptor::SYNTAX_PROTO3) { + field->requires_utf8_validation()) { name = name + "RequireUtf8"; } return IsIntegralFieldWithStringJSType(field) ? (name + "String") : name;