diff --git a/binary/decoder.js b/binary/decoder.js
index 1186add..c44053f 100644
--- a/binary/decoder.js
+++ b/binary/decoder.js
@@ -47,7 +47,7 @@
goog.provide('jspb.BinaryDecoder');
goog.require('jspb.asserts');
-goog.require('goog.crypt');
+goog.require('jspb.binary.utf8');
goog.require('jspb.utils');
@@ -256,7 +256,7 @@ jspb.BinaryDecoder.prototype.setCursor = function(cursor) {
*/
jspb.BinaryDecoder.prototype.advance = function(count) {
this.cursor_ += count;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
};
@@ -397,6 +397,17 @@ jspb.BinaryDecoder.prototype.readSplitFixed64 = function(convert) {
return convert(lowBits, highBits);
};
+/**
+ * Asserts that our cursor is in bounds.
+ *
+ * @private
+ * @return {void}
+ */
+jspb.BinaryDecoder.prototype.checkCursor = function () {
+ if (this.cursor_ > this.end_) {
+ asserts.fail('Read past the end ' + this.cursor_ + ' > ' + this.end_);
+ }
+}
/**
* Skips over a varint in the block without decoding it.
@@ -452,7 +463,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
var x = (temp & 0x7F);
if (temp < 128) {
this.cursor_ += 1;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return x;
}
@@ -460,7 +471,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
x |= (temp & 0x7F) << 7;
if (temp < 128) {
this.cursor_ += 2;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return x;
}
@@ -468,7 +479,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
x |= (temp & 0x7F) << 14;
if (temp < 128) {
this.cursor_ += 3;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return x;
}
@@ -476,7 +487,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
x |= (temp & 0x7F) << 21;
if (temp < 128) {
this.cursor_ += 4;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return x;
}
@@ -486,7 +497,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
// We're reading the high bits of an unsigned varint. The byte we just read
// also contains bits 33 through 35, which we're going to discard.
this.cursor_ += 5;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return x >>> 0;
}
@@ -500,7 +511,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
jspb.asserts.assert(false);
}
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return x;
};
@@ -679,7 +690,7 @@ jspb.BinaryDecoder.prototype.readZigzagVarint64String = function() {
jspb.BinaryDecoder.prototype.readUint8 = function() {
var a = this.bytes_[this.cursor_ + 0];
this.cursor_ += 1;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return a;
};
@@ -694,7 +705,7 @@ jspb.BinaryDecoder.prototype.readUint16 = function() {
var a = this.bytes_[this.cursor_ + 0];
var b = this.bytes_[this.cursor_ + 1];
this.cursor_ += 2;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return (a << 0) | (b << 8);
};
@@ -711,7 +722,7 @@ jspb.BinaryDecoder.prototype.readUint32 = function() {
var c = this.bytes_[this.cursor_ + 2];
var d = this.bytes_[this.cursor_ + 3];
this.cursor_ += 4;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return ((a << 0) | (b << 8) | (c << 16) | (d << 24)) >>> 0;
};
@@ -756,7 +767,7 @@ jspb.BinaryDecoder.prototype.readUint64String = function() {
jspb.BinaryDecoder.prototype.readInt8 = function() {
var a = this.bytes_[this.cursor_ + 0];
this.cursor_ += 1;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return (a << 24) >> 24;
};
@@ -771,7 +782,7 @@ jspb.BinaryDecoder.prototype.readInt16 = function() {
var a = this.bytes_[this.cursor_ + 0];
var b = this.bytes_[this.cursor_ + 1];
this.cursor_ += 2;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return (((a << 0) | (b << 8)) << 16) >> 16;
};
@@ -788,7 +799,7 @@ jspb.BinaryDecoder.prototype.readInt32 = function() {
var c = this.bytes_[this.cursor_ + 2];
var d = this.bytes_[this.cursor_ + 3];
this.cursor_ += 4;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return (a << 0) | (b << 8) | (c << 16) | (d << 24);
};
@@ -858,7 +869,9 @@ jspb.BinaryDecoder.prototype.readDouble = function() {
* @export
*/
jspb.BinaryDecoder.prototype.readBool = function() {
- return !!this.bytes_[this.cursor_++];
+ const b = !!this.bytes_[this.cursor_++];
+ this.checkCursor();
+ return b;
};
@@ -879,59 +892,17 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
* Supports codepoints from U+0000 up to U+10FFFF.
* (http://en.wikipedia.org/wiki/UTF-8).
* @param {number} length The length of the string to read.
+ * @param {boolean} requireUtf8 Whether to throw when invalid utf8 is found.
* @return {string} The decoded string.
* @export
*/
-jspb.BinaryDecoder.prototype.readString = function(length) {
- var bytes = this.bytes_;
- var cursor = this.cursor_;
- var end = cursor + length;
- var codeUnits = [];
-
- var result = '';
- while (cursor < end) {
- var c = bytes[cursor++];
- if (c < 128) { // Regular 7-bit ASCII.
- codeUnits.push(c);
- } else if (c < 192) {
- // UTF-8 continuation mark. We are out of sync. This
- // might happen if we attempted to read a character
- // with more than four bytes.
- continue;
- } else if (c < 224) { // UTF-8 with two bytes.
- var c2 = bytes[cursor++];
- codeUnits.push(((c & 31) << 6) | (c2 & 63));
- } else if (c < 240) { // UTF-8 with three bytes.
- var c2 = bytes[cursor++];
- var c3 = bytes[cursor++];
- codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
- } else if (c < 248) { // UTF-8 with 4 bytes.
- var c2 = bytes[cursor++];
- var c3 = bytes[cursor++];
- var c4 = bytes[cursor++];
- // Characters written on 4 bytes have 21 bits for a codepoint.
- // We can't fit that on 16bit characters, so we use surrogates.
- var codepoint =
- ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
- // Surrogates formula from wikipedia.
- // 1. Subtract 0x10000 from codepoint
- codepoint -= 0x10000;
- // 2. Split this into the high 10-bit value and the low 10-bit value
- // 3. Add 0xD800 to the high value to form the high surrogate
- // 4. Add 0xDC00 to the low value to form the low surrogate:
- var low = (codepoint & 1023) + 0xDC00;
- var high = ((codepoint >> 10) & 1023) + 0xD800;
- codeUnits.push(high, low);
- }
- // Avoid exceeding the maximum stack size when calling `apply`.
- if (codeUnits.length >= 8192) {
- result += String.fromCharCode.apply(null, codeUnits);
- codeUnits.length = 0;
- }
- }
- result += goog.crypt.byteArrayToString(codeUnits);
- this.cursor_ = cursor;
+jspb.BinaryDecoder.prototype.readString = function (length, requireUtf8) {
+ const cursor = this.cursor_;
+ this.cursor_ += length;
+ this.checkCursor();
+ const result =
+ jspb.binary.utf8.decodeUtf8(jspb.asserts.assert(this.bytes_), cursor, length, requireUtf8);
return result;
};
@@ -966,7 +937,7 @@ jspb.BinaryDecoder.prototype.readBytes = function(length) {
var result = this.bytes_.subarray(this.cursor_, this.cursor_ + length);
this.cursor_ += length;
- jspb.asserts.assert(this.cursor_ <= this.end_);
+ this.checkCursor();
return result;
};
diff --git a/binary/decoder_test.js b/binary/decoder_test.js
index 77f6877..7a71c0d 100644
--- a/binary/decoder_test.js
+++ b/binary/decoder_test.js
@@ -354,7 +354,7 @@ describe('binaryDecoderTest', () => {
const decoder = jspb.BinaryDecoder.alloc(encoder.end());
- expect(decoder.readString(len)).toEqual(long_string);
+ expect(decoder.readString(len, true)).toEqual(long_string);
});
/**
@@ -375,11 +375,11 @@ describe('binaryDecoderTest', () => {
const decoder = jspb.BinaryDecoder.alloc(encoder.end());
- expect(decoder.readString(ascii.length)).toEqual(ascii);
- expect(utf8_two_bytes).toEqual(decoder.readString(utf8_two_bytes.length));
+ expect(decoder.readString(ascii.length, /* enforceUtf8= */ true)).toEqual(ascii);
+ expect(utf8_two_bytes).toEqual(decoder.readString(2, /* enforceUtf8= */ true));
expect(utf8_three_bytes)
- .toEqual(decoder.readString(utf8_three_bytes.length));
- expect(utf8_four_bytes).toEqual(decoder.readString(utf8_four_bytes.length));
+ .toEqual(decoder.readString(3, /* enforceUtf8= */ true));
+ expect(utf8_four_bytes).toEqual(decoder.readString(4, /* enforceUtf8= */ true));
});
/**
diff --git a/binary/reader.js b/binary/reader.js
index 7be3b58..0f8c961 100644
--- a/binary/reader.js
+++ b/binary/reader.js
@@ -52,6 +52,26 @@ goog.require('jspb.BinaryConstants');
goog.require('jspb.BinaryDecoder');
goog.require('jspb.utils');
+/**
+ * Whether to enforce that string fields are valid utf8.
+ *
+ *
Currently set to `ALWAYS`, can be set to `DEPRECATED_PROTO3_ONLY` to only
+ * enforce utf8 for proto3 string fields, for proto2 string fields it will use
+ * replacement characters when encoding errors are found.
+ *
+ *
TODO: Remove the flag, simplify BinaryReader to remove
+ * readStringRequireUtf8 and related support in the code generator et. al.
+ *
+ * @define {string}
+ */
+const ENFORCE_UTF8 = goog.define('jspb.binary.ENFORCE_UTF8', 'ALWAYS');
+
+// Constrain the set of values to only these two.
+jspb.asserts.assert(
+ ENFORCE_UTF8 === 'DEPRECATED_PROTO3_ONLY' || ENFORCE_UTF8 === 'ALWAYS');
+
+const /** boolean */ UTF8_PARSING_ERRORS_ARE_FATAL = ENFORCE_UTF8 === 'ALWAYS';
+
/**
@@ -996,10 +1016,29 @@ jspb.BinaryReader.prototype.readEnum = function() {
* @export
*/
jspb.BinaryReader.prototype.readString = function() {
+ // delegate to the other reader so that inlining can eliminate this method
+ // in the common case.
+ if (UTF8_PARSING_ERRORS_ARE_FATAL) {
+ return this.readStringRequireUtf8();
+ }
+
jspb.asserts.assert(
this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
var length = this.decoder_.readUnsignedVarint32();
- return this.decoder_.readString(length);
+ return this.decoder_.readString(length, /*requireUtf8=*/ false);
+};
+
+/**
+ * Reads a string field from the binary stream, or throws an error if the next
+ * field in the stream is not of the correct wire type, or if the string is
+ * not valid utf8.
+ *
+ * @return {string} The value of the string field.
+ */
+jspb.BinaryReader.prototype.readStringRequireUtf8 = function () {
+ jspb.asserts.assert(this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
+ const length = this.decoder_.readUnsignedVarint32();
+ return this.decoder_.readString(length, /*requireUtf8=*/ true);
};
diff --git a/binary/utf8.js b/binary/utf8.js
new file mode 100644
index 0000000..b77602b
--- /dev/null
+++ b/binary/utf8.js
@@ -0,0 +1,426 @@
+/**
+ * @fileoverview UTF8 encoding and decoding routines
+ */
+goog.provide('jspb.binary.utf8');
+
+goog.require('jspb.asserts');
+
+
+/**
+ * Whether to use the browser based `TextEncoder` and `TextDecoder` APIs for
+ * handling utf8.
+ *
+ *
Enabled by default for `goog.FEATURESET_YEAR >= 2020`. The code also
+ * performs feature detection for this API and will always use it if available,
+ * this variable enables us to not ship the polyfill.
+ *
+ *
See http://go/jscompiler-flags#browser-featureset-year-options for the
+ * behavior here.
+ *
+ * @define {boolean}
+ */
+const USE_TEXT_ENCODING =
+ goog.define('jspb.binary.USE_TEXTENCODING', goog.FEATURESET_YEAR >= 2020);
+
+const /** number */ MIN_SURROGATE = 0xD800;
+const /** number */ MIN_HIGH_SURROGATE = MIN_SURROGATE;
+const /** number */ MAX_HIGH_SURROGATE = 0xDBFF;
+const /** number */ MIN_LOW_SURROGATE = 0xDC00;
+const /** number */ MAX_LOW_SURROGATE = 0xDFFF;
+const /** number */ MAX_SURROGATE = MAX_LOW_SURROGATE;
+
+/**
+ * Returns whether the byte is not a valid continuation of the form
+ * '10XXXXXX'.
+ * @return {boolean}
+ */
+function isNotTrailingByte(/** number */ byte) {
+ // 0xC0 is '11000000' in binary
+ // 0x80 is '10000000' in binary
+ return (byte & 0xC0) !== 0x80;
+}
+
+
+/**
+ * Either throws an error or appends a replacement codepoint of invalid utf8
+ */
+function invalid(
+ /** boolean */ parsingErrorsAreFatal, /** !Array */ codeUnits) {
+ if (parsingErrorsAreFatal) {
+ throw new Error('Invalid UTF8');
+ }
+ codeUnits.push(0xFFFD); // utf8 replacement character
+}
+
+/** @return {string} */
+function codeUnitsToString(
+ /** string? */ accum, /** !Array */ utf16CodeUnits) {
+ const suffix = String.fromCharCode.apply(null, utf16CodeUnits);
+ return accum == null ? suffix : accum + suffix;
+}
+
+/**
+ * Our handwritten UTF8 decoder.
+ *
+ * https://en.wikipedia.org/wiki/UTF-8#Encoding describes the bit layout
+ *
+ * https://en.wikipedia.org/wiki/UTF-8#Invalid_sequences_and_error_handling
+ * describes important cases to check for which are namely:
+ * - overlong encodings, meaning a value expressable in N bytes could have been
+ * expressed in fewer bytes
+ * - invalid bytes, meaning bytes that are generally out of range
+ * - surrogate codepoints, utf8 never encodes directly a utf16 surrogate value
+ * - underflow where there aren't enough bytes for the sequence we are parsing
+ * - out of range codepoints.
+ *
+ * @return {string}
+ */
+jspb.binary.utf8.polyfillDecodeUtf8 = function (
+ /** !Uint8Array */ bytes, /** number */ offset, /** number */ length,
+ /** boolean */ parsingErrorsAreFatal) {
+ let cursor = offset;
+ const end = cursor + length;
+ const codeUnits = [];
+ let result = null;
+
+ // This is significantly slower than the TextDecoder implementation.
+ // Ideas for improving performance:
+ // 1. Reduce branching with non-shortcircuting operators, e.g.
+ // https://stackoverflow.com/q/5652363
+ // 2. improve isNotTrailingByte using xor?
+ // 3. consider having a dedicate ascii loop (java impls do this)
+ let c1, c2, c3, c4;
+ while (cursor < end) {
+ c1 = bytes[cursor++];
+ if (c1 < 0x80) { // Regular 7-bit ASCII.
+ codeUnits.push(c1);
+ } else if (c1 < 0xE0) { // UTF-8 with two bytes.
+ if (cursor >= end) {
+ invalid(parsingErrorsAreFatal, codeUnits);
+ } else {
+ c2 = bytes[cursor++];
+ // Make sure that c1 is a valid leading byte and c2 is a valid
+ // trailing byte
+ // 0xC2 is '11000010', if c1 is less than this then we have an overlong
+ // encoding because there would only be 7 significant bits.
+ if (c1 < 0xC2 || isNotTrailingByte(c2)) {
+ cursor--; // push c2 back since it isn't 'accepted'
+ invalid(parsingErrorsAreFatal, codeUnits);
+ } else {
+ // The codeUnit is the lower 6 bits from c2 and the lower 5 bits from
+ // c1
+ const codeUnit = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
+ // Consistency check that the computed code is in range for a 2 byte
+ // sequence.
+ jspb.asserts.assert(codeUnit >= 0x80 && codeUnit <= 0x07FF);
+ codeUnits.push(codeUnit);
+ }
+ }
+ } else if (c1 < 0xF0) { // UTF-8 with three bytes.
+ if (cursor >= end - 1) {
+ invalid(parsingErrorsAreFatal, codeUnits);
+ } else {
+ c2 = bytes[cursor++];
+ if (isNotTrailingByte(c2) ||
+ // These checks were taken from
+ // java/com/google/protobuf/Utf8.java
+ // overlong? 5 most significant bits must not all be zero
+ (c1 === 0xE0 && c2 < 0xA0)
+ // check for illegal surrogate codepoints
+ || (c1 === 0xED && c2 >= 0xA0) ||
+ // We delay reading c3 until now so than an error in c2 or c1 will
+ // preserve c3 for the next loop iteration
+ isNotTrailingByte(c3 = bytes[cursor++])) {
+ cursor--; // push back c2 or c3, depending on how far we made it
+ invalid(parsingErrorsAreFatal, codeUnits);
+ } else {
+ // 4 bits from the first byte
+ // 6 bits from each of the two lower bytes
+ // == 16 bits total
+ const codeUnit =
+ ((c1 & 0xF) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
+ // Consistency check, this is the valid range for a 3 byte character
+ jspb.asserts.assert(codeUnit >= 0x800 && codeUnit <= 0xFFFF);
+ // And that Utf16 surrogates are disallowed
+ jspb.asserts.assert(codeUnit < MIN_SURROGATE || codeUnit > MAX_SURROGATE);
+ codeUnits.push(codeUnit);
+ }
+ }
+ } else if (c1 <= 0xF4) { // UTF-8 with 4 bytes.
+ // 0xF8 matches the bitpattern for utf8 with 4 bytes, but all leading
+ // bytes > 0xF4 are either overlong encodings or exceed the valid range.
+ if (cursor >= end - 2) {
+ invalid(parsingErrorsAreFatal, codeUnits);
+ } else {
+ c2 = bytes[cursor++];
+ if (isNotTrailingByte(c2) ||
+ // This check was inspired by
+ // java/com/google/protobuf/Utf8.java
+ // Tricky optimized form of:
+ // valid 4-byte leading byte?
+ // if (byte1 > (byte) 0xF4 ||
+ // overlong? 4 most significant bits must not all be zero
+ // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
+ // codepoint larger than the highest code point (U+10FFFF)?
+ // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
+ (((c1 << 28) + (c2 - 0x90)) >> 30) !== 0 ||
+ // We delay reading c3 and c4 until now so than an error in c2 or c1
+ // will preserve them for the next loop iteration.
+ isNotTrailingByte(c3 = bytes[cursor++]) ||
+ isNotTrailingByte(c4 = bytes[cursor++])) {
+ cursor--; // push back c2, c3 or c4 depending on how far we made it
+ invalid(parsingErrorsAreFatal, codeUnits);
+ } else {
+ // Characters written on 4 bytes have 21 bits for a codepoint.
+ // We can't fit that on 16bit characters, so we use surrogates.
+ // 3 bits from the uppermost byte, 6 bits from each of the lower 3
+ // bytes. This is 21 bits which is too big for a 16 bit utf16 code
+ // unit so we use surrogates.
+ let codepoint = ((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12) |
+ ((c3 & 0x3F) << 6) | (c4 & 0x3F);
+ // Consistency check, this is the valid range for a 4 byte character.
+ jspb.asserts.assert(codepoint >= 0x10000 && codepoint <= 0x10FFFF);
+ // Surrogates formula from wikipedia.
+ // 1. Subtract 0x10000 from codepoint
+ codepoint -= 0x10000;
+ // 2. Split this into the high 10-bit value and the low 10-bit value
+ // 3. Add 0xD800 to the high value to form the high surrogate
+ // 4. Add 0xDC00 to the low value to form the low surrogate:
+ const low = (codepoint & 0x3FF) + MIN_LOW_SURROGATE;
+ const high = ((codepoint >> 10) & 0x3FF) + MIN_HIGH_SURROGATE;
+ codeUnits.push(high, low);
+ }
+ }
+ } else {
+ // initial byte is too large for utf8
+ invalid(parsingErrorsAreFatal, codeUnits);
+ }
+ // Accumulate as we go to avoid exceeding the maximum stack size when
+ // calling `apply`.
+ if (codeUnits.length >= 8192) {
+ result = codeUnitsToString(result, codeUnits);
+ codeUnits.length = 0;
+ }
+ }
+ // ensure we don't overflow or underflow
+ jspb.asserts.assert(cursor === end, `expected ${cursor} === ${end}`);
+ return codeUnitsToString(result, codeUnits);
+}
+
+
+/** @type {boolean|undefined} */
+let isFatalTextDecoderCachableAfterThrowing_ =
+ // chrome version >= 2020 are not subject to https://crbug.com/910292
+ goog.FEATURESET_YEAR >= 2020 ? true : undefined;
+
+/** @return {boolean} */
+function isFatalTextDecoderCachableAfterThrowing(/** !TextDecoder */ decoder) {
+ // Test if the decoder is subject to https://crbug.com/910292
+ // chrome versions with this bug cause one failed decode to cause all later
+ // decodes to throw.
+ if (isFatalTextDecoderCachableAfterThrowing_ === undefined) {
+ // In theory we shouldn't need to generate an error here since this function
+ // is only called in the context of a failed decode. However, the buggy
+ // chrome versions are not 'consistent' in corrupting their internal state
+ // since it depends on where in the decode stream the error occurs. This
+ // error however does consistently trigger the bug based on manual testing.
+ try {
+ // A lonely continuation byte
+ decoder.decode(new Uint8Array([0x80]));
+ } catch (e) {
+ // expected
+ }
+ try {
+ // 'a' in hex
+ decoder.decode(new Uint8Array([0x61]));
+ isFatalTextDecoderCachableAfterThrowing_ = true;
+ } catch (e) {
+ // This decode should not throw, if it does it means our chrome version
+ // is buggy and we need to flush our cached decoder when failures occur
+ isFatalTextDecoderCachableAfterThrowing_ = false;
+ }
+ }
+ return isFatalTextDecoderCachableAfterThrowing_;
+}
+
+/** @type {!TextDecoder|undefined} */
+let fatalDecoderInstance;
+
+/** @return {!TextDecoder}*/
+function getFatalDecoderInstance() {
+ let instance = fatalDecoderInstance;
+ if (!instance) {
+ instance = fatalDecoderInstance = new TextDecoder('utf-8', { fatal: true });
+ }
+ return instance;
+}
+
+/** @type {!TextDecoder|undefined} */
+let nonFatalDecoderInstance;
+
+/** @return {!TextDecoder}*/
+function getNonFatalDecoderInstance() {
+ let instance = nonFatalDecoderInstance;
+ if (!instance) {
+ instance = nonFatalDecoderInstance =
+ new TextDecoder('utf-8', { fatal: false });
+ }
+ return instance;
+}
+
+/**
+ * A `subarray` implementation that avoids calling `subarray` if it isn't needed
+ *
+ * `subarray` tends to be surprisingly slow.
+ * @return {!Uint8Array}
+ */
+function subarray(
+ /** !Uint8Array*/ bytes, /** number */ offset, /** number */ end) {
+ return offset === 0 && end === bytes.length ? bytes :
+ bytes.subarray(offset, end);
+}
+
+/**
+ * @return {string}
+ */
+jspb.binary.utf8.textDecoderDecodeUtf8 = function (
+ /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length,
+ /** boolean*/ parsingErrorsAreFatal) {
+ const /** !TextDecoder */ decoder = parsingErrorsAreFatal ?
+ getFatalDecoderInstance() :
+ getNonFatalDecoderInstance();
+
+ bytes = subarray(bytes, offset, offset + length);
+ try {
+ return decoder.decode(bytes);
+ } catch (e) {
+ if (parsingErrorsAreFatal &&
+ !isFatalTextDecoderCachableAfterThrowing(decoder)) {
+ fatalDecoderInstance = undefined;
+ }
+ throw e;
+ }
+}
+
+/** @const {boolean} */
+const useTextDecoderDecode =
+ USE_TEXT_ENCODING || typeof TextDecoder !== 'undefined';
+
+/**
+ * A utf8 decoding routine either based upon TextDecoder if available or using
+ * our polyfill implementation
+ * @return {string}
+ */
+jspb.binary.utf8.decodeUtf8 = function (
+ /** !Uint8Array*/ bytes, /** number */ offset, /** number */ length,
+ /** boolean*/ parsingErrorsAreFatal) {
+ return useTextDecoderDecode ?
+ jspb.binary.utf8.textDecoderDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal) :
+ jspb.binary.utf8.polyfillDecodeUtf8(bytes, offset, length, parsingErrorsAreFatal);
+}
+
+/** @type {!TextEncoder|undefined} */
+let textEncoderInstance;
+
+/** @return {!Uint8Array} */
+jspb.binary.utf8.textEncoderEncode = function (
+ /** string */ s, /** boolean */ rejectUnpairedSurrogates) {
+ if (rejectUnpairedSurrogates) {
+ checkWellFormed(s);
+ }
+
+ if (!textEncoderInstance) {
+ textEncoderInstance = new TextEncoder();
+ }
+ return textEncoderInstance.encode(s);
+}
+
+// isWellFormed landed in major browsers in early 2023 so it will only be
+// definitely available in 2024 See
+// http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed
+const /** boolean */ HAS_WELL_FORMED_METHOD = goog.FEATURESET_YEAR > 2023 ||
+ typeof String.prototype.isWellFormed === 'function';
+
+jspb.binary.utf8.checkWellFormed = function (/** string */ text) {
+ if (HAS_WELL_FORMED_METHOD ?
+ // Externs don't contain the definition of this function yet.
+ // http://go/mdn/JavaScript/Reference/Global_Objects/String/isWellFormed
+ !(/** @type{{isWellFormed:function():boolean}}*/ (
+ /** @type {?} */ (text))
+ .isWellFormed()) :
+ /(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/
+ .test(text)) {
+ throw new Error('Found an unpaired surrogate');
+ }
+}
+
+
+/** @return {!Uint8Array} */
+jspb.binary.utf8.polyfillEncode = function (
+ /** string */ s, /** boolean */ rejectUnpairedSurrogates) {
+ let bi = 0;
+ // The worse case is that every character requires 3 output bytes, so we
+ // allocate for this. This assumes that the buffer will be short lived.
+ // Callers can always `slice` if needed
+ const buffer = new Uint8Array(3 * s.length);
+ for (let ci = 0; ci < s.length; ci++) {
+ let c = s.charCodeAt(ci);
+ if (c < 0x80) {
+ buffer[bi++] = c;
+ } else if (c < 0x800) {
+ buffer[bi++] = (c >> 6) | 0xC0;
+ buffer[bi++] = (c & 63) | 0x80;
+ } else {
+ jspb.asserts.assert(c < 65536);
+ // Look for surrogates
+ // First check if it is surrogate range
+ if (c >= MIN_SURROGATE && c <= MAX_SURROGATE) {
+ // is it a high surrogate?
+ if (c <= MAX_HIGH_SURROGATE && ci < s.length) {
+ const c2 = s.charCodeAt(++ci);
+ if (c2 >= MIN_LOW_SURROGATE && c2 <= MAX_LOW_SURROGATE) {
+ // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
+ const codePoint =
+ (c - MIN_SURROGATE) * 0x400 + c2 - MIN_LOW_SURROGATE + 0x10000;
+ buffer[bi++] = (codePoint >> 18) | 0xF0;
+ buffer[bi++] = ((codePoint >> 12) & 63) | 0x80;
+ buffer[bi++] = ((codePoint >> 6) & 63) | 0x80;
+ buffer[bi++] = (codePoint & 63) | 0x80;
+ continue;
+ } else {
+ // else c2 not in low surrogate range, treat c as a lone surrogate
+ // and back up ci so we process c2 on the next loop as an
+ // independent character
+ ci--;
+ }
+ } // else c not a high surrogate
+ if (rejectUnpairedSurrogates) {
+ throw new Error('Found an unpaired surrogate');
+ }
+ c = 0xFFFD; // Error! Unpaired surrogate
+ }
+ buffer[bi++] = (c >> 12) | 0xE0;
+ buffer[bi++] = ((c >> 6) & 63) | 0x80;
+ buffer[bi++] = (c & 63) | 0x80;
+ }
+ }
+ return subarray(buffer, 0, bi);
+}
+
+/** @const {boolean} */
+const useTextEncoderEncode =
+ (USE_TEXT_ENCODING || typeof TextEncoder !== 'undefined');
+
+/**
+ * A utf8 encoding routine either based upon TextEncoder if available or using
+ * our polyfill implementation
+ * @return {!Uint8Array}
+ */
+jspb.binary.utf8.encodeUtf8 = function (
+ /**string*/ string, /** boolean=*/ rejectUnpairedSurrogates = false) {
+ jspb.asserts.assertString(string);
+ return useTextEncoderEncode ?
+ jspb.binary.utf8.textEncoderEncode(string, rejectUnpairedSurrogates) :
+ jspb.binary.utf8.polyfillEncode(string, rejectUnpairedSurrogates);
+}
+
diff --git a/generator/js_generator.cc b/generator/js_generator.cc
index 84365dc..6ca0ad3 100644
--- a/generator/js_generator.cc
+++ b/generator/js_generator.cc
@@ -1073,17 +1073,21 @@ std::string JSFieldTypeAnnotation(const GeneratorOptions& options,
return jstype;
}
-std::string JSBinaryReaderMethodType(const FieldDescriptor* field) {
+std::string JSBinaryMethodType(const FieldDescriptor* field, bool is_writer) {
std::string name = field->type_name();
if (name[0] >= 'a' && name[0] <= 'z') {
name[0] = (name[0] - 'a') + 'A';
}
+ if (!is_writer && field->type() == FieldDescriptor::TYPE_STRING &&
+ field->requires_utf8_validation()) {
+ name = name + "RequireUtf8";
+ }
return IsIntegralFieldWithStringJSType(field) ? (name + "String") : name;
}
std::string JSBinaryReadWriteMethodName(const FieldDescriptor* field,
bool is_writer) {
- std::string name = JSBinaryReaderMethodType(field);
+ std::string name = JSBinaryMethodType(field, is_writer);
if (field->is_packed()) {
name = "Packed" + name;
} else if (is_writer && field->is_repeated()) {
@@ -3128,11 +3132,11 @@ void Generator::GenerateClassDeserializeBinaryField(
printer->Print(
" var values = /** @type {$fieldtype$} */ "
"(reader.isDelimited() "
- "? reader.readPacked$reader$() : [reader.read$reader$()]);\n",
+ "? reader.read$reader$() : [reader.read$reader$()]);\n",
"fieldtype",
JSFieldTypeAnnotation(options, field, false, true,
/* singular_if_not_packed */ false, BYTES_U8),
- "reader", JSBinaryReaderMethodType(field));
+ "reader", JSBinaryReadWriteMethodName(field, /* is_writer=*/false));
} else {
printer->Print(
" var value = /** @type {$fieldtype$} */ "
diff --git a/gulpfile.js b/gulpfile.js
index e7f7511..426e0f1 100644
--- a/gulpfile.js
+++ b/gulpfile.js
@@ -145,6 +145,7 @@ function getClosureCompilerCommand(exportsFile, outputFile) {
'--js=binary/decoder.js',
'--js=binary/encoder.js',
'--js=binary/reader.js',
+ '--js=binary/utf8.js',
'--js=binary/utils.js',
'--js=binary/writer.js',
`--js=${exportsFile}`,
@@ -194,7 +195,7 @@ function commonjs_out(cb) {
function closure_make_deps(cb) {
exec(
- './node_modules/.bin/closure-make-deps --closure-path=. --file=node_modules/google-closure-library/closure/goog/deps.js binary/arith.js binary/constants.js binary/decoder.js binary/encoder.js binary/reader.js binary/utils.js binary/writer.js asserts.js debug.js map.js message.js node_loader.js test_bootstrap.js > deps.js',
+ './node_modules/.bin/closure-make-deps --closure-path=. --file=node_modules/google-closure-library/closure/goog/deps.js binary/arith.js binary/constants.js binary/decoder.js binary/encoder.js binary/reader.js binary/utf8.js binary/utils.js binary/writer.js asserts.js debug.js map.js message.js node_loader.js test_bootstrap.js > deps.js',
make_exec_logging_callback(cb));
}