Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port fix: Fix JSPB binary utf8 decoding and validate by default (breaking change) #191

Merged
merged 2 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 36 additions & 65 deletions binary/decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
goog.provide('jspb.BinaryDecoder');

goog.require('jspb.asserts');
goog.require('goog.crypt');
goog.require('jspb.binary.utf8');
goog.require('jspb.utils');


Expand Down Expand Up @@ -256,7 +256,7 @@ jspb.BinaryDecoder.prototype.setCursor = function(cursor) {
*/
jspb.BinaryDecoder.prototype.advance = function(count) {
this.cursor_ += count;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
};


Expand Down Expand Up @@ -397,6 +397,17 @@ jspb.BinaryDecoder.prototype.readSplitFixed64 = function(convert) {
return convert(lowBits, highBits);
};

/**
* Asserts that our cursor is in bounds.
*
* @private
* @return {void}
*/
jspb.BinaryDecoder.prototype.checkCursor = function () {
if (this.cursor_ > this.end_) {
asserts.fail('Read past the end ' + this.cursor_ + ' > ' + this.end_);
}
}

/**
* Skips over a varint in the block without decoding it.
Expand Down Expand Up @@ -452,31 +463,31 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
var x = (temp & 0x7F);
if (temp < 128) {
this.cursor_ += 1;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
}

temp = bytes[this.cursor_ + 1];
x |= (temp & 0x7F) << 7;
if (temp < 128) {
this.cursor_ += 2;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
}

temp = bytes[this.cursor_ + 2];
x |= (temp & 0x7F) << 14;
if (temp < 128) {
this.cursor_ += 3;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
}

temp = bytes[this.cursor_ + 3];
x |= (temp & 0x7F) << 21;
if (temp < 128) {
this.cursor_ += 4;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
}

Expand All @@ -486,7 +497,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
// We're reading the high bits of an unsigned varint. The byte we just read
// also contains bits 33 through 35, which we're going to discard.
this.cursor_ += 5;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x >>> 0;
}

Expand All @@ -500,7 +511,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
jspb.asserts.assert(false);
}

jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
};

Expand Down Expand Up @@ -679,7 +690,7 @@ jspb.BinaryDecoder.prototype.readZigzagVarint64String = function() {
jspb.BinaryDecoder.prototype.readUint8 = function() {
var a = this.bytes_[this.cursor_ + 0];
this.cursor_ += 1;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return a;
};

Expand All @@ -694,7 +705,7 @@ jspb.BinaryDecoder.prototype.readUint16 = function() {
var a = this.bytes_[this.cursor_ + 0];
var b = this.bytes_[this.cursor_ + 1];
this.cursor_ += 2;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return (a << 0) | (b << 8);
};

Expand All @@ -711,7 +722,7 @@ jspb.BinaryDecoder.prototype.readUint32 = function() {
var c = this.bytes_[this.cursor_ + 2];
var d = this.bytes_[this.cursor_ + 3];
this.cursor_ += 4;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return ((a << 0) | (b << 8) | (c << 16) | (d << 24)) >>> 0;
};

Expand Down Expand Up @@ -756,7 +767,7 @@ jspb.BinaryDecoder.prototype.readUint64String = function() {
jspb.BinaryDecoder.prototype.readInt8 = function() {
var a = this.bytes_[this.cursor_ + 0];
this.cursor_ += 1;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return (a << 24) >> 24;
};

Expand All @@ -771,7 +782,7 @@ jspb.BinaryDecoder.prototype.readInt16 = function() {
var a = this.bytes_[this.cursor_ + 0];
var b = this.bytes_[this.cursor_ + 1];
this.cursor_ += 2;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return (((a << 0) | (b << 8)) << 16) >> 16;
};

Expand All @@ -788,7 +799,7 @@ jspb.BinaryDecoder.prototype.readInt32 = function() {
var c = this.bytes_[this.cursor_ + 2];
var d = this.bytes_[this.cursor_ + 3];
this.cursor_ += 4;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return (a << 0) | (b << 8) | (c << 16) | (d << 24);
};

Expand Down Expand Up @@ -858,7 +869,9 @@ jspb.BinaryDecoder.prototype.readDouble = function() {
* @export
*/
jspb.BinaryDecoder.prototype.readBool = function() {
return !!this.bytes_[this.cursor_++];
const b = !!this.bytes_[this.cursor_++];
this.checkCursor();
return b;
};


Expand All @@ -879,59 +892,17 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
* Supports codepoints from U+0000 up to U+10FFFF.
* (http://en.wikipedia.org/wiki/UTF-8).
* @param {number} length The length of the string to read.
* @param {boolean} requireUtf8 Whether to throw when invalid utf8 is found.
* @return {string} The decoded string.
* @export
*/
jspb.BinaryDecoder.prototype.readString = function(length) {
var bytes = this.bytes_;
var cursor = this.cursor_;
var end = cursor + length;
var codeUnits = [];

var result = '';
while (cursor < end) {
var c = bytes[cursor++];
if (c < 128) { // Regular 7-bit ASCII.
codeUnits.push(c);
} else if (c < 192) {
// UTF-8 continuation mark. We are out of sync. This
// might happen if we attempted to read a character
// with more than four bytes.
continue;
} else if (c < 224) { // UTF-8 with two bytes.
var c2 = bytes[cursor++];
codeUnits.push(((c & 31) << 6) | (c2 & 63));
} else if (c < 240) { // UTF-8 with three bytes.
var c2 = bytes[cursor++];
var c3 = bytes[cursor++];
codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
} else if (c < 248) { // UTF-8 with 4 bytes.
var c2 = bytes[cursor++];
var c3 = bytes[cursor++];
var c4 = bytes[cursor++];
// Characters written on 4 bytes have 21 bits for a codepoint.
// We can't fit that on 16bit characters, so we use surrogates.
var codepoint =
((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
// Surrogates formula from wikipedia.
// 1. Subtract 0x10000 from codepoint
codepoint -= 0x10000;
// 2. Split this into the high 10-bit value and the low 10-bit value
// 3. Add 0xD800 to the high value to form the high surrogate
// 4. Add 0xDC00 to the low value to form the low surrogate:
var low = (codepoint & 1023) + 0xDC00;
var high = ((codepoint >> 10) & 1023) + 0xD800;
codeUnits.push(high, low);
}

// Avoid exceeding the maximum stack size when calling `apply`.
if (codeUnits.length >= 8192) {
result += String.fromCharCode.apply(null, codeUnits);
codeUnits.length = 0;
}
}
result += goog.crypt.byteArrayToString(codeUnits);
this.cursor_ = cursor;
jspb.BinaryDecoder.prototype.readString = function (length, requireUtf8) {
const cursor = this.cursor_;
this.cursor_ += length;
this.checkCursor();
const result =
jspb.binary.utf8.decodeUtf8(jspb.asserts.assert(this.bytes_), cursor, length, requireUtf8);
return result;
};

Expand Down Expand Up @@ -966,7 +937,7 @@ jspb.BinaryDecoder.prototype.readBytes = function(length) {
var result = this.bytes_.subarray(this.cursor_, this.cursor_ + length);

this.cursor_ += length;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return result;
};

Expand Down
10 changes: 5 additions & 5 deletions binary/decoder_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ describe('binaryDecoderTest', () => {

const decoder = jspb.BinaryDecoder.alloc(encoder.end());

expect(decoder.readString(len)).toEqual(long_string);
expect(decoder.readString(len, true)).toEqual(long_string);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
expect(decoder.readString(len, true)).toEqual(long_string);
expect(decoder.readString(len, /* enforceUtf8= */ true)).toEqual(long_string);

});

/**
Expand All @@ -375,11 +375,11 @@ describe('binaryDecoderTest', () => {

const decoder = jspb.BinaryDecoder.alloc(encoder.end());

expect(decoder.readString(ascii.length)).toEqual(ascii);
expect(utf8_two_bytes).toEqual(decoder.readString(utf8_two_bytes.length));
expect(decoder.readString(ascii.length, /* enforceUtf8= */ true)).toEqual(ascii);
expect(utf8_two_bytes).toEqual(decoder.readString(2, /* enforceUtf8= */ true));
expect(utf8_three_bytes)
.toEqual(decoder.readString(utf8_three_bytes.length));
expect(utf8_four_bytes).toEqual(decoder.readString(utf8_four_bytes.length));
.toEqual(decoder.readString(3, /* enforceUtf8= */ true));
expect(utf8_four_bytes).toEqual(decoder.readString(4, /* enforceUtf8= */ true));
});

/**
Expand Down
41 changes: 40 additions & 1 deletion binary/reader.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,26 @@ goog.require('jspb.BinaryConstants');
goog.require('jspb.BinaryDecoder');
goog.require('jspb.utils');

/**
* Whether to enforce that string fields are valid utf8.
*
* <p>Currently set to `ALWAYS`, can be set to `DEPRECATED_PROTO3_ONLY` to only
dibenede marked this conversation as resolved.
Show resolved Hide resolved
* enforce utf8 for proto3 string fields, for proto2 string fields it will use
* replacement characters when encoding errors are found.
*
* <p>TODO: Remove the flag, simplify BinaryReader to remove
* readStringRequireUtf8 and related support in the code generator et. al.
*
* @define {string}
*/
const ENFORCE_UTF8 = goog.define('jspb.binary.ENFORCE_UTF8', 'ALWAYS');

// Constrain the set of values to only these two.
jspb.asserts.assert(
ENFORCE_UTF8 === 'DEPRECATED_PROTO3_ONLY' || ENFORCE_UTF8 === 'ALWAYS');

const /** boolean */ UTF8_PARSING_ERRORS_ARE_FATAL = ENFORCE_UTF8 === 'ALWAYS';



/**
Expand Down Expand Up @@ -996,10 +1016,29 @@ jspb.BinaryReader.prototype.readEnum = function() {
* @export
*/
jspb.BinaryReader.prototype.readString = function() {
// delegate to the other reader so that inlining can eliminate this method
// in the common case.
if (UTF8_PARSING_ERRORS_ARE_FATAL) {
return this.readStringRequireUtf8();
}

jspb.asserts.assert(
this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
var length = this.decoder_.readUnsignedVarint32();
return this.decoder_.readString(length);
return this.decoder_.readString(length, /*requireUtf8=*/ false);
};

/**
* Reads a string field from the binary stream, or throws an error if the next
* field in the stream is not of the correct wire type, or if the string is
* not valid utf8.
*
* @return {string} The value of the string field.
*/
jspb.BinaryReader.prototype.readStringRequireUtf8 = function () {
jspb.asserts.assert(this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
const length = this.decoder_.readUnsignedVarint32();
return this.decoder_.readString(length, /*requireUtf8=*/ true);
};


Expand Down
Loading