Skip to content

Commit

Permalink
Port fix: Fix JSPB binary utf8 decoding to be spec compliant.
Browse files Browse the repository at this point in the history
Our prior behavior was extremely undefined when confronted with errors, it would read out of bounds, accept overlong encodings, skip over out of range bytes, compose out of range codepoints. The new implementation always detects and handles errors consistently by either throwing or using replacement characters (� aka \uFFFD)

This also adds support for aligning with the proto3 spec to the code generator which requires that parsing fail for proto3 messages with invalid utf8 payloads for string fields. For now, actual failing is disabled via the goog.define jspb.binary.ENFORCE_UTF8 which is set to NEVER. A future change will flip this to DEFAULT.
  • Loading branch information
dibenede committed Jul 16, 2024
1 parent 5aee743 commit 1abea1a
Show file tree
Hide file tree
Showing 6 changed files with 517 additions and 76 deletions.
101 changes: 36 additions & 65 deletions binary/decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
goog.provide('jspb.BinaryDecoder');

goog.require('jspb.asserts');
goog.require('goog.crypt');
goog.require('jspb.binary.utf8');
goog.require('jspb.utils');


Expand Down Expand Up @@ -256,7 +256,7 @@ jspb.BinaryDecoder.prototype.setCursor = function(cursor) {
*/
jspb.BinaryDecoder.prototype.advance = function(count) {
this.cursor_ += count;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
};


Expand Down Expand Up @@ -397,6 +397,17 @@ jspb.BinaryDecoder.prototype.readSplitFixed64 = function(convert) {
return convert(lowBits, highBits);
};

/**
* Asserts that our cursor is in bounds.
*
* @private
* @return {void}
*/
jspb.BinaryDecoder.prototype.checkCursor = function () {
if (this.cursor_ > this.end_) {
asserts.fail('Read past the end ' + this.cursor_ + ' > ' + this.end_);
}
}

/**
* Skips over a varint in the block without decoding it.
Expand Down Expand Up @@ -452,31 +463,31 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
var x = (temp & 0x7F);
if (temp < 128) {
this.cursor_ += 1;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
}

temp = bytes[this.cursor_ + 1];
x |= (temp & 0x7F) << 7;
if (temp < 128) {
this.cursor_ += 2;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
}

temp = bytes[this.cursor_ + 2];
x |= (temp & 0x7F) << 14;
if (temp < 128) {
this.cursor_ += 3;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
}

temp = bytes[this.cursor_ + 3];
x |= (temp & 0x7F) << 21;
if (temp < 128) {
this.cursor_ += 4;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
}

Expand All @@ -486,7 +497,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
// We're reading the high bits of an unsigned varint. The byte we just read
// also contains bits 33 through 35, which we're going to discard.
this.cursor_ += 5;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x >>> 0;
}

Expand All @@ -500,7 +511,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
jspb.asserts.assert(false);
}

jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return x;
};

Expand Down Expand Up @@ -679,7 +690,7 @@ jspb.BinaryDecoder.prototype.readZigzagVarint64String = function() {
jspb.BinaryDecoder.prototype.readUint8 = function() {
var a = this.bytes_[this.cursor_ + 0];
this.cursor_ += 1;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return a;
};

Expand All @@ -694,7 +705,7 @@ jspb.BinaryDecoder.prototype.readUint16 = function() {
var a = this.bytes_[this.cursor_ + 0];
var b = this.bytes_[this.cursor_ + 1];
this.cursor_ += 2;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return (a << 0) | (b << 8);
};

Expand All @@ -711,7 +722,7 @@ jspb.BinaryDecoder.prototype.readUint32 = function() {
var c = this.bytes_[this.cursor_ + 2];
var d = this.bytes_[this.cursor_ + 3];
this.cursor_ += 4;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return ((a << 0) | (b << 8) | (c << 16) | (d << 24)) >>> 0;
};

Expand Down Expand Up @@ -756,7 +767,7 @@ jspb.BinaryDecoder.prototype.readUint64String = function() {
jspb.BinaryDecoder.prototype.readInt8 = function() {
var a = this.bytes_[this.cursor_ + 0];
this.cursor_ += 1;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return (a << 24) >> 24;
};

Expand All @@ -771,7 +782,7 @@ jspb.BinaryDecoder.prototype.readInt16 = function() {
var a = this.bytes_[this.cursor_ + 0];
var b = this.bytes_[this.cursor_ + 1];
this.cursor_ += 2;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return (((a << 0) | (b << 8)) << 16) >> 16;
};

Expand All @@ -788,7 +799,7 @@ jspb.BinaryDecoder.prototype.readInt32 = function() {
var c = this.bytes_[this.cursor_ + 2];
var d = this.bytes_[this.cursor_ + 3];
this.cursor_ += 4;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return (a << 0) | (b << 8) | (c << 16) | (d << 24);
};

Expand Down Expand Up @@ -858,7 +869,9 @@ jspb.BinaryDecoder.prototype.readDouble = function() {
* @export
*/
jspb.BinaryDecoder.prototype.readBool = function() {
return !!this.bytes_[this.cursor_++];
const b = !!this.bytes_[this.cursor_++];
this.checkCursor();
return b;
};


Expand All @@ -879,59 +892,17 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
* Supports codepoints from U+0000 up to U+10FFFF.
* (http://en.wikipedia.org/wiki/UTF-8).
* @param {number} length The length of the string to read.
* @param {boolean} requireUtf8 Whether to throw when invalid utf8 is found.
* @return {string} The decoded string.
* @export
*/
jspb.BinaryDecoder.prototype.readString = function(length) {
var bytes = this.bytes_;
var cursor = this.cursor_;
var end = cursor + length;
var codeUnits = [];

var result = '';
while (cursor < end) {
var c = bytes[cursor++];
if (c < 128) { // Regular 7-bit ASCII.
codeUnits.push(c);
} else if (c < 192) {
// UTF-8 continuation mark. We are out of sync. This
// might happen if we attempted to read a character
// with more than four bytes.
continue;
} else if (c < 224) { // UTF-8 with two bytes.
var c2 = bytes[cursor++];
codeUnits.push(((c & 31) << 6) | (c2 & 63));
} else if (c < 240) { // UTF-8 with three bytes.
var c2 = bytes[cursor++];
var c3 = bytes[cursor++];
codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
} else if (c < 248) { // UTF-8 with 4 bytes.
var c2 = bytes[cursor++];
var c3 = bytes[cursor++];
var c4 = bytes[cursor++];
// Characters written on 4 bytes have 21 bits for a codepoint.
// We can't fit that on 16bit characters, so we use surrogates.
var codepoint =
((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
// Surrogates formula from wikipedia.
// 1. Subtract 0x10000 from codepoint
codepoint -= 0x10000;
// 2. Split this into the high 10-bit value and the low 10-bit value
// 3. Add 0xD800 to the high value to form the high surrogate
// 4. Add 0xDC00 to the low value to form the low surrogate:
var low = (codepoint & 1023) + 0xDC00;
var high = ((codepoint >> 10) & 1023) + 0xD800;
codeUnits.push(high, low);
}

// Avoid exceeding the maximum stack size when calling `apply`.
if (codeUnits.length >= 8192) {
result += String.fromCharCode.apply(null, codeUnits);
codeUnits.length = 0;
}
}
result += goog.crypt.byteArrayToString(codeUnits);
this.cursor_ = cursor;
jspb.BinaryDecoder.prototype.readString = function (length, requireUtf8) {
const cursor = this.cursor_;
this.cursor_ += length;
this.checkCursor();
const result =
jspb.binary.utf8.decodeUtf8(jspb.asserts.assert(this.bytes_), cursor, length, requireUtf8);
return result;
};

Expand Down Expand Up @@ -966,7 +937,7 @@ jspb.BinaryDecoder.prototype.readBytes = function(length) {
var result = this.bytes_.subarray(this.cursor_, this.cursor_ + length);

this.cursor_ += length;
jspb.asserts.assert(this.cursor_ <= this.end_);
this.checkCursor();
return result;
};

Expand Down
10 changes: 5 additions & 5 deletions binary/decoder_test.js
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ describe('binaryDecoderTest', () => {

const decoder = jspb.BinaryDecoder.alloc(encoder.end());

expect(decoder.readString(len)).toEqual(long_string);
expect(decoder.readString(len, true)).toEqual(long_string);
});

/**
Expand All @@ -375,11 +375,11 @@ describe('binaryDecoderTest', () => {

const decoder = jspb.BinaryDecoder.alloc(encoder.end());

expect(decoder.readString(ascii.length)).toEqual(ascii);
expect(utf8_two_bytes).toEqual(decoder.readString(utf8_two_bytes.length));
expect(decoder.readString(ascii.length, true)).toEqual(ascii);
expect(utf8_two_bytes).toEqual(decoder.readString(2, true));
expect(utf8_three_bytes)
.toEqual(decoder.readString(utf8_three_bytes.length));
expect(utf8_four_bytes).toEqual(decoder.readString(utf8_four_bytes.length));
.toEqual(decoder.readString(3, true));
expect(utf8_four_bytes).toEqual(decoder.readString(4, true));
});

/**
Expand Down
41 changes: 40 additions & 1 deletion binary/reader.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,26 @@ goog.require('jspb.BinaryConstants');
goog.require('jspb.BinaryDecoder');
goog.require('jspb.utils');

/**
* Whether to enforce that string fields are valid utf8.
*
* <p>Currently set to `ALWAYS`, can be set to `DEPRECATED_PROTO3_ONLY` to only
* enforce utf8 for proto3 string fields, for proto2 string fields it will use
* replacement characters when encoding errors are found.
*
* <p>TODO: Remove the flag, simplify BinaryReader to remove
* readStringRequireUtf8 and related support in the code generator et. al.
*
* @define {string}
*/
const ENFORCE_UTF8 = goog.define('jspb.binary.ENFORCE_UTF8', 'ALWAYS');

// Constrain the set of values to only these two.
jspb.asserts.assert(
ENFORCE_UTF8 === 'DEPRECATED_PROTO3_ONLY' || ENFORCE_UTF8 === 'ALWAYS');

const /** boolean */ UTF8_PARSING_ERRORS_ARE_FATAL = ENFORCE_UTF8 === 'ALWAYS';



/**
Expand Down Expand Up @@ -996,10 +1016,29 @@ jspb.BinaryReader.prototype.readEnum = function() {
* @export
*/
jspb.BinaryReader.prototype.readString = function() {
// delegate to the other reader so that inlining can eliminate this method
// in the common case.
if (UTF8_PARSING_ERRORS_ARE_FATAL) {
return this.readStringRequireUtf8();
}

jspb.asserts.assert(
this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
var length = this.decoder_.readUnsignedVarint32();
return this.decoder_.readString(length);
return this.decoder_.readString(length, /*requireUtf8=*/ false);
};

/**
* Reads a string field from the binary stream, or throws an error if the next
* field in the stream is not of the correct wire type, or if the string is
* not valid utf8.
*
* @return {string} The value of the string field.
*/
jspb.BinaryReader.prototype.readStringRequireUtf8 = function () {
jspb.asserts.assert(this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
const length = this.decoder_.readUnsignedVarint32();
return this.decoder_.readString(length, /*requireUtf8=*/ true);
};


Expand Down
Loading

0 comments on commit 1abea1a

Please sign in to comment.