Port fix: Fix JSPB binary utf8 decoding to be spec compliant.

Our prior behavior was extremely undefined when confronted with errors, it would read out of bounds, accept overlong encodings, skip over out of range bytes, compose out of range codepoints. The new implementation always detects and handles errors consistently by either throwing or using replacement characters (� aka \uFFFD) This also adds support for aligning with the proto3 spec to the code generator which requires that parsing fail for proto3 messages with invalid utf8 payloads for string fields. For now, actual failing is disabled via the goog.define jspb.binary.ENFORCE_UTF8 which is set to NEVER. A future change will flip this to DEFAULT.
protocolbuffers · Jul 16, 2024 · 1abea1a · 1abea1a
1 parent 5aee743
commit 1abea1a
Show file tree

Hide file tree

Showing 6 changed files with 517 additions and 76 deletions.
diff --git a/binary/decoder.js b/binary/decoder.js
@@ -47,7 +47,7 @@
 goog.provide('jspb.BinaryDecoder');
 
 goog.require('jspb.asserts');
-goog.require('goog.crypt');
+goog.require('jspb.binary.utf8');
 goog.require('jspb.utils');
 
 
@@ -256,7 +256,7 @@ jspb.BinaryDecoder.prototype.setCursor = function(cursor) {
  */
 jspb.BinaryDecoder.prototype.advance = function(count) {
   this.cursor_ += count;
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
 };
 
 
@@ -397,6 +397,17 @@ jspb.BinaryDecoder.prototype.readSplitFixed64 = function(convert) {
   return convert(lowBits, highBits);
 };
 
+/**
+  * Asserts that our cursor is in bounds.
+  *
+  * @private
+  * @return {void}
+  */
+jspb.BinaryDecoder.prototype.checkCursor = function () {
+  if (this.cursor_ > this.end_) {
+    asserts.fail('Read past the end ' + this.cursor_ + ' > ' + this.end_);
+  }
+}
 
 /**
  * Skips over a varint in the block without decoding it.
@@ -452,31 +463,31 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
   var x = (temp & 0x7F);
   if (temp < 128) {
     this.cursor_ += 1;
-    jspb.asserts.assert(this.cursor_ <= this.end_);
+    this.checkCursor();
     return x;
   }
 
   temp = bytes[this.cursor_ + 1];
   x |= (temp & 0x7F) << 7;
   if (temp < 128) {
     this.cursor_ += 2;
-    jspb.asserts.assert(this.cursor_ <= this.end_);
+    this.checkCursor();
     return x;
   }
 
   temp = bytes[this.cursor_ + 2];
   x |= (temp & 0x7F) << 14;
   if (temp < 128) {
     this.cursor_ += 3;
-    jspb.asserts.assert(this.cursor_ <= this.end_);
+    this.checkCursor();
     return x;
   }
 
   temp = bytes[this.cursor_ + 3];
   x |= (temp & 0x7F) << 21;
   if (temp < 128) {
     this.cursor_ += 4;
-    jspb.asserts.assert(this.cursor_ <= this.end_);
+    this.checkCursor();
     return x;
   }
 
@@ -486,7 +497,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
     // We're reading the high bits of an unsigned varint. The byte we just read
     // also contains bits 33 through 35, which we're going to discard.
     this.cursor_ += 5;
-    jspb.asserts.assert(this.cursor_ <= this.end_);
+    this.checkCursor();
     return x >>> 0;
   }
 
@@ -500,7 +511,7 @@ jspb.BinaryDecoder.prototype.readUnsignedVarint32 = function() {
     jspb.asserts.assert(false);
   }
 
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
   return x;
 };
 
@@ -679,7 +690,7 @@ jspb.BinaryDecoder.prototype.readZigzagVarint64String = function() {
 jspb.BinaryDecoder.prototype.readUint8 = function() {
   var a = this.bytes_[this.cursor_ + 0];
   this.cursor_ += 1;
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
   return a;
 };
 
@@ -694,7 +705,7 @@ jspb.BinaryDecoder.prototype.readUint16 = function() {
   var a = this.bytes_[this.cursor_ + 0];
   var b = this.bytes_[this.cursor_ + 1];
   this.cursor_ += 2;
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
   return (a << 0) | (b << 8);
 };
 
@@ -711,7 +722,7 @@ jspb.BinaryDecoder.prototype.readUint32 = function() {
   var c = this.bytes_[this.cursor_ + 2];
   var d = this.bytes_[this.cursor_ + 3];
   this.cursor_ += 4;
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
   return ((a << 0) | (b << 8) | (c << 16) | (d << 24)) >>> 0;
 };
 
@@ -756,7 +767,7 @@ jspb.BinaryDecoder.prototype.readUint64String = function() {
 jspb.BinaryDecoder.prototype.readInt8 = function() {
   var a = this.bytes_[this.cursor_ + 0];
   this.cursor_ += 1;
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
   return (a << 24) >> 24;
 };
 
@@ -771,7 +782,7 @@ jspb.BinaryDecoder.prototype.readInt16 = function() {
   var a = this.bytes_[this.cursor_ + 0];
   var b = this.bytes_[this.cursor_ + 1];
   this.cursor_ += 2;
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
   return (((a << 0) | (b << 8)) << 16) >> 16;
 };
 
@@ -788,7 +799,7 @@ jspb.BinaryDecoder.prototype.readInt32 = function() {
   var c = this.bytes_[this.cursor_ + 2];
   var d = this.bytes_[this.cursor_ + 3];
   this.cursor_ += 4;
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
   return (a << 0) | (b << 8) | (c << 16) | (d << 24);
 };
 
@@ -858,7 +869,9 @@ jspb.BinaryDecoder.prototype.readDouble = function() {
  * @export
  */
 jspb.BinaryDecoder.prototype.readBool = function() {
-  return !!this.bytes_[this.cursor_++];
+  const b = !!this.bytes_[this.cursor_++];
+  this.checkCursor();
+  return b;
 };
 
 
@@ -879,59 +892,17 @@ jspb.BinaryDecoder.prototype.readEnum = function() {
  * Supports codepoints from U+0000 up to U+10FFFF.
  * (http://en.wikipedia.org/wiki/UTF-8).
  * @param {number} length The length of the string to read.
+ * @param {boolean} requireUtf8 Whether to throw when invalid utf8 is found.
  * @return {string} The decoded string.
  * @export
  */
-jspb.BinaryDecoder.prototype.readString = function(length) {
-  var bytes = this.bytes_;
-  var cursor = this.cursor_;
-  var end = cursor + length;
-  var codeUnits = [];
-
-  var result = '';
-  while (cursor < end) {
-    var c = bytes[cursor++];
-    if (c < 128) {  // Regular 7-bit ASCII.
-      codeUnits.push(c);
-    } else if (c < 192) {
-      // UTF-8 continuation mark. We are out of sync. This
-      // might happen if we attempted to read a character
-      // with more than four bytes.
-      continue;
-    } else if (c < 224) {  // UTF-8 with two bytes.
-      var c2 = bytes[cursor++];
-      codeUnits.push(((c & 31) << 6) | (c2 & 63));
-    } else if (c < 240) {  // UTF-8 with three bytes.
-      var c2 = bytes[cursor++];
-      var c3 = bytes[cursor++];
-      codeUnits.push(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63));
-    } else if (c < 248) {  // UTF-8 with 4 bytes.
-      var c2 = bytes[cursor++];
-      var c3 = bytes[cursor++];
-      var c4 = bytes[cursor++];
-      // Characters written on 4 bytes have 21 bits for a codepoint.
-      // We can't fit that on 16bit characters, so we use surrogates.
-      var codepoint =
-          ((c & 7) << 18) | ((c2 & 63) << 12) | ((c3 & 63) << 6) | (c4 & 63);
-      // Surrogates formula from wikipedia.
-      // 1. Subtract 0x10000 from codepoint
-      codepoint -= 0x10000;
-      // 2. Split this into the high 10-bit value and the low 10-bit value
-      // 3. Add 0xD800 to the high value to form the high surrogate
-      // 4. Add 0xDC00 to the low value to form the low surrogate:
-      var low = (codepoint & 1023) + 0xDC00;
-      var high = ((codepoint >> 10) & 1023) + 0xD800;
-      codeUnits.push(high, low);
-    }
 
-    // Avoid exceeding the maximum stack size when calling `apply`.
-    if (codeUnits.length >= 8192) {
-      result += String.fromCharCode.apply(null, codeUnits);
-      codeUnits.length = 0;
-    }
-  }
-  result += goog.crypt.byteArrayToString(codeUnits);
-  this.cursor_ = cursor;
+jspb.BinaryDecoder.prototype.readString = function (length, requireUtf8) {
+  const cursor = this.cursor_;
+  this.cursor_ += length;
+  this.checkCursor();
+  const result =
+    jspb.binary.utf8.decodeUtf8(jspb.asserts.assert(this.bytes_), cursor, length, requireUtf8);
   return result;
 };
 
@@ -966,7 +937,7 @@ jspb.BinaryDecoder.prototype.readBytes = function(length) {
   var result = this.bytes_.subarray(this.cursor_, this.cursor_ + length);
 
   this.cursor_ += length;
-  jspb.asserts.assert(this.cursor_ <= this.end_);
+  this.checkCursor();
   return result;
 };
 

diff --git a/binary/decoder_test.js b/binary/decoder_test.js
@@ -354,7 +354,7 @@ describe('binaryDecoderTest', () => {
 
     const decoder = jspb.BinaryDecoder.alloc(encoder.end());
 
-    expect(decoder.readString(len)).toEqual(long_string);
+    expect(decoder.readString(len, true)).toEqual(long_string);
   });
 
   /**
@@ -375,11 +375,11 @@ describe('binaryDecoderTest', () => {
 
     const decoder = jspb.BinaryDecoder.alloc(encoder.end());
 
-    expect(decoder.readString(ascii.length)).toEqual(ascii);
-    expect(utf8_two_bytes).toEqual(decoder.readString(utf8_two_bytes.length));
+    expect(decoder.readString(ascii.length, true)).toEqual(ascii);
+    expect(utf8_two_bytes).toEqual(decoder.readString(2, true));
     expect(utf8_three_bytes)
-        .toEqual(decoder.readString(utf8_three_bytes.length));
-    expect(utf8_four_bytes).toEqual(decoder.readString(utf8_four_bytes.length));
+      .toEqual(decoder.readString(3, true));
+    expect(utf8_four_bytes).toEqual(decoder.readString(4, true));
   });
 
   /**

diff --git a/binary/reader.js b/binary/reader.js
@@ -52,6 +52,26 @@ goog.require('jspb.BinaryConstants');
 goog.require('jspb.BinaryDecoder');
 goog.require('jspb.utils');
 
+/**
+ * Whether to enforce that string fields are valid utf8.
+ *
+ * <p>Currently set to `ALWAYS`, can be set to `DEPRECATED_PROTO3_ONLY` to only
+ * enforce utf8 for proto3 string fields, for proto2 string fields it will use
+ * replacement characters when encoding errors are found.
+ *
+ * <p>TODO: Remove the flag, simplify BinaryReader to remove
+ * readStringRequireUtf8 and related support in the code generator et. al.
+ *
+ * @define {string}
+ */
+const ENFORCE_UTF8 = goog.define('jspb.binary.ENFORCE_UTF8', 'ALWAYS');
+
+// Constrain the set of values to only these two.
+jspb.asserts.assert(
+  ENFORCE_UTF8 === 'DEPRECATED_PROTO3_ONLY' || ENFORCE_UTF8 === 'ALWAYS');
+
+const /** boolean */ UTF8_PARSING_ERRORS_ARE_FATAL = ENFORCE_UTF8 === 'ALWAYS';
+
 
 
 /**
@@ -996,10 +1016,29 @@ jspb.BinaryReader.prototype.readEnum = function() {
  * @export
  */
 jspb.BinaryReader.prototype.readString = function() {
+  // delegate to the other reader so that inlining can eliminate this method
+  // in the common case.
+  if (UTF8_PARSING_ERRORS_ARE_FATAL) {
+    return this.readStringRequireUtf8();
+  }
+
   jspb.asserts.assert(
       this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
   var length = this.decoder_.readUnsignedVarint32();
-  return this.decoder_.readString(length);
+  return this.decoder_.readString(length, /*requireUtf8=*/ false);
+};
+
+/**
+ * Reads a string field from the binary stream, or throws an error if the next
+ * field in the stream is not of the correct wire type, or if the string is
+ * not valid utf8.
+ *
+ * @return {string} The value of the string field.
+ */
+jspb.BinaryReader.prototype.readStringRequireUtf8 = function () {
+  jspb.asserts.assert(this.nextWireType_ == jspb.BinaryConstants.WireType.DELIMITED);
+  const length = this.decoder_.readUnsignedVarint32();
+  return this.decoder_.readString(length, /*requireUtf8=*/ true);
 };