GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in Unmarsha…

…lExtJSON. (#649) * GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON. * Correct err handling in jsonScanner.ScanString and remove unused field in UnmarshalExtJSON test. * Explicitly write unicode.ReplacementChar for invalid surrogate and simplify test types. * Add tests for high surrogate followed by non-Unicode escape sequence and 4-byte UTF-8 extJSON marshaling. * Explicitly write the Unicode replacement character for an un-paired Unicode surrogate value.
mongodb · May 4, 2021 · 4b2077a · 4b2077a
1 parent a69cc5e
commit 4b2077a
Show file tree

Hide file tree

Showing 4 changed files with 204 additions and 22 deletions.
diff --git a/bson/bsonrw/json_scanner.go b/bson/bsonrw/json_scanner.go
@@ -13,8 +13,8 @@ import (
 	"io"
 	"math"
 	"strconv"
-	"strings"
 	"unicode"
+	"unicode/utf16"
 )
 
 type jsonTokenType byte
@@ -162,6 +162,31 @@ func isValueTerminator(c byte) bool {
 	return c == ',' || c == '}' || c == ']' || isWhiteSpace(c)
 }
 
+// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune,
+// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present.
+// It is copied and lightly modified from the Go JSON decode function at
+// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188
+func getu4(s []byte) rune {
+	if len(s) < 4 {
+		return -1
+	}
+	var r rune
+	for _, c := range s[:4] {
+		switch {
+		case '0' <= c && c <= '9':
+			c = c - '0'
+		case 'a' <= c && c <= 'f':
+			c = c - 'a' + 10
+		case 'A' <= c && c <= 'F':
+			c = c - 'A' + 10
+		default:
+			return -1
+		}
+		r = r*16 + rune(c)
+	}
+	return r
+}
+
 // scanString reads from an opening '"' to a closing '"' and handles escaped characters
 func (js *jsonScanner) scanString() (*jsonToken, error) {
 	var b bytes.Buffer
@@ -179,9 +204,18 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
 			return nil, err
 		}
 
+	evalNextChar:
 		switch c {
 		case '\\':
 			c, err = js.readNextByte()
+			if err != nil {
+				if err == io.EOF {
+					return nil, errors.New("end of input in JSON string")
+				}
+				return nil, err
+			}
+
+		evalNextEscapeChar:
 			switch c {
 			case '"', '\\', '/':
 				b.WriteByte(c)
@@ -202,13 +236,68 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
 					return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
 				}
 
-				s := fmt.Sprintf(`\u%s`, us)
-				s, err = strconv.Unquote(strings.Replace(strconv.Quote(s), `\\u`, `\u`, 1))
-				if err != nil {
-					return nil, err
+				rn := getu4(us)
+
+				// If the rune we just decoded is the high or low value of a possible surrogate pair,
+				// try to decode the next sequence as the low value of a surrogate pair. We're
+				// expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"),
+				// but need to handle cases where the input is not a valid surrogate pair.
+				// For more context on unicode surrogate pairs, see:
+				// https://www.christianfscott.com/rust-chars-vs-go-runes/
+				// https://www.unicode.org/glossary/#high_surrogate_code_point
+				if utf16.IsSurrogate(rn) {
+					c, err = js.readNextByte()
+					if err != nil {
+						if err == io.EOF {
+							return nil, errors.New("end of input in JSON string")
+						}
+						return nil, err
+					}
+
+					// If the next value isn't the beginning of a backslash escape sequence, write
+					// the Unicode replacement character for the surrogate value and goto the
+					// beginning of the next char eval block.
+					if c != '\\' {
+						b.WriteRune(unicode.ReplacementChar)
+						goto evalNextChar
+					}
+
+					c, err = js.readNextByte()
+					if err != nil {
+						if err == io.EOF {
+							return nil, errors.New("end of input in JSON string")
+						}
+						return nil, err
+					}
+
+					// If the next value isn't the beginning of a unicode escape sequence, write the
+					// Unicode replacement character for the surrogate value and goto the beginning
+					// of the next escape char eval block.
+					if c != 'u' {
+						b.WriteRune(unicode.ReplacementChar)
+						goto evalNextEscapeChar
+					}
+
+					err = js.readNNextBytes(us, 4, 0)
+					if err != nil {
+						return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
+					}
+
+					rn2 := getu4(us)
+
+					// Try to decode the pair of runes as a utf16 surrogate pair. If that fails, write
+					// the Unicode replacement character for the surrogate value and the 2nd decoded rune.
+					if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar {
+						b.WriteRune(rnPair)
+					} else {
+						b.WriteRune(unicode.ReplacementChar)
+						b.WriteRune(rn2)
+					}
+
+					break
 				}
 
-				b.WriteString(s)
+				b.WriteRune(rn)
 			default:
 				return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c)
 			}

diff --git a/bson/bsonrw/json_scanner_test.go b/bson/bsonrw/json_scanner_test.go
@@ -100,6 +100,31 @@ func TestJsonScannerValidInputs(t *testing.T) {
 			input:  `"\"\\\/\b\f\n\r\t"`,
 			tokens: []jsonToken{{t: jttString, v: "\"\\/\b\f\n\r\t"}},
 		},
+		{
+			desc:   "valid string--surrogate pair",
+			input:  `"abc \uD834\uDd1e 123"`,
+			tokens: []jsonToken{{t: jttString, v: "abc 𝄞 123"}},
+		},
+		{
+			desc:   "valid string--high surrogate at end of string",
+			input:  `"abc \uD834"`,
+			tokens: []jsonToken{{t: jttString, v: "abc �"}},
+		},
+		{
+			desc:   "valid string--low surrogate at end of string",
+			input:  `"abc \uDD1E"`,
+			tokens: []jsonToken{{t: jttString, v: "abc �"}},
+		},
+		{
+			desc:   "valid string--high surrogate with non-surrogate Unicode value",
+			input:  `"abc \uDD1E\u00BF"`,
+			tokens: []jsonToken{{t: jttString, v: "abc �¿"}},
+		},
+		{
+			desc:   "valid string--high surrogate with non-Unicode escape sequence",
+			input:  `"abc \uDD1E\t"`,
+			tokens: []jsonToken{{t: jttString, v: "abc �\t"}},
+		},
 		{
 			desc: "valid literal--true", input: "true",
 			tokens: []jsonToken{{t: jttBool, v: true}},
@@ -280,28 +305,28 @@ func TestJsonScannerValidInputs(t *testing.T) {
 
 			for _, token := range tc.tokens {
 				c, err := js.nextToken()
+				expectNoError(t, err, tc.desc)
 				jttDiff(t, token.t, c.t, tc.desc)
 				jtvDiff(t, token.v, c.v, tc.desc)
-				expectNoError(t, err, tc.desc)
 			}
 
 			c, err := js.nextToken()
-			jttDiff(t, jttEOF, c.t, tc.desc)
 			noerr(t, err)
+			jttDiff(t, jttEOF, c.t, tc.desc)
 
 			// testing early EOF reading
 			js = &jsonScanner{r: iotest.DataErrReader(strings.NewReader(tc.input))}
 
 			for _, token := range tc.tokens {
 				c, err := js.nextToken()
+				expectNoError(t, err, tc.desc)
 				jttDiff(t, token.t, c.t, tc.desc)
 				jtvDiff(t, token.v, c.v, tc.desc)
-				expectNoError(t, err, tc.desc)
 			}
 
 			c, err = js.nextToken()
-			jttDiff(t, jttEOF, c.t, tc.desc)
 			noerr(t, err)
+			jttDiff(t, jttEOF, c.t, tc.desc)
 		})
 	}
 }

diff --git a/bson/unmarshal_test.go b/bson/unmarshal_test.go
@@ -95,18 +95,81 @@ func TestUnmarshalExtJSONWithRegistry(t *testing.T) {
 }
 
 func TestUnmarshalExtJSONWithContext(t *testing.T) {
-	t.Run("UnmarshalExtJSONWithContext", func(t *testing.T) {
-		type teststruct struct{ Foo int }
-		var got teststruct
-		data := []byte("{\"foo\":1}")
-		dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
-		err := UnmarshalExtJSONWithContext(dc, data, true, &got)
-		noerr(t, err)
-		want := teststruct{1}
-		if !cmp.Equal(got, want) {
-			t.Errorf("Did not unmarshal as expected. got %v; want %v", got, want)
-		}
-	})
+	type fooInt struct {
+		Foo int
+	}
+
+	type fooString struct {
+		Foo string
+	}
+
+	var cases = []struct {
+		name  string
+		sType reflect.Type
+		want  interface{}
+		data  []byte
+	}{
+		{
+			name:  "Small struct",
+			sType: reflect.TypeOf(fooInt{}),
+			data:  []byte(`{"foo":1}`),
+			want:  &fooInt{Foo: 1},
+		},
+		{
+			name:  "Valid surrogate pair",
+			sType: reflect.TypeOf(fooString{}),
+			data:  []byte(`{"foo":"\uD834\uDd1e"}`),
+			want:  &fooString{Foo: "𝄞"},
+		},
+		{
+			name:  "Valid surrogate pair with other values",
+			sType: reflect.TypeOf(fooString{}),
+			data:  []byte(`{"foo":"abc \uD834\uDd1e 123"}`),
+			want:  &fooString{Foo: "abc 𝄞 123"},
+		},
+		{
+			name:  "High surrogate value with no following low surrogate value",
+			sType: reflect.TypeOf(fooString{}),
+			data:  []byte(`{"foo":"abc \uD834 123"}`),
+			want:  &fooString{Foo: "abc � 123"},
+		},
+		{
+			name:  "High surrogate value at end of string",
+			sType: reflect.TypeOf(fooString{}),
+			data:  []byte(`{"foo":"\uD834"}`),
+			want:  &fooString{Foo: "�"},
+		},
+		{
+			name:  "Low surrogate value with no preceeding high surrogate value",
+			sType: reflect.TypeOf(fooString{}),
+			data:  []byte(`{"foo":"abc \uDd1e 123"}`),
+			want:  &fooString{Foo: "abc � 123"},
+		},
+		{
+			name:  "Low surrogate value at end of string",
+			sType: reflect.TypeOf(fooString{}),
+			data:  []byte(`{"foo":"\uDd1e"}`),
+			want:  &fooString{Foo: "�"},
+		},
+		{
+			name:  "High surrogate value with non-surrogate unicode value",
+			sType: reflect.TypeOf(fooString{}),
+			data:  []byte(`{"foo":"\uD834\u00BF"}`),
+			want:  &fooString{Foo: "�¿"},
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := reflect.New(tc.sType).Interface()
+			dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
+			err := UnmarshalExtJSONWithContext(dc, tc.data, true, got)
+			noerr(t, err)
+			if !cmp.Equal(got, tc.want) {
+				t.Errorf("Did not unmarshal as expected. got %+v; want %+v", got, tc.want)
+			}
+		})
+	}
 }
 
 func TestCachingDecodersNotSharedAcrossRegistries(t *testing.T) {

diff --git a/data/bson-corpus/string.json b/data/bson-corpus/string.json
@@ -28,6 +28,11 @@
             "canonical_bson": "190000000261000D000000E29886E29886E29886E298860000",
             "canonical_extjson": "{\"a\" : \"\\u2606\\u2606\\u2606\\u2606\"}"
         },
+        {
+            "description": "four-byte UTF-8 (𝄞)",
+            "canonical_bson": "1100000002610005000000F09D849E0000",
+            "canonical_extjson": "{\"a\" : \"𝄞\"}"
+        },
         {
             "description": "Embedded nulls",
             "canonical_bson": "190000000261000D0000006162006261620062616261620000",