From 91263a90b603400da4524d644da5a6ce2d5b144e Mon Sep 17 00:00:00 2001
From: Matt Dale <9760375+matthewdale@users.noreply.github.com>
Date: Mon, 26 Apr 2021 10:04:34 -0700
Subject: [PATCH] GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in
 UnmarshalExtJSON.

---
 bson/bsonrw/json_scanner.go      | 93 +++++++++++++++++++++++++++++---
 bson/bsonrw/json_scanner_test.go | 28 ++++++++--
 bson/unmarshal_test.go           | 23 ++++----
 bson/unmarshaling_cases_test.go  | 91 +++++++++++++++++++++++++++++++
 4 files changed, 213 insertions(+), 22 deletions(-)

diff --git a/bson/bsonrw/json_scanner.go b/bson/bsonrw/json_scanner.go
index 212f348348..6c4c7bc916 100644
--- a/bson/bsonrw/json_scanner.go
+++ b/bson/bsonrw/json_scanner.go
@@ -13,8 +13,8 @@ import (
 	"io"
 	"math"
 	"strconv"
-	"strings"
 	"unicode"
+	"unicode/utf16"
 )
 
 type jsonTokenType byte
@@ -162,6 +162,31 @@ func isValueTerminator(c byte) bool {
 	return c == ',' || c == '}' || c == ']' || isWhiteSpace(c)
 }
 
+// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune,
+// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present.
+// It is copied and lightly modified from the Go JSON decode function at
+// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188
+func getu4(s []byte) rune {
+	if len(s) < 4 {
+		return -1
+	}
+	var r rune
+	for _, c := range s[:4] {
+		switch {
+		case '0' <= c && c <= '9':
+			c = c - '0'
+		case 'a' <= c && c <= 'f':
+			c = c - 'a' + 10
+		case 'A' <= c && c <= 'F':
+			c = c - 'A' + 10
+		default:
+			return -1
+		}
+		r = r*16 + rune(c)
+	}
+	return r
+}
+
 // scanString reads from an opening '"' to a closing '"' and handles escaped characters
 func (js *jsonScanner) scanString() (*jsonToken, error) {
 	var b bytes.Buffer
@@ -179,9 +204,18 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
 			return nil, err
 		}
 
+	evalNextChar:
 		switch c {
 		case '\\':
 			c, err = js.readNextByte()
+			if err != nil {
+				if err == io.EOF {
+					return nil, errors.New("end of input in JSON string")
+				}
+				return nil, err
+			}
+
+		evalNextEscapeChar:
 			switch c {
 			case '"', '\\', '/':
 				b.WriteByte(c)
@@ -202,13 +236,60 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
 					return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
 				}
 
-				s := fmt.Sprintf(`\u%s`, us)
-				s, err = strconv.Unquote(strings.Replace(strconv.Quote(s), `\\u`, `\u`, 1))
-				if err != nil {
-					return nil, err
+				rn := getu4(us)
+
+				// If the rune we just decoded is the high or low value of a possible surrogate pair,
+				// try to decode the next sequence as the low value of a surrogate pair. We're
+				// expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"),
+				// but need to handle cases where the input is not a valid surrogate pair.
+				// For more context on unicode surrogate pairs, see:
+				// https://www.christianfscott.com/rust-chars-vs-go-runes/
+				// https://www.unicode.org/glossary/#high_surrogate_code_point
+				if utf16.IsSurrogate(rn) {
+					c, err = js.readNextByte()
+					if err != nil {
+						return nil, errors.New("end of input in JSON string")
+					}
+
+					// If the next value isn't the beginning of a backslash escape sequence, write the
+					// current rune and goto the beginning of the next char eval block.
+					if c != '\\' {
+						b.WriteRune(rn)
+						goto evalNextChar
+					}
+
+					c, err = js.readNextByte()
+					if err != nil {
+						return nil, errors.New("end of input in JSON string")
+					}
+
+					// If the next value isn't the beginning of a unicode escape sequence, write the
+					// current rune and goto the beginning of the next escape char eval block.
+					if c != 'u' {
+						b.WriteRune(rn)
+						goto evalNextEscapeChar
+					}
+
+					err = js.readNNextBytes(us, 4, 0)
+					if err != nil {
+						return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
+					}
+
+					rn2 := getu4(us)
+
+					// Try to decode the pair of runes as a utf16 surrogate pair. If that fails, just
+					// write the original two runes to the buffer.
+					if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar {
+						b.WriteRune(rnPair)
+					} else {
+						b.WriteRune(rn)
+						b.WriteRune(rn2)
+					}
+
+					break
 				}
 
-				b.WriteString(s)
+				b.WriteRune(rn)
 			default:
 				return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c)
 			}
diff --git a/bson/bsonrw/json_scanner_test.go b/bson/bsonrw/json_scanner_test.go
index c0795ed2c0..c8f6e9c59e 100644
--- a/bson/bsonrw/json_scanner_test.go
+++ b/bson/bsonrw/json_scanner_test.go
@@ -100,6 +100,26 @@ func TestJsonScannerValidInputs(t *testing.T) {
 			input:  `"\"\\\/\b\f\n\r\t"`,
 			tokens: []jsonToken{{t: jttString, v: "\"\\/\b\f\n\r\t"}},
 		},
+		{
+			desc:   "valid string--surrogate pair",
+			input:  `"abc \uD834\uDd1e 123"`,
+			tokens: []jsonToken{{t: jttString, v: "abc 𝄞 123"}},
+		},
+		{
+			desc:   "valid string--high surrogate at end of string",
+			input:  `"abc \uD834"`,
+			tokens: []jsonToken{{t: jttString, v: "abc �"}},
+		},
+		{
+			desc:   "valid string--low surrogate at end of string",
+			input:  `"abc \uDD1E"`,
+			tokens: []jsonToken{{t: jttString, v: "abc �"}},
+		},
+		{
+			desc:   "valid string--high surrogate with non-surrogate unicode value",
+			input:  `"abc \uDD1E\u00BF"`,
+			tokens: []jsonToken{{t: jttString, v: "abc �¿"}},
+		},
 		{
 			desc: "valid literal--true", input: "true",
 			tokens: []jsonToken{{t: jttBool, v: true}},
@@ -280,28 +300,28 @@ func TestJsonScannerValidInputs(t *testing.T) {
 
 			for _, token := range tc.tokens {
 				c, err := js.nextToken()
+				expectNoError(t, err, tc.desc)
 				jttDiff(t, token.t, c.t, tc.desc)
 				jtvDiff(t, token.v, c.v, tc.desc)
-				expectNoError(t, err, tc.desc)
 			}
 
 			c, err := js.nextToken()
-			jttDiff(t, jttEOF, c.t, tc.desc)
 			noerr(t, err)
+			jttDiff(t, jttEOF, c.t, tc.desc)
 
 			// testing early EOF reading
 			js = &jsonScanner{r: iotest.DataErrReader(strings.NewReader(tc.input))}
 
 			for _, token := range tc.tokens {
 				c, err := js.nextToken()
+				expectNoError(t, err, tc.desc)
 				jttDiff(t, token.t, c.t, tc.desc)
 				jtvDiff(t, token.v, c.v, tc.desc)
-				expectNoError(t, err, tc.desc)
 			}
 
 			c, err = js.nextToken()
-			jttDiff(t, jttEOF, c.t, tc.desc)
 			noerr(t, err)
+			jttDiff(t, jttEOF, c.t, tc.desc)
 		})
 	}
 }
diff --git a/bson/unmarshal_test.go b/bson/unmarshal_test.go
index f17e04b183..a5e848eaae 100644
--- a/bson/unmarshal_test.go
+++ b/bson/unmarshal_test.go
@@ -95,18 +95,17 @@ func TestUnmarshalExtJSONWithRegistry(t *testing.T) {
 }
 
 func TestUnmarshalExtJSONWithContext(t *testing.T) {
-	t.Run("UnmarshalExtJSONWithContext", func(t *testing.T) {
-		type teststruct struct{ Foo int }
-		var got teststruct
-		data := []byte("{\"foo\":1}")
-		dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
-		err := UnmarshalExtJSONWithContext(dc, data, true, &got)
-		noerr(t, err)
-		want := teststruct{1}
-		if !cmp.Equal(got, want) {
-			t.Errorf("Did not unmarshal as expected. got %v; want %v", got, want)
-		}
-	})
+	for _, tc := range unmarshalingExtTestCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := reflect.New(tc.sType).Interface()
+			dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
+			err := UnmarshalExtJSONWithContext(dc, tc.data, true, got)
+			noerr(t, err)
+			if !cmp.Equal(got, tc.want) {
+				t.Errorf("Did not unmarshal as expected. got %+v; want %+v", got, tc.want)
+			}
+		})
+	}
 }
 
 func TestCachingDecodersNotSharedAcrossRegistries(t *testing.T) {
diff --git a/bson/unmarshaling_cases_test.go b/bson/unmarshaling_cases_test.go
index 9a84fe6db0..13b40ecd5a 100644
--- a/bson/unmarshaling_cases_test.go
+++ b/bson/unmarshaling_cases_test.go
@@ -78,3 +78,94 @@ var unmarshalingTestCases = []unmarshalingTestCase{
 		docToBytes(D{{"fooBar", int32(10)}}),
 	},
 }
+
+var unmarshalingExtTestCases = []unmarshalingTestCase{
+	{
+		name: "Small struct",
+		reg:  DefaultRegistry,
+		sType: reflect.TypeOf(struct {
+			Foo int
+		}{}),
+		data: []byte(`{"foo":1}`),
+		want: &struct {
+			Foo int
+		}{Foo: 1},
+	},
+	{
+		name: "Valid surrogate pair",
+		reg:  DefaultRegistry,
+		sType: reflect.TypeOf(struct {
+			Foo string
+		}{}),
+		data: []byte(`{"foo":"\uD834\uDd1e"}`),
+		want: &struct {
+			Foo string
+		}{Foo: "𝄞"},
+	},
+	{
+		name: "Valid surrogate pair with other values",
+		reg:  DefaultRegistry,
+		sType: reflect.TypeOf(struct {
+			Foo string
+		}{}),
+		data: []byte(`{"foo":"abc \uD834\uDd1e 123"}`),
+		want: &struct {
+			Foo string
+		}{Foo: "abc 𝄞 123"},
+	},
+	{
+		name: "High surrogate value with no following low surrogate value",
+		reg:  DefaultRegistry,
+		sType: reflect.TypeOf(struct {
+			Foo string
+		}{}),
+		data: []byte(`{"foo":"abc \uD834 123"}`),
+		want: &struct {
+			Foo string
+		}{Foo: "abc � 123"},
+	},
+	{
+		name: "High surrogate value at end of string",
+		reg:  DefaultRegistry,
+		sType: reflect.TypeOf(struct {
+			Foo string
+		}{}),
+		data: []byte(`{"foo":"\uD834"}`),
+		want: &struct {
+			Foo string
+		}{Foo: "�"},
+	},
+	{
+		name: "Low surrogate value with no preceeding high surrogate value",
+		reg:  DefaultRegistry,
+		sType: reflect.TypeOf(struct {
+			Foo string
+		}{}),
+		data: []byte(`{"foo":"abc \uDd1e 123"}`),
+		want: &struct {
+			Foo string
+		}{Foo: "abc � 123"},
+	},
+	{
+		name: "Low surrogate value at end of string",
+		reg:  DefaultRegistry,
+		sType: reflect.TypeOf(struct {
+			Foo string
+		}{}),
+		data: []byte(`{"foo":"\uDd1e"}`),
+		want: &struct {
+			Foo string
+		}{Foo: "�"},
+	},
+	{
+		name: "High surrogate value with non-surrogate unicode value",
+		reg:  DefaultRegistry,
+		sType: reflect.TypeOf(struct {
+			Foo string
+		}{}),
+		data: []byte(`{"foo":"\uD834\u00BF"}`),
+		want: &struct {
+			Foo string
+		}{Foo: "�¿"},
+	},
+}