From 91263a90b603400da4524d644da5a6ce2d5b144e Mon Sep 17 00:00:00 2001 From: Matt Dale <9760375+matthewdale@users.noreply.github.com> Date: Mon, 26 Apr 2021 10:04:34 -0700 Subject: [PATCH] GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON. --- bson/bsonrw/json_scanner.go | 93 +++++++++++++++++++++++++++++--- bson/bsonrw/json_scanner_test.go | 28 ++++++++-- bson/unmarshal_test.go | 23 ++++---- bson/unmarshaling_cases_test.go | 91 +++++++++++++++++++++++++++++++ 4 files changed, 213 insertions(+), 22 deletions(-) diff --git a/bson/bsonrw/json_scanner.go b/bson/bsonrw/json_scanner.go index 212f348348..6c4c7bc916 100644 --- a/bson/bsonrw/json_scanner.go +++ b/bson/bsonrw/json_scanner.go @@ -13,8 +13,8 @@ import ( "io" "math" "strconv" - "strings" "unicode" + "unicode/utf16" ) type jsonTokenType byte @@ -162,6 +162,31 @@ func isValueTerminator(c byte) bool { return c == ',' || c == '}' || c == ']' || isWhiteSpace(c) } +// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune, +// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present. +// It is copied and lightly modified from the Go JSON decode function at +// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188 +func getu4(s []byte) rune { + if len(s) < 4 { + return -1 + } + var r rune + for _, c := range s[:4] { + switch { + case '0' <= c && c <= '9': + c = c - '0' + case 'a' <= c && c <= 'f': + c = c - 'a' + 10 + case 'A' <= c && c <= 'F': + c = c - 'A' + 10 + default: + return -1 + } + r = r*16 + rune(c) + } + return r +} + // scanString reads from an opening '"' to a closing '"' and handles escaped characters func (js *jsonScanner) scanString() (*jsonToken, error) { var b bytes.Buffer @@ -179,9 +204,18 @@ func (js *jsonScanner) scanString() (*jsonToken, error) { return nil, err } + evalNextChar: switch c { case '\\': c, err = js.readNextByte() + if err != nil { + if err == io.EOF { + return nil, errors.New("end of input in JSON string") + } + return nil, err + } + + evalNextEscapeChar: switch c { case '"', '\\', '/': b.WriteByte(c) @@ -202,13 +236,60 @@ func (js *jsonScanner) scanString() (*jsonToken, error) { return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us) } - s := fmt.Sprintf(`\u%s`, us) - s, err = strconv.Unquote(strings.Replace(strconv.Quote(s), `\\u`, `\u`, 1)) - if err != nil { - return nil, err + rn := getu4(us) + + // If the rune we just decoded is the high or low value of a possible surrogate pair, + // try to decode the next sequence as the low value of a surrogate pair. We're + // expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"), + // but need to handle cases where the input is not a valid surrogate pair. + // For more context on unicode surrogate pairs, see: + // https://www.christianfscott.com/rust-chars-vs-go-runes/ + // https://www.unicode.org/glossary/#high_surrogate_code_point + if utf16.IsSurrogate(rn) { + c, err = js.readNextByte() + if err != nil { + return nil, errors.New("end of input in JSON string") + } + + // If the next value isn't the beginning of a backslash escape sequence, write the + // current rune and goto the beginning of the next char eval block. + if c != '\\' { + b.WriteRune(rn) + goto evalNextChar + } + + c, err = js.readNextByte() + if err != nil { + return nil, errors.New("end of input in JSON string") + } + + // If the next value isn't the beginning of a unicode escape sequence, write the + // current rune and goto the beginning of the next escape char eval block. + if c != 'u' { + b.WriteRune(rn) + goto evalNextEscapeChar + } + + err = js.readNNextBytes(us, 4, 0) + if err != nil { + return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us) + } + + rn2 := getu4(us) + + // Try to decode the pair of runes as a utf16 surrogate pair. If that fails, just + // write the original two runes to the buffer. + if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar { + b.WriteRune(rnPair) + } else { + b.WriteRune(rn) + b.WriteRune(rn2) + } + + break } - b.WriteString(s) + b.WriteRune(rn) default: return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c) } diff --git a/bson/bsonrw/json_scanner_test.go b/bson/bsonrw/json_scanner_test.go index c0795ed2c0..c8f6e9c59e 100644 --- a/bson/bsonrw/json_scanner_test.go +++ b/bson/bsonrw/json_scanner_test.go @@ -100,6 +100,26 @@ func TestJsonScannerValidInputs(t *testing.T) { input: `"\"\\\/\b\f\n\r\t"`, tokens: []jsonToken{{t: jttString, v: "\"\\/\b\f\n\r\t"}}, }, + { + desc: "valid string--surrogate pair", + input: `"abc \uD834\uDd1e 123"`, + tokens: []jsonToken{{t: jttString, v: "abc 𝄞 123"}}, + }, + { + desc: "valid string--high surrogate at end of string", + input: `"abc \uD834"`, + tokens: []jsonToken{{t: jttString, v: "abc �"}}, + }, + { + desc: "valid string--low surrogate at end of string", + input: `"abc \uDD1E"`, + tokens: []jsonToken{{t: jttString, v: "abc �"}}, + }, + { + desc: "valid string--high surrogate with non-surrogate unicode value", + input: `"abc \uDD1E\u00BF"`, + tokens: []jsonToken{{t: jttString, v: "abc �¿"}}, + }, { desc: "valid literal--true", input: "true", tokens: []jsonToken{{t: jttBool, v: true}}, @@ -280,28 +300,28 @@ func TestJsonScannerValidInputs(t *testing.T) { for _, token := range tc.tokens { c, err := js.nextToken() + expectNoError(t, err, tc.desc) jttDiff(t, token.t, c.t, tc.desc) jtvDiff(t, token.v, c.v, tc.desc) - expectNoError(t, err, tc.desc) } c, err := js.nextToken() - jttDiff(t, jttEOF, c.t, tc.desc) noerr(t, err) + jttDiff(t, jttEOF, c.t, tc.desc) // testing early EOF reading js = &jsonScanner{r: iotest.DataErrReader(strings.NewReader(tc.input))} for _, token := range tc.tokens { c, err := js.nextToken() + expectNoError(t, err, tc.desc) jttDiff(t, token.t, c.t, tc.desc) jtvDiff(t, token.v, c.v, tc.desc) - expectNoError(t, err, tc.desc) } c, err = js.nextToken() - jttDiff(t, jttEOF, c.t, tc.desc) noerr(t, err) + jttDiff(t, jttEOF, c.t, tc.desc) }) } } diff --git a/bson/unmarshal_test.go b/bson/unmarshal_test.go index f17e04b183..a5e848eaae 100644 --- a/bson/unmarshal_test.go +++ b/bson/unmarshal_test.go @@ -95,18 +95,17 @@ func TestUnmarshalExtJSONWithRegistry(t *testing.T) { } func TestUnmarshalExtJSONWithContext(t *testing.T) { - t.Run("UnmarshalExtJSONWithContext", func(t *testing.T) { - type teststruct struct{ Foo int } - var got teststruct - data := []byte("{\"foo\":1}") - dc := bsoncodec.DecodeContext{Registry: DefaultRegistry} - err := UnmarshalExtJSONWithContext(dc, data, true, &got) - noerr(t, err) - want := teststruct{1} - if !cmp.Equal(got, want) { - t.Errorf("Did not unmarshal as expected. got %v; want %v", got, want) - } - }) + for _, tc := range unmarshalingExtTestCases { + t.Run(tc.name, func(t *testing.T) { + got := reflect.New(tc.sType).Interface() + dc := bsoncodec.DecodeContext{Registry: DefaultRegistry} + err := UnmarshalExtJSONWithContext(dc, tc.data, true, got) + noerr(t, err) + if !cmp.Equal(got, tc.want) { + t.Errorf("Did not unmarshal as expected. got %+v; want %+v", got, tc.want) + } + }) + } } func TestCachingDecodersNotSharedAcrossRegistries(t *testing.T) { diff --git a/bson/unmarshaling_cases_test.go b/bson/unmarshaling_cases_test.go index 9a84fe6db0..13b40ecd5a 100644 --- a/bson/unmarshaling_cases_test.go +++ b/bson/unmarshaling_cases_test.go @@ -78,3 +78,94 @@ var unmarshalingTestCases = []unmarshalingTestCase{ docToBytes(D{{"fooBar", int32(10)}}), }, } + +var unmarshalingExtTestCases = []unmarshalingTestCase{ + { + name: "Small struct", + reg: DefaultRegistry, + sType: reflect.TypeOf(struct { + Foo int + }{}), + data: []byte(`{"foo":1}`), + want: &struct { + Foo int + }{Foo: 1}, + }, + { + name: "Valid surrogate pair", + reg: DefaultRegistry, + sType: reflect.TypeOf(struct { + Foo string + }{}), + data: []byte(`{"foo":"\uD834\uDd1e"}`), + want: &struct { + Foo string + }{Foo: "𝄞"}, + }, + { + name: "Valid surrogate pair with other values", + reg: DefaultRegistry, + sType: reflect.TypeOf(struct { + Foo string + }{}), + data: []byte(`{"foo":"abc \uD834\uDd1e 123"}`), + want: &struct { + Foo string + }{Foo: "abc 𝄞 123"}, + }, + { + name: "High surrogate value with no following low surrogate value", + reg: DefaultRegistry, + sType: reflect.TypeOf(struct { + Foo string + }{}), + data: []byte(`{"foo":"abc \uD834 123"}`), + want: &struct { + Foo string + }{Foo: "abc � 123"}, + }, + { + name: "High surrogate value at end of string", + reg: DefaultRegistry, + sType: reflect.TypeOf(struct { + Foo string + }{}), + data: []byte(`{"foo":"\uD834"}`), + want: &struct { + Foo string + }{Foo: "�"}, + }, + { + name: "Low surrogate value with no preceeding high surrogate value", + reg: DefaultRegistry, + sType: reflect.TypeOf(struct { + Foo string + }{}), + data: []byte(`{"foo":"abc \uDd1e 123"}`), + want: &struct { + Foo string + }{Foo: "abc � 123"}, + }, + { + name: "Low surrogate value at end of string", + reg: DefaultRegistry, + sType: reflect.TypeOf(struct { + Foo string + }{}), + data: []byte(`{"foo":"\uDd1e"}`), + want: &struct { + Foo string + }{Foo: "�"}, + }, + { + name: "High surrogate value with non-surrogate unicode value", + reg: DefaultRegistry, + sType: reflect.TypeOf(struct { + Foo string + }{}), + data: []byte(`{"foo":"\uD834\u00BF"}`), + want: &struct { + Foo string + }{Foo: "�¿"}, + }, +}