From 3a4ead8c1c6ab68f1b4b51712c406a8cef16ec1b Mon Sep 17 00:00:00 2001 From: Matt Dale <9760375+matthewdale@users.noreply.github.com> Date: Mon, 3 May 2021 13:05:40 -0700 Subject: [PATCH] GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON. (#649) * GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON. * Correct err handling in jsonScanner.ScanString and remove unused field in UnmarshalExtJSON test. * Explicitly write unicode.ReplacementChar for invalid surrogate and simplify test types. * Add tests for high surrogate followed by non-Unicode escape sequence and 4-byte UTF-8 extJSON marshaling. * Explicitly write the Unicode replacement character for an un-paired Unicode surrogate value. --- bson/bsonrw/json_scanner.go | 101 +++++++++++++++++++++++++++++-- bson/bsonrw/json_scanner_test.go | 33 ++++++++-- bson/unmarshal_test.go | 87 ++++++++++++++++++++++---- data/bson-corpus/string.json | 5 ++ 4 files changed, 204 insertions(+), 22 deletions(-) diff --git a/bson/bsonrw/json_scanner.go b/bson/bsonrw/json_scanner.go index 212f348348..cd4843a3a4 100644 --- a/bson/bsonrw/json_scanner.go +++ b/bson/bsonrw/json_scanner.go @@ -13,8 +13,8 @@ import ( "io" "math" "strconv" - "strings" "unicode" + "unicode/utf16" ) type jsonTokenType byte @@ -162,6 +162,31 @@ func isValueTerminator(c byte) bool { return c == ',' || c == '}' || c == ']' || isWhiteSpace(c) } +// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune, +// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present. +// It is copied and lightly modified from the Go JSON decode function at +// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188 +func getu4(s []byte) rune { + if len(s) < 4 { + return -1 + } + var r rune + for _, c := range s[:4] { + switch { + case '0' <= c && c <= '9': + c = c - '0' + case 'a' <= c && c <= 'f': + c = c - 'a' + 10 + case 'A' <= c && c <= 'F': + c = c - 'A' + 10 + default: + return -1 + } + r = r*16 + rune(c) + } + return r +} + // scanString reads from an opening '"' to a closing '"' and handles escaped characters func (js *jsonScanner) scanString() (*jsonToken, error) { var b bytes.Buffer @@ -179,9 +204,18 @@ func (js *jsonScanner) scanString() (*jsonToken, error) { return nil, err } + evalNextChar: switch c { case '\\': c, err = js.readNextByte() + if err != nil { + if err == io.EOF { + return nil, errors.New("end of input in JSON string") + } + return nil, err + } + + evalNextEscapeChar: switch c { case '"', '\\', '/': b.WriteByte(c) @@ -202,13 +236,68 @@ func (js *jsonScanner) scanString() (*jsonToken, error) { return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us) } - s := fmt.Sprintf(`\u%s`, us) - s, err = strconv.Unquote(strings.Replace(strconv.Quote(s), `\\u`, `\u`, 1)) - if err != nil { - return nil, err + rn := getu4(us) + + // If the rune we just decoded is the high or low value of a possible surrogate pair, + // try to decode the next sequence as the low value of a surrogate pair. We're + // expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"), + // but need to handle cases where the input is not a valid surrogate pair. + // For more context on unicode surrogate pairs, see: + // https://www.christianfscott.com/rust-chars-vs-go-runes/ + // https://www.unicode.org/glossary/#high_surrogate_code_point + if utf16.IsSurrogate(rn) { + c, err = js.readNextByte() + if err != nil { + if err == io.EOF { + return nil, errors.New("end of input in JSON string") + } + return nil, err + } + + // If the next value isn't the beginning of a backslash escape sequence, write + // the Unicode replacement character for the surrogate value and goto the + // beginning of the next char eval block. + if c != '\\' { + b.WriteRune(unicode.ReplacementChar) + goto evalNextChar + } + + c, err = js.readNextByte() + if err != nil { + if err == io.EOF { + return nil, errors.New("end of input in JSON string") + } + return nil, err + } + + // If the next value isn't the beginning of a unicode escape sequence, write the + // Unicode replacement character for the surrogate value and goto the beginning + // of the next escape char eval block. + if c != 'u' { + b.WriteRune(unicode.ReplacementChar) + goto evalNextEscapeChar + } + + err = js.readNNextBytes(us, 4, 0) + if err != nil { + return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us) + } + + rn2 := getu4(us) + + // Try to decode the pair of runes as a utf16 surrogate pair. If that fails, write + // the Unicode replacement character for the surrogate value and the 2nd decoded rune. + if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar { + b.WriteRune(rnPair) + } else { + b.WriteRune(unicode.ReplacementChar) + b.WriteRune(rn2) + } + + break } - b.WriteString(s) + b.WriteRune(rn) default: return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c) } diff --git a/bson/bsonrw/json_scanner_test.go b/bson/bsonrw/json_scanner_test.go index c0795ed2c0..abbf7d3fe5 100644 --- a/bson/bsonrw/json_scanner_test.go +++ b/bson/bsonrw/json_scanner_test.go @@ -100,6 +100,31 @@ func TestJsonScannerValidInputs(t *testing.T) { input: `"\"\\\/\b\f\n\r\t"`, tokens: []jsonToken{{t: jttString, v: "\"\\/\b\f\n\r\t"}}, }, + { + desc: "valid string--surrogate pair", + input: `"abc \uD834\uDd1e 123"`, + tokens: []jsonToken{{t: jttString, v: "abc 𝄞 123"}}, + }, + { + desc: "valid string--high surrogate at end of string", + input: `"abc \uD834"`, + tokens: []jsonToken{{t: jttString, v: "abc �"}}, + }, + { + desc: "valid string--low surrogate at end of string", + input: `"abc \uDD1E"`, + tokens: []jsonToken{{t: jttString, v: "abc �"}}, + }, + { + desc: "valid string--high surrogate with non-surrogate Unicode value", + input: `"abc \uDD1E\u00BF"`, + tokens: []jsonToken{{t: jttString, v: "abc �¿"}}, + }, + { + desc: "valid string--high surrogate with non-Unicode escape sequence", + input: `"abc \uDD1E\t"`, + tokens: []jsonToken{{t: jttString, v: "abc �\t"}}, + }, { desc: "valid literal--true", input: "true", tokens: []jsonToken{{t: jttBool, v: true}}, @@ -280,28 +305,28 @@ func TestJsonScannerValidInputs(t *testing.T) { for _, token := range tc.tokens { c, err := js.nextToken() + expectNoError(t, err, tc.desc) jttDiff(t, token.t, c.t, tc.desc) jtvDiff(t, token.v, c.v, tc.desc) - expectNoError(t, err, tc.desc) } c, err := js.nextToken() - jttDiff(t, jttEOF, c.t, tc.desc) noerr(t, err) + jttDiff(t, jttEOF, c.t, tc.desc) // testing early EOF reading js = &jsonScanner{r: iotest.DataErrReader(strings.NewReader(tc.input))} for _, token := range tc.tokens { c, err := js.nextToken() + expectNoError(t, err, tc.desc) jttDiff(t, token.t, c.t, tc.desc) jtvDiff(t, token.v, c.v, tc.desc) - expectNoError(t, err, tc.desc) } c, err = js.nextToken() - jttDiff(t, jttEOF, c.t, tc.desc) noerr(t, err) + jttDiff(t, jttEOF, c.t, tc.desc) }) } } diff --git a/bson/unmarshal_test.go b/bson/unmarshal_test.go index f17e04b183..78b597ada7 100644 --- a/bson/unmarshal_test.go +++ b/bson/unmarshal_test.go @@ -95,18 +95,81 @@ func TestUnmarshalExtJSONWithRegistry(t *testing.T) { } func TestUnmarshalExtJSONWithContext(t *testing.T) { - t.Run("UnmarshalExtJSONWithContext", func(t *testing.T) { - type teststruct struct{ Foo int } - var got teststruct - data := []byte("{\"foo\":1}") - dc := bsoncodec.DecodeContext{Registry: DefaultRegistry} - err := UnmarshalExtJSONWithContext(dc, data, true, &got) - noerr(t, err) - want := teststruct{1} - if !cmp.Equal(got, want) { - t.Errorf("Did not unmarshal as expected. got %v; want %v", got, want) - } - }) + type fooInt struct { + Foo int + } + + type fooString struct { + Foo string + } + + var cases = []struct { + name string + sType reflect.Type + want interface{} + data []byte + }{ + { + name: "Small struct", + sType: reflect.TypeOf(fooInt{}), + data: []byte(`{"foo":1}`), + want: &fooInt{Foo: 1}, + }, + { + name: "Valid surrogate pair", + sType: reflect.TypeOf(fooString{}), + data: []byte(`{"foo":"\uD834\uDd1e"}`), + want: &fooString{Foo: "𝄞"}, + }, + { + name: "Valid surrogate pair with other values", + sType: reflect.TypeOf(fooString{}), + data: []byte(`{"foo":"abc \uD834\uDd1e 123"}`), + want: &fooString{Foo: "abc 𝄞 123"}, + }, + { + name: "High surrogate value with no following low surrogate value", + sType: reflect.TypeOf(fooString{}), + data: []byte(`{"foo":"abc \uD834 123"}`), + want: &fooString{Foo: "abc � 123"}, + }, + { + name: "High surrogate value at end of string", + sType: reflect.TypeOf(fooString{}), + data: []byte(`{"foo":"\uD834"}`), + want: &fooString{Foo: "�"}, + }, + { + name: "Low surrogate value with no preceeding high surrogate value", + sType: reflect.TypeOf(fooString{}), + data: []byte(`{"foo":"abc \uDd1e 123"}`), + want: &fooString{Foo: "abc � 123"}, + }, + { + name: "Low surrogate value at end of string", + sType: reflect.TypeOf(fooString{}), + data: []byte(`{"foo":"\uDd1e"}`), + want: &fooString{Foo: "�"}, + }, + { + name: "High surrogate value with non-surrogate unicode value", + sType: reflect.TypeOf(fooString{}), + data: []byte(`{"foo":"\uD834\u00BF"}`), + want: &fooString{Foo: "�¿"}, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := reflect.New(tc.sType).Interface() + dc := bsoncodec.DecodeContext{Registry: DefaultRegistry} + err := UnmarshalExtJSONWithContext(dc, tc.data, true, got) + noerr(t, err) + if !cmp.Equal(got, tc.want) { + t.Errorf("Did not unmarshal as expected. got %+v; want %+v", got, tc.want) + } + }) + } } func TestCachingDecodersNotSharedAcrossRegistries(t *testing.T) { diff --git a/data/bson-corpus/string.json b/data/bson-corpus/string.json index 148334d091..4964ddcdf5 100644 --- a/data/bson-corpus/string.json +++ b/data/bson-corpus/string.json @@ -28,6 +28,11 @@ "canonical_bson": "190000000261000D000000E29886E29886E29886E298860000", "canonical_extjson": "{\"a\" : \"\\u2606\\u2606\\u2606\\u2606\"}" }, + { + "description": "four-byte UTF-8 (𝄞)", + "canonical_bson": "1100000002610005000000F09D849E0000", + "canonical_extjson": "{\"a\" : \"𝄞\"}" + }, { "description": "Embedded nulls", "canonical_bson": "190000000261000D0000006162006261620062616261620000",