Skip to content

Commit

Permalink
GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in Unmarsha…
Browse files Browse the repository at this point in the history
…lExtJSON. (#649)

* GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON.

* Correct err handling in jsonScanner.ScanString and remove unused field in UnmarshalExtJSON test.

* Explicitly write unicode.ReplacementChar for invalid surrogate and simplify test types.

* Add tests for high surrogate followed by non-Unicode escape sequence and 4-byte UTF-8 extJSON marshaling.

* Explicitly write the Unicode replacement character for an un-paired Unicode surrogate value.
  • Loading branch information
matthewdale committed May 4, 2021
1 parent a69cc5e commit 4b2077a
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 22 deletions.
101 changes: 95 additions & 6 deletions bson/bsonrw/json_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ import (
"io"
"math"
"strconv"
"strings"
"unicode"
"unicode/utf16"
)

type jsonTokenType byte
Expand Down Expand Up @@ -162,6 +162,31 @@ func isValueTerminator(c byte) bool {
return c == ',' || c == '}' || c == ']' || isWhiteSpace(c)
}

// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune,
// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present.
// It is copied and lightly modified from the Go JSON decode function at
// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188
func getu4(s []byte) rune {
if len(s) < 4 {
return -1
}
var r rune
for _, c := range s[:4] {
switch {
case '0' <= c && c <= '9':
c = c - '0'
case 'a' <= c && c <= 'f':
c = c - 'a' + 10
case 'A' <= c && c <= 'F':
c = c - 'A' + 10
default:
return -1
}
r = r*16 + rune(c)
}
return r
}

// scanString reads from an opening '"' to a closing '"' and handles escaped characters
func (js *jsonScanner) scanString() (*jsonToken, error) {
var b bytes.Buffer
Expand All @@ -179,9 +204,18 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
return nil, err
}

evalNextChar:
switch c {
case '\\':
c, err = js.readNextByte()
if err != nil {
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

evalNextEscapeChar:
switch c {
case '"', '\\', '/':
b.WriteByte(c)
Expand All @@ -202,13 +236,68 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
}

s := fmt.Sprintf(`\u%s`, us)
s, err = strconv.Unquote(strings.Replace(strconv.Quote(s), `\\u`, `\u`, 1))
if err != nil {
return nil, err
rn := getu4(us)

// If the rune we just decoded is the high or low value of a possible surrogate pair,
// try to decode the next sequence as the low value of a surrogate pair. We're
// expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"),
// but need to handle cases where the input is not a valid surrogate pair.
// For more context on unicode surrogate pairs, see:
// https://www.christianfscott.com/rust-chars-vs-go-runes/
// https://www.unicode.org/glossary/#high_surrogate_code_point
if utf16.IsSurrogate(rn) {
c, err = js.readNextByte()
if err != nil {
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

// If the next value isn't the beginning of a backslash escape sequence, write
// the Unicode replacement character for the surrogate value and goto the
// beginning of the next char eval block.
if c != '\\' {
b.WriteRune(unicode.ReplacementChar)
goto evalNextChar
}

c, err = js.readNextByte()
if err != nil {
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

// If the next value isn't the beginning of a unicode escape sequence, write the
// Unicode replacement character for the surrogate value and goto the beginning
// of the next escape char eval block.
if c != 'u' {
b.WriteRune(unicode.ReplacementChar)
goto evalNextEscapeChar
}

err = js.readNNextBytes(us, 4, 0)
if err != nil {
return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
}

rn2 := getu4(us)

// Try to decode the pair of runes as a utf16 surrogate pair. If that fails, write
// the Unicode replacement character for the surrogate value and the 2nd decoded rune.
if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar {
b.WriteRune(rnPair)
} else {
b.WriteRune(unicode.ReplacementChar)
b.WriteRune(rn2)
}

break
}

b.WriteString(s)
b.WriteRune(rn)
default:
return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c)
}
Expand Down
33 changes: 29 additions & 4 deletions bson/bsonrw/json_scanner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,31 @@ func TestJsonScannerValidInputs(t *testing.T) {
input: `"\"\\\/\b\f\n\r\t"`,
tokens: []jsonToken{{t: jttString, v: "\"\\/\b\f\n\r\t"}},
},
{
desc: "valid string--surrogate pair",
input: `"abc \uD834\uDd1e 123"`,
tokens: []jsonToken{{t: jttString, v: "abc 𝄞 123"}},
},
{
desc: "valid string--high surrogate at end of string",
input: `"abc \uD834"`,
tokens: []jsonToken{{t: jttString, v: "abc �"}},
},
{
desc: "valid string--low surrogate at end of string",
input: `"abc \uDD1E"`,
tokens: []jsonToken{{t: jttString, v: "abc �"}},
},
{
desc: "valid string--high surrogate with non-surrogate Unicode value",
input: `"abc \uDD1E\u00BF"`,
tokens: []jsonToken{{t: jttString, v: "abc �¿"}},
},
{
desc: "valid string--high surrogate with non-Unicode escape sequence",
input: `"abc \uDD1E\t"`,
tokens: []jsonToken{{t: jttString, v: "abc �\t"}},
},
{
desc: "valid literal--true", input: "true",
tokens: []jsonToken{{t: jttBool, v: true}},
Expand Down Expand Up @@ -280,28 +305,28 @@ func TestJsonScannerValidInputs(t *testing.T) {

for _, token := range tc.tokens {
c, err := js.nextToken()
expectNoError(t, err, tc.desc)
jttDiff(t, token.t, c.t, tc.desc)
jtvDiff(t, token.v, c.v, tc.desc)
expectNoError(t, err, tc.desc)
}

c, err := js.nextToken()
jttDiff(t, jttEOF, c.t, tc.desc)
noerr(t, err)
jttDiff(t, jttEOF, c.t, tc.desc)

// testing early EOF reading
js = &jsonScanner{r: iotest.DataErrReader(strings.NewReader(tc.input))}

for _, token := range tc.tokens {
c, err := js.nextToken()
expectNoError(t, err, tc.desc)
jttDiff(t, token.t, c.t, tc.desc)
jtvDiff(t, token.v, c.v, tc.desc)
expectNoError(t, err, tc.desc)
}

c, err = js.nextToken()
jttDiff(t, jttEOF, c.t, tc.desc)
noerr(t, err)
jttDiff(t, jttEOF, c.t, tc.desc)
})
}
}
Expand Down
87 changes: 75 additions & 12 deletions bson/unmarshal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,18 +95,81 @@ func TestUnmarshalExtJSONWithRegistry(t *testing.T) {
}

func TestUnmarshalExtJSONWithContext(t *testing.T) {
t.Run("UnmarshalExtJSONWithContext", func(t *testing.T) {
type teststruct struct{ Foo int }
var got teststruct
data := []byte("{\"foo\":1}")
dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
err := UnmarshalExtJSONWithContext(dc, data, true, &got)
noerr(t, err)
want := teststruct{1}
if !cmp.Equal(got, want) {
t.Errorf("Did not unmarshal as expected. got %v; want %v", got, want)
}
})
type fooInt struct {
Foo int
}

type fooString struct {
Foo string
}

var cases = []struct {
name string
sType reflect.Type
want interface{}
data []byte
}{
{
name: "Small struct",
sType: reflect.TypeOf(fooInt{}),
data: []byte(`{"foo":1}`),
want: &fooInt{Foo: 1},
},
{
name: "Valid surrogate pair",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"\uD834\uDd1e"}`),
want: &fooString{Foo: "𝄞"},
},
{
name: "Valid surrogate pair with other values",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"abc \uD834\uDd1e 123"}`),
want: &fooString{Foo: "abc 𝄞 123"},
},
{
name: "High surrogate value with no following low surrogate value",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"abc \uD834 123"}`),
want: &fooString{Foo: "abc � 123"},
},
{
name: "High surrogate value at end of string",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"\uD834"}`),
want: &fooString{Foo: "�"},
},
{
name: "Low surrogate value with no preceeding high surrogate value",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"abc \uDd1e 123"}`),
want: &fooString{Foo: "abc � 123"},
},
{
name: "Low surrogate value at end of string",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"\uDd1e"}`),
want: &fooString{Foo: "�"},
},
{
name: "High surrogate value with non-surrogate unicode value",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"\uD834\u00BF"}`),
want: &fooString{Foo: "�¿"},
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got := reflect.New(tc.sType).Interface()
dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
err := UnmarshalExtJSONWithContext(dc, tc.data, true, got)
noerr(t, err)
if !cmp.Equal(got, tc.want) {
t.Errorf("Did not unmarshal as expected. got %+v; want %+v", got, tc.want)
}
})
}
}

func TestCachingDecodersNotSharedAcrossRegistries(t *testing.T) {
Expand Down
5 changes: 5 additions & 0 deletions data/bson-corpus/string.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
"canonical_bson": "190000000261000D000000E29886E29886E29886E298860000",
"canonical_extjson": "{\"a\" : \"\\u2606\\u2606\\u2606\\u2606\"}"
},
{
"description": "four-byte UTF-8 (𝄞)",
"canonical_bson": "1100000002610005000000F09D849E0000",
"canonical_extjson": "{\"a\" : \"𝄞\"}"
},
{
"description": "Embedded nulls",
"canonical_bson": "190000000261000D0000006162006261620062616261620000",
Expand Down

0 comments on commit 4b2077a

Please sign in to comment.