Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON. #649

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 95 additions & 6 deletions bson/bsonrw/json_scanner.go
Original file line number Diff line number Diff line change
@@ -13,8 +13,8 @@ import (
"io"
"math"
"strconv"
"strings"
"unicode"
"unicode/utf16"
)

type jsonTokenType byte
@@ -162,6 +162,31 @@ func isValueTerminator(c byte) bool {
return c == ',' || c == '}' || c == ']' || isWhiteSpace(c)
}

// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune,
// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present.
// It is copied and lightly modified from the Go JSON decode function at
// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188
func getu4(s []byte) rune {
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
if len(s) < 4 {
return -1
}
var r rune
for _, c := range s[:4] {
switch {
case '0' <= c && c <= '9':
c = c - '0'
case 'a' <= c && c <= 'f':
c = c - 'a' + 10
case 'A' <= c && c <= 'F':
c = c - 'A' + 10
default:
return -1
}
r = r*16 + rune(c)
}
return r
}

// scanString reads from an opening '"' to a closing '"' and handles escaped characters
func (js *jsonScanner) scanString() (*jsonToken, error) {
var b bytes.Buffer
@@ -179,9 +204,18 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
return nil, err
}

evalNextChar:
switch c {
case '\\':
c, err = js.readNextByte()
if err != nil {
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

evalNextEscapeChar:
switch c {
case '"', '\\', '/':
b.WriteByte(c)
@@ -202,13 +236,68 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
}

s := fmt.Sprintf(`\u%s`, us)
s, err = strconv.Unquote(strings.Replace(strconv.Quote(s), `\\u`, `\u`, 1))
if err != nil {
return nil, err
rn := getu4(us)

// If the rune we just decoded is the high or low value of a possible surrogate pair,
// try to decode the next sequence as the low value of a surrogate pair. We're
// expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"),
// but need to handle cases where the input is not a valid surrogate pair.
// For more context on unicode surrogate pairs, see:
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
// https://www.christianfscott.com/rust-chars-vs-go-runes/
// https://www.unicode.org/glossary/#high_surrogate_code_point
if utf16.IsSurrogate(rn) {
c, err = js.readNextByte()
if err != nil {
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

// If the next value isn't the beginning of a backslash escape sequence, write
// the Unicode replacement character for the surrogate value and goto the
// beginning of the next char eval block.
if c != '\\' {
b.WriteRune(unicode.ReplacementChar)
goto evalNextChar
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
}

c, err = js.readNextByte()
if err != nil {
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

// If the next value isn't the beginning of a unicode escape sequence, write the
// Unicode replacement character for the surrogate value and goto the beginning
// of the next escape char eval block.
if c != 'u' {
b.WriteRune(unicode.ReplacementChar)
goto evalNextEscapeChar
}

err = js.readNNextBytes(us, 4, 0)
if err != nil {
return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
}

rn2 := getu4(us)

// Try to decode the pair of runes as a utf16 surrogate pair. If that fails, write
// the Unicode replacement character for the surrogate value and the 2nd decoded rune.
if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar {
b.WriteRune(rnPair)
kevinAlbs marked this conversation as resolved.
Show resolved Hide resolved
} else {
b.WriteRune(unicode.ReplacementChar)
b.WriteRune(rn2)
kevinAlbs marked this conversation as resolved.
Show resolved Hide resolved
}

break
}

b.WriteString(s)
b.WriteRune(rn)
default:
return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c)
}
33 changes: 29 additions & 4 deletions bson/bsonrw/json_scanner_test.go
Original file line number Diff line number Diff line change
@@ -100,6 +100,31 @@ func TestJsonScannerValidInputs(t *testing.T) {
input: `"\"\\\/\b\f\n\r\t"`,
tokens: []jsonToken{{t: jttString, v: "\"\\/\b\f\n\r\t"}},
},
{
desc: "valid string--surrogate pair",
input: `"abc \uD834\uDd1e 123"`,
tokens: []jsonToken{{t: jttString, v: "abc 𝄞 123"}},
},
{
desc: "valid string--high surrogate at end of string",
input: `"abc \uD834"`,
tokens: []jsonToken{{t: jttString, v: "abc �"}},
},
{
desc: "valid string--low surrogate at end of string",
input: `"abc \uDD1E"`,
tokens: []jsonToken{{t: jttString, v: "abc �"}},
},
{
desc: "valid string--high surrogate with non-surrogate Unicode value",
input: `"abc \uDD1E\u00BF"`,
tokens: []jsonToken{{t: jttString, v: "abc �¿"}},
},
kevinAlbs marked this conversation as resolved.
Show resolved Hide resolved
{
desc: "valid string--high surrogate with non-Unicode escape sequence",
input: `"abc \uDD1E\t"`,
tokens: []jsonToken{{t: jttString, v: "abc �\t"}},
},
{
desc: "valid literal--true", input: "true",
tokens: []jsonToken{{t: jttBool, v: true}},
@@ -280,28 +305,28 @@ func TestJsonScannerValidInputs(t *testing.T) {

for _, token := range tc.tokens {
c, err := js.nextToken()
expectNoError(t, err, tc.desc)
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
jttDiff(t, token.t, c.t, tc.desc)
jtvDiff(t, token.v, c.v, tc.desc)
expectNoError(t, err, tc.desc)
}

c, err := js.nextToken()
jttDiff(t, jttEOF, c.t, tc.desc)
noerr(t, err)
jttDiff(t, jttEOF, c.t, tc.desc)

// testing early EOF reading
js = &jsonScanner{r: iotest.DataErrReader(strings.NewReader(tc.input))}

for _, token := range tc.tokens {
c, err := js.nextToken()
expectNoError(t, err, tc.desc)
jttDiff(t, token.t, c.t, tc.desc)
jtvDiff(t, token.v, c.v, tc.desc)
expectNoError(t, err, tc.desc)
}

c, err = js.nextToken()
jttDiff(t, jttEOF, c.t, tc.desc)
noerr(t, err)
jttDiff(t, jttEOF, c.t, tc.desc)
})
}
}
87 changes: 75 additions & 12 deletions bson/unmarshal_test.go
Original file line number Diff line number Diff line change
@@ -95,18 +95,81 @@ func TestUnmarshalExtJSONWithRegistry(t *testing.T) {
}

func TestUnmarshalExtJSONWithContext(t *testing.T) {
t.Run("UnmarshalExtJSONWithContext", func(t *testing.T) {
type teststruct struct{ Foo int }
var got teststruct
data := []byte("{\"foo\":1}")
dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
err := UnmarshalExtJSONWithContext(dc, data, true, &got)
noerr(t, err)
want := teststruct{1}
if !cmp.Equal(got, want) {
t.Errorf("Did not unmarshal as expected. got %v; want %v", got, want)
}
})
type fooInt struct {
Foo int
}

type fooString struct {
Foo string
}

var cases = []struct {
name string
sType reflect.Type
want interface{}
data []byte
}{
{
name: "Small struct",
sType: reflect.TypeOf(fooInt{}),
data: []byte(`{"foo":1}`),
want: &fooInt{Foo: 1},
},
{
name: "Valid surrogate pair",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"\uD834\uDd1e"}`),
want: &fooString{Foo: "𝄞"},
},
{
name: "Valid surrogate pair with other values",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"abc \uD834\uDd1e 123"}`),
want: &fooString{Foo: "abc 𝄞 123"},
},
{
name: "High surrogate value with no following low surrogate value",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"abc \uD834 123"}`),
want: &fooString{Foo: "abc � 123"},
},
{
name: "High surrogate value at end of string",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"\uD834"}`),
want: &fooString{Foo: "�"},
},
{
name: "Low surrogate value with no preceeding high surrogate value",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"abc \uDd1e 123"}`),
want: &fooString{Foo: "abc � 123"},
},
{
name: "Low surrogate value at end of string",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"\uDd1e"}`),
want: &fooString{Foo: "�"},
},
{
name: "High surrogate value with non-surrogate unicode value",
sType: reflect.TypeOf(fooString{}),
data: []byte(`{"foo":"\uD834\u00BF"}`),
want: &fooString{Foo: "�¿"},
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got := reflect.New(tc.sType).Interface()
dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
err := UnmarshalExtJSONWithContext(dc, tc.data, true, got)
noerr(t, err)
if !cmp.Equal(got, tc.want) {
t.Errorf("Did not unmarshal as expected. got %+v; want %+v", got, tc.want)
}
})
}
}

func TestCachingDecodersNotSharedAcrossRegistries(t *testing.T) {
5 changes: 5 additions & 0 deletions data/bson-corpus/string.json
Original file line number Diff line number Diff line change
@@ -28,6 +28,11 @@
"canonical_bson": "190000000261000D000000E29886E29886E29886E298860000",
"canonical_extjson": "{\"a\" : \"\\u2606\\u2606\\u2606\\u2606\"}"
},
{
"description": "four-byte UTF-8 (𝄞)",
"canonical_bson": "1100000002610005000000F09D849E0000",
"canonical_extjson": "{\"a\" : \"𝄞\"}"
},
{
"description": "Embedded nulls",
"canonical_bson": "190000000261000D0000006162006261620062616261620000",