Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GODRIVER-1947 Unmarshal unicode surrogate pairs correctly in UnmarshalExtJSON. #649

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 93 additions & 6 deletions bson/bsonrw/json_scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ import (
"io"
"math"
"strconv"
"strings"
"unicode"
"unicode/utf16"
)

type jsonTokenType byte
Expand Down Expand Up @@ -162,6 +162,31 @@ func isValueTerminator(c byte) bool {
return c == ',' || c == '}' || c == ']' || isWhiteSpace(c)
}

// getu4 decodes the 4-byte hex sequence from the beginning of s, returning the hex value as a rune,
// or it returns -1. Note that the "\u" from the unicode escape sequence should not be present.
// It is copied and lightly modified from the Go JSON decode function at
// https://github.com/golang/go/blob/1b0a0316802b8048d69da49dc23c5a5ab08e8ae8/src/encoding/json/decode.go#L1169-L1188
func getu4(s []byte) rune {
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
if len(s) < 4 {
return -1
}
var r rune
for _, c := range s[:4] {
switch {
case '0' <= c && c <= '9':
c = c - '0'
case 'a' <= c && c <= 'f':
c = c - 'a' + 10
case 'A' <= c && c <= 'F':
c = c - 'A' + 10
default:
return -1
}
r = r*16 + rune(c)
}
return r
}

// scanString reads from an opening '"' to a closing '"' and handles escaped characters
func (js *jsonScanner) scanString() (*jsonToken, error) {
var b bytes.Buffer
Expand All @@ -179,9 +204,18 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
return nil, err
}

evalNextChar:
switch c {
case '\\':
c, err = js.readNextByte()
if err != nil {
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

evalNextEscapeChar:
switch c {
case '"', '\\', '/':
b.WriteByte(c)
Expand All @@ -202,13 +236,66 @@ func (js *jsonScanner) scanString() (*jsonToken, error) {
return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
}

s := fmt.Sprintf(`\u%s`, us)
s, err = strconv.Unquote(strings.Replace(strconv.Quote(s), `\\u`, `\u`, 1))
if err != nil {
return nil, err
rn := getu4(us)

// If the rune we just decoded is the high or low value of a possible surrogate pair,
// try to decode the next sequence as the low value of a surrogate pair. We're
// expecting the next sequence to be another Unicode escape sequence (e.g. "\uDD1E"),
// but need to handle cases where the input is not a valid surrogate pair.
// For more context on unicode surrogate pairs, see:
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
// https://www.christianfscott.com/rust-chars-vs-go-runes/
// https://www.unicode.org/glossary/#high_surrogate_code_point
if utf16.IsSurrogate(rn) {
c, err = js.readNextByte()
if err != nil {
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

// If the next value isn't the beginning of a backslash escape sequence, write the
// current rune and goto the beginning of the next char eval block.
if c != '\\' {
b.WriteRune(rn)
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
goto evalNextChar
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
}

c, err = js.readNextByte()
if err != nil {
if err == io.EOF {
return nil, errors.New("end of input in JSON string")
}
return nil, err
}

// If the next value isn't the beginning of a unicode escape sequence, write the
// current rune and goto the beginning of the next escape char eval block.
if c != 'u' {
b.WriteRune(rn)
goto evalNextEscapeChar
}

err = js.readNNextBytes(us, 4, 0)
if err != nil {
return nil, fmt.Errorf("invalid unicode sequence in JSON string: %s", us)
}

rn2 := getu4(us)

// Try to decode the pair of runes as a utf16 surrogate pair. If that fails, just
// write the original two runes to the buffer.
if rnPair := utf16.DecodeRune(rn, rn2); rnPair != unicode.ReplacementChar {
b.WriteRune(rnPair)
kevinAlbs marked this conversation as resolved.
Show resolved Hide resolved
} else {
b.WriteRune(rn)
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
b.WriteRune(rn2)
kevinAlbs marked this conversation as resolved.
Show resolved Hide resolved
}

break
}

b.WriteString(s)
b.WriteRune(rn)
default:
return nil, fmt.Errorf("invalid escape sequence in JSON string '\\%c'", c)
}
Expand Down
28 changes: 24 additions & 4 deletions bson/bsonrw/json_scanner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,26 @@ func TestJsonScannerValidInputs(t *testing.T) {
input: `"\"\\\/\b\f\n\r\t"`,
tokens: []jsonToken{{t: jttString, v: "\"\\/\b\f\n\r\t"}},
},
{
desc: "valid string--surrogate pair",
input: `"abc \uD834\uDd1e 123"`,
tokens: []jsonToken{{t: jttString, v: "abc 𝄞 123"}},
},
{
desc: "valid string--high surrogate at end of string",
input: `"abc \uD834"`,
tokens: []jsonToken{{t: jttString, v: "abc �"}},
},
{
desc: "valid string--low surrogate at end of string",
input: `"abc \uDD1E"`,
tokens: []jsonToken{{t: jttString, v: "abc �"}},
},
{
desc: "valid string--high surrogate with non-surrogate unicode value",
input: `"abc \uDD1E\u00BF"`,
tokens: []jsonToken{{t: jttString, v: "abc �¿"}},
},
kevinAlbs marked this conversation as resolved.
Show resolved Hide resolved
{
desc: "valid literal--true", input: "true",
tokens: []jsonToken{{t: jttBool, v: true}},
Expand Down Expand Up @@ -280,28 +300,28 @@ func TestJsonScannerValidInputs(t *testing.T) {

for _, token := range tc.tokens {
c, err := js.nextToken()
expectNoError(t, err, tc.desc)
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
jttDiff(t, token.t, c.t, tc.desc)
jtvDiff(t, token.v, c.v, tc.desc)
expectNoError(t, err, tc.desc)
}

c, err := js.nextToken()
jttDiff(t, jttEOF, c.t, tc.desc)
noerr(t, err)
jttDiff(t, jttEOF, c.t, tc.desc)

// testing early EOF reading
js = &jsonScanner{r: iotest.DataErrReader(strings.NewReader(tc.input))}

for _, token := range tc.tokens {
c, err := js.nextToken()
expectNoError(t, err, tc.desc)
jttDiff(t, token.t, c.t, tc.desc)
jtvDiff(t, token.v, c.v, tc.desc)
expectNoError(t, err, tc.desc)
}

c, err = js.nextToken()
jttDiff(t, jttEOF, c.t, tc.desc)
noerr(t, err)
jttDiff(t, jttEOF, c.t, tc.desc)
})
}
}
Expand Down
111 changes: 99 additions & 12 deletions bson/unmarshal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,18 +95,105 @@ func TestUnmarshalExtJSONWithRegistry(t *testing.T) {
}

func TestUnmarshalExtJSONWithContext(t *testing.T) {
t.Run("UnmarshalExtJSONWithContext", func(t *testing.T) {
type teststruct struct{ Foo int }
var got teststruct
data := []byte("{\"foo\":1}")
dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
err := UnmarshalExtJSONWithContext(dc, data, true, &got)
noerr(t, err)
want := teststruct{1}
if !cmp.Equal(got, want) {
t.Errorf("Did not unmarshal as expected. got %v; want %v", got, want)
}
})
var cases = []struct {
name string
sType reflect.Type
want interface{}
data []byte
}{
{
name: "Small struct",
sType: reflect.TypeOf(struct {
Foo int
}{}),
data: []byte(`{"foo":1}`),
want: &struct {
Foo int
}{Foo: 1},
},
{
name: "Valid surrogate pair",
sType: reflect.TypeOf(struct {
matthewdale marked this conversation as resolved.
Show resolved Hide resolved
Foo string
}{}),
data: []byte(`{"foo":"\uD834\uDd1e"}`),
want: &struct {
Foo string
}{Foo: "𝄞"},
},
{
name: "Valid surrogate pair with other values",
sType: reflect.TypeOf(struct {
Foo string
}{}),
data: []byte(`{"foo":"abc \uD834\uDd1e 123"}`),
want: &struct {
Foo string
}{Foo: "abc 𝄞 123"},
},
{
name: "High surrogate value with no following low surrogate value",
sType: reflect.TypeOf(struct {
Foo string
}{}),
data: []byte(`{"foo":"abc \uD834 123"}`),
want: &struct {
Foo string
}{Foo: "abc � 123"},
},
{
name: "High surrogate value at end of string",
sType: reflect.TypeOf(struct {
Foo string
}{}),
data: []byte(`{"foo":"\uD834"}`),
want: &struct {
Foo string
}{Foo: "�"},
},
{
name: "Low surrogate value with no preceeding high surrogate value",
sType: reflect.TypeOf(struct {
Foo string
}{}),
data: []byte(`{"foo":"abc \uDd1e 123"}`),
want: &struct {
Foo string
}{Foo: "abc � 123"},
},
{
name: "Low surrogate value at end of string",
sType: reflect.TypeOf(struct {
Foo string
}{}),
data: []byte(`{"foo":"\uDd1e"}`),
want: &struct {
Foo string
}{Foo: "�"},
},
{
name: "High surrogate value with non-surrogate unicode value",
sType: reflect.TypeOf(struct {
Foo string
}{}),
data: []byte(`{"foo":"\uD834\u00BF"}`),
want: &struct {
Foo string
}{Foo: "�¿"},
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
got := reflect.New(tc.sType).Interface()
dc := bsoncodec.DecodeContext{Registry: DefaultRegistry}
err := UnmarshalExtJSONWithContext(dc, tc.data, true, got)
noerr(t, err)
if !cmp.Equal(got, tc.want) {
t.Errorf("Did not unmarshal as expected. got %+v; want %+v", got, tc.want)
}
})
}
}

func TestCachingDecodersNotSharedAcrossRegistries(t *testing.T) {
Expand Down