From 2ab584e0fa5a8ae227ce23228599f888930531ef Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Mon, 2 Dec 2024 11:30:11 +0900 Subject: [PATCH] support utf-16 surrogate pair (#564) --- decode_test.go | 12 ++++++++ lexer/lexer_test.go | 20 +++++++++++++ scanner/scanner.go | 70 ++++++++++++++++++++++++++++++++++++--------- 3 files changed, 88 insertions(+), 14 deletions(-) diff --git a/decode_test.go b/decode_test.go index b95cc0d..3b39819 100644 --- a/decode_test.go +++ b/decode_test.go @@ -1117,6 +1117,18 @@ c: `, []string{"Fun with \\", "\" \u0007 \b \u001b \f", "\n \r \t \u000b \u0000", "\u0020 \u00a0 \u0085 \u2028 \u2029 A A A"}, }, + { + `"\ud83e\udd23"`, + "🤣", + }, + { + `"\uD83D\uDE00\uD83D\uDE01"`, + "😀😁", + }, + { + `"\uD83D\uDE00a\uD83D\uDE01"`, + "😀a😁", + }, } for _, test := range tests { t.Run(test.source, func(t *testing.T) { diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index 052c00a..8e94c2f 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -3233,6 +3233,26 @@ a: | c `, }, + { + name: "invalid UTF-16 character", + src: `"\u00"`, + }, + { + name: "invalid UTF-16 surrogate pair length", + src: `"\ud800"`, + }, + { + name: "invalid UTF-16 low surrogate prefix", + src: `"\ud800\v"`, + }, + { + name: "invalid UTF-16 low surrogate", + src: `"\ud800\u0000"`, + }, + { + name: "invalid UTF-32 character", + src: `"\U0000"`, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { diff --git a/scanner/scanner.go b/scanner/scanner.go index a379a8c..a82ee6b 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -439,25 +439,67 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (*token.Token, error) { value = append(value, rune(codeNum)) } case 'u': + // \u0000 style must have 5 characters at least. if idx+5 >= size { - progress = 1 - ctx.addOriginBuf(nextChar) - value = append(value, nextChar) - } else { - progress = 5 - codeNum := hexRunesToInt(src[idx+2 : idx+progress+1]) - value = append(value, rune(codeNum)) + return nil, ErrInvalidToken( + token.Invalid( + "not enough length for escaped UTF-16 character", + string(ctx.obuf), s.pos(), + ), + ) + } + progress = 5 + codeNum := hexRunesToInt(src[idx+2 : idx+6]) + + // handle surrogate pairs. + if codeNum >= 0xD800 && codeNum <= 0xDBFF { + high := codeNum + + // \u0000\u0000 style must have 11 characters at least. + if idx+11 >= size { + return nil, ErrInvalidToken( + token.Invalid( + "not enough length for escaped UTF-16 surrogate pair", + string(ctx.obuf), s.pos(), + ), + ) + } + + if src[idx+6] != '\\' || src[idx+7] != 'u' { + return nil, ErrInvalidToken( + token.Invalid( + "found unexpected character after high surrogate for UTF-16 surrogate pair", + string(ctx.obuf), s.pos(), + ), + ) + } + + low := hexRunesToInt(src[idx+8 : idx+12]) + if low < 0xDC00 || low > 0xDFFF { + return nil, ErrInvalidToken( + token.Invalid( + "found unexpected low surrogate after high surrogate", + string(ctx.obuf), s.pos(), + ), + ) + } + codeNum = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000 + progress += 6 } + value = append(value, rune(codeNum)) case 'U': + // \U00000000 style must have 9 characters at least. if idx+9 >= size { - progress = 1 - ctx.addOriginBuf(nextChar) - value = append(value, nextChar) - } else { - progress = 9 - codeNum := hexRunesToInt(src[idx+2 : idx+progress+1]) - value = append(value, rune(codeNum)) + return nil, ErrInvalidToken( + token.Invalid( + "not enough length for escaped UTF-32 character", + string(ctx.obuf), s.pos(), + ), + ) } + progress = 9 + codeNum := hexRunesToInt(src[idx+2 : idx+10]) + value = append(value, rune(codeNum)) case '\n': isFirstLineChar = true isNewLine = true