From 2ab584e0fa5a8ae227ce23228599f888930531ef Mon Sep 17 00:00:00 2001
From: Masaaki Goshima <goccy54@gmail.com>
Date: Mon, 2 Dec 2024 11:30:11 +0900
Subject: [PATCH] support utf-16 surrogate pair (#564)

---
 decode_test.go      | 12 ++++++++
 lexer/lexer_test.go | 20 +++++++++++++
 scanner/scanner.go  | 70 ++++++++++++++++++++++++++++++++++++---------
 3 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/decode_test.go b/decode_test.go
index b95cc0d..3b39819 100644
--- a/decode_test.go
+++ b/decode_test.go
@@ -1117,6 +1117,18 @@ c:
 `,
 			[]string{"Fun with \\", "\" \u0007 \b \u001b \f", "\n \r \t \u000b \u0000", "\u0020 \u00a0 \u0085 \u2028 \u2029 A A A"},
 		},
+		{
+			`"\ud83e\udd23"`,
+			"🤣",
+		},
+		{
+			`"\uD83D\uDE00\uD83D\uDE01"`,
+			"😀😁",
+		},
+		{
+			`"\uD83D\uDE00a\uD83D\uDE01"`,
+			"😀a😁",
+		},
 	}
 	for _, test := range tests {
 		t.Run(test.source, func(t *testing.T) {
diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go
index 052c00a..8e94c2f 100644
--- a/lexer/lexer_test.go
+++ b/lexer/lexer_test.go
@@ -3233,6 +3233,26 @@ a: |
 	c
 `,
 		},
+		{
+			name: "invalid UTF-16 character",
+			src:  `"\u00"`,
+		},
+		{
+			name: "invalid UTF-16 surrogate pair length",
+			src:  `"\ud800"`,
+		},
+		{
+			name: "invalid UTF-16 low surrogate prefix",
+			src:  `"\ud800\v"`,
+		},
+		{
+			name: "invalid UTF-16 low surrogate",
+			src:  `"\ud800\u0000"`,
+		},
+		{
+			name: "invalid UTF-32 character",
+			src:  `"\U0000"`,
+		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
diff --git a/scanner/scanner.go b/scanner/scanner.go
index a379a8c..a82ee6b 100644
--- a/scanner/scanner.go
+++ b/scanner/scanner.go
@@ -439,25 +439,67 @@ func (s *Scanner) scanDoubleQuote(ctx *Context) (*token.Token, error) {
 					value = append(value, rune(codeNum))
 				}
 			case 'u':
+				// \u0000 style must have 5 characters at least.
 				if idx+5 >= size {
-					progress = 1
-					ctx.addOriginBuf(nextChar)
-					value = append(value, nextChar)
-				} else {
-					progress = 5
-					codeNum := hexRunesToInt(src[idx+2 : idx+progress+1])
-					value = append(value, rune(codeNum))
+					return nil, ErrInvalidToken(
+						token.Invalid(
+							"not enough length for escaped UTF-16 character",
+							string(ctx.obuf), s.pos(),
+						),
+					)
+				}
+				progress = 5
+				codeNum := hexRunesToInt(src[idx+2 : idx+6])
+
+				// handle surrogate pairs.
+				if codeNum >= 0xD800 && codeNum <= 0xDBFF {
+					high := codeNum
+
+					// \u0000\u0000 style must have 11 characters at least.
+					if idx+11 >= size {
+						return nil, ErrInvalidToken(
+							token.Invalid(
+								"not enough length for escaped UTF-16 surrogate pair",
+								string(ctx.obuf), s.pos(),
+							),
+						)
+					}
+
+					if src[idx+6] != '\\' || src[idx+7] != 'u' {
+						return nil, ErrInvalidToken(
+							token.Invalid(
+								"found unexpected character after high surrogate for UTF-16 surrogate pair",
+								string(ctx.obuf), s.pos(),
+							),
+						)
+					}
+
+					low := hexRunesToInt(src[idx+8 : idx+12])
+					if low < 0xDC00 || low > 0xDFFF {
+						return nil, ErrInvalidToken(
+							token.Invalid(
+								"found unexpected low surrogate after high surrogate",
+								string(ctx.obuf), s.pos(),
+							),
+						)
+					}
+					codeNum = ((high - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000
+					progress += 6
 				}
+				value = append(value, rune(codeNum))
 			case 'U':
+				// \U00000000 style must have 9 characters at least.
 				if idx+9 >= size {
-					progress = 1
-					ctx.addOriginBuf(nextChar)
-					value = append(value, nextChar)
-				} else {
-					progress = 9
-					codeNum := hexRunesToInt(src[idx+2 : idx+progress+1])
-					value = append(value, rune(codeNum))
+					return nil, ErrInvalidToken(
+						token.Invalid(
+							"not enough length for escaped UTF-32 character",
+							string(ctx.obuf), s.pos(),
+						),
+					)
 				}
+				progress = 9
+				codeNum := hexRunesToInt(src[idx+2 : idx+10])
+				value = append(value, rune(codeNum))
 			case '\n':
 				isFirstLineChar = true
 				isNewLine = true