From 4de1898f96e1ce3839e360bb332134896534da6d Mon Sep 17 00:00:00 2001
From: Zhengda Lu <zhengda.lu@datadoghq.com>
Date: Thu, 19 Oct 2023 17:58:07 -0400
Subject: [PATCH] scan unicode identifier

---
 sqllexer.go      | 20 +++++++++-------
 sqllexer_test.go | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/sqllexer.go b/sqllexer.go
index 2374f80..622a8d4 100644
--- a/sqllexer.go
+++ b/sqllexer.go
@@ -1,5 +1,7 @@
 package sqllexer
 
+import "unicode/utf8"
+
 type TokenType int
 
 const (
@@ -98,7 +100,7 @@ func (s *Lexer) Scan() Token {
 	case isWhitespace(ch):
 		return s.scanWhitespace()
 	case isLetter(ch):
-		return s.scanIdentifier()
+		return s.scanIdentifier(ch)
 	case isDoubleQuote(ch):
 		return s.scanDoubleQuotedIdentifier('"')
 	case isSingleQuote(ch):
@@ -125,7 +127,7 @@ func (s *Lexer) Scan() Token {
 			return s.scanPositionalParameter()
 		}
 		if s.config.DBMS == DBMSSQLServer && isLetter(s.lookAhead(1)) {
-			return s.scanIdentifier()
+			return s.scanIdentifier(ch)
 		}
 		return s.scanDollarQuotedString()
 	case ch == ':':
@@ -157,7 +159,8 @@ func (s *Lexer) lookAhead(n int) rune {
 	if s.cursor+n >= len(s.src) || s.cursor+n < 0 {
 		return 0
 	}
-	return rune(s.src[s.cursor+n])
+	r, _ := utf8.DecodeRuneInString(s.src[s.cursor+n:])
+	return r
 }
 
 // peek returns the rune at the cursor position.
@@ -172,10 +175,11 @@ func (s *Lexer) nextBy(n int) rune {
 		return 0
 	}
 	s.cursor += n
-	if s.cursor == len(s.src) {
+	if s.cursor >= len(s.src) {
 		return 0
 	}
-	return rune(s.src[s.cursor])
+	r, _ := utf8.DecodeRuneInString(s.src[s.cursor:])
+	return r
 }
 
 // next advances the cursor by 1 position and returns the rune at the cursor position.
@@ -288,12 +292,12 @@ func (s *Lexer) scanString() Token {
 	}
 }
 
-func (s *Lexer) scanIdentifier() Token {
+func (s *Lexer) scanIdentifier(ch rune) Token {
 	// NOTE: this func does not distinguish between SQL keywords and identifiers
 	s.start = s.cursor
-	ch := s.next()
+	ch = s.nextBy(utf8.RuneLen(ch))
 	for isLetter(ch) || isDigit(ch) || ch == '.' || ch == '?' || ch == '$' {
-		ch = s.next()
+		ch = s.nextBy(utf8.RuneLen(ch))
 	}
 	// return the token as uppercase so that we can do case insensitive matching
 	return Token{IDENT, s.src[s.start:s.cursor]}
diff --git a/sqllexer_test.go b/sqllexer_test.go
index 886aa1d..1982179 100644
--- a/sqllexer_test.go
+++ b/sqllexer_test.go
@@ -547,6 +547,65 @@ func TestLexer(t *testing.T) {
 	}
 }
 
+func TestLexerUnicode(t *testing.T) {
+	tests := []struct {
+		input     string
+		expected  []Token
+		lexerOpts []lexerOption
+	}{
+		{
+			input: `Descripció_CAT`,
+			expected: []Token{
+				{IDENT, `Descripció_CAT`},
+			},
+		},
+		{
+			input: `世界`,
+			expected: []Token{
+				{IDENT, `世界`},
+			},
+		},
+		{
+			input: `こんにちは`,
+			expected: []Token{
+				{IDENT, `こんにちは`},
+			},
+		},
+		{
+			input: `안녕하세요`,
+			expected: []Token{
+				{IDENT, `안녕하세요`},
+			},
+		},
+		{
+			input: `über`,
+			expected: []Token{
+				{IDENT, `über`},
+			},
+		},
+		{
+			input: `résumé`,
+			expected: []Token{
+				{IDENT, `résumé`},
+			},
+		},
+		{
+			input: `"über"`,
+			expected: []Token{
+				{IDENT, `"über"`},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run("", func(t *testing.T) {
+			lexer := New(tt.input, tt.lexerOpts...)
+			tokens := lexer.ScanAll()
+			assert.Equal(t, tt.expected, tokens)
+		})
+	}
+}
+
 func ExampleLexer() {
 	query := "SELECT * FROM users WHERE id = 1"
 	lexer := New(query)