From 4de1898f96e1ce3839e360bb332134896534da6d Mon Sep 17 00:00:00 2001 From: Zhengda Lu Date: Thu, 19 Oct 2023 17:58:07 -0400 Subject: [PATCH] scan unicode identifier --- sqllexer.go | 20 +++++++++------- sqllexer_test.go | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/sqllexer.go b/sqllexer.go index 2374f80..622a8d4 100644 --- a/sqllexer.go +++ b/sqllexer.go @@ -1,5 +1,7 @@ package sqllexer +import "unicode/utf8" + type TokenType int const ( @@ -98,7 +100,7 @@ func (s *Lexer) Scan() Token { case isWhitespace(ch): return s.scanWhitespace() case isLetter(ch): - return s.scanIdentifier() + return s.scanIdentifier(ch) case isDoubleQuote(ch): return s.scanDoubleQuotedIdentifier('"') case isSingleQuote(ch): @@ -125,7 +127,7 @@ func (s *Lexer) Scan() Token { return s.scanPositionalParameter() } if s.config.DBMS == DBMSSQLServer && isLetter(s.lookAhead(1)) { - return s.scanIdentifier() + return s.scanIdentifier(ch) } return s.scanDollarQuotedString() case ch == ':': @@ -157,7 +159,8 @@ func (s *Lexer) lookAhead(n int) rune { if s.cursor+n >= len(s.src) || s.cursor+n < 0 { return 0 } - return rune(s.src[s.cursor+n]) + r, _ := utf8.DecodeRuneInString(s.src[s.cursor+n:]) + return r } // peek returns the rune at the cursor position. @@ -172,10 +175,11 @@ func (s *Lexer) nextBy(n int) rune { return 0 } s.cursor += n - if s.cursor == len(s.src) { + if s.cursor >= len(s.src) { return 0 } - return rune(s.src[s.cursor]) + r, _ := utf8.DecodeRuneInString(s.src[s.cursor:]) + return r } // next advances the cursor by 1 position and returns the rune at the cursor position. @@ -288,12 +292,12 @@ func (s *Lexer) scanString() Token { } } -func (s *Lexer) scanIdentifier() Token { +func (s *Lexer) scanIdentifier(ch rune) Token { // NOTE: this func does not distinguish between SQL keywords and identifiers s.start = s.cursor - ch := s.next() + ch = s.nextBy(utf8.RuneLen(ch)) for isLetter(ch) || isDigit(ch) || ch == '.' || ch == '?' || ch == '$' { - ch = s.next() + ch = s.nextBy(utf8.RuneLen(ch)) } // return the token as uppercase so that we can do case insensitive matching return Token{IDENT, s.src[s.start:s.cursor]} diff --git a/sqllexer_test.go b/sqllexer_test.go index 886aa1d..1982179 100644 --- a/sqllexer_test.go +++ b/sqllexer_test.go @@ -547,6 +547,65 @@ func TestLexer(t *testing.T) { } } +func TestLexerUnicode(t *testing.T) { + tests := []struct { + input string + expected []Token + lexerOpts []lexerOption + }{ + { + input: `Descripció_CAT`, + expected: []Token{ + {IDENT, `Descripció_CAT`}, + }, + }, + { + input: `世界`, + expected: []Token{ + {IDENT, `世界`}, + }, + }, + { + input: `こんにちは`, + expected: []Token{ + {IDENT, `こんにちは`}, + }, + }, + { + input: `안녕하세요`, + expected: []Token{ + {IDENT, `안녕하세요`}, + }, + }, + { + input: `über`, + expected: []Token{ + {IDENT, `über`}, + }, + }, + { + input: `résumé`, + expected: []Token{ + {IDENT, `résumé`}, + }, + }, + { + input: `"über"`, + expected: []Token{ + {IDENT, `"über"`}, + }, + }, + } + + for _, tt := range tests { + t.Run("", func(t *testing.T) { + lexer := New(tt.input, tt.lexerOpts...) + tokens := lexer.ScanAll() + assert.Equal(t, tt.expected, tokens) + }) + } +} + func ExampleLexer() { query := "SELECT * FROM users WHERE id = 1" lexer := New(query)