Skip to content

Commit

Permalink
scan unicode identifier
Browse files Browse the repository at this point in the history
  • Loading branch information
lu-zhengda committed Oct 19, 2023
1 parent 7a55cf7 commit 4de1898
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 8 deletions.
20 changes: 12 additions & 8 deletions sqllexer.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package sqllexer

import "unicode/utf8"

type TokenType int

const (
Expand Down Expand Up @@ -98,7 +100,7 @@ func (s *Lexer) Scan() Token {
case isWhitespace(ch):
return s.scanWhitespace()
case isLetter(ch):
return s.scanIdentifier()
return s.scanIdentifier(ch)
case isDoubleQuote(ch):
return s.scanDoubleQuotedIdentifier('"')
case isSingleQuote(ch):
Expand All @@ -125,7 +127,7 @@ func (s *Lexer) Scan() Token {
return s.scanPositionalParameter()
}
if s.config.DBMS == DBMSSQLServer && isLetter(s.lookAhead(1)) {
return s.scanIdentifier()
return s.scanIdentifier(ch)
}
return s.scanDollarQuotedString()
case ch == ':':
Expand Down Expand Up @@ -157,7 +159,8 @@ func (s *Lexer) lookAhead(n int) rune {
if s.cursor+n >= len(s.src) || s.cursor+n < 0 {
return 0
}
return rune(s.src[s.cursor+n])
r, _ := utf8.DecodeRuneInString(s.src[s.cursor+n:])
return r
}

// peek returns the rune at the cursor position.
Expand All @@ -172,10 +175,11 @@ func (s *Lexer) nextBy(n int) rune {
return 0
}
s.cursor += n
if s.cursor == len(s.src) {
if s.cursor >= len(s.src) {
return 0
}
return rune(s.src[s.cursor])
r, _ := utf8.DecodeRuneInString(s.src[s.cursor:])
return r
}

// next advances the cursor by 1 position and returns the rune at the cursor position.
Expand Down Expand Up @@ -288,12 +292,12 @@ func (s *Lexer) scanString() Token {
}
}

func (s *Lexer) scanIdentifier() Token {
func (s *Lexer) scanIdentifier(ch rune) Token {
// NOTE: this func does not distinguish between SQL keywords and identifiers
s.start = s.cursor
ch := s.next()
ch = s.nextBy(utf8.RuneLen(ch))
for isLetter(ch) || isDigit(ch) || ch == '.' || ch == '?' || ch == '$' {
ch = s.next()
ch = s.nextBy(utf8.RuneLen(ch))
}
// return the token as uppercase so that we can do case insensitive matching
return Token{IDENT, s.src[s.start:s.cursor]}
Expand Down
59 changes: 59 additions & 0 deletions sqllexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,65 @@ func TestLexer(t *testing.T) {
}
}

func TestLexerUnicode(t *testing.T) {
tests := []struct {
input string
expected []Token
lexerOpts []lexerOption
}{
{
input: `Descripció_CAT`,
expected: []Token{
{IDENT, `Descripció_CAT`},
},
},
{
input: `世界`,
expected: []Token{
{IDENT, `世界`},
},
},
{
input: `こんにちは`,
expected: []Token{
{IDENT, `こんにちは`},
},
},
{
input: `안녕하세요`,
expected: []Token{
{IDENT, `안녕하세요`},
},
},
{
input: `über`,
expected: []Token{
{IDENT, `über`},
},
},
{
input: `résumé`,
expected: []Token{
{IDENT, `résumé`},
},
},
{
input: `"über"`,
expected: []Token{
{IDENT, `"über"`},
},
},
}

for _, tt := range tests {
t.Run("", func(t *testing.T) {
lexer := New(tt.input, tt.lexerOpts...)
tokens := lexer.ScanAll()
assert.Equal(t, tt.expected, tokens)
})
}
}

func ExampleLexer() {
query := "SELECT * FROM users WHERE id = 1"
lexer := New(query)
Expand Down

0 comments on commit 4de1898

Please sign in to comment.