From 4859a296324e8324a965330521a1c7f463ca972f Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 24 Oct 2023 10:19:51 -0400 Subject: [PATCH 1/6] handle ZWJ and emoji sequences --- src/tokenize.jl | 14 ++++++++++++-- test/tokenize.jl | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 739a24c6..61cc7de5 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -1287,10 +1287,20 @@ const MAX_KW_LENGTH = 10 function lex_identifier(l::Lexer, c) h = simple_hash(c, UInt64(0)) n = 1 + graphemestate = Ref(zero(Int32)) + graphemestate_peek = Ref(zero(Int32)) while true pc, ppc = dpeekchar(l) - if (pc == '!' && ppc == '=') || !is_identifier_char(pc) - break + if Unicode.isgraphemebreak!(graphemestate, c, pc) + if (pc == '!' && ppc == '=') || !is_identifier_char(pc) + break + end + elseif pc == '\u200d' # ZWJ control character + # ZWJ only allowed within emoji sequences, not at end + graphemestate_peek[] = graphemestate[] + if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc) + break + end end c = readchar(l) h = simple_hash(c, h) diff --git a/test/tokenize.jl b/test/tokenize.jl index 07972c98..ffd57ac7 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -44,12 +44,12 @@ end end # testset @testset "tokenize unicode" begin - str = "𝘋 =2β" + str = "𝘋 =2🏳️‍🌈" for s in [str, IOBuffer(str)] l = tokenize(s) kinds = [K"Identifier", K"Whitespace", K"=", K"Integer", K"Identifier", K"EndMarker"] - token_strs = ["𝘋", " ", "=", "2", "β", ""] + token_strs = ["𝘋", " ", "=", "2", "🏳️‍🌈", ""] for (i, n) in enumerate(l) @test kind(n) == kinds[i] @test untokenize(n, str) == token_strs[i] From bfcac857e8afbf4208648e8bf7558a813a3baec2 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 26 Oct 2023 08:49:28 -0400 Subject: [PATCH 2/6] forbid ZWNJ at end --- src/tokenize.jl | 4 ++-- test/diagnostics.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 61cc7de5..99d9b90c 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -1295,8 +1295,8 @@ function lex_identifier(l::Lexer, c) if (pc == '!' && ppc == '=') || !is_identifier_char(pc) break end - elseif pc == '\u200d' # ZWJ control character - # ZWJ only allowed within emoji sequences, not at end + elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters + # ZWJ/ZWNJ only within grapheme sequences, not at end graphemestate_peek[] = graphemestate[] if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc) break diff --git a/test/diagnostics.jl b/test/diagnostics.jl index ea2feb37..d7fd0b30 100644 --- a/test/diagnostics.jl +++ b/test/diagnostics.jl @@ -7,7 +7,7 @@ function diagnostic(str; only_first=false, allow_multiple=false, rule=:all, vers if !only_first @test length(stream.diagnostics) == 1 end - return stream.diagnostics[1] + return isempty(stream.diagnostics) ? nothing : stream.diagnostics[1] end end From d22e2c7a94418bb3cdc597eac85d14835b9225e5 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Thu, 26 Oct 2023 09:15:17 -0400 Subject: [PATCH 3/6] fix tests on Julia < 1.5 --- test/tokenize.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/tokenize.jl b/test/tokenize.jl index ffd57ac7..b8168c03 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -44,12 +44,13 @@ end end # testset @testset "tokenize unicode" begin - str = "𝘋 =2🏳️‍🌈" + emoji = VERSION < v"1.5" ? "😄" : "🏳️‍🌈" # requires newer Unicode + str = "𝘋 =2"*emoji for s in [str, IOBuffer(str)] l = tokenize(s) kinds = [K"Identifier", K"Whitespace", K"=", K"Integer", K"Identifier", K"EndMarker"] - token_strs = ["𝘋", " ", "=", "2", "🏳️‍🌈", ""] + token_strs = ["𝘋", " ", "=", "2", emoji, ""] for (i, n) in enumerate(l) @test kind(n) == kinds[i] @test untokenize(n, str) == token_strs[i] From 584d363cf2b2959765555b5fbfbad822bedcd36c Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Fri, 27 Oct 2023 16:48:38 -0400 Subject: [PATCH 4/6] ascii fast path --- src/tokenize.jl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 99d9b90c..4ebc8060 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -1284,14 +1284,22 @@ function lex_backtick(l::Lexer) end const MAX_KW_LENGTH = 10 +const ascii_is_identifier_char::Vector{Bool} = map(is_identifier_char ∘ Char, 0x00:0x7f) function lex_identifier(l::Lexer, c) h = simple_hash(c, UInt64(0)) n = 1 - graphemestate = Ref(zero(Int32)) + ascii = isascii(c) + graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER graphemestate_peek = Ref(zero(Int32)) while true pc, ppc = dpeekchar(l) - if Unicode.isgraphemebreak!(graphemestate, c, pc) + ascii = ascii && isascii(pc) + if ascii # fast path + pc_byte = pc % UInt8 + @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1] + break + end + elseif Unicode.isgraphemebreak!(graphemestate, c, pc) if (pc == '!' && ppc == '=') || !is_identifier_char(pc) break end From 8f6dc09c86f3116c75570dcdf145d28f10728a76 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Fri, 27 Oct 2023 17:01:00 -0400 Subject: [PATCH 5/6] fix for earlier Julia versions --- src/tokenize.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index 4ebc8060..9c19c040 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -1284,7 +1284,7 @@ function lex_backtick(l::Lexer) end const MAX_KW_LENGTH = 10 -const ascii_is_identifier_char::Vector{Bool} = map(is_identifier_char ∘ Char, 0x00:0x7f) +const ascii_is_identifier_char = Bool[is_identifier_char(Char(b)) for b=0x00:0x7f] function lex_identifier(l::Lexer, c) h = simple_hash(c, UInt64(0)) n = 1 From 75a1fc20beb603e46e47a378141bde5f148eb9b0 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 31 Oct 2023 14:11:08 -0400 Subject: [PATCH 6/6] Update test/tokenize.jl --- test/tokenize.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/tokenize.jl b/test/tokenize.jl index b8168c03..26ab044a 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -44,7 +44,8 @@ end end # testset @testset "tokenize unicode" begin - emoji = VERSION < v"1.5" ? "😄" : "🏳️‍🌈" # requires newer Unicode + # FIXME: rm VERSION check once we implement our own is_identifier_char + emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode str = "𝘋 =2"*emoji for s in [str, IOBuffer(str)] l = tokenize(s)