From 4859a296324e8324a965330521a1c7f463ca972f Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 24 Oct 2023 10:19:51 -0400
Subject: [PATCH 1/6] handle ZWJ and emoji sequences

---
 src/tokenize.jl  | 14 ++++++++++++--
 test/tokenize.jl |  4 ++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/tokenize.jl b/src/tokenize.jl
index 739a24c6..61cc7de5 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -1287,10 +1287,20 @@ const MAX_KW_LENGTH = 10
 function lex_identifier(l::Lexer, c)
     h = simple_hash(c, UInt64(0))
     n = 1
+    graphemestate = Ref(zero(Int32))
+    graphemestate_peek = Ref(zero(Int32))
     while true
         pc, ppc = dpeekchar(l)
-        if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
-            break
+        if Unicode.isgraphemebreak!(graphemestate, c, pc)
+            if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
+                break
+            end
+        elseif pc == '\u200d' # ZWJ control character
+            # ZWJ only allowed within emoji sequences, not at end
+            graphemestate_peek[] = graphemestate[]
+            if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
+                break
+            end
         end
         c = readchar(l)
         h = simple_hash(c, h)
diff --git a/test/tokenize.jl b/test/tokenize.jl
index 07972c98..ffd57ac7 100644
--- a/test/tokenize.jl
+++ b/test/tokenize.jl
@@ -44,12 +44,12 @@ end
 end # testset
 
 @testset "tokenize unicode" begin
-    str = "𝘋 =2β"
+    str = "𝘋 =2🏳️‍🌈"
     for s in [str, IOBuffer(str)]
         l = tokenize(s)
         kinds = [K"Identifier", K"Whitespace", K"=",
                  K"Integer", K"Identifier", K"EndMarker"]
-        token_strs = ["𝘋", " ", "=", "2", "β", ""]
+        token_strs = ["𝘋", " ", "=", "2", "🏳️‍🌈", ""]
         for (i, n) in enumerate(l)
             @test kind(n) == kinds[i]
             @test untokenize(n, str)  == token_strs[i]

From bfcac857e8afbf4208648e8bf7558a813a3baec2 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Thu, 26 Oct 2023 08:49:28 -0400
Subject: [PATCH 2/6] forbid ZWNJ at end

---
 src/tokenize.jl     | 4 ++--
 test/diagnostics.jl | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tokenize.jl b/src/tokenize.jl
index 61cc7de5..99d9b90c 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -1295,8 +1295,8 @@ function lex_identifier(l::Lexer, c)
             if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
                 break
             end
-        elseif pc == '\u200d' # ZWJ control character
-            # ZWJ only allowed within emoji sequences, not at end
+        elseif pc in ('\u200c','\u200d') # ZWNJ/ZWJ control characters
+            # ZWJ/ZWNJ only within grapheme sequences, not at end
             graphemestate_peek[] = graphemestate[]
             if Unicode.isgraphemebreak!(graphemestate_peek, pc, ppc)
                 break
diff --git a/test/diagnostics.jl b/test/diagnostics.jl
index ea2feb37..d7fd0b30 100644
--- a/test/diagnostics.jl
+++ b/test/diagnostics.jl
@@ -7,7 +7,7 @@ function diagnostic(str; only_first=false, allow_multiple=false, rule=:all, vers
         if !only_first
             @test length(stream.diagnostics) == 1
         end
-        return stream.diagnostics[1]
+        return isempty(stream.diagnostics) ? nothing : stream.diagnostics[1]
     end
 end
 

From d22e2c7a94418bb3cdc597eac85d14835b9225e5 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Thu, 26 Oct 2023 09:15:17 -0400
Subject: [PATCH 3/6] fix tests on Julia < 1.5

---
 test/tokenize.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/tokenize.jl b/test/tokenize.jl
index ffd57ac7..b8168c03 100644
--- a/test/tokenize.jl
+++ b/test/tokenize.jl
@@ -44,12 +44,13 @@ end
 end # testset
 
 @testset "tokenize unicode" begin
-    str = "𝘋 =2🏳️‍🌈"
+    emoji = VERSION < v"1.5" ? "😄" : "🏳️‍🌈" # requires newer Unicode
+    str = "𝘋 =2"*emoji
     for s in [str, IOBuffer(str)]
         l = tokenize(s)
         kinds = [K"Identifier", K"Whitespace", K"=",
                  K"Integer", K"Identifier", K"EndMarker"]
-        token_strs = ["𝘋", " ", "=", "2", "🏳️‍🌈", ""]
+        token_strs = ["𝘋", " ", "=", "2", emoji, ""]
         for (i, n) in enumerate(l)
             @test kind(n) == kinds[i]
             @test untokenize(n, str)  == token_strs[i]

From 584d363cf2b2959765555b5fbfbad822bedcd36c Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@mit.edu>
Date: Fri, 27 Oct 2023 16:48:38 -0400
Subject: [PATCH 4/6] ascii fast path

---
 src/tokenize.jl | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/tokenize.jl b/src/tokenize.jl
index 99d9b90c..4ebc8060 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -1284,14 +1284,22 @@ function lex_backtick(l::Lexer)
 end
 
 const MAX_KW_LENGTH = 10
+const ascii_is_identifier_char::Vector{Bool} = map(is_identifier_char ∘ Char, 0x00:0x7f)
 function lex_identifier(l::Lexer, c)
     h = simple_hash(c, UInt64(0))
     n = 1
-    graphemestate = Ref(zero(Int32))
+    ascii = isascii(c)
+    graphemestate = Ref(Int32(ascii)) # all ASCII id chars are UTF8PROC_BOUNDCLASS_OTHER
     graphemestate_peek = Ref(zero(Int32))
     while true
         pc, ppc = dpeekchar(l)
-        if Unicode.isgraphemebreak!(graphemestate, c, pc)
+        ascii = ascii && isascii(pc)
+        if ascii # fast path
+            pc_byte = pc % UInt8
+            @inbounds if (pc_byte == UInt8('!') && ppc == '=') || !ascii_is_identifier_char[pc_byte+1]
+                break
+            end
+        elseif Unicode.isgraphemebreak!(graphemestate, c, pc)
             if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
                 break
             end

From 8f6dc09c86f3116c75570dcdf145d28f10728a76 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@mit.edu>
Date: Fri, 27 Oct 2023 17:01:00 -0400
Subject: [PATCH 5/6] fix for earlier Julia versions

---
 src/tokenize.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tokenize.jl b/src/tokenize.jl
index 4ebc8060..9c19c040 100644
--- a/src/tokenize.jl
+++ b/src/tokenize.jl
@@ -1284,7 +1284,7 @@ function lex_backtick(l::Lexer)
 end
 
 const MAX_KW_LENGTH = 10
-const ascii_is_identifier_char::Vector{Bool} = map(is_identifier_char ∘ Char, 0x00:0x7f)
+const ascii_is_identifier_char = Bool[is_identifier_char(Char(b)) for b=0x00:0x7f]
 function lex_identifier(l::Lexer, c)
     h = simple_hash(c, UInt64(0))
     n = 1

From 75a1fc20beb603e46e47a378141bde5f148eb9b0 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@mit.edu>
Date: Tue, 31 Oct 2023 14:11:08 -0400
Subject: [PATCH 6/6] Update test/tokenize.jl

---
 test/tokenize.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/tokenize.jl b/test/tokenize.jl
index b8168c03..26ab044a 100644
--- a/test/tokenize.jl
+++ b/test/tokenize.jl
@@ -44,7 +44,8 @@ end
 end # testset
 
 @testset "tokenize unicode" begin
-    emoji = VERSION < v"1.5" ? "😄" : "🏳️‍🌈" # requires newer Unicode
+    # FIXME: rm VERSION check once we implement our own is_identifier_char
+    emoji = VERSION < v"1.5" ? "😄" : "\U1F3F3\UFE0F\U200D\U1F308" # 🏳️‍🌈 requires newer Unicode
     str = "𝘋 =2"*emoji
     for s in [str, IOBuffer(str)]
         l = tokenize(s)