biomejs · Conaclos · Jan 18, 2024 · Jan 18, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -43,6 +43,20 @@ Read our [guidelines for writing a good changelog entry](https://github.com/biom
 
 - Fix [#1575](https://github.com/biomejs/biome/issues/1575). [noArrayIndexKey](https://biomejs.dev/linter/rules/no-array-index-key/) now captures array index value inside template literals and with string concatination. Contributed by @vasucp1207
 
+- Linter rules that inspect regexes now handle multibyte characters correctly ([#1522](https://github.com/biomejs/biome/issues/1522)).
+
+  Previously, [noMisleadingCharacterClass](https://biomejs.dev/linter/no-misleading-character-class), [noMultipleSpacesInRegularExpressionLiterals](https://biomejs.dev/linter/no-multiple-spaces-in-regular-expression-literals), and [noEmptyCharacterClassInRegex](https://biomejs.dev/linter/no-empty-character-class-in-regex) made Biome errors on multi-bytes characters.
+  Multibyte characters are now handled correctly.
+
+  The following code no longer raises an internal error:
+
+  ```js
+  // Cyrillic characters
+  /[\u200E\u2066-\u2069]/gu;
+  ```
+
+  Contributed by @Conaclos
+
 ### Parser
 
 

diff --git a/..._js_analyze/src/analyzers/complexity/no_multiple_spaces_in_regular_expression_literals.rs b/..._js_analyze/src/analyzers/complexity/no_multiple_spaces_in_regular_expression_literals.rs
@@ -67,7 +67,8 @@ impl Rule for NoMultipleSpacesInRegularExpressionLiterals {
         let mut range_list = vec![];
         let mut previous_is_space = false;
         let mut first_consecutive_space_index = 0;
-        for (i, ch) in trimmed_text.chars().enumerate() {
+        // We use `char_indices` to get the byte index of every character
+        for (i, ch) in trimmed_text.char_indices() {
             if ch == ' ' {
                 if !previous_is_space {
                     previous_is_space = true;

diff --git a/crates/biome_js_analyze/src/analyzers/correctness/no_empty_character_class_in_regex.rs b/crates/biome_js_analyze/src/analyzers/correctness/no_empty_character_class_in_regex.rs
@@ -62,7 +62,8 @@ impl Rule for NoEmptyCharacterClassInRegex {
         let trimmed_text = pattern.text();
         let mut class_start_index = None;
         let mut is_negated_class = false;
-        let mut enumerated_char_iter = trimmed_text.chars().enumerate();
+        // We use `char_indices` to get the byte index of every character
+        let mut enumerated_char_iter = trimmed_text.char_indices();
         while let Some((i, ch)) = enumerated_char_iter.next() {
             match ch {
                 '\\' => {

diff --git a/crates/biome_js_analyze/src/semantic_analyzers/nursery/no_misleading_character_class.rs b/crates/biome_js_analyze/src/semantic_analyzers/nursery/no_misleading_character_class.rs
@@ -293,10 +293,11 @@ fn diagnostic_regex_pattern(
     has_u_flag: bool,
     range: TextRange,
 ) -> Option<RuleState> {
+    let regex_bytes_len = regex_pattern.as_bytes().len();
     let mut is_in_character_class = false;
     let mut escape_next = false;
-    let char_iter = regex_pattern.chars().peekable();
-    for (i, ch) in char_iter.enumerate() {
+    // We use `char_indices` to get the byte index of every character
+    for (i, ch) in regex_pattern.char_indices() {
         if escape_next {
             escape_next = false;
             continue;
@@ -305,7 +306,7 @@ fn diagnostic_regex_pattern(
             '\\' => escape_next = true,
             '[' => is_in_character_class = true,
             ']' => is_in_character_class = false,
-            _ if is_in_character_class && i < regex_pattern.len() => {
+            _ if is_in_character_class && i < regex_bytes_len => {
                 if !has_u_flag && has_surrogate_pair(&regex_pattern[i..]) {
                     return Some(RuleState {
                         range,

diff --git a/..._analyze/tests/specs/complexity/noMultipleSpacesInRegularExpressionLiterals/invalid.jsonc b/..._analyze/tests/specs/complexity/noMultipleSpacesInRegularExpressionLiterals/invalid.jsonc
@@ -23,5 +23,6 @@
 	"/foo  {,2}/;",
 	"/foo  {1 2}/;",
 	"/foo  {1/;",
-	"/foo  {1,2/;"
+	"/foo  {1,2/;",
+	"/\u200E\u2066\u2069   /gu;"
 ]
diff --git a/...yze/tests/specs/complexity/noMultipleSpacesInRegularExpressionLiterals/invalid.jsonc.snap b/...yze/tests/specs/complexity/noMultipleSpacesInRegularExpressionLiterals/invalid.jsonc.snap
@@ -578,4 +578,28 @@ invalid.jsonc:1:5 lint/complexity/noMultipleSpacesInRegularExpressionLiterals  F
 
 ```
 
+# Input
+```cjs
+/‎⁦⁩   /gu;
+```
+
+# Diagnostics
+```
+invalid.jsonc:1:5 lint/complexity/noMultipleSpacesInRegularExpressionLiterals  FIXABLE  ━━━━━━━━━━━━
+
+  ! This regular expression contains unclear uses of consecutive spaces.
+
+  > 1 │ /���   /gu;
+      │  ^^^
+
+  i It's hard to visually count the amount of spaces.
+
+  i Safe fix: Use a quantifier instead.
+
+  - /���···/gu;
+  + /���·{3}/gu;
+
+
+```
+
 
diff --git a/...js_analyze/tests/specs/complexity/noMultipleSpacesInRegularExpressionLiterals/valid.jsonc b/...js_analyze/tests/specs/complexity/noMultipleSpacesInRegularExpressionLiterals/valid.jsonc
@@ -2,5 +2,6 @@
     "/foo {2}bar/;",
     "/foo bar baz/;",
     "/foo bar\tbaz/;",
-    "/foo /;"
+    "/foo /;",
+	"/\u200E\u2066\u2069/gu;"
 ]
diff --git a/...alyze/tests/specs/complexity/noMultipleSpacesInRegularExpressionLiterals/valid.jsonc.snap b/...alyze/tests/specs/complexity/noMultipleSpacesInRegularExpressionLiterals/valid.jsonc.snap
@@ -22,4 +22,9 @@ expression: valid.jsonc
 /foo /;
 ```
 
+# Input
+```cjs
+/‎⁦⁩/gu;
+```
+
 
diff --git a/crates/biome_js_analyze/tests/specs/correctness/noEmptyCharacterClassInRegex/invalid.js b/crates/biome_js_analyze/tests/specs/correctness/noEmptyCharacterClassInRegex/invalid.js
@@ -17,4 +17,5 @@ if (/^abc[]/.test(foo)) {}
 /[[]&&b]/v;
 // negated empty class
 /[^]/;
-/\[][^]/;
+/\[][^]/;
+/[\u200E\u2066-\u2069][]/gu;
diff --git a/crates/biome_js_analyze/tests/specs/correctness/noEmptyCharacterClassInRegex/invalid.js.snap b/crates/biome_js_analyze/tests/specs/correctness/noEmptyCharacterClassInRegex/invalid.js.snap
@@ -24,6 +24,7 @@ if (/^abc[]/.test(foo)) {}
 // negated empty class
 /[^]/;
 /\[][^]/;
+/[\u200E\u2066-\u2069][]/gu;
 ```
 
 # Diagnostics
@@ -357,6 +358,7 @@ invalid.js:19:2 lint/correctness/noEmptyCharacterClassInRegex ━━━━━━
   > 19 │ /[^]/;
        │  ^^^
     20 │ /\[][^]/;
+    21 │ /[\u200E\u2066-\u2069][]/gu;
 
   i Negated empty character classes match anything.
     If you want to match against [, escape it \[.
@@ -374,6 +376,7 @@ invalid.js:20:5 lint/correctness/noEmptyCharacterClassInRegex ━━━━━━
     19 │ /[^]/;
   > 20 │ /\[][^]/;
        │     ^^^
+    21 │ /[\u200E\u2066-\u2069][]/gu;
 
   i Negated empty character classes match anything.
     If you want to match against [, escape it \[.
@@ -382,4 +385,21 @@ invalid.js:20:5 lint/correctness/noEmptyCharacterClassInRegex ━━━━━━
 
 ```
 
+```
+invalid.js:21:23 lint/correctness/noEmptyCharacterClassInRegex ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+  ! The regular expression includes this empty character class.
+
+    19 │ /[^]/;
+    20 │ /\[][^]/;
+  > 21 │ /[\u200E\u2066-\u2069][]/gu;
+       │                       ^^
+
+  i Empty character classes don't match anything.
+    If you want to match against [, escape it \[.
+    Otherwise, remove the character class or fill it.
+
+
+```
+
 
diff --git a/crates/biome_js_analyze/tests/specs/correctness/noEmptyCharacterClassInRegex/valid.js b/crates/biome_js_analyze/tests/specs/correctness/noEmptyCharacterClassInRegex/valid.js
@@ -19,4 +19,5 @@
 /[a&&b]/v;
 /[[a][b]]/v;
 /[\q{}]/v;
-/[[^]--\p{ASCII}]/v;
+/[[^]--\p{ASCII}]/v;
+/[\u200E\u2066-\u2069]/gu;
diff --git a/crates/biome_js_analyze/tests/specs/correctness/noEmptyCharacterClassInRegex/valid.js.snap b/crates/biome_js_analyze/tests/specs/correctness/noEmptyCharacterClassInRegex/valid.js.snap
@@ -26,6 +26,7 @@ expression: valid.js
 /[[a][b]]/v;
 /[\q{}]/v;
 /[[^]--\p{ASCII}]/v;
+/[\u200E\u2066-\u2069]/gu;
 ```
 
 # Diagnostics
@@ -57,6 +58,7 @@ valid.js:22:3 lint/correctness/noEmptyCharacterClassInRegex ━━━━━━
     21 │ /[\q{}]/v;
   > 22 │ /[[^]--\p{ASCII}]/v;
        │   ^^^
+    23 │ /[\u200E\u2066-\u2069]/gu;
 
   i Negated empty character classes match anything.
     If you want to match against [, escape it \[.

diff --git a/crates/biome_js_analyze/tests/specs/nursery/noMisleadingCharacterClass/valid.js b/crates/biome_js_analyze/tests/specs/nursery/noMisleadingCharacterClass/valid.js
@@ -52,3 +52,6 @@ var r = new window.RegExp(/[👍]/u);
 var r = new global.RegExp(/[👍]/u);
 var r = new globalThis.RegExp(/[👍]/u);
 var r = new globalThis.globalThis.globalThis.RegExp(/[👍]/u);
+
+// Issue: https://github.com/biomejs/biome/issues/1522
+var cyrillicChars = /[\u200E\u2066-\u2069]/gu;
diff --git a/crates/biome_js_analyze/tests/specs/nursery/noMisleadingCharacterClass/valid.js.snap b/crates/biome_js_analyze/tests/specs/nursery/noMisleadingCharacterClass/valid.js.snap
@@ -59,6 +59,9 @@ var r = new global.RegExp(/[👍]/u);
 var r = new globalThis.RegExp(/[👍]/u);
 var r = new globalThis.globalThis.globalThis.RegExp(/[👍]/u);
 
+// Issue: https://github.com/biomejs/biome/issues/1522
+var cyrillicChars = /[\u200E\u2066-\u2069]/gu;
+
 ```
 
 
diff --git a/crates/biome_js_analyze/tests/specs/suspicious/noControlCharactersInRegex/valid.js b/crates/biome_js_analyze/tests/specs/suspicious/noControlCharactersInRegex/valid.js
@@ -18,3 +18,4 @@ var regex = /\u{1F}/g;
 var regex = /\t/;
 var regex = /\n/;
 new (function foo() {})("\\x1f");
+/[\u200E\u2066-\u2069]/gu;
diff --git a/crates/biome_js_analyze/tests/specs/suspicious/noControlCharactersInRegex/valid.js.snap b/crates/biome_js_analyze/tests/specs/suspicious/noControlCharactersInRegex/valid.js.snap
@@ -24,7 +24,7 @@ var regex = /\u{1F}/g;
 var regex = /\t/;
 var regex = /\n/;
 new (function foo() {})("\\x1f");
-
+/[\u200E\u2066-\u2069]/gu;
 ```
 
 
diff --git a/website/src/content/docs/internals/changelog.mdx b/website/src/content/docs/internals/changelog.mdx
@@ -49,6 +49,20 @@ Read our [guidelines for writing a good changelog entry](https://github.com/biom
 
 - Fix [#1575](https://github.com/biomejs/biome/issues/1575). [noArrayIndexKey](https://biomejs.dev/linter/rules/no-array-index-key/) now captures array index value inside template literals and with string concatination. Contributed by @vasucp1207
 
+- Linter rules that inspect regexes now handle multibyte characters correctly ([#1522](https://github.com/biomejs/biome/issues/1522)).
+
+  Previously, [noMisleadingCharacterClass](https://biomejs.dev/linter/no-misleading-character-class), [noMultipleSpacesInRegularExpressionLiterals](https://biomejs.dev/linter/no-multiple-spaces-in-regular-expression-literals), and [noEmptyCharacterClassInRegex](https://biomejs.dev/linter/no-empty-character-class-in-regex) made Biome errors on multi-bytes characters.
+  Multibyte characters are now handled correctly.
+
+  The following code no longer raises an internal error:
+
+  ```js
+  // Cyrillic characters
+  /[\u200E\u2066-\u2069]/gu;
+  ```
+
+  Contributed by @Conaclos
+
 ### Parser
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,4 +22,9 @@ expression: valid.jsonc @@
     /foo /;
     ```
+    # Input
+    ```cjs
+    /‎⁦⁩/gu;
+    ```