From f9868cf6c650f5c3a3f468b2ab62656b76eac7d7 Mon Sep 17 00:00:00 2001 From: Victorien Elvinger Date: Mon, 18 Nov 2024 16:26:12 +0100 Subject: [PATCH] fix(js_parser): unicode escape seq with multi-byte chars (#4570) --- CHANGELOG.md | 1 + crates/biome_js_parser/src/lexer/errors.rs | 6 - crates/biome_js_parser/src/lexer/mod.rs | 29 +++-- .../error/unicode_escape_unterminated.js | 1 + .../error/unicode_escape_unterminated.js.snap | 57 +++++++++ .../error/unicode_escape_with_unicode_char.js | 1 + .../unicode_escape_with_unicode_char.js.snap | 118 ++++++++++++++++++ .../ok/unicode_escape_in_tagged_template.js | 2 + .../unicode_escape_in_tagged_template.js.snap | 97 ++++++++++++++ 9 files changed, 294 insertions(+), 18 deletions(-) delete mode 100644 crates/biome_js_parser/src/lexer/errors.rs create mode 100644 crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_unterminated.js create mode 100644 crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_unterminated.js.snap create mode 100644 crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_with_unicode_char.js create mode 100644 crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_with_unicode_char.js.snap create mode 100644 crates/biome_js_parser/tests/js_test_suite/ok/unicode_escape_in_tagged_template.js create mode 100644 crates/biome_js_parser/tests/js_test_suite/ok/unicode_escape_in_tagged_template.js.snap diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fa203aed334..82945704be05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -307,6 +307,7 @@ our [guidelines for writing a good changelog entry](https://github.com/biomejs/b Contributed by @denbezrukov +- Don't panic when a multi-byte character is found in a unicode escape sequence ([#4564](https://github.com/biomejs/biome/issues/4564)). Contributed by @Conaclos ## v1.9.4 (2024-10-17) diff --git a/crates/biome_js_parser/src/lexer/errors.rs b/crates/biome_js_parser/src/lexer/errors.rs deleted file mode 100644 index a2f86818c35e..000000000000 --- a/crates/biome_js_parser/src/lexer/errors.rs +++ /dev/null @@ -1,6 +0,0 @@ -use crate::prelude::*; - -pub fn invalid_digits_after_unicode_escape_sequence(start: usize, end: usize) -> ParseDiagnostic { - ParseDiagnostic::new("invalid digits after unicode escape sequence", start..end) - .with_hint("expected valid unicode escape sequence") -} diff --git a/crates/biome_js_parser/src/lexer/mod.rs b/crates/biome_js_parser/src/lexer/mod.rs index 07b843f0c942..fa8e7ee115f1 100644 --- a/crates/biome_js_parser/src/lexer/mod.rs +++ b/crates/biome_js_parser/src/lexer/mod.rs @@ -15,8 +15,6 @@ #![allow(clippy::or_fun_call)] -#[rustfmt::skip] -mod errors; mod tests; use std::ops::{BitOr, BitOrAssign}; @@ -37,8 +35,6 @@ use enumflags2::{bitflags, make_bitflags, BitFlags}; use crate::JsParserOptions; -use self::errors::invalid_digits_after_unicode_escape_sequence; - // The first utf8 byte of every valid unicode whitespace char, used for short circuiting whitespace checks const UNICODE_WHITESPACE_STARTS: [u8; 5] = [ // NBSP @@ -633,23 +629,32 @@ impl<'src> JsLexer<'src> { /// This returns a `u32` since not all escape sequences produce valid /// Unicode characters. fn read_unicode_escape(&mut self) -> Result { + let start = self.position - 1; self.assert_byte(b'u'); for _ in 0..4 { match self.next_byte_bounded() { None => { - let err = invalid_digits_after_unicode_escape_sequence( - self.position - 1, - self.position + 1, - ); + let err = ParseDiagnostic::new( + "Unterminated unicode escape sequence.", + start..(self.position + 1), + ) + .with_hint("Expected a valid unicode escape sequence."); self.push_diagnostic(err); return Err(()); } Some(b) if !b.is_ascii_hexdigit() => { - let err = invalid_digits_after_unicode_escape_sequence( - self.position - 1, - self.position + 1, - ); + let start = self.position; + // `b` can be a unicode character. + // To have a correct range, we have to eat the whole character. + if !b.is_ascii() { + self.advance_char_unchecked(); + } + let err = ParseDiagnostic::new( + "Invalid digit in unicode escape sequence.", + start..self.position, + ) + .with_hint("Expected a valid unicode escape sequence."); self.push_diagnostic(err); return Err(()); } diff --git a/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_unterminated.js b/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_unterminated.js new file mode 100644 index 000000000000..6dce84ad5ea3 --- /dev/null +++ b/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_unterminated.js @@ -0,0 +1 @@ +\u1 \ No newline at end of file diff --git a/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_unterminated.js.snap b/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_unterminated.js.snap new file mode 100644 index 000000000000..20f340c852ea --- /dev/null +++ b/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_unterminated.js.snap @@ -0,0 +1,57 @@ +--- +source: crates/biome_js_parser/tests/spec_test.rs +expression: snapshot +snapshot_kind: text +--- +## Input + +```jsx +\u1 +``` + + +## AST + +``` +JsModule { + bom_token: missing (optional), + interpreter_token: missing (optional), + directives: JsDirectiveList [], + items: JsModuleItemList [ + JsBogusStatement { + items: [ + ERROR_TOKEN@0..3 "\\u1" [] [], + ], + }, + ], + eof_token: EOF@3..3 "" [] [], +} +``` + +## CST + +``` +0: JS_MODULE@0..3 + 0: (empty) + 1: (empty) + 2: JS_DIRECTIVE_LIST@0..0 + 3: JS_MODULE_ITEM_LIST@0..3 + 0: JS_BOGUS_STATEMENT@0..3 + 0: ERROR_TOKEN@0..3 "\\u1" [] [] + 4: EOF@3..3 "" [] [] + +``` + +## Diagnostics + +``` +unicode_escape_unterminated.js:1:1 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + + × Unterminated unicode escape sequence. + + > 1 │ \u1 + │ ^^^ + + i Expected a valid unicode escape sequence. + +``` diff --git a/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_with_unicode_char.js b/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_with_unicode_char.js new file mode 100644 index 000000000000..5bd98199c6e9 --- /dev/null +++ b/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_with_unicode_char.js @@ -0,0 +1 @@ +const v = \u1¡1 \ No newline at end of file diff --git a/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_with_unicode_char.js.snap b/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_with_unicode_char.js.snap new file mode 100644 index 000000000000..9259aa1ffeda --- /dev/null +++ b/crates/biome_js_parser/tests/js_test_suite/error/unicode_escape_with_unicode_char.js.snap @@ -0,0 +1,118 @@ +--- +source: crates/biome_js_parser/tests/spec_test.rs +expression: snapshot +snapshot_kind: text +--- +## Input + +```jsx +const v = \u1¡1 +``` + + +## AST + +``` +JsModule { + bom_token: missing (optional), + interpreter_token: missing (optional), + directives: JsDirectiveList [], + items: JsModuleItemList [ + JsBogusStatement { + items: [ + JsBogus { + items: [ + CONST_KW@0..6 "const" [] [Whitespace(" ")], + JsBogus { + items: [ + JsBogus { + items: [ + JsIdentifierBinding { + name_token: IDENT@6..8 "v" [] [Whitespace(" ")], + }, + JsBogus { + items: [ + EQ@8..10 "=" [] [Whitespace(" ")], + JsBogus { + items: [ + ERROR_TOKEN@10..15 "\\u1¡" [] [], + ], + }, + ], + }, + ], + }, + ], + }, + ], + }, + ], + }, + JsExpressionStatement { + expression: JsNumberLiteralExpression { + value_token: JS_NUMBER_LITERAL@15..16 "1" [] [], + }, + semicolon_token: missing (optional), + }, + ], + eof_token: EOF@16..16 "" [] [], +} +``` + +## CST + +``` +0: JS_MODULE@0..16 + 0: (empty) + 1: (empty) + 2: JS_DIRECTIVE_LIST@0..0 + 3: JS_MODULE_ITEM_LIST@0..16 + 0: JS_BOGUS_STATEMENT@0..15 + 0: JS_BOGUS@0..15 + 0: CONST_KW@0..6 "const" [] [Whitespace(" ")] + 1: JS_BOGUS@6..15 + 0: JS_BOGUS@6..15 + 0: JS_IDENTIFIER_BINDING@6..8 + 0: IDENT@6..8 "v" [] [Whitespace(" ")] + 1: JS_BOGUS@8..15 + 0: EQ@8..10 "=" [] [Whitespace(" ")] + 1: JS_BOGUS@10..15 + 0: ERROR_TOKEN@10..15 "\\u1¡" [] [] + 1: JS_EXPRESSION_STATEMENT@15..16 + 0: JS_NUMBER_LITERAL_EXPRESSION@15..16 + 0: JS_NUMBER_LITERAL@15..16 "1" [] [] + 1: (empty) + 4: EOF@16..16 "" [] [] + +``` + +## Diagnostics + +``` +unicode_escape_with_unicode_char.js:1:14 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + + × Invalid digit in unicode escape sequence. + + > 1 │ const v = \u1¡1 + │ ^ + + i Expected a valid unicode escape sequence. + +unicode_escape_with_unicode_char.js:1:15 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + + × Expected a semicolon or an implicit semicolon after a statement, but found none + + > 1 │ const v = \u1¡1 + │ ^ + + i An explicit or implicit semicolon is expected here... + + > 1 │ const v = \u1¡1 + │ ^ + + i ...Which is required to end this statement + + > 1 │ const v = \u1¡1 + │ ^^^^^^^^^^^^^^ + +``` diff --git a/crates/biome_js_parser/tests/js_test_suite/ok/unicode_escape_in_tagged_template.js b/crates/biome_js_parser/tests/js_test_suite/ok/unicode_escape_in_tagged_template.js new file mode 100644 index 000000000000..5e7c2557c485 --- /dev/null +++ b/crates/biome_js_parser/tests/js_test_suite/ok/unicode_escape_in_tagged_template.js @@ -0,0 +1,2 @@ +tagged`\u0`; +tagged`\u1¡1`; \ No newline at end of file diff --git a/crates/biome_js_parser/tests/js_test_suite/ok/unicode_escape_in_tagged_template.js.snap b/crates/biome_js_parser/tests/js_test_suite/ok/unicode_escape_in_tagged_template.js.snap new file mode 100644 index 000000000000..d61afcda10a4 --- /dev/null +++ b/crates/biome_js_parser/tests/js_test_suite/ok/unicode_escape_in_tagged_template.js.snap @@ -0,0 +1,97 @@ +--- +source: crates/biome_js_parser/tests/spec_test.rs +expression: snapshot +snapshot_kind: text +--- +## Input + +```jsx +tagged`\u0`; +tagged`\u1¡1`; +``` + + +## AST + +``` +JsModule { + bom_token: missing (optional), + interpreter_token: missing (optional), + directives: JsDirectiveList [], + items: JsModuleItemList [ + JsExpressionStatement { + expression: JsTemplateExpression { + tag: JsIdentifierExpression { + name: JsReferenceIdentifier { + value_token: IDENT@0..6 "tagged" [] [], + }, + }, + type_arguments: missing (optional), + l_tick_token: BACKTICK@6..7 "`" [] [], + elements: JsTemplateElementList [ + JsTemplateChunkElement { + template_chunk_token: TEMPLATE_CHUNK@7..10 "\\u0" [] [], + }, + ], + r_tick_token: BACKTICK@10..11 "`" [] [], + }, + semicolon_token: SEMICOLON@11..12 ";" [] [], + }, + JsExpressionStatement { + expression: JsTemplateExpression { + tag: JsIdentifierExpression { + name: JsReferenceIdentifier { + value_token: IDENT@12..19 "tagged" [Newline("\n")] [], + }, + }, + type_arguments: missing (optional), + l_tick_token: BACKTICK@19..20 "`" [] [], + elements: JsTemplateElementList [ + JsTemplateChunkElement { + template_chunk_token: TEMPLATE_CHUNK@20..26 "\\u1¡1" [] [], + }, + ], + r_tick_token: BACKTICK@26..27 "`" [] [], + }, + semicolon_token: SEMICOLON@27..28 ";" [] [], + }, + ], + eof_token: EOF@28..28 "" [] [], +} +``` + +## CST + +``` +0: JS_MODULE@0..28 + 0: (empty) + 1: (empty) + 2: JS_DIRECTIVE_LIST@0..0 + 3: JS_MODULE_ITEM_LIST@0..28 + 0: JS_EXPRESSION_STATEMENT@0..12 + 0: JS_TEMPLATE_EXPRESSION@0..11 + 0: JS_IDENTIFIER_EXPRESSION@0..6 + 0: JS_REFERENCE_IDENTIFIER@0..6 + 0: IDENT@0..6 "tagged" [] [] + 1: (empty) + 2: BACKTICK@6..7 "`" [] [] + 3: JS_TEMPLATE_ELEMENT_LIST@7..10 + 0: JS_TEMPLATE_CHUNK_ELEMENT@7..10 + 0: TEMPLATE_CHUNK@7..10 "\\u0" [] [] + 4: BACKTICK@10..11 "`" [] [] + 1: SEMICOLON@11..12 ";" [] [] + 1: JS_EXPRESSION_STATEMENT@12..28 + 0: JS_TEMPLATE_EXPRESSION@12..27 + 0: JS_IDENTIFIER_EXPRESSION@12..19 + 0: JS_REFERENCE_IDENTIFIER@12..19 + 0: IDENT@12..19 "tagged" [Newline("\n")] [] + 1: (empty) + 2: BACKTICK@19..20 "`" [] [] + 3: JS_TEMPLATE_ELEMENT_LIST@20..26 + 0: JS_TEMPLATE_CHUNK_ELEMENT@20..26 + 0: TEMPLATE_CHUNK@20..26 "\\u1¡1" [] [] + 4: BACKTICK@26..27 "`" [] [] + 1: SEMICOLON@27..28 ";" [] [] + 4: EOF@28..28 "" [] [] + +```