Skip to content

Commit

Permalink
fix(parser/html): fix incorrect lexing of <html> elements (#3840)
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 authored Sep 9, 2024
1 parent bdf8547 commit 4bc409d
Show file tree
Hide file tree
Showing 6 changed files with 409 additions and 6 deletions.
21 changes: 15 additions & 6 deletions crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ pub(crate) struct HtmlLexer<'src> {
after_newline: bool,

unicode_bom_length: usize,

after_doctype: bool,
}

impl<'src> HtmlLexer<'src> {
Expand All @@ -47,13 +49,17 @@ impl<'src> HtmlLexer<'src> {
after_newline: false,
current_flags: TokenFlags::empty(),
unicode_bom_length: 0,
after_doctype: false,
}
}
fn consume_token(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
b'<' => self.consume_l_angle(),
b'>' => self.consume_byte(T![>]),
b'>' => {
self.after_doctype = false;
self.consume_byte(T![>])
}
b'/' => self.consume_byte(T![/]),
b'!' => self.consume_byte(T![!]),
b'=' => self.consume_byte(T![=]),
Expand Down Expand Up @@ -148,8 +154,11 @@ impl<'src> HtmlLexer<'src> {
}

match &buffer[..len] {
b"doctype" | b"DOCTYPE" => DOCTYPE_KW,
b"html" | b"HTML" => HTML_KW,
b"doctype" | b"DOCTYPE" => {
self.after_doctype = true;
DOCTYPE_KW
}
b"html" | b"HTML" if self.after_doctype => HTML_KW,
_ => HTML_LITERAL,
}
}
Expand Down Expand Up @@ -242,10 +251,10 @@ impl<'src> HtmlLexer<'src> {
fn consume_l_angle(&mut self) -> HtmlSyntaxKind {
self.assert_byte(b'<');

if !self.at_start_comment() {
self.consume_byte(T![<])
} else {
if self.at_start_comment() {
self.consume_comment()
} else {
self.consume_byte(T![<])
}
}

Expand Down
14 changes: 14 additions & 0 deletions crates/biome_html_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,17 @@ fn element_with_attributes() {
R_ANGLE: 1,
}
}

#[test]
fn html_element() {
assert_lex! {
"<html></html>",
L_ANGLE: 1,
HTML_LITERAL: 4,
R_ANGLE: 1,
L_ANGLE: 1,
SLASH: 1,
HTML_LITERAL: 4,
R_ANGLE: 1,
}
}
10 changes: 10 additions & 0 deletions crates/biome_html_parser/tests/html_specs/ok/hello-world.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<!DOCTYPE html>
<html>
<head>
<title>Hello, world!</title>
</head>
<body>
<h1>Hello, world!</h1>
<p>This is a test HTML file.</p>
</body>
</html>
291 changes: 291 additions & 0 deletions crates/biome_html_parser/tests/html_specs/ok/hello-world.html.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<!DOCTYPE html>
<html>
<head>
<title>Hello, world!</title>
</head>
<body>
<h1>Hello, world!</h1>
<p>This is a test HTML file.</p>
</body>
</html>
```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: HtmlDirective {
l_angle_token: L_ANGLE@0..1 "<" [] [],
excl_token: BANG@1..2 "!" [] [],
doctype_token: DOCTYPE_KW@2..10 "DOCTYPE" [] [Whitespace(" ")],
html_token: HTML_KW@10..14 "html" [] [],
quirk_token: missing (optional),
public_id_token: missing (optional),
system_id_token: missing (optional),
r_angle_token: R_ANGLE@14..15 ">" [] [],
},
html: HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@15..17 "<" [Newline("\n")] [],
name: HtmlName {
value_token: HTML_LITERAL@17..21 "html" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@21..22 ">" [] [],
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@22..24 "\n\t" [] [],
},
HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@24..25 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@25..29 "head" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@29..30 ">" [] [],
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@30..33 "\n\t\t" [] [],
},
HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@33..34 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@34..39 "title" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@39..40 ">" [] [],
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@40..53 "Hello, world!" [] [],
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@53..54 "<" [] [],
slash_token: SLASH@54..55 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@55..60 "title" [] [],
},
r_angle_token: R_ANGLE@60..61 ">" [] [],
},
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@61..64 "<" [Newline("\n"), Whitespace("\t")] [],
slash_token: SLASH@64..65 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@65..69 "head" [] [],
},
r_angle_token: R_ANGLE@69..70 ">" [] [],
},
},
HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@70..73 "<" [Newline("\n"), Whitespace("\t")] [],
name: HtmlName {
value_token: HTML_LITERAL@73..77 "body" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@77..78 ">" [] [],
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@78..81 "\n\t\t" [] [],
},
HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@81..82 "<" [] [],
name: HtmlName {
value_token: HTML_LITERAL@82..84 "h1" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@84..85 ">" [] [],
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@85..98 "Hello, world!" [] [],
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@98..99 "<" [] [],
slash_token: SLASH@99..100 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@100..102 "h1" [] [],
},
r_angle_token: R_ANGLE@102..103 ">" [] [],
},
},
HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: L_ANGLE@103..107 "<" [Newline("\n"), Whitespace("\t\t")] [],
name: HtmlName {
value_token: HTML_LITERAL@107..108 "p" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: R_ANGLE@108..109 ">" [] [],
},
children: HtmlElementList [
HtmlContent {
value_token: HTML_LITERAL@109..134 "This is a test HTML file." [] [],
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@134..135 "<" [] [],
slash_token: SLASH@135..136 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@136..137 "p" [] [],
},
r_angle_token: R_ANGLE@137..138 ">" [] [],
},
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@138..141 "<" [Newline("\n"), Whitespace("\t")] [],
slash_token: SLASH@141..142 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@142..146 "body" [] [],
},
r_angle_token: R_ANGLE@146..147 ">" [] [],
},
},
],
closing_element: HtmlClosingElement {
l_angle_token: L_ANGLE@147..149 "<" [Newline("\n")] [],
slash_token: SLASH@149..150 "/" [] [],
name: HtmlName {
value_token: HTML_LITERAL@150..154 "html" [] [],
},
r_angle_token: R_ANGLE@154..155 ">" [] [],
},
},
eof_token: EOF@155..156 "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: [email protected]
0: [email protected] "<" [] []
1: [email protected] "!" [] []
2: [email protected] "DOCTYPE" [] [Whitespace(" ")]
3: [email protected] "html" [] []
4: (empty)
5: (empty)
6: (empty)
7: [email protected] ">" [] []
2: [email protected]
0: [email protected]
0: [email protected] "<" [Newline("\n")] []
1: [email protected]
0: [email protected] "html" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "\n\t" [] []
1: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "head" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "\n\t\t" [] []
1: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "title" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "Hello, world!" [] []
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "title" [] []
3: [email protected] ">" [] []
2: [email protected]
0: [email protected] "<" [Newline("\n"), Whitespace("\t")] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "head" [] []
3: [email protected] ">" [] []
2: [email protected]
0: [email protected]
0: [email protected] "<" [Newline("\n"), Whitespace("\t")] []
1: [email protected]
0: [email protected] "body" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "\n\t\t" [] []
1: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "h1" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "Hello, world!" [] []
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "h1" [] []
3: [email protected] ">" [] []
2: [email protected]
0: [email protected]
0: [email protected] "<" [Newline("\n"), Whitespace("\t\t")] []
1: [email protected]
0: [email protected] "p" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "This is a test HTML file." [] []
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "p" [] []
3: [email protected] ">" [] []
2: [email protected]
0: [email protected] "<" [Newline("\n"), Whitespace("\t")] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "body" [] []
3: [email protected] ">" [] []
2: [email protected]
0: [email protected] "<" [Newline("\n")] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "html" [] []
3: [email protected] ">" [] []
3: [email protected] "" [Newline("\n")] []
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<div>
4 / 2 == 2
"foo"
html is cool
</div>
Loading

0 comments on commit 4bc409d

Please sign in to comment.