Skip to content

Commit

Permalink
feat(parser/html): lex and parse unquoted attribute values
Browse files Browse the repository at this point in the history
  • Loading branch information
dyc3 committed Sep 17, 2024
1 parent 7e6de58 commit 5e73499
Show file tree
Hide file tree
Showing 10 changed files with 1,082 additions and 3 deletions.
46 changes: 46 additions & 0 deletions crates/biome_html_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ impl<'src> HtmlLexer<'src> {
}
}

fn consume_token_attribute_value(&mut self, current: u8) -> HtmlSyntaxKind {
match current {
b'\n' | b'\r' | b'\t' | b' ' => self.consume_newline_or_whitespaces(),
b'<' => self.consume_byte(T![<]),
b'>' => self.consume_byte(T![>]),
b'\'' | b'"' => self.consume_string_literal(current),
_ => self.consume_unquoted_string_literal(),
}
}

/// Bumps the current byte and creates a lexed token of the passed in kind.
#[inline]
fn consume_byte(&mut self, tok: HtmlSyntaxKind) -> HtmlSyntaxKind {
Expand Down Expand Up @@ -233,6 +243,41 @@ impl<'src> HtmlLexer<'src> {
}
}

/// Consume an attribute value that is not quoted.
///
/// See: https://html.spec.whatwg.org/#attributes-2 under "Unquoted attribute value syntax"
fn consume_unquoted_string_literal(&mut self) -> HtmlSyntaxKind {
let mut content_started = false;
let mut encountered_invalid = false;
while let Some(current) = self.current_byte() {
match current {
// these characters safely terminate an unquoted attribute value
b'\n' | b'\r' | b'\t' | b' ' | b'>' => break,
// these characters are absolutely invalid in an unquoted attribute value
b'?' | b'\'' | b'"' | b'=' | b'<' | b'`' => {
encountered_invalid = true;
break;
}
_ if current.is_ascii() => {
self.advance(1);
content_started = true;
}
_ => break,
}
}

if content_started && !encountered_invalid {
HTML_STRING_LITERAL
} else {
let char = self.current_char_unchecked();
self.push_diagnostic(ParseDiagnostic::new(
"Unexpected character in unquoted attribute value",
self.text_position()..self.text_position() + char.text_len(),
));
self.consume_unexpected_character()
}
}

fn consume_l_angle(&mut self) -> HtmlSyntaxKind {
self.assert_byte(b'<');

Expand Down Expand Up @@ -385,6 +430,7 @@ impl<'src> Lexer<'src> for HtmlLexer<'src> {
Some(current) => match context {
HtmlLexContext::Regular => self.consume_token(current),
HtmlLexContext::OutsideTag => self.consume_token_outside_tag(current),
HtmlLexContext::AttributeValue => self.consume_token_attribute_value(current),
},
None => EOF,
}
Expand Down
35 changes: 35 additions & 0 deletions crates/biome_html_parser/src/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,38 @@ fn html_text_spaces_with_lines() {
HTML_LITERAL: 18,
}
}

#[test]
fn unquoted_attribute_value_1() {
assert_lex! {
HtmlLexContext::AttributeValue,
"value",
HTML_STRING_LITERAL: 5,
}
}

#[test]
fn unquoted_attribute_value_2() {
assert_lex! {
HtmlLexContext::AttributeValue,
"value value\tvalue\n",
HTML_STRING_LITERAL: 5,
WHITESPACE: 1,
HTML_STRING_LITERAL: 5,
WHITESPACE: 1,
HTML_STRING_LITERAL: 5,
NEWLINE: 1,
}
}

#[test]
fn unquoted_attribute_value_invalid_chars() {
assert_lex! {
HtmlLexContext::AttributeValue,
"?<='\"`",
ERROR_TOKEN: 1,
L_ANGLE: 1,
ERROR_TOKEN: 1,
ERROR_TOKEN: 3,
}
}
6 changes: 3 additions & 3 deletions crates/biome_html_parser/src/syntax/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ fn parse_literal(p: &mut HtmlParser) -> ParsedSyntax {
Present(m.complete(p, HTML_NAME))
}

fn parse_string_literal(p: &mut HtmlParser) -> ParsedSyntax {
fn parse_attribute_string_literal(p: &mut HtmlParser) -> ParsedSyntax {
if !p.at(HTML_STRING_LITERAL) {
return Absent;
}
Expand All @@ -226,7 +226,7 @@ fn parse_attribute_initializer(p: &mut HtmlParser) -> ParsedSyntax {
return Absent;
}
let m = p.start();
p.bump(T![=]);
parse_string_literal(p).or_add_diagnostic(p, expected_initializer);
p.bump_with_context(T![=], HtmlLexContext::AttributeValue);
parse_attribute_string_literal(p).or_add_diagnostic(p, expected_initializer);
Present(m.complete(p, HTML_ATTRIBUTE_INITIALIZER_CLAUSE))
}
4 changes: 4 additions & 0 deletions crates/biome_html_parser/src/token_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ pub(crate) enum HtmlLexContext {
///
/// The exeptions being `<` which indicates the start of a tag, and `>` which is invalid syntax if not preceeded with a `<`.
OutsideTag,
/// When the parser encounters a `=` token (the beginning of the attribute initializer clause), it switches to this context.
///
/// This is because attribute values can start and end with a `"` or `'` character, or be unquoted, and the lexer needs to know to start lexing a string literal.
AttributeValue,
}

impl LexContext for HtmlLexContext {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<div>
<div class== >foo</div>
<div class=? >foo</div>
</div>
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
---
source: crates/biome_html_parser/tests/spec_test.rs
expression: snapshot
---
## Input

```html
<div>
<div class== >foo</div>
<div class=? >foo</div>
</div>

```


## AST

```
HtmlRoot {
bom_token: missing (optional),
directive: missing (optional),
html: HtmlElement {
opening_element: HtmlOpeningElement {
l_angle_token: [email protected] "<" [] [],
name: HtmlName {
value_token: [email protected] "div" [] [],
},
attributes: HtmlAttributeList [],
r_angle_token: [email protected] ">" [] [],
},
children: HtmlElementList [
HtmlBogusElement {
items: [
HtmlBogus {
items: [
[email protected] "<" [Newline("\n"), Whitespace("\t")] [],
HtmlName {
value_token: [email protected] "div" [] [Whitespace(" ")],
},
HtmlBogus {
items: [
HtmlAttribute {
name: HtmlName {
value_token: [email protected] "class" [] [],
},
initializer: HtmlAttributeInitializerClause {
eq_token: [email protected] "=" [] [],
value: missing (required),
},
},
HtmlBogusElement {
items: [
[email protected] "=" [] [Whitespace(" ")],
],
},
],
},
[email protected] ">" [] [],
],
},
HtmlElementList [
HtmlContent {
value_token: [email protected] "foo" [] [],
},
],
HtmlClosingElement {
l_angle_token: [email protected] "<" [] [],
slash_token: [email protected] "/" [] [],
name: HtmlName {
value_token: [email protected] "div" [] [],
},
r_angle_token: [email protected] ">" [] [],
},
],
},
HtmlBogusElement {
items: [
HtmlBogus {
items: [
[email protected] "<" [Newline("\n"), Whitespace("\t")] [],
HtmlName {
value_token: [email protected] "div" [] [Whitespace(" ")],
},
HtmlBogus {
items: [
HtmlAttribute {
name: HtmlName {
value_token: [email protected] "class" [] [],
},
initializer: HtmlAttributeInitializerClause {
eq_token: [email protected] "=" [] [],
value: missing (required),
},
},
HtmlBogusElement {
items: [
[email protected] "?" [] [Whitespace(" ")],
],
},
],
},
[email protected] ">" [] [],
],
},
HtmlElementList [
HtmlContent {
value_token: [email protected] "foo" [] [],
},
],
HtmlClosingElement {
l_angle_token: [email protected] "<" [] [],
slash_token: [email protected] "/" [] [],
name: HtmlName {
value_token: [email protected] "div" [] [],
},
r_angle_token: [email protected] ">" [] [],
},
],
},
],
closing_element: HtmlClosingElement {
l_angle_token: [email protected] "<" [Newline("\n")] [],
slash_token: [email protected] "/" [] [],
name: HtmlName {
value_token: [email protected] "div" [] [],
},
r_angle_token: [email protected] ">" [] [],
},
},
eof_token: [email protected] "" [Newline("\n")] [],
}
```

## CST

```
0: [email protected]
0: (empty)
1: (empty)
2: [email protected]
0: [email protected]
0: [email protected] "<" [] []
1: [email protected]
0: [email protected] "div" [] []
2: [email protected]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected]
0: [email protected] "<" [Newline("\n"), Whitespace("\t")] []
1: [email protected]
0: [email protected] "div" [] [Whitespace(" ")]
2: [email protected]
0: [email protected]
0: [email protected]
0: [email protected] "class" [] []
1: [email protected]
0: [email protected] "=" [] []
1: (empty)
1: [email protected]
0: [email protected] "=" [] [Whitespace(" ")]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "foo" [] []
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "div" [] []
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "<" [Newline("\n"), Whitespace("\t")] []
1: [email protected]
0: [email protected] "div" [] [Whitespace(" ")]
2: [email protected]
0: [email protected]
0: [email protected]
0: [email protected] "class" [] []
1: [email protected]
0: [email protected] "=" [] []
1: (empty)
1: [email protected]
0: [email protected] "?" [] [Whitespace(" ")]
3: [email protected] ">" [] []
1: [email protected]
0: [email protected]
0: [email protected] "foo" [] []
2: [email protected]
0: [email protected] "<" [] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "div" [] []
3: [email protected] ">" [] []
2: [email protected]
0: [email protected] "<" [Newline("\n")] []
1: [email protected] "/" [] []
2: [email protected]
0: [email protected] "div" [] []
3: [email protected] ">" [] []
3: [email protected] "" [Newline("\n")] []

```

## Diagnostics

```
invalid-unqouted-value1.html:2:13 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

× Unexpected character in unquoted attribute value

1 │ <div>
> 2 │ <div class== >foo</div>
│ ^
3 │ <div class=? >foo</div>
4 │ </div>

invalid-unqouted-value1.html:2:13 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

× Unexpected character `=`

1 │ <div>
> 2 │ <div class== >foo</div>
│ ^
3 │ <div class=? >foo</div>
4 │ </div>

invalid-unqouted-value1.html:3:13 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

× Unexpected character in unquoted attribute value

1 │ <div>
2 │ <div class== >foo</div>
> 3 │ <div class=? >foo</div>
│ ^
4 │ </div>
5 │

invalid-unqouted-value1.html:3:13 parse ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

× Unexpected character `?`

1 │ <div>
2 │ <div class== >foo</div>
> 3 │ <div class=? >foo</div>
│ ^
4 │ </div>
5 │

```
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<div>
<div class=foo"bar >foo</div>
<div class=foo'bar >foo</div>
</div>
Loading

0 comments on commit 5e73499

Please sign in to comment.