Skip to content

Commit

Permalink
fix(html): Improve triangle bracket handling in text tokenizer
Browse files Browse the repository at this point in the history
Script tags require special handling of triangle brackets to allow
them to be used as comparison operators in JS.

Closes #9
  • Loading branch information
James-LG committed Jul 21, 2022
1 parent 40a90c1 commit d1e5827
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 21 deletions.
40 changes: 40 additions & 0 deletions src/html/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,46 @@ mod tests {
assert_tag(&result, key, "script", Some(attributes));
}

#[test]
fn parse_should_handle_text_with_triangle_brackets() {
// arrange
let html = r###"<div>foo > bar < baz</div>"###;

// act
let result = parse(html).unwrap();

// assert
// <div>
let key = result.root_node;
let children = assert_tag(&result, key, "div", None);

// <div> -> -> text()
{
let key = children[0];
assert_text(&result, key, "foo > bar < baz");
}
}

#[test]
fn parse_should_include_tag_like_text_in_script_tags() {
// arrange
let html = r###"<script>foo<bar></baz></script>"###;

// act
let result = parse(html).unwrap();

// assert
// <script>
let key = result.root_node;
let children = assert_tag(&result, key, "script", None);

// <script> -> -> text()
{
let key = children[0];
assert_text(&result, key, "foo<bar></baz>");
}
}

#[test]
fn parse_should_handle_single_tags() {
// arrange
Expand Down
123 changes: 106 additions & 17 deletions src/html/tokenizer/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use super::Token;
/// Has additional checks to make sure it is not an end tag.
pub fn is_start_tag(pointer: &mut VecPointerRef<char>) -> Option<Token> {
if let (Some('<'), Some(c2)) = (pointer.current(), pointer.peek()) {
if *c2 != '/' {
let c2 = *c2;
if c2 != '/' && !c2.is_whitespace() {
let mut name: Vec<char> = Vec::new();
loop {
match pointer.next() {
Expand Down Expand Up @@ -213,29 +214,57 @@ pub fn is_identifier(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> O
None
}

lazy_static! {
/// List of characters that end a Text [Symbol](Symbol).
static ref INAVLID_TEXT_CHARS: Vec<char> = vec!['<', '>'];
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a Text [Symbol](Symbol).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// Text is defined as any text outside a tag definition.
pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<Token> {
pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool, in_script_tag: bool) -> Option<Token> {
if has_open_tag {
return None;
}

if let Some(c) = pointer.current() {
if !INAVLID_TEXT_CHARS.contains(c) {
let start_index = pointer.index;
let c = *c;
let start_index = pointer.index;
// If character is not '<', or if it is, make sure its not a start or end tag.
if c != '<' || (is_end_tag(pointer).is_none() && is_start_tag(pointer).is_none()) {
let mut has_non_whitespace = !c.is_whitespace();

let mut buffer: Vec<char> = vec![*c];
let mut buffer: Vec<char> = vec![c];
loop {
match pointer.next() {
Some(c) if INAVLID_TEXT_CHARS.contains(c) => break,
Some('<') => {
let pointer_index = pointer.index;

// In a script tag the *only* thing that can end a text is an end script tag.
if in_script_tag {

if let Some(end_tag) = is_end_tag(pointer) {
match end_tag {
Token::EndTag(end_tag) => {
if end_tag == "script" {
// We can finally close the text
pointer.index = pointer_index;
break;
}
},
token => panic!("is_end_tag returned {:?} instead of Token::EndTag", token)
}
}
} else {
// The current tag can end or a new tag can be started mid-text.
if is_end_tag(pointer).is_some() || is_start_tag(pointer).is_some() {
// Start or end tag was matched meaning we've moved the pointer up;
// reset it now so it can be matched in the main tokenizer loop.
pointer.index = pointer_index;
break;
}
}

// If the loop hasn't been broken at this point, add the '<' and move on.
pointer.index = pointer_index;
buffer.push('<');
},
Some(c) => {
if !c.is_whitespace() {
has_non_whitespace = true;
Expand All @@ -255,8 +284,12 @@ pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<
pointer.index = start_index;
return None;
}
} else {
// Start or end tag was matched meaning we've moved the pointer up;
// reset it now so it can be matched in the main tokenizer loop.
pointer.index = start_index;
return None;
}
return None;
}
None
}
Expand Down Expand Up @@ -555,24 +588,80 @@ mod tests {
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false).unwrap();
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo bar")), result);
assert_eq!(7, pointer.index);
}

#[test]
fn is_text_not_move_pointer_if_not_found() {
fn is_text_not_move_pointer_if_end_tag() {
// arrange
let chars: Vec<char> = "<".chars().collect();
let chars: Vec<char> = "</foo>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false);
let result = is_text(&mut pointer, false, false);

// assert
assert!(matches!(result, None));
assert_eq!(None, result);
assert_eq!(0, pointer.index);
}

#[test]
fn is_text_not_move_pointer_if_start_tag() {
// arrange
let chars: Vec<char> = "<foo>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false);

// assert
assert_eq!(None, result);
assert_eq!(0, pointer.index);
}

#[test]
fn is_text_should_not_end_on_floating_triangle_bracket() {
// arrange
let chars: Vec<char> = "foo > bar < baz".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo > bar < baz")), result);
assert_eq!(15, pointer.index);
}

#[test]
fn is_text_should_end_on_tag_end() {
// arrange
let chars: Vec<char> = "foo > bar </baz>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo > bar ")), result);
assert_eq!(10, pointer.index);
}

#[test]
fn is_text_should_allow_tag_like_strings_in_script_tags() {
// arrange
let chars: Vec<char> = "foo<bar></baz>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, true).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo<bar></baz>")), result);
assert_eq!(14, pointer.index);
}
}
16 changes: 15 additions & 1 deletion src/html/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,32 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
let mut pointer = VecPointerRef::new(&chars);

let mut has_open_tag = false;
let mut in_script_tag = false;

while pointer.has_next() {
if let Some(s) = helpers::is_comment(&mut pointer) {
symbols.push(s);
} else if let Some(s) = helpers::is_start_tag(&mut pointer) {
has_open_tag = true;

// Check if this is a "script" tag so the flag can be set if required
match &s {
Token::StartTag(start_tag) => {
if start_tag == "script" {
in_script_tag = true;
}
},
token => panic!("is_start_tag returned {:?} instead of Token::StartTag", token)
}

symbols.push(s);
} else if let Some(s) = helpers::is_end_tag(&mut pointer) {
has_open_tag = true;
in_script_tag = false;
symbols.push(s);
} else if let Some(s) = helpers::is_tag_close_and_end(&mut pointer) {
has_open_tag = false;
in_script_tag = false;
symbols.push(s);
} else if let Some(s) = helpers::is_tag_close(&mut pointer) {
has_open_tag = false;
Expand All @@ -38,7 +52,7 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
symbols.push(s);
} else if let Some(s) = helpers::is_identifier(&mut pointer, has_open_tag) {
symbols.push(s);
} else if let Some(s) = helpers::is_text(&mut pointer, has_open_tag) {
} else if let Some(s) = helpers::is_text(&mut pointer, has_open_tag, in_script_tag) {
symbols.push(s);
} else {
if let Some(c) = pointer.current() {
Expand Down
2 changes: 1 addition & 1 deletion src/xpath/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ pub enum XpathAxes {
impl XpathAxes {
/// Returns the [XpathAxes] corresponding to the given `text`, or [None] if the string
/// is not a known axis.
pub fn from_str(text: &str) -> Option<Self> {
pub fn try_from_str(text: &str) -> Option<Self> {
match text {
"parent" => Some(XpathAxes::Parent),
"child" => Some(XpathAxes::Child),
Expand Down
4 changes: 2 additions & 2 deletions src/xpath/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,8 @@ fn inner_parse(text: &str) -> Result<Vec<XpathElement>, ParseError> {
fn parse_axis_selector(elements: &mut Vec<XpathElement>) -> Result<(), ParseError> {
let last_item = elements.pop().ok_or(ParseError::MissingAxis)?;
let axis = match last_item {
XpathElement::Tag(last_tag) => XpathAxes::from_str(last_tag.as_str())
.ok_or_else(|| ParseError::UnknownAxisType(last_tag))?,
XpathElement::Tag(last_tag) => XpathAxes::try_from_str(last_tag.as_str())
.ok_or(ParseError::UnknownAxisType(last_tag))?,
_ => return Err(ParseError::MissingAxis),
};
elements.push(XpathElement::Axis(axis));
Expand Down

0 comments on commit d1e5827

Please sign in to comment.