From 4cffb1674648b0703fba40720bf49f01a5864c6d Mon Sep 17 00:00:00 2001 From: James La Novara-Gsell Date: Thu, 21 Jul 2022 01:35:29 +0000 Subject: [PATCH] fix(html): Improve triangle bracket handling in text tokenizer Script tags require special handling of triangle brackets to allow them to be used as comparison operators in JS. Closes #9 --- Cargo.toml | 2 +- src/html/parse.rs | 40 +++++++++++ src/html/tokenizer/helpers.rs | 123 +++++++++++++++++++++++++++++----- src/html/tokenizer/mod.rs | 16 ++++- src/xpath/mod.rs | 2 +- src/xpath/parse.rs | 4 +- 6 files changed, 165 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 71ee0fd..9f7c7bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "skyscraper" -version = "0.3.0" +version = "0.3.1" authors = ["James La Novara-Gsell "] edition = "2018" description = "XPath for HTML web scraping" diff --git a/src/html/parse.rs b/src/html/parse.rs index 7bc1469..b418d9c 100644 --- a/src/html/parse.rs +++ b/src/html/parse.rs @@ -465,6 +465,46 @@ mod tests { assert_tag(&result, key, "script", Some(attributes)); } + #[test] + fn parse_should_handle_text_with_triangle_brackets() { + // arrange + let html = r###"
foo > bar < baz
"###; + + // act + let result = parse(html).unwrap(); + + // assert + //
+ let key = result.root_node; + let children = assert_tag(&result, key, "div", None); + + //
-> -> text() + { + let key = children[0]; + assert_text(&result, key, "foo > bar < baz"); + } + } + + #[test] + fn parse_should_include_tag_like_text_in_script_tags() { + // arrange + let html = r###""###; + + // act + let result = parse(html).unwrap(); + + // assert + //