Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(html): Improve triangle bracket handling in text tokenizer #10

Merged
merged 1 commit into from
Jul 21, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
fix(html): Improve triangle bracket handling in text tokenizer
Script tags require special handling of triangle brackets to allow
them to be used as comparison operators in JS.

Closes #9
James-LG committed Jul 21, 2022
commit 4cffb1674648b0703fba40720bf49f01a5864c6d
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "skyscraper"
version = "0.3.0"
version = "0.3.1"
authors = ["James La Novara-Gsell <james.lanovara.gsell@gmail.com>"]
edition = "2018"
description = "XPath for HTML web scraping"
40 changes: 40 additions & 0 deletions src/html/parse.rs
Original file line number Diff line number Diff line change
@@ -465,6 +465,46 @@ mod tests {
assert_tag(&result, key, "script", Some(attributes));
}

#[test]
fn parse_should_handle_text_with_triangle_brackets() {
// arrange
let html = r###"<div>foo > bar < baz</div>"###;

// act
let result = parse(html).unwrap();

// assert
// <div>
let key = result.root_node;
let children = assert_tag(&result, key, "div", None);

// <div> -> -> text()
{
let key = children[0];
assert_text(&result, key, "foo > bar < baz");
}
}

#[test]
fn parse_should_include_tag_like_text_in_script_tags() {
// arrange
let html = r###"<script>foo<bar></baz></script>"###;

// act
let result = parse(html).unwrap();

// assert
// <script>
let key = result.root_node;
let children = assert_tag(&result, key, "script", None);

// <script> -> -> text()
{
let key = children[0];
assert_text(&result, key, "foo<bar></baz>");
}
}

#[test]
fn parse_should_handle_single_tags() {
// arrange
123 changes: 106 additions & 17 deletions src/html/tokenizer/helpers.rs
Original file line number Diff line number Diff line change
@@ -10,7 +10,8 @@ use super::Token;
/// Has additional checks to make sure it is not an end tag.
pub fn is_start_tag(pointer: &mut VecPointerRef<char>) -> Option<Token> {
if let (Some('<'), Some(c2)) = (pointer.current(), pointer.peek()) {
if *c2 != '/' {
let c2 = *c2;
if c2 != '/' && !c2.is_whitespace() {
let mut name: Vec<char> = Vec::new();
loop {
match pointer.next() {
@@ -213,29 +214,57 @@ pub fn is_identifier(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> O
None
}

lazy_static! {
/// List of characters that end a Text [Symbol](Symbol).
static ref INAVLID_TEXT_CHARS: Vec<char> = vec!['<', '>'];
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a Text [Symbol](Symbol).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// Text is defined as any text outside a tag definition.
pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<Token> {
pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool, in_script_tag: bool) -> Option<Token> {
if has_open_tag {
return None;
}

if let Some(c) = pointer.current() {
if !INAVLID_TEXT_CHARS.contains(c) {
let start_index = pointer.index;
let c = *c;
let start_index = pointer.index;
// If character is not '<', or if it is, make sure its not a start or end tag.
if c != '<' || (is_end_tag(pointer).is_none() && is_start_tag(pointer).is_none()) {
let mut has_non_whitespace = !c.is_whitespace();

let mut buffer: Vec<char> = vec![*c];
let mut buffer: Vec<char> = vec![c];
loop {
match pointer.next() {
Some(c) if INAVLID_TEXT_CHARS.contains(c) => break,
Some('<') => {
let pointer_index = pointer.index;

// In a script tag the *only* thing that can end a text is an end script tag.
if in_script_tag {

if let Some(end_tag) = is_end_tag(pointer) {
match end_tag {
Token::EndTag(end_tag) => {
if end_tag == "script" {
// We can finally close the text
pointer.index = pointer_index;
break;
}
},
token => panic!("is_end_tag returned {:?} instead of Token::EndTag", token)
}
}
} else {
// The current tag can end or a new tag can be started mid-text.
if is_end_tag(pointer).is_some() || is_start_tag(pointer).is_some() {
// Start or end tag was matched meaning we've moved the pointer up;
// reset it now so it can be matched in the main tokenizer loop.
pointer.index = pointer_index;
break;
}
}

// If the loop hasn't been broken at this point, add the '<' and move on.
pointer.index = pointer_index;
buffer.push('<');
},
Some(c) => {
if !c.is_whitespace() {
has_non_whitespace = true;
@@ -255,8 +284,12 @@ pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<
pointer.index = start_index;
return None;
}
} else {
// Start or end tag was matched meaning we've moved the pointer up;
// reset it now so it can be matched in the main tokenizer loop.
pointer.index = start_index;
return None;
}
return None;
}
None
}
@@ -555,24 +588,80 @@ mod tests {
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false).unwrap();
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo bar")), result);
assert_eq!(7, pointer.index);
}

#[test]
fn is_text_not_move_pointer_if_not_found() {
fn is_text_not_move_pointer_if_end_tag() {
// arrange
let chars: Vec<char> = "<".chars().collect();
let chars: Vec<char> = "</foo>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false);
let result = is_text(&mut pointer, false, false);

// assert
assert!(matches!(result, None));
assert_eq!(None, result);
assert_eq!(0, pointer.index);
}

#[test]
fn is_text_not_move_pointer_if_start_tag() {
// arrange
let chars: Vec<char> = "<foo>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false);

// assert
assert_eq!(None, result);
assert_eq!(0, pointer.index);
}

#[test]
fn is_text_should_not_end_on_floating_triangle_bracket() {
// arrange
let chars: Vec<char> = "foo > bar < baz".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo > bar < baz")), result);
assert_eq!(15, pointer.index);
}

#[test]
fn is_text_should_end_on_tag_end() {
// arrange
let chars: Vec<char> = "foo > bar </baz>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo > bar ")), result);
assert_eq!(10, pointer.index);
}

#[test]
fn is_text_should_allow_tag_like_strings_in_script_tags() {
// arrange
let chars: Vec<char> = "foo<bar></baz>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, true).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo<bar></baz>")), result);
assert_eq!(14, pointer.index);
}
}
16 changes: 15 additions & 1 deletion src/html/tokenizer/mod.rs
Original file line number Diff line number Diff line change
@@ -16,18 +16,32 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
let mut pointer = VecPointerRef::new(&chars);

let mut has_open_tag = false;
let mut in_script_tag = false;

while pointer.has_next() {
if let Some(s) = helpers::is_comment(&mut pointer) {
symbols.push(s);
} else if let Some(s) = helpers::is_start_tag(&mut pointer) {
has_open_tag = true;

// Check if this is a "script" tag so the flag can be set if required
match &s {
Token::StartTag(start_tag) => {
if start_tag == "script" {
in_script_tag = true;
}
},
token => panic!("is_start_tag returned {:?} instead of Token::StartTag", token)
}

symbols.push(s);
} else if let Some(s) = helpers::is_end_tag(&mut pointer) {
has_open_tag = true;
in_script_tag = false;
symbols.push(s);
} else if let Some(s) = helpers::is_tag_close_and_end(&mut pointer) {
has_open_tag = false;
in_script_tag = false;
symbols.push(s);
} else if let Some(s) = helpers::is_tag_close(&mut pointer) {
has_open_tag = false;
@@ -38,7 +52,7 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
symbols.push(s);
} else if let Some(s) = helpers::is_identifier(&mut pointer, has_open_tag) {
symbols.push(s);
} else if let Some(s) = helpers::is_text(&mut pointer, has_open_tag) {
} else if let Some(s) = helpers::is_text(&mut pointer, has_open_tag, in_script_tag) {
symbols.push(s);
} else {
if let Some(c) = pointer.current() {
2 changes: 1 addition & 1 deletion src/xpath/mod.rs
Original file line number Diff line number Diff line change
@@ -144,7 +144,7 @@ pub enum XpathAxes {
impl XpathAxes {
/// Returns the [XpathAxes] corresponding to the given `text`, or [None] if the string
/// is not a known axis.
pub fn from_str(text: &str) -> Option<Self> {
pub fn try_from_str(text: &str) -> Option<Self> {
match text {
"parent" => Some(XpathAxes::Parent),
"child" => Some(XpathAxes::Child),
4 changes: 2 additions & 2 deletions src/xpath/parse.rs
Original file line number Diff line number Diff line change
@@ -228,8 +228,8 @@ fn inner_parse(text: &str) -> Result<Vec<XpathElement>, ParseError> {
fn parse_axis_selector(elements: &mut Vec<XpathElement>) -> Result<(), ParseError> {
let last_item = elements.pop().ok_or(ParseError::MissingAxis)?;
let axis = match last_item {
XpathElement::Tag(last_tag) => XpathAxes::from_str(last_tag.as_str())
.ok_or_else(|| ParseError::UnknownAxisType(last_tag))?,
XpathElement::Tag(last_tag) => XpathAxes::try_from_str(last_tag.as_str())
.ok_or(ParseError::UnknownAxisType(last_tag))?,
_ => return Err(ParseError::MissingAxis),
};
elements.push(XpathElement::Axis(axis));