From bd6a11609210906fca19f542cef064fe6d747f85 Mon Sep 17 00:00:00 2001 From: James La Novara-Gsell Date: Sat, 15 Jun 2024 01:43:07 +0000 Subject: [PATCH] fix(html): Allow verbose doctype declaration Older documents use a more verbose doctype declaration that must also be handled. e.g. `` --- Cargo.toml | 2 +- src/html/parse/mod.rs | 10 +++--- .../expressions/comparison_expressions.rs | 5 ++- tests/html_tests.rs | 32 +++++++++++++++++++ 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f28e2e6..ded80f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "skyscraper" -version = "0.6.3" +version = "0.6.4" authors = ["James La Novara-Gsell "] edition = "2021" description = "XPath for HTML web scraping" diff --git a/src/html/parse/mod.rs b/src/html/parse/mod.rs index fb985b5..eb2da3b 100644 --- a/src/html/parse/mod.rs +++ b/src/html/parse/mod.rs @@ -380,16 +380,16 @@ fn is_doctype( if iden != "html" { return Err(ParseError::MissingHtmlAfterDoctype); } - let token = tokens.next().ok_or(ParseError::UnexpectedEndOfTokens)?; - if !matches!(token, Token::TagClose) { - return Err(ParseError::MissingTagCloseAfterDoctype); + for token in tokens { + if let Token::TagClose = token { + return Ok(true); + } } + return Err(ParseError::MissingTagCloseAfterDoctype); } else { return Err(ParseError::MissingHtmlAfterDoctype); } - - return Ok(true); } Ok(false) diff --git a/src/xpath/grammar/expressions/comparison_expressions.rs b/src/xpath/grammar/expressions/comparison_expressions.rs index 60601ee..96f67bf 100644 --- a/src/xpath/grammar/expressions/comparison_expressions.rs +++ b/src/xpath/grammar/expressions/comparison_expressions.rs @@ -10,14 +10,13 @@ use nom::{ use crate::{ xpath::{ grammar::{ - data_model::{AnyAtomicType, Node, XpathItem}, + data_model::{AnyAtomicType, XpathItem}, expressions::string_concat_expressions::string_concat_expr, recipes::Res, terminal_symbols::symbol_separator, - NonTreeXpathNode, XpathItemTreeNodeData, }, xpath_item_set::XpathItemSet, - ExpressionApplyError, XpathExpressionContext, XpathItemTree, + ExpressionApplyError, XpathExpressionContext, }, xpath_item_set, }; diff --git a/tests/html_tests.rs b/tests/html_tests.rs index f34a465..243fe4b 100644 --- a/tests/html_tests.rs +++ b/tests/html_tests.rs @@ -49,3 +49,35 @@ fn text_should_unescape_characters() { let html_text = document.get_html_node(&child).unwrap().extract_as_text(); assert_eq!(html_text.value, r##"&"'<>"##); } + +#[test] +fn doctype_should_skip_regular_doctype() { + // arrange + let text = r##" + +
hi
"##; + + // act + let document = html::parse(text).unwrap(); + + // assert + let root_node = document.root_node; + let html_tag = document.get_html_node(&root_node).unwrap().extract_as_tag(); + assert_eq!(html_tag.name, "div"); +} + +#[test] +fn doctype_should_skip_verbose_doctype() { + // arrange + let text = r##" + +
hi
"##; + + // act + let document = html::parse(text).unwrap(); + + // assert + let root_node = document.root_node; + let html_tag = document.get_html_node(&root_node).unwrap().extract_as_tag(); + assert_eq!(html_tag.name, "div"); +}