Skip to content

Commit

Permalink
Merge pull request #33 from James-LG/james/doctype
Browse files Browse the repository at this point in the history
fix(html): Allow verbose doctype declaration
  • Loading branch information
James-LG authored Jun 15, 2024
2 parents 3a5a3be + bd6a116 commit 1c9bbc8
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "skyscraper"
version = "0.6.3"
version = "0.6.4"
authors = ["James La Novara-Gsell <[email protected]>"]
edition = "2021"
description = "XPath for HTML web scraping"
Expand Down
10 changes: 5 additions & 5 deletions src/html/parse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,16 +380,16 @@ fn is_doctype(
if iden != "html" {
return Err(ParseError::MissingHtmlAfterDoctype);
}
let token = tokens.next().ok_or(ParseError::UnexpectedEndOfTokens)?;

if !matches!(token, Token::TagClose) {
return Err(ParseError::MissingTagCloseAfterDoctype);
for token in tokens {
if let Token::TagClose = token {
return Ok(true);
}
}
return Err(ParseError::MissingTagCloseAfterDoctype);
} else {
return Err(ParseError::MissingHtmlAfterDoctype);
}

return Ok(true);
}

Ok(false)
Expand Down
5 changes: 2 additions & 3 deletions src/xpath/grammar/expressions/comparison_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,13 @@ use nom::{
use crate::{
xpath::{
grammar::{
data_model::{AnyAtomicType, Node, XpathItem},
data_model::{AnyAtomicType, XpathItem},
expressions::string_concat_expressions::string_concat_expr,
recipes::Res,
terminal_symbols::symbol_separator,
NonTreeXpathNode, XpathItemTreeNodeData,
},
xpath_item_set::XpathItemSet,
ExpressionApplyError, XpathExpressionContext, XpathItemTree,
ExpressionApplyError, XpathExpressionContext,
},
xpath_item_set,
};
Expand Down
32 changes: 32 additions & 0 deletions tests/html_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,35 @@ fn text_should_unescape_characters() {
let html_text = document.get_html_node(&child).unwrap().extract_as_text();
assert_eq!(html_text.value, r##"&"'<>"##);
}

#[test]
fn doctype_should_skip_regular_doctype() {
// arrange
let text = r##"
<!DOCTYPE html>
<div>hi</div>"##;

// act
let document = html::parse(text).unwrap();

// assert
let root_node = document.root_node;
let html_tag = document.get_html_node(&root_node).unwrap().extract_as_tag();
assert_eq!(html_tag.name, "div");
}

#[test]
fn doctype_should_skip_verbose_doctype() {
// arrange
let text = r##"
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<div>hi</div>"##;

// act
let document = html::parse(text).unwrap();

// assert
let root_node = document.root_node;
let html_tag = document.get_html_node(&root_node).unwrap().extract_as_tag();
assert_eq!(html_tag.name, "div");
}

0 comments on commit 1c9bbc8

Please sign in to comment.