Skip to content

Commit

Permalink
fix(walk): ignore empty and non-js script tags (#67)
Browse files Browse the repository at this point in the history
  • Loading branch information
DonIsaac authored Aug 27, 2024
1 parent e86d2a5 commit e7bd96e
Showing 1 changed file with 72 additions and 5 deletions.
77 changes: 72 additions & 5 deletions src/walk/website/url_extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ impl<'html> UrlExtractor<'html> {
}

fn record_embedded_script(&mut self, script: &str) {
if script.is_empty() {
return;
}
self.scripts
.push(Script::Embedded(script.to_string(), self.page_url.clone()));
}
Expand Down Expand Up @@ -108,12 +111,19 @@ impl<'html> UrlExtractor<'html> {
impl<'dom> DomVisitor<'dom> for UrlExtractor<'dom> {
fn visit_element(&mut self, node: dom_walker::ElementRef<'dom>) {
match node.name() {
"script" => match node.attr("src") {
Some(script_url) => self.record_remote_script(script_url),
None => {
self.record_embedded_script(node.text().collect::<String>().trim());
"script" => {
let r#type = node.attr("type");
if r#type.is_some_and(|t| !t.contains("javascript")) {
return;
}
},

match node.attr("src") {
Some(script_url) => self.record_remote_script(script_url),
None => {
self.record_embedded_script(node.text().collect::<String>().trim());
}
}
}
"a" => {
let Some(page_url) = node.attr("href") else {
return;
Expand Down Expand Up @@ -225,4 +235,61 @@ mod test {
]
);
}
#[test]
fn test_embedded_script_empty() {
let url = Url::parse("https://example.com").unwrap();
let html = "
<html>
<head>
<script></script>
<script> </script>
<script>
\t
</script>
</head>
<body></body>
</html>
";
let mut extractor = UrlExtractor::new(&url, &url);
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
assert!(pages.is_empty(), "found pages: {pages:#?}");
assert!(scripts.is_empty());
}

#[test]
fn test_non_js_embedded_script() {
let url = Url::parse("https://example.com").unwrap();
let html = r#"
<html>
<head>
<script type="application/json">
{ "foo": "bar" }
</script>
<script type="text/javascript">
console.log("hello, world");
</script>
</head>
<body></body>
</html>
"#;

let mut extractor = UrlExtractor::new(&url, &url);
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();

assert!(pages.is_empty(), "found pages: {pages:#?}");
assert_eq!(
scripts,
vec![Script::Embedded(
"console.log(\"hello, world\");".to_string(),
url.clone()
),]
);
}
}

0 comments on commit e7bd96e

Please sign in to comment.