From db7444a36b7f0b19c180658f4423588dd37a4f8f Mon Sep 17 00:00:00 2001 From: Zoran Regvart Date: Sat, 7 Nov 2020 20:55:36 +0100 Subject: [PATCH] fix: correctly treat absolute paths against `dir` Absolute paths should be rooted from the `--dir` path, not against the base directory of the examined file. For example, given file in `$DIR/sub/a.html` with a link to `/other/b.html` and deadlinks invoked with `--dir $DIR`, that link should be evaluated as `$DIR/other/b.html`, not as `$DIR/sub/other.b.html`. --- CHANGELOG.md | 6 +++++ src/lib.rs | 2 +- src/parse.rs | 70 ++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76ff35b..a291521 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,12 @@ #### Fixes +* Proper root path computation for absolute paths in subdirectories [PR#93] + +[PR#92]: https://github.com/deadlinks/cargo-deadlinks/pull/93 + +#### Fixes + * No longer try to document examples that are dynamic libraries This was a regression introduced by [PR#68]. That looked at all targets to diff --git a/src/lib.rs b/src/lib.rs index 5d29d20..1690dc1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -72,7 +72,7 @@ pub fn unavailable_urls<'a>( .filter_map(|e| e.ok()) .filter(|entry| entry.file_type().is_file() && is_html_file(&entry)) .flat_map(move |entry| { - let urls = parse_html_file(entry.path()); + let urls = parse_html_file(dir_path, entry.path()); let errors = urls .into_iter() .filter_map(|url| match is_available(&url, &ctx) { diff --git a/src/parse.rs b/src/parse.rs index d7782b4..dc90bdd 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -8,21 +8,22 @@ use log::{debug, info}; use url::Url; /// Parse the html file at the provided path and check the availablility of all links in it. -pub fn parse_html_file(path: &Path) -> HashSet { +pub fn parse_html_file(root_dir: &Path, path: &Path) -> HashSet { info!("Checking doc page at {}", path.display()); let dom = parse_document(RcDom::default(), Default::default()) .from_utf8() .from_file(path) .unwrap(); + let root_url = Url::from_directory_path(root_dir).unwrap(); let base_url = Url::from_file_path(path).unwrap(); let mut urls = HashSet::new(); - parse_a_hrefs(&dom.document, &base_url, &mut urls); + parse_a_hrefs(&dom.document, &root_url, &base_url, &mut urls); urls } /// Traverse the DOM of a parsed HTML element, extracting all URLs from links. -fn parse_a_hrefs(handle: &Handle, base_url: &Url, urls: &mut HashSet) { +fn parse_a_hrefs(handle: &Handle, root_url: &Url, base_url: &Url, urls: &mut HashSet) { let node = handle; if let NodeData::Element { ref name, @@ -37,12 +38,15 @@ fn parse_a_hrefs(handle: &Handle, base_url: &Url, urls: &mut HashSet) { .find(|attr| &attr.name.local == "href") { let mut val = attr.value.clone(); - // Treat absolute paths as absolute with respect to the `base_url`, not with respect to the file system. + // base is the file path, unless path is absolute (starts with /) + let mut base = base_url; if attr.value.starts_with('/') { - val.pop_front_char(); + // Treat absolute paths as absolute with respect to the `root_url`, not with respect to the file system. + val.pop_front_char(); // remove the leading `/` and join on `root_url` + base = root_url; } - if let Ok(link) = base_url.join(&val) { + if let Ok(link) = base.join(&val) { debug!("link is {:?}", link); urls.insert(link); } else { @@ -53,7 +57,7 @@ fn parse_a_hrefs(handle: &Handle, base_url: &Url, urls: &mut HashSet) { } for child in node.children.borrow().iter() { - parse_a_hrefs(&child, base_url, urls); + parse_a_hrefs(&child, root_url, base_url, urls); } } @@ -66,6 +70,18 @@ mod test { use super::parse_a_hrefs; + fn gather_urls(html: &str, root: &Url, url: &Url) -> HashSet { + let dom = parse_document(RcDom::default(), Default::default()) + .from_utf8() + .read_from(&mut html.as_bytes()) + .unwrap(); + + let mut urls = HashSet::new(); + parse_a_hrefs(&dom.document, &root, &url, &mut urls); + + return urls; + } + #[test] fn test_parse_a_hrefs() { let html = r#" @@ -75,20 +91,38 @@ mod test { a a - - "#; - - let dom = parse_document(RcDom::default(), Default::default()) - .from_utf8() - .read_from(&mut html.as_bytes()) - .unwrap(); + "#; - let base_url = Url::from_directory_path("/base").unwrap(); - - let mut urls = HashSet::new(); - parse_a_hrefs(&dom.document, &base_url, &mut urls); + let urls = gather_urls( + html, + &Url::from_directory_path("/base").unwrap(), + &Url::from_file_path("/base/test.html").unwrap(), + ); assert!(urls.contains(&Url::from_file_path("/base/a.html").unwrap())); assert!(urls.contains(&Url::from_file_path("/base/b/c.html").unwrap())); } + + #[test] + fn test_parse_a_hrefs_in_subdirectory() { + let html = r#" + + + + a + a + d + + "#; + + let urls = gather_urls( + html, + &Url::from_directory_path("/root").unwrap(), + &Url::from_file_path("/root/base/test.html").unwrap(), + ); + + assert!(urls.contains(&Url::from_file_path("/root/base/a.html").unwrap())); + assert!(urls.contains(&Url::from_file_path("/root/b/c.html").unwrap())); + assert!(urls.contains(&Url::from_file_path("/root/d.html").unwrap())); + } }