Skip to content

Commit

Permalink
fix: correctly treat absolute paths against dir
Browse files Browse the repository at this point in the history
Absolute paths should be rooted from the `--dir` path, not against the
base directory of the examined file. For example, given file in
`$DIR/sub/a.html` with a link to `/other/b.html` and deadlinks invoked
with `--dir $DIR`, that link should be evaluated as `$DIR/other/b.html`,
not as `$DIR/sub/other/b.html`.
  • Loading branch information
zregvart committed Nov 8, 2020
1 parent 8c8da9f commit 622a1d5
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 23 deletions.
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ pub fn unavailable_urls<'a>(
.filter_map(|e| e.ok())
.filter(|entry| entry.file_type().is_file() && is_html_file(&entry))
.flat_map(move |entry| {
let urls = parse_html_file(entry.path());
let urls = parse_html_file(dir_path, entry.path());
let errors = urls
.into_iter()
.filter_map(|url| match is_available(&url, &ctx) {
Expand Down
79 changes: 57 additions & 22 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,22 @@ use log::{debug, info};
use url::Url;

/// Parse the html file at the provided path and check the availablility of all links in it.
pub fn parse_html_file(path: &Path) -> HashSet<Url> {
pub fn parse_html_file(root_dir: &Path, path: &Path) -> HashSet<Url> {
info!("Checking doc page at {}", path.display());
let dom = parse_document(RcDom::default(), Default::default())
.from_utf8()
.from_file(path)
.unwrap();

let root_url = Url::from_directory_path(root_dir).unwrap();
let base_url = Url::from_file_path(path).unwrap();
let mut urls = HashSet::new();
parse_a_hrefs(&dom.document, &base_url, &mut urls);
parse_a_hrefs(&dom.document, &root_url, &base_url, &mut urls);
urls
}

/// Traverse the DOM of a parsed HTML element, extracting all URLs from <a href="xxx"> links.
fn parse_a_hrefs(handle: &Handle, base_url: &Url, urls: &mut HashSet<Url>) {
fn parse_a_hrefs(handle: &Handle, root_url: &Url, base_url: &Url, urls: &mut HashSet<Url>) {
let node = handle;
if let NodeData::Element {
ref name,
Expand All @@ -36,24 +37,28 @@ fn parse_a_hrefs(handle: &Handle, base_url: &Url, urls: &mut HashSet<Url>) {
.iter()
.find(|attr| &attr.name.local == "href")
{
let mut val = attr.value.clone();
// Treat absolute paths as absolute with respect to the `base_url`, not with respect to the file system.
if attr.value.starts_with('/') {
val.pop_front_char();
}
// base is the file path, unless path is absolute (starts with /)
let (base, href) = if attr.value.starts_with('/') {
// Treat absolute paths as absolute with respect to the `root_url`, not with respect to the file system.
let mut val = attr.value.clone();
val.pop_front_char(); // remove the leading `/` and join on `root_url`
(root_url, val)
} else {
(base_url, attr.value.clone())
};

if let Ok(link) = base_url.join(&val) {
if let Ok(link) = base.join(&href) {
debug!("link is {:?}", link);
urls.insert(link);
} else {
debug!("unparsable link {:?}", val);
debug!("unparsable link {:?}", href);
}
}
}
}

for child in node.children.borrow().iter() {
parse_a_hrefs(&child, base_url, urls);
parse_a_hrefs(&child, root_url, base_url, urls);
}
}

Expand All @@ -66,6 +71,18 @@ mod test {

use super::parse_a_hrefs;

fn gather_urls(html: &str, root: &Url, url: &Url) -> HashSet<Url> {
let dom = parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut html.as_bytes())
.unwrap();

let mut urls = HashSet::new();
parse_a_hrefs(&dom.document, &root, &url, &mut urls);

return urls;
}

#[test]
fn test_parse_a_hrefs() {
let html = r#"
Expand All @@ -75,20 +92,38 @@ mod test {
<a href="a.html">a</a>
<a href="/b/c.html">a</a>
</body>
</html>
"#;

let dom = parse_document(RcDom::default(), Default::default())
.from_utf8()
.read_from(&mut html.as_bytes())
.unwrap();

let base_url = Url::from_directory_path("/base").unwrap();
</html>"#;

let mut urls = HashSet::new();
parse_a_hrefs(&dom.document, &base_url, &mut urls);
let urls = gather_urls(
html,
&Url::from_directory_path("/base").unwrap(),
&Url::from_file_path("/base/test.html").unwrap(),
);

assert!(urls.contains(&Url::from_file_path("/base/a.html").unwrap()));
assert!(urls.contains(&Url::from_file_path("/base/b/c.html").unwrap()));
}

#[test]
fn test_parse_a_hrefs_in_subdirectory() {
let html = r#"
<!DOCTYPE html>
<html>
<body>
<a href="a.html">a</a>
<a href="/b/c.html">a</a>
<a href="../d.html">d</a>
</body>
</html>"#;

let urls = gather_urls(
html,
&Url::from_directory_path("/root").unwrap(),
&Url::from_file_path("/root/base/test.html").unwrap(),
);

assert!(urls.contains(&Url::from_file_path("/root/base/a.html").unwrap()));
assert!(urls.contains(&Url::from_file_path("/root/b/c.html").unwrap()));
assert!(urls.contains(&Url::from_file_path("/root/d.html").unwrap()));
}
}

0 comments on commit 622a1d5

Please sign in to comment.