From db7444a36b7f0b19c180658f4423588dd37a4f8f Mon Sep 17 00:00:00 2001
From: Zoran Regvart <zregvart@apache.org>
Date: Sat, 7 Nov 2020 20:55:36 +0100
Subject: [PATCH] fix: correctly treat absolute paths against `dir`

Absolute paths should be rooted from the `--dir` path, not against the
base directory of the examined file. For example, given file in
`$DIR/sub/a.html` with a link to `/other/b.html` and deadlinks invoked
with `--dir $DIR`, that link should be evaluated as `$DIR/other/b.html`,
not as `$DIR/sub/other.b.html`.
---
 CHANGELOG.md |  6 +++++
 src/lib.rs   |  2 +-
 src/parse.rs | 70 ++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 59 insertions(+), 19 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76ff35b..a291521 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,12 @@
 
 #### Fixes
 
+* Proper root path computation for absolute paths in subdirectories [PR#93]
+
+[PR#92]: https://github.com/deadlinks/cargo-deadlinks/pull/93
+
+#### Fixes
+
 * No longer try to document examples that are dynamic libraries
 
   This was a regression introduced by [PR#68]. That looked at all targets to
diff --git a/src/lib.rs b/src/lib.rs
index 5d29d20..1690dc1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -72,7 +72,7 @@ pub fn unavailable_urls<'a>(
         .filter_map(|e| e.ok())
         .filter(|entry| entry.file_type().is_file() && is_html_file(&entry))
         .flat_map(move |entry| {
-            let urls = parse_html_file(entry.path());
+            let urls = parse_html_file(dir_path, entry.path());
             let errors = urls
                 .into_iter()
                 .filter_map(|url| match is_available(&url, &ctx) {
diff --git a/src/parse.rs b/src/parse.rs
index d7782b4..dc90bdd 100644
--- a/src/parse.rs
+++ b/src/parse.rs
@@ -8,21 +8,22 @@ use log::{debug, info};
 use url::Url;
 
 /// Parse the html file at the provided path and check the availablility of all links in it.
-pub fn parse_html_file(path: &Path) -> HashSet<Url> {
+pub fn parse_html_file(root_dir: &Path, path: &Path) -> HashSet<Url> {
     info!("Checking doc page at {}", path.display());
     let dom = parse_document(RcDom::default(), Default::default())
         .from_utf8()
         .from_file(path)
         .unwrap();
 
+    let root_url = Url::from_directory_path(root_dir).unwrap();
     let base_url = Url::from_file_path(path).unwrap();
     let mut urls = HashSet::new();
-    parse_a_hrefs(&dom.document, &base_url, &mut urls);
+    parse_a_hrefs(&dom.document, &root_url, &base_url, &mut urls);
     urls
 }
 
 /// Traverse the DOM of a parsed HTML element, extracting all URLs from <a href="xxx"> links.
-fn parse_a_hrefs(handle: &Handle, base_url: &Url, urls: &mut HashSet<Url>) {
+fn parse_a_hrefs(handle: &Handle, root_url: &Url, base_url: &Url, urls: &mut HashSet<Url>) {
     let node = handle;
     if let NodeData::Element {
         ref name,
@@ -37,12 +38,15 @@ fn parse_a_hrefs(handle: &Handle, base_url: &Url, urls: &mut HashSet<Url>) {
                 .find(|attr| &attr.name.local == "href")
             {
                 let mut val = attr.value.clone();
-                // Treat absolute paths as absolute with respect to the `base_url`, not with respect to the file system.
+                // base is the file path, unless path is absolute (starts with /)
+                let mut base = base_url;
                 if attr.value.starts_with('/') {
-                    val.pop_front_char();
+                    // Treat absolute paths as absolute with respect to the `root_url`, not with respect to the file system.
+                    val.pop_front_char(); // remove the leading `/` and join on `root_url`
+                    base = root_url;
                 }
 
-                if let Ok(link) = base_url.join(&val) {
+                if let Ok(link) = base.join(&val) {
                     debug!("link is {:?}", link);
                     urls.insert(link);
                 } else {
@@ -53,7 +57,7 @@ fn parse_a_hrefs(handle: &Handle, base_url: &Url, urls: &mut HashSet<Url>) {
     }
 
     for child in node.children.borrow().iter() {
-        parse_a_hrefs(&child, base_url, urls);
+        parse_a_hrefs(&child, root_url, base_url, urls);
     }
 }
 
@@ -66,6 +70,18 @@ mod test {
 
     use super::parse_a_hrefs;
 
+    fn gather_urls(html: &str, root: &Url, url: &Url) -> HashSet<Url> {
+        let dom = parse_document(RcDom::default(), Default::default())
+            .from_utf8()
+            .read_from(&mut html.as_bytes())
+            .unwrap();
+
+        let mut urls = HashSet::new();
+        parse_a_hrefs(&dom.document, &root, &url, &mut urls);
+
+        return urls;
+    }
+
     #[test]
     fn test_parse_a_hrefs() {
         let html = r#"
@@ -75,20 +91,38 @@ mod test {
                 <a href="a.html">a</a>
                 <a href="/b/c.html">a</a>
             </body>
-        </html> 
-        "#;
-
-        let dom = parse_document(RcDom::default(), Default::default())
-            .from_utf8()
-            .read_from(&mut html.as_bytes())
-            .unwrap();
+        </html>"#;
 
-        let base_url = Url::from_directory_path("/base").unwrap();
-
-        let mut urls = HashSet::new();
-        parse_a_hrefs(&dom.document, &base_url, &mut urls);
+        let urls = gather_urls(
+            html,
+            &Url::from_directory_path("/base").unwrap(),
+            &Url::from_file_path("/base/test.html").unwrap(),
+        );
 
         assert!(urls.contains(&Url::from_file_path("/base/a.html").unwrap()));
         assert!(urls.contains(&Url::from_file_path("/base/b/c.html").unwrap()));
     }
+
+    #[test]
+    fn test_parse_a_hrefs_in_subdirectory() {
+        let html = r#"
+        <!DOCTYPE html>
+        <html>
+            <body>
+                <a href="a.html">a</a>
+                <a href="/b/c.html">a</a>
+                <a href="../d.html">d</a>
+            </body>
+        </html>"#;
+
+        let urls = gather_urls(
+            html,
+            &Url::from_directory_path("/root").unwrap(),
+            &Url::from_file_path("/root/base/test.html").unwrap(),
+        );
+
+        assert!(urls.contains(&Url::from_file_path("/root/base/a.html").unwrap()));
+        assert!(urls.contains(&Url::from_file_path("/root/b/c.html").unwrap()));
+        assert!(urls.contains(&Url::from_file_path("/root/d.html").unwrap()));
+    }
 }