From 01423519a1bb62e35882cdc837a02b96c24a68ca Mon Sep 17 00:00:00 2001 From: Joshua Nelson Date: Fri, 30 Oct 2020 16:26:30 -0400 Subject: [PATCH] Switch from rcdom to lol_html This is both more efficient and easier to read. It does introduce a ton of dependencies, but fortunately compile times are about the same as before. --- CHANGELOG.md | 2 + Cargo.lock | 257 ++++++++++++++++++++++++++++++++++----------------- Cargo.toml | 2 +- src/parse.rs | 91 +++++++----------- 4 files changed, 206 insertions(+), 146 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a749ea9..184c144 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,7 +27,9 @@ #### Changes * Switch from `reqwest` to `ureq` for HTTP-checking, cutting down the number of dependencies by almost a third. [PR#95] +* Switch from `html5ever` to `lol_html`, making the code much easier to modify. [PR#86] +[PR#86]: https://github.com/deadlinks/cargo-deadlinks/pull/86 [PR#95]: https://github.com/deadlinks/cargo-deadlinks/pull/95 diff --git a/Cargo.lock b/Cargo.lock index 7bfdcf0..27d283b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -69,6 +69,12 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" +[[package]] +name = "byteorder" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" + [[package]] name = "cargo-deadlinks" version = "0.4.2" @@ -77,8 +83,8 @@ dependencies = [ "cargo_metadata", "docopt", "env_logger", - "html5ever", "log", + "lol_html", "num_cpus", "predicates", "rayon", @@ -188,6 +194,38 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "cssparser" +version = "0.25.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbe18ca4efb9ba3716c6da66cc3d7e673bf59fa576353011f48c4cfddbdd740e" +dependencies = [ + "autocfg 0.1.7", + "cssparser-macros", + "dtoa-short", + "itoa", + "matches", + "phf", + "proc-macro2", + "procedural-masquerade", + "quote", + "smallvec", + "syn", +] + +[[package]] +name = "cssparser-macros" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb1c84e87c717666564ec056105052331431803d606bd45529b28547b611eef" +dependencies = [ + "phf_codegen", + "proc-macro2", + "procedural-masquerade", + "quote", + "syn", +] + [[package]] name = "difference" version = "2.0.0" @@ -212,12 +250,36 @@ dependencies = [ "strsim", ] +[[package]] +name = "dtoa" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "134951f4028bdadb9b84baf4232681efbf277da25144b9b0ad65df75946c422b" + +[[package]] +name = "dtoa-short" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59020b8513b76630c49d918c33db9f4c91638e7d3404a28084083b87e33f76f2" +dependencies = [ + "dtoa", +] + [[package]] name = "either" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encoding_rs" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "801bbab217d7f79c0062f4f7205b5d4427c6d1a7bd7aafdd1475f7c59d62b283" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "env_logger" version = "0.8.1" @@ -247,13 +309,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" [[package]] -name = "futf" -version = "0.1.4" +name = "fxhash" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" dependencies = [ - "mac", - "new_debug_unreachable", + "byteorder", ] [[package]] @@ -265,20 +326,6 @@ dependencies = [ "libc", ] -[[package]] -name = "html5ever" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025483b0a1e4577bb28578318c886ee5f817dda6eb62473269349044406644cb" -dependencies = [ - "log", - "mac", - "markup5ever", - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "humantime" version = "2.0.1" @@ -317,6 +364,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + [[package]] name = "libc" version = "0.2.79" @@ -333,26 +386,21 @@ dependencies = [ ] [[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - -[[package]] -name = "markup5ever" -version = "0.9.0" +name = "lol_html" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65381d9d47506b8592b97c4efd936afcf673b09b059f2bef39c7211ee78b9d03" +checksum = "169299b3b58aa5cd8ad25fd8fe984e93748046d24c80f05aaadd9022f95423ec" dependencies = [ - "log", - "phf", - "phf_codegen", - "serde", - "serde_derive", - "serde_json", - "string_cache", - "string_cache_codegen", - "tendril", + "bitflags", + "cfg-if 0.1.10", + "cssparser", + "encoding_rs", + "lazy_static", + "lazycell", + "memchr", + "safemem", + "selectors", + "thiserror", ] [[package]] @@ -361,6 +409,12 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" +[[package]] +name = "maybe-uninit" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" + [[package]] name = "memchr" version = "2.3.3" @@ -377,10 +431,10 @@ dependencies = [ ] [[package]] -name = "new_debug_unreachable" -version = "1.0.4" +name = "nodrop" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" [[package]] name = "normalize-line-endings" @@ -501,6 +555,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "procedural-masquerade" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1383dff4092fe903ac180e391a8d4121cc48f08ccf850614b0290c6673b69d" + [[package]] name = "qstring" version = "0.7.2" @@ -711,6 +771,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "safemem" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" + [[package]] name = "same-file" version = "1.0.6" @@ -736,6 +802,25 @@ dependencies = [ "untrusted", ] +[[package]] +name = "selectors" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b86b100bede4f651059740afc3b6cb83458d7401cb7c1ad96d8a11e91742c86" +dependencies = [ + "bitflags", + "cssparser", + "fxhash", + "log", + "matches", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", + "thin-slice", +] + [[package]] name = "semver" version = "0.9.0" @@ -784,50 +869,41 @@ dependencies = [ ] [[package]] -name = "siphasher" -version = "0.2.3" +name = "servo_arc" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" +checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" +dependencies = [ + "nodrop", + "stable_deref_trait", +] [[package]] -name = "spin" -version = "0.5.2" +name = "siphasher" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" [[package]] -name = "string_cache" -version = "0.7.5" +name = "smallvec" +version = "0.6.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89c058a82f9fd69b1becf8c274f412281038877c553182f1d02eb027045a2d67" +checksum = "f7b0758c52e15a8b5e3691eae6cc559f08eee9406e548a4477ba4e67770a82b6" dependencies = [ - "lazy_static", - "new_debug_unreachable", - "phf_shared", - "precomputed-hash", - "serde", - "string_cache_codegen", - "string_cache_shared", + "maybe-uninit", ] [[package]] -name = "string_cache_codegen" -version = "0.4.4" +name = "spin" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f45ed1b65bf9a4bf2f7b7dc59212d1926e9eaf00fa998988e420fd124467c6" -dependencies = [ - "phf_generator", - "phf_shared", - "proc-macro2", - "quote", - "string_cache_shared", -] +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] -name = "string_cache_shared" -version = "0.3.0" +name = "stable_deref_trait" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "strsim" @@ -847,23 +923,38 @@ dependencies = [ ] [[package]] -name = "tendril" -version = "0.4.1" +name = "termcolor" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b" +checksum = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f" dependencies = [ - "futf", - "mac", - "utf-8", + "winapi-util", ] [[package]] -name = "termcolor" -version = "1.1.0" +name = "thin-slice" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f" +checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" + +[[package]] +name = "thiserror" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e" dependencies = [ - "winapi-util", + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -945,12 +1036,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "utf-8" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" - [[package]] name = "wait-timeout" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 0e49992..6137418 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ license = "MIT OR Apache-2.0" cargo_metadata = "0.9" docopt = "1" env_logger = "0.8" -html5ever = "0.24" +lol_html = "0.2" log = "0.4" num_cpus = "1.8" rayon = "1.0" diff --git a/src/parse.rs b/src/parse.rs index 7dd1294..d40474d 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1,50 +1,38 @@ use std::collections::HashSet; use std::path::Path; -use html5ever::parse_document; -use html5ever::rcdom::{Handle, NodeData, RcDom}; -use html5ever::tendril::TendrilSink; use log::{debug, info}; +use lol_html::{element, RewriteStrSettings}; use url::Url; /// Parse the html file at the provided path and check the availablility of all links in it. pub fn parse_html_file(root_dir: &Path, path: &Path) -> HashSet { info!("Checking doc page at {}", path.display()); - let dom = parse_document(RcDom::default(), Default::default()) - .from_utf8() - .from_file(path) - .unwrap(); + let html = std::fs::read_to_string(path) + .unwrap_or_else(|e| panic!("{} did not contain valid UTF8: {}", path.display(), e)); + // root_url is absolute *relative to* the documentation directory. For `target/dir/crate_x/y`, it's `crate_x`. let root_url = Url::from_directory_path(root_dir).unwrap(); + // base_url is the relative file path. For `target/dir/crate_x/y`, it's `crate_x/y`. let base_url = Url::from_file_path(path).unwrap(); - let mut urls = HashSet::new(); - parse_a_hrefs(&dom.document, &root_url, &base_url, &mut urls); - urls + + parse_a_hrefs(&html, root_url, base_url) } -/// Traverse the DOM of a parsed HTML element, extracting all URLs from links. -fn parse_a_hrefs(handle: &Handle, root_url: &Url, base_url: &Url, urls: &mut HashSet) { - let node = handle; - if let NodeData::Element { - ref name, - ref attrs, - .. - } = node.data - { - if &name.local == "a" { - if let Some(attr) = attrs - .borrow() - .iter() - .find(|attr| &attr.name.local == "href") - { +/// This is a pure function, unlike `parse_html_file`, allowing it to be easily tested. +fn parse_a_hrefs(html: &str, root_url: Url, base_url: Url) -> HashSet { + let mut urls = HashSet::new(); + lol_html::rewrite_str( + html, + RewriteStrSettings { + element_content_handlers: vec![element!("a[href]", |el| { + let href = el.get_attribute("href").unwrap(); // base is the file path, unless path is absolute (starts with /) - let (base, href) = if attr.value.starts_with('/') { + let (base, href) = if href.starts_with('/') { // Treat absolute paths as absolute with respect to the `root_url`, not with respect to the file system. - let mut val = attr.value.clone(); - val.pop_front_char(); // remove the leading `/` and join on `root_url` - (root_url, val) + (&root_url, &href[1..]) } else { - (base_url, attr.value.clone()) + (&base_url, href.as_str()) }; if let Ok(link) = base.join(&href) { @@ -53,35 +41,20 @@ fn parse_a_hrefs(handle: &Handle, root_url: &Url, base_url: &Url, urls: &mut Has } else { debug!("unparsable link {:?}", href); } - } - } - } + Ok(()) + })], + ..RewriteStrSettings::default() + }, + ) + .expect("html rewriting failed"); - for child in node.children.borrow().iter() { - parse_a_hrefs(&child, root_url, base_url, urls); - } + urls } #[cfg(test)] mod test { - use html5ever::parse_document; - use html5ever::{rcdom::RcDom, tendril::TendrilSink}; - use std::collections::HashSet; - use url::Url; - use super::parse_a_hrefs; - - fn gather_urls(html: &str, root: &Url, url: &Url) -> HashSet { - let dom = parse_document(RcDom::default(), Default::default()) - .from_utf8() - .read_from(&mut html.as_bytes()) - .unwrap(); - - let mut urls = HashSet::new(); - parse_a_hrefs(&dom.document, &root, &url, &mut urls); - - return urls; - } + use url::Url; #[test] fn test_parse_a_hrefs() { @@ -94,10 +67,10 @@ mod test { "#; - let urls = gather_urls( + let urls = parse_a_hrefs( html, - &Url::from_directory_path("/base").unwrap(), - &Url::from_file_path("/base/test.html").unwrap(), + Url::from_directory_path("/base").unwrap(), + Url::from_file_path("/base/test.html").unwrap(), ); assert!(urls.contains(&Url::from_file_path("/base/a.html").unwrap())); @@ -116,10 +89,10 @@ mod test { "#; - let urls = gather_urls( + let urls = parse_a_hrefs( html, - &Url::from_directory_path("/root").unwrap(), - &Url::from_file_path("/root/base/test.html").unwrap(), + Url::from_directory_path("/root").unwrap(), + Url::from_file_path("/root/base/test.html").unwrap(), ); assert!(urls.contains(&Url::from_file_path("/root/base/a.html").unwrap()));