Skip to content

Commit

Permalink
perf(walk): use Arc to reduce URL clones
Browse files Browse the repository at this point in the history
  • Loading branch information
DonIsaac committed Aug 27, 2024
1 parent a470205 commit 5b994b8
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 95 deletions.
2 changes: 1 addition & 1 deletion src/extract/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ impl ApiKeyCollector {
Ok(js)
}

fn parse_and_send(&self, url: Url, script: &str) {
fn parse_and_send(&self, url: Arc<Url>, script: &str) {
trace!("({url}) Parsing script");
let alloc = Allocator::default();
let extract_result = self
Expand Down
35 changes: 20 additions & 15 deletions src/walk/website/url_extractor.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use core::fmt;
use std::sync::Arc;

use url::{ParseError, ParseOptions, Url};

Expand Down Expand Up @@ -28,7 +29,7 @@ const BANNED_EXTENSIONS: [&str; 3] = [".pdf", ".png", ".jpg"];
/// Extracts URLs to webpages and scripts from HTML.
pub(crate) struct UrlExtractor<'html> {
/// URL of the page being parsed.
page_url: &'html Url,
page_url: Arc<Url>,
opts: ParseOptions<'html>,
pages: Vec<Url>,
scripts: Vec<Script>,
Expand All @@ -44,7 +45,7 @@ impl fmt::Debug for UrlExtractor<'_> {
}

impl<'html> UrlExtractor<'html> {
pub fn new(base_url: &'html Url, page_url: &'html Url) -> Self {
pub fn new(base_url: &'html Url, page_url: Arc<Url>) -> Self {
const CAP: usize = 10;
debug_assert!(!base_url.cannot_be_a_base());

Expand All @@ -70,7 +71,7 @@ impl<'html> UrlExtractor<'html> {
let Ok(script_url) = self.resolve(script_url) else {
return;
};
self.scripts.push(Script::Url(script_url));
self.scripts.push(Script::from(script_url));
}

fn record_embedded_script(&mut self, script: &str) {
Expand Down Expand Up @@ -138,13 +139,17 @@ impl<'dom> DomVisitor<'dom> for UrlExtractor<'dom> {
#[cfg(test)]
mod test {
use crate::walk::website::dom_walker::DomWalker;
use std::sync::Arc;

fn example() -> Arc<Url> {
Arc::new(Url::parse("https://example.com").unwrap())
}

use super::*;
use url::Url;
#[test]

fn test_basic() {
let url = Url::parse("https://example.com").unwrap();
let url = example();
let html = r#"
<html>
<head>
Expand All @@ -158,14 +163,14 @@ mod test {
</html>
"#;

let mut extractor = UrlExtractor::new(&url, &url);
let mut extractor = UrlExtractor::new(&url, url.clone());
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();

assert_eq!(
scripts,
vec![Script::Url(
vec![Script::from(
Url::parse("https://example.com/main.js").unwrap()
)]
);
Expand All @@ -183,7 +188,7 @@ mod test {

#[test]
fn test_ignored() {
let url = Url::parse("https://example.com").unwrap();
let url = example();
let html = r"
<html>
<body>
Expand All @@ -195,7 +200,7 @@ mod test {
</html>
";

let mut extractor = UrlExtractor::new(&url, &url);
let mut extractor = UrlExtractor::new(&url, url.clone());
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
Expand All @@ -206,7 +211,7 @@ mod test {

#[test]
fn test_embedded_script() {
let url = Url::parse("https://example.com").unwrap();
let url = example();
let html = r#"
<html>
<head>
Expand All @@ -221,7 +226,7 @@ mod test {
</html>
"#;

let mut extractor = UrlExtractor::new(&url, &url);
let mut extractor = UrlExtractor::new(&url, url.clone());
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
Expand All @@ -237,7 +242,7 @@ mod test {
}
#[test]
fn test_embedded_script_empty() {
let url = Url::parse("https://example.com").unwrap();
let url = example();
let html = "
<html>
<head>
Expand All @@ -253,7 +258,7 @@ mod test {
<body></body>
</html>
";
let mut extractor = UrlExtractor::new(&url, &url);
let mut extractor = UrlExtractor::new(&url, url.clone());
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
Expand All @@ -263,7 +268,7 @@ mod test {

#[test]
fn test_non_js_embedded_script() {
let url = Url::parse("https://example.com").unwrap();
let url = example();
let html = r#"
<html>
<head>
Expand All @@ -278,7 +283,7 @@ mod test {
</html>
"#;

let mut extractor = UrlExtractor::new(&url, &url);
let mut extractor = UrlExtractor::new(&url, url.clone());
let dom = DomWalker::new(html).unwrap();
dom.walk(&mut extractor);
let (pages, scripts) = extractor.into_inner();
Expand Down
77 changes: 17 additions & 60 deletions src/walk/website/walk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,10 @@ use log::{debug, trace, warn};
use miette::{Context as _, Error, IntoDiagnostic as _, Result};

use std::{
borrow::{Borrow, Cow},
num::NonZeroUsize,
sync::{
atomic::{AtomicBool, AtomicUsize, Ordering},
mpsc, Mutex, OnceLock,
mpsc, Arc, Mutex, OnceLock,
},
};

Expand All @@ -46,9 +45,14 @@ pub type ScriptReceiver = mpsc::Receiver<ScriptMessage>;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Script {
/// A JS script fetchable at some URL
Url(Url),
Url(Arc<Url>),
/// JS embedded in a `<script>` tag within HTML
Embedded(/* source code */ String, /* page url */ Url),
Embedded(/* source code */ String, /* page url */ Arc<Url>),
}
impl From<Url> for Script {
fn from(url: Url) -> Self {
Script::Url(Arc::new(url))
}
}

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -155,7 +159,7 @@ impl WebsiteWalker {
self.visit_many(vec![parsed])
}

fn visit_many(&mut self, urls: Vec<Url>) -> Result<(), Error> {
fn visit_many(&mut self, mut urls: Vec<Url>) -> Result<(), Error> {
let pages_to_visit = self.reserve_walk_count(urls.len());
if pages_to_visit == 0 && *self.in_progress.get_mut().unwrap() == 0 {
debug!("Finishing walk, No more pages to visit");
Expand All @@ -166,19 +170,21 @@ impl WebsiteWalker {
match urls.len() {
0 => Ok(()),
1 => {
let url = &urls[0];
if self.has_visited_url(url) {
// let url = &urls[0];
let url = Arc::new(urls.pop().unwrap());
if self.cache.has_seen_url(&url) {
return Ok(());
}
let webpage = self.get_webpage(url.as_str())?;
let result = self.walk_rec(url, &webpage);
let result = self.walk_rec(&url, &webpage);
self.on_visit_end(1);
result
}
num_urls => {
let urls_and_webpages = urls
.into_iter()
.filter(|url| self.is_whitelisted_link(url) && !self.has_visited_url(url))
.map(Arc::new)
.filter(|url| self.is_whitelisted_link(url) && !self.cache.has_seen_url(url))
.take(pages_to_visit)
.par_bridge()
.map(|url| {
Expand Down Expand Up @@ -251,12 +257,12 @@ impl WebsiteWalker {
}
}

fn walk_rec(&mut self, url: &Url, webpage: &str) -> Result<(), Error> {
fn walk_rec(&mut self, url: &Arc<Url>, webpage: &str) -> Result<(), Error> {
trace!("Building DOM walker for '{url}'");
let dom_walker = DomWalker::new(webpage).context("Failed to parse HTML")?;

trace!("Extracting links and scripts for '{url}'");
let mut url_visitor = UrlExtractor::new(self.base_url.get().unwrap(), url);
let mut url_visitor = UrlExtractor::new(self.base_url.get().unwrap(), Arc::clone(url));
dom_walker.walk(&mut url_visitor);
let (pages, scripts) = url_visitor.into_inner();

Expand Down Expand Up @@ -342,55 +348,6 @@ impl WebsiteWalker {
self.domain_whitelist.iter().any(|d| d.as_str() == domain)
}

fn has_visited_url(&self, url: &Url) -> bool {
debug_assert!(
!url.cannot_be_a_base(),
"skip_if_visited got a relative url"
); // should be absolute

if url.query().is_none() && url.fragment().is_none() {
return self.has_visited_url_clean(url);
}

// remove #section hash and (most) query parameters from URL since they
// don't affect what page the URL points to. Note that some applications
// use query parameters to identify what page to go to, thus the below
// query_pairs() check. We may need to update this list as new cases are
// brought to light.
let mut without_query_params = url.clone();
without_query_params.set_query(None);
without_query_params.set_fragment(None);
let mut new_params: Vec<(Cow<'_, str>, Cow<'_, str>)> = vec![];
for (key, value) in url.query_pairs() {
// TODO: use phf?
if matches!(
key.borrow(),
"tab" | "tabid" | "tab_id" | "tab-id" | "id" | "page" | "page_id" | "page-id"
) {
new_params.push((key, value))
}
}

if new_params.is_empty() {
self.has_visited_url_clean(&without_query_params)
} else {
let query = new_params
.into_iter()
.fold(String::new(), |acc, (key, value)| {
acc + format!("{key}={value}").as_str()
});
without_query_params.set_query(Some(query.as_str()));
self.has_visited_url_clean(&without_query_params)
}
}
fn has_visited_url_clean(&self, url: &Url) -> bool {
if self.cache.has_seen_url(url) {
true
} else {
self.cache.see_url(url.clone());
false
}
}
fn finish(&self) {
debug!("({}) finishing walk", self.base_url.get().unwrap());

Expand Down
Loading

0 comments on commit 5b994b8

Please sign in to comment.