From 488235f74989b72bb000186b55bcfa51e2d692a4 Mon Sep 17 00:00:00 2001 From: Kevin Kuehler Date: Thu, 23 Sep 2021 15:47:44 -0700 Subject: [PATCH] speedreader: Throw out srcset if they don't look like images Resolves https://github.com/brave/brave-browser/issues/18280 --- .../lib/src/readability/examples/slideshow.rs | 1 + .../rust/lib/src/readability/src/scorer.rs | 143 +++++++++++++++--- 2 files changed, 119 insertions(+), 25 deletions(-) diff --git a/components/speedreader/rust/lib/src/readability/examples/slideshow.rs b/components/speedreader/rust/lib/src/readability/examples/slideshow.rs index 72e06fb19e57..20d5d3470047 100644 --- a/components/speedreader/rust/lib/src/readability/examples/slideshow.rs +++ b/components/speedreader/rust/lib/src/readability/examples/slideshow.rs @@ -133,6 +133,7 @@ fn main() { extract_wrap("dutchnews_1", &mut files, &css); extract_wrap("nytimes_1", &mut files, &css); + extract_wrap("cbs_1", &mut files, &css); extract_wrap("cnet_2", &mut files, &css); extract_wrap("medium_2", &mut files, &css); extract_wrap("vanityfair_1", &mut files, &css); diff --git a/components/speedreader/rust/lib/src/readability/src/scorer.rs b/components/speedreader/rust/lib/src/readability/src/scorer.rs index be63229fea75..84fd51381a1e 100644 --- a/components/speedreader/rust/lib/src/readability/src/scorer.rs +++ b/components/speedreader/rust/lib/src/readability/src/scorer.rs @@ -48,7 +48,6 @@ static ALTER_TO_DIV_EXCEPTIONS: [&LocalName; 3] = [ &local_name!("section"), &local_name!("p"), ]; -static LOADABLE_IMG_SUFFIX: [&str; 4] = [".jpg", ".jpeg", ".png", ".webp"]; static PRESENTATIONAL_ATTRIBUTES: [&LocalName; 12] = [ &local_name!("align"), &local_name!("background"), @@ -142,6 +141,32 @@ bitflags::bitflags! { const NONE = 0; } } + +/// Checks if a url looks like an image. Ignores url params. +fn url_suffix_is_img(src: &str) -> bool { + static LOADABLE_IMG_SUFFIX: [&str; 4] = [".jpg", ".jpeg", ".png", ".webp"]; + + if LOADABLE_IMG_SUFFIX + .iter() + .any(|suffix| src.ends_with(suffix)) + { + return true; + } + + // Try again with URL params stripped + let is_loadable_path = |src: &str| match Url::parse(src) { + Ok(url) => LOADABLE_IMG_SUFFIX + .iter() + .any(|suffix| url.path().ends_with(suffix)), + _ => false, + }; + if src.starts_with('/') { + is_loadable_path(format!("data:{}", src).as_str()) + } else { + is_loadable_path(src) + } +} + /// Returns true if the image src loads without JavaScript logic. Some sites like Kotaku and The /// Atlantic try and get fancy with this. Our simple heuristic is to check if src is set something /// reasonable or srcset is set at all. Some things we've seen in the wild are srcset being left @@ -150,25 +175,8 @@ bitflags::bitflags! { fn img_loaded_mask(data: &ElementData) -> ImageLoadedMask { let mut mask: ImageLoadedMask = ImageLoadedMask::NONE; if let Some(src) = data.attributes.borrow().get(local_name!("src")) { - for suffix in LOADABLE_IMG_SUFFIX.iter() { - if src.ends_with(suffix) { - mask |= ImageLoadedMask::SRC; - break; - } - } - if !mask.contains(ImageLoadedMask::SRC) { - // Try parsing the URL to ignore url params - match Url::parse(src) { - Ok(url) => { - for suffix in LOADABLE_IMG_SUFFIX.iter() { - if url.path().ends_with(suffix) { - mask |= ImageLoadedMask::SRC; - break; - } - } - } - _ => (), - } + if url_suffix_is_img(&src) { + mask |= ImageLoadedMask::SRC; } } @@ -176,13 +184,72 @@ fn img_loaded_mask(data: &ElementData) -> ImageLoadedMask { .attributes .borrow() .get(local_name!("srcset")) - .is_some() + .map(|srcset| is_valid_srcset(&srcset)) + .unwrap_or(false) { mask |= ImageLoadedMask::SRCSET; } mask } +#[derive(Debug, PartialEq)] +pub enum ImageCandidateError { + InvalidURL, + URLFailedImageHeuristic, + InvalidPixelDensityDescriptor, + InvalidWidthDescriptor, +} + +impl From for ImageCandidateError { + fn from(_err: url::ParseError) -> Self { + ImageCandidateError::InvalidURL + } +} + +impl From for ImageCandidateError { + fn from(_err: std::num::ParseFloatError) -> Self { + ImageCandidateError::InvalidPixelDensityDescriptor + } +} + +impl From for ImageCandidateError { + fn from(_err: std::num::ParseIntError) -> Self { + ImageCandidateError::InvalidWidthDescriptor + } +} + +#[inline] +fn is_valid_srcset(srcset: &str) -> bool { + srcset + .split(',') + .all(|candidate| validate_image_candidate_string(candidate).is_ok()) +} + +/// We roughly follow the guidelines at https://html.spec.whatwg.org/multipage/urls-and-fetching.html#valid-non-empty-url +/// while also denying urls that don't look like images. +fn validate_image_candidate_string(candidate: &str) -> Result<(), ImageCandidateError> { + let c = candidate.trim(); + // Extract url and descriptor separately + let (url, descriptor) = if let Some(offset) = c.find(|c: char| c.is_ascii_whitespace()) { + (&c[..offset], Some(c[offset..].trim_start())) + } else { + (c, None) + }; + + if !url_suffix_is_img(&url) { + return Err(ImageCandidateError::URLFailedImageHeuristic); + } + + if let Some(descriptor) = descriptor { + if descriptor.ends_with('x') { + descriptor[..descriptor.len() - 1].parse::()?; + } else if descriptor.ends_with('w') { + descriptor[..descriptor.len() - 1].parse::()?; + } + } + Ok(()) +} + /// Contains metadata about how to load a lazy loaded image. struct LazyImage { local: LocalName, @@ -211,10 +278,7 @@ fn try_lazy_img( } } if !mask.contains(ImageLoadedMask::SRC) { - if LOADABLE_IMG_SUFFIX - .iter() - .any(|suffix| value.value.ends_with(suffix)) - { + if url_suffix_is_img(&value.value) { mask |= ImageLoadedMask::SRC; lazy_srcs.push(LazyImage { local: local_name!("src"), @@ -904,6 +968,8 @@ pub fn clean( } local_name!("img") => { let mask = img_loaded_mask(data); + // TODO(keur): We should maybe dom::clean_attr if src or srcset is presest + // but they are not in the mask. Means they are invalid. let (mask, lazy_srcs) = try_lazy_img(data, mask); if mask == ImageLoadedMask::NONE { true @@ -1073,4 +1139,31 @@ mod tests { let (mask, _) = try_lazy_img(elem, ImageLoadedMask::NONE); assert!(mask.is_all()); } + + #[test] + fn test_validate_image_candidate_string() { + // Example based off https://github.com/brave/brave-browser/issues/18280 + assert_eq!( + ImageCandidateError::URLFailedImageHeuristic, + validate_image_candidate_string("data:image/svg+xml,%3Csvg%20xmlns%3D").unwrap_err() + ); + assert!( + validate_image_candidate_string("https://cdn.theatlantic.com/imgf/original.jpg") + .is_ok() + ); + assert!(validate_image_candidate_string("image-480w.jpg 480w").is_ok()); + assert!(validate_image_candidate_string("image-480w.jpg 480w").is_ok()); + assert!(validate_image_candidate_string( + "/content/dam/news/2017/11/16/a08.jpeg?imwidth=480 480w" + ) + .is_ok()); + assert!(validate_image_candidate_string("https://i.guim.co.uk/img/media/603fd890c17afa94c6fd0e41f87875be104b811a/38_0_4848_2912/master/4848.jpg?width=160&quality=85&auto=format&fit=max&s=fc5a4d4e165ba28ea00817f416bae258 160w").is_ok()); + } + + #[test] + fn test_is_srcset() { + assert!(is_valid_srcset( + "https://test.com/original.jpg, https://test.com/original.jpg 2x" + )); + } }