Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

speedreader: Throw out srcset if they don't look like images #10238

Merged
merged 1 commit into from
Sep 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ fn main() {

extract_wrap("dutchnews_1", &mut files, &css);
extract_wrap("nytimes_1", &mut files, &css);
extract_wrap("cbs_1", &mut files, &css);
extract_wrap("cnet_2", &mut files, &css);
extract_wrap("medium_2", &mut files, &css);
extract_wrap("vanityfair_1", &mut files, &css);
Expand Down
143 changes: 118 additions & 25 deletions components/speedreader/rust/lib/src/readability/src/scorer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ static ALTER_TO_DIV_EXCEPTIONS: [&LocalName; 3] = [
&local_name!("section"),
&local_name!("p"),
];
static LOADABLE_IMG_SUFFIX: [&str; 4] = [".jpg", ".jpeg", ".png", ".webp"];
static PRESENTATIONAL_ATTRIBUTES: [&LocalName; 12] = [
&local_name!("align"),
&local_name!("background"),
Expand Down Expand Up @@ -142,6 +141,32 @@ bitflags::bitflags! {
const NONE = 0;
}
}

/// Checks if a url looks like an image. Ignores url params.
fn url_suffix_is_img(src: &str) -> bool {
static LOADABLE_IMG_SUFFIX: [&str; 4] = [".jpg", ".jpeg", ".png", ".webp"];

if LOADABLE_IMG_SUFFIX
.iter()
.any(|suffix| src.ends_with(suffix))
{
return true;
}

// Try again with URL params stripped
let is_loadable_path = |src: &str| match Url::parse(src) {
Ok(url) => LOADABLE_IMG_SUFFIX
.iter()
.any(|suffix| url.path().ends_with(suffix)),
_ => false,
};
if src.starts_with('/') {
is_loadable_path(format!("data:{}", src).as_str())
} else {
is_loadable_path(src)
}
}

/// Returns true if the image src loads without JavaScript logic. Some sites like Kotaku and The
/// Atlantic try and get fancy with this. Our simple heuristic is to check if src is set something
/// reasonable or srcset is set at all. Some things we've seen in the wild are srcset being left
Expand All @@ -150,39 +175,81 @@ bitflags::bitflags! {
fn img_loaded_mask(data: &ElementData) -> ImageLoadedMask {
let mut mask: ImageLoadedMask = ImageLoadedMask::NONE;
if let Some(src) = data.attributes.borrow().get(local_name!("src")) {
for suffix in LOADABLE_IMG_SUFFIX.iter() {
if src.ends_with(suffix) {
mask |= ImageLoadedMask::SRC;
break;
}
}
if !mask.contains(ImageLoadedMask::SRC) {
// Try parsing the URL to ignore url params
match Url::parse(src) {
Ok(url) => {
for suffix in LOADABLE_IMG_SUFFIX.iter() {
if url.path().ends_with(suffix) {
mask |= ImageLoadedMask::SRC;
break;
}
}
}
_ => (),
}
if url_suffix_is_img(&src) {
mask |= ImageLoadedMask::SRC;
}
}

if data
.attributes
.borrow()
.get(local_name!("srcset"))
.is_some()
.map(|srcset| is_valid_srcset(&srcset))
.unwrap_or(false)
{
mask |= ImageLoadedMask::SRCSET;
}
mask
}

#[derive(Debug, PartialEq)]
pub enum ImageCandidateError {
InvalidURL,
URLFailedImageHeuristic,
InvalidPixelDensityDescriptor,
InvalidWidthDescriptor,
}

impl From<url::ParseError> for ImageCandidateError {
fn from(_err: url::ParseError) -> Self {
ImageCandidateError::InvalidURL
}
}

impl From<std::num::ParseFloatError> for ImageCandidateError {
fn from(_err: std::num::ParseFloatError) -> Self {
ImageCandidateError::InvalidPixelDensityDescriptor
}
}

impl From<std::num::ParseIntError> for ImageCandidateError {
fn from(_err: std::num::ParseIntError) -> Self {
ImageCandidateError::InvalidWidthDescriptor
}
}

#[inline]
fn is_valid_srcset(srcset: &str) -> bool {
srcset
.split(',')
.all(|candidate| validate_image_candidate_string(candidate).is_ok())
}

/// We roughly follow the guidelines at https://html.spec.whatwg.org/multipage/urls-and-fetching.html#valid-non-empty-url
/// while also denying urls that don't look like images.
fn validate_image_candidate_string(candidate: &str) -> Result<(), ImageCandidateError> {
let c = candidate.trim();
// Extract url and descriptor separately
let (url, descriptor) = if let Some(offset) = c.find(|c: char| c.is_ascii_whitespace()) {
(&c[..offset], Some(c[offset..].trim_start()))
} else {
(c, None)
};

if !url_suffix_is_img(&url) {
return Err(ImageCandidateError::URLFailedImageHeuristic);
}

if let Some(descriptor) = descriptor {
if descriptor.ends_with('x') {
descriptor[..descriptor.len() - 1].parse::<f64>()?;
} else if descriptor.ends_with('w') {
descriptor[..descriptor.len() - 1].parse::<u64>()?;
}
}
Ok(())
}

/// Contains metadata about how to load a lazy loaded image.
struct LazyImage {
local: LocalName,
Expand Down Expand Up @@ -211,10 +278,7 @@ fn try_lazy_img(
}
}
if !mask.contains(ImageLoadedMask::SRC) {
if LOADABLE_IMG_SUFFIX
.iter()
.any(|suffix| value.value.ends_with(suffix))
{
if url_suffix_is_img(&value.value) {
mask |= ImageLoadedMask::SRC;
lazy_srcs.push(LazyImage {
local: local_name!("src"),
Expand Down Expand Up @@ -904,6 +968,8 @@ pub fn clean<S: ::std::hash::BuildHasher>(
}
local_name!("img") => {
let mask = img_loaded_mask(data);
// TODO(keur): We should maybe dom::clean_attr if src or srcset is presest
// but they are not in the mask. Means they are invalid.
let (mask, lazy_srcs) = try_lazy_img(data, mask);
if mask == ImageLoadedMask::NONE {
true
Expand Down Expand Up @@ -1073,4 +1139,31 @@ mod tests {
let (mask, _) = try_lazy_img(elem, ImageLoadedMask::NONE);
assert!(mask.is_all());
}

#[test]
fn test_validate_image_candidate_string() {
// Example based off https://github.com/brave/brave-browser/issues/18280
assert_eq!(
ImageCandidateError::URLFailedImageHeuristic,
validate_image_candidate_string("data:image/svg+xml,%3Csvg%20xmlns%3D").unwrap_err()
);
assert!(
validate_image_candidate_string("https://cdn.theatlantic.com/imgf/original.jpg")
.is_ok()
);
assert!(validate_image_candidate_string("image-480w.jpg 480w").is_ok());
assert!(validate_image_candidate_string("image-480w.jpg 480w").is_ok());
assert!(validate_image_candidate_string(
"/content/dam/news/2017/11/16/a08.jpeg?imwidth=480 480w"
)
.is_ok());
assert!(validate_image_candidate_string("https://i.guim.co.uk/img/media/603fd890c17afa94c6fd0e41f87875be104b811a/38_0_4848_2912/master/4848.jpg?width=160&amp;quality=85&amp;auto=format&amp;fit=max&amp;s=fc5a4d4e165ba28ea00817f416bae258 160w").is_ok());
}

#[test]
fn test_is_srcset() {
assert!(is_valid_srcset(
"https://test.com/original.jpg, https://test.com/original.jpg 2x"
));
}
}