From 376a2957832dd5873da491cf59e7899402a1906a Mon Sep 17 00:00:00 2001 From: Chris Patuzzo Date: Thu, 9 Nov 2023 18:01:55 +0000 Subject: [PATCH 1/2] Add a way to bypass the options[:tags] whitelist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Readability is primarily concerned with extracting text, not images. We are using readability to extract images by setting `tags: %w[img]` which preserves tags in the output HTML. However, this won’t work if the image is nested under a non-whitelisted node, e.g. ```html
``` I think we basically just want to whitelist all nodes for extraction because our SplitCleanService already handles stripping of nodes it doesn’t care about. Therefore, add a mechanism to bypass the node whitelisting by setting `tags: %w[*]`, i.e. a wildcard. --- lib/readability.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/readability.rb b/lib/readability.rb index c618b8b..159747e 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -423,6 +423,8 @@ def sanitize(node, candidates, options = {}) # We'll sanitize all elements using a whitelist base_whitelist = @options[:tags] || %w[div p] + all_whitelisted = base_whitelist.include?("*") + # We'll add whitespace instead of block elements, # so a
b will have a nice space between them base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center] @@ -435,7 +437,7 @@ def sanitize(node, candidates, options = {}) ([node] + node.css("*")).each do |el| # If element is in whitelist, delete all its attributes - if whitelist[el.node_name] + if all_whitelisted || whitelist[el.node_name] el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } # Otherwise, replace the element with its contents From aeec549c735934f2623df63dec1c53296dae6b7e Mon Sep 17 00:00:00 2001 From: Chris Patuzzo Date: Fri, 10 Nov 2023 12:40:04 +0000 Subject: [PATCH 2/2] Add a unit test for deeply nested image tag preservation --- spec/fixtures/nested_images.html | 11 +++++++++++ spec/readability_spec.rb | 11 +++++++++++ 2 files changed, 22 insertions(+) create mode 100644 spec/fixtures/nested_images.html diff --git a/spec/fixtures/nested_images.html b/spec/fixtures/nested_images.html new file mode 100644 index 0000000..26610eb --- /dev/null +++ b/spec/fixtures/nested_images.html @@ -0,0 +1,11 @@ + + +
+
+
+ +
+
+
+ + diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb index 4ba7fd5..35c1e67 100644 --- a/spec/readability_spec.rb +++ b/spec/readability_spec.rb @@ -56,6 +56,7 @@ @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html") @thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html") @ch = File.read(File.dirname(__FILE__) + "/fixtures/codinghorror.html") + @nested = File.read(File.dirname(__FILE__) + "/fixtures/nested_images.html") FakeWeb::Registry.instance.clean_registry @@ -103,6 +104,16 @@ ]) end + it "should be able to preserve deeply nested image tags in the article's content by whitelisting all tags" do + @doc = Readability::Document.new(@nested, attributes: ["src"]) + expect(@doc.images).to be_empty + + @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["figure", "image"]) + expect(@doc.images).to be_empty + + @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["*"]) + expect(@doc.content).to include('') + end it "should not try to download local images" do @doc = Readability::Document.new(<<-HTML)