From 376a2957832dd5873da491cf59e7899402a1906a Mon Sep 17 00:00:00 2001
From: Chris Patuzzo <chris@patuzzo.co.uk>
Date: Thu, 9 Nov 2023 18:01:55 +0000
Subject: [PATCH 1/2] Add a way to bypass the options[:tags] whitelist
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Readability is primarily concerned with extracting text, not images. We
are using readability to extract images by setting `tags: %w[img]` which
preserves <img> tags in the output HTML. However, this won’t work if the
image is nested under a non-whitelisted node, e.g.

```html
<figure>
  <img src=“…” />
</figure>
```

I think we basically just want to whitelist all nodes for extraction
because our SplitCleanService already handles stripping of nodes it
doesn’t care about. Therefore, add a mechanism to bypass the node
whitelisting by setting `tags: %w[*]`, i.e. a wildcard.
---
 lib/readability.rb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/lib/readability.rb b/lib/readability.rb
index c618b8b..159747e 100644
--- a/lib/readability.rb
+++ b/lib/readability.rb
@@ -423,6 +423,8 @@ def sanitize(node, candidates, options = {})
 
       # We'll sanitize all elements using a whitelist
       base_whitelist = @options[:tags] || %w[div p]
+      all_whitelisted = base_whitelist.include?("*")
+
       # We'll add whitespace instead of block elements,
       # so a<br>b will have a nice space between them
       base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
@@ -435,7 +437,7 @@ def sanitize(node, candidates, options = {})
 
       ([node] + node.css("*")).each do |el|
         # If element is in whitelist, delete all its attributes
-        if whitelist[el.node_name]
+        if all_whitelisted || whitelist[el.node_name]
           el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
 
           # Otherwise, replace the element with its contents

From aeec549c735934f2623df63dec1c53296dae6b7e Mon Sep 17 00:00:00 2001
From: Chris Patuzzo <chris@patuzzo.co.uk>
Date: Fri, 10 Nov 2023 12:40:04 +0000
Subject: [PATCH 2/2] Add a unit test for deeply nested image tag preservation

---
 spec/fixtures/nested_images.html | 11 +++++++++++
 spec/readability_spec.rb         | 11 +++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 spec/fixtures/nested_images.html

diff --git a/spec/fixtures/nested_images.html b/spec/fixtures/nested_images.html
new file mode 100644
index 0000000..26610eb
--- /dev/null
+++ b/spec/fixtures/nested_images.html
@@ -0,0 +1,11 @@
+<html>
+  <body>
+    <article>
+      <section>
+        <figure>
+          <img src="http://example.com/image.jpeg" />
+        </figure>
+      </section>
+    </article>
+  </body>
+</html>
diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb
index 4ba7fd5..35c1e67 100644
--- a/spec/readability_spec.rb
+++ b/spec/readability_spec.rb
@@ -56,6 +56,7 @@
       @nytimes  = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
       @thesun   = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
       @ch       = File.read(File.dirname(__FILE__) + "/fixtures/codinghorror.html")
+      @nested   = File.read(File.dirname(__FILE__) + "/fixtures/nested_images.html")
 
       FakeWeb::Registry.instance.clean_registry
 
@@ -103,6 +104,16 @@
       ])
     end
 
+    it "should be able to preserve deeply nested image tags in the article's content by whitelisting all tags" do
+      @doc = Readability::Document.new(@nested, attributes: ["src"])
+      expect(@doc.images).to be_empty
+
+      @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["figure", "image"])
+      expect(@doc.images).to be_empty
+
+      @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["*"])
+      expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
+    end
 
     it "should not try to download local images" do
       @doc = Readability::Document.new(<<-HTML)