Skip to content

Commit

Permalink
Merge pull request #95 from beyondwords-io/allow-a-wildcard-in-the-ta…
Browse files Browse the repository at this point in the history
…gs-option

Add a way to bypass the options[:tags] whitelist
  • Loading branch information
cantino authored Nov 11, 2023
2 parents 599ed39 + aeec549 commit cc1b0d2
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 1 deletion.
4 changes: 3 additions & 1 deletion lib/readability.rb
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,8 @@ def sanitize(node, candidates, options = {})

# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
all_whitelisted = base_whitelist.include?("*")

# We'll add whitespace instead of block elements,
# so a<br>b will have a nice space between them
base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
Expand All @@ -438,7 +440,7 @@ def sanitize(node, candidates, options = {})

([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
if whitelist[el.node_name]
if all_whitelisted || whitelist[el.node_name]
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }

# Otherwise, replace the element with its contents
Expand Down
11 changes: 11 additions & 0 deletions spec/fixtures/nested_images.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<html>
<body>
<article>
<section>
<figure>
<img src="http://example.com/image.jpeg" />
</figure>
</section>
</article>
</body>
</html>
11 changes: 11 additions & 0 deletions spec/readability_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
@nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
@thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
@ch = File.read(File.dirname(__FILE__) + "/fixtures/codinghorror.html")
@nested = File.read(File.dirname(__FILE__) + "/fixtures/nested_images.html")

FakeWeb::Registry.instance.clean_registry

Expand Down Expand Up @@ -103,6 +104,16 @@
])
end

it "should be able to preserve deeply nested image tags in the article's content by whitelisting all tags" do
@doc = Readability::Document.new(@nested, attributes: ["src"])
expect(@doc.images).to be_empty

@doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["figure", "image"])
expect(@doc.images).to be_empty

@doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["*"])
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
end

it "should not try to download local images" do
@doc = Readability::Document.new(<<-HTML)
Expand Down

0 comments on commit cc1b0d2

Please sign in to comment.