diff --git a/lib/readability.rb b/lib/readability.rb index c618b8b..2a3f988 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -17,7 +17,9 @@ class Document :min_image_height => 80, :ignore_image_format => [], :blacklist => nil, - :whitelist => nil + :whitelist => nil, + :elements_to_score => ["p", "td", "pre"], + :likely_siblings => ["p"] }.freeze REGEXES = { @@ -260,13 +262,14 @@ def get_article(candidates, best_candidate) # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max + downcased_likely_siblings = options[:likely_siblings].map(&:downcase) output = Nokogiri::XML::Node.new('div', @html) best_candidate[:elem].parent.children.each do |sibling| append = false append = true if sibling == best_candidate[:elem] append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold - if sibling.name.downcase == "p" + if downcased_likely_siblings.include?(sibling.name.downcase) link_density = get_link_density(sibling) node_content = sibling.text node_length = node_content.length @@ -310,7 +313,7 @@ def get_link_density(elem) def score_paragraphs(min_text_length) candidates = {} - @html.css("p,td").each do |elem| + @html.css(options[:elements_to_score].join(',')).each do |elem| parent_node = elem.parent grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil inner_text = elem.text diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb index 4ba7fd5..4e45e0e 100644 --- a/spec/readability_spec.rb +++ b/spec/readability_spec.rb @@ -376,6 +376,126 @@ expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1') end end + + it "does not include short paragraphs as related siblings in the output" do + @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"]) + + + title! + + +
+

Paragraph 1

+

Paragraph 2

+
+
+

Too short

+
+ #{'This link lowers the body score.' * 5} + + + HTML + + expect(@doc.content).to include("Paragraph 1") + expect(@doc.content).to include("Paragraph 2") + expect(@doc.content).not_to include("Too short") + end + + it "includes long paragraphs as related siblings in the output" do + @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"]) + + + title! + + +
+

Paragraph 1

+

Paragraph 2

+
+

This paragraph is longer than 80 characters so should be included as a sibling in the output.

+ #{'This link lowers the body score.' * 5} + + + HTML + + expect(@doc.content).to include("Paragraph 1") + expect(@doc.content).to include("Paragraph 2") + expect(@doc.content).to include("This paragraph is longer") + end + + it "does not include non-paragraph tags in the output, even when longer than 80 characters" do + @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"]) + + + title! + + +
+

Paragraph 1

+

Paragraph 2

+
+
+

Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.

+
+ #{'This link lowers the body score.' * 5} + + + HTML + + expect(@doc.content).to include("Paragraph 1") + expect(@doc.content).to include("Paragraph 2") + expect(@doc.content).not_to include("Although this paragraph") + end + + it "does include non-paragraph tags in the output if their content score is high enough" do + @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"]) + + + title! + + +
+

Paragraph 1

+ #{'

Paragraph 2

' * 10} +
+
+

This should be included in the output because the content is score is high enough.

+

The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.

+
+ #{'This link lowers the body score.' * 5} + + + HTML + + expect(@doc.content).to include("Paragraph 1") + expect(@doc.content).to include("Paragraph 2") + expect(@doc.content).to include("This should be included") + end + + it "can optionally include other related siblings in the output if they meet the 80 character threshold" do + @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"]) + + + title! + + +
+

Paragraph 1

+ #{'

Paragraph 2

' * 10} +
+
+

This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.

+

The likely_siblings now include the section tag so it should be included in the output.

+
+ #{'This link lowers the body score.' * 5} + + + HTML + + expect(@doc.content).to include("Paragraph 1") + expect(@doc.content).to include("Paragraph 2") + expect(@doc.content).to include("should be included") + end end describe "the cant_read.html fixture" do