Skip to content

Commit

Permalink
Merge pull request #94 from beyondwords-io/allow-passing-in-elements-…
Browse files Browse the repository at this point in the history
…to-score

Allow passing in an array of elements_to_score and add 'pre' as a default
  • Loading branch information
cantino authored Nov 11, 2023
2 parents b1b87ff + 4743551 commit 599ed39
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 3 deletions.
9 changes: 6 additions & 3 deletions lib/readability.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ class Document
:min_image_height => 80,
:ignore_image_format => [],
:blacklist => nil,
:whitelist => nil
:whitelist => nil,
:elements_to_score => ["p", "td", "pre"],
:likely_siblings => ["p"]
}.freeze

REGEXES = {
Expand Down Expand Up @@ -260,13 +262,14 @@ def get_article(candidates, best_candidate)
# Things like preambles, content split by ads that we removed, etc.

sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
output = Nokogiri::XML::Node.new('div', @html)
best_candidate[:elem].parent.children.each do |sibling|
append = false
append = true if sibling == best_candidate[:elem]
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold

if sibling.name.downcase == "p"
if downcased_likely_siblings.include?(sibling.name.downcase)
link_density = get_link_density(sibling)
node_content = sibling.text
node_length = node_content.length
Expand Down Expand Up @@ -310,7 +313,7 @@ def get_link_density(elem)

def score_paragraphs(min_text_length)
candidates = {}
@html.css("p,td").each do |elem|
@html.css(options[:elements_to_score].join(',')).each do |elem|
parent_node = elem.parent
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
inner_text = elem.text
Expand Down
120 changes: 120 additions & 0 deletions spec/readability_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,126 @@
expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1')
end
end

it "does not include short paragraphs as related siblings in the output" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
<html>
<head>
<title>title!</title>
</head>
<body>
<section>
<p>Paragraph 1</p>
<p>Paragraph 2</p>
</section>
<section>
<p>Too short</p>
</section>
#{'<a href="/">This link lowers the body score.</a>' * 5}
</body>
</html>
HTML

expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).not_to include("Too short")
end

it "includes long paragraphs as related siblings in the output" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
<html>
<head>
<title>title!</title>
</head>
<body>
<section>
<p>Paragraph 1</p>
<p>Paragraph 2</p>
</section>
<p>This paragraph is longer than 80 characters so should be included as a sibling in the output.</p>
#{'<a href="/">This link lowers the body score.</a>' * 5}
</body>
</html>
HTML

expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("This paragraph is longer")
end

it "does not include non-paragraph tags in the output, even when longer than 80 characters" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
<html>
<head>
<title>title!</title>
</head>
<body>
<section>
<p>Paragraph 1</p>
<p>Paragraph 2</p>
</section>
<section>
<p>Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.</p>
</section>
#{'<a href="/">This link lowers the body score.</a>' * 5}
</body>
</html>
HTML

expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).not_to include("Although this paragraph")
end

it "does include non-paragraph tags in the output if their content score is high enough" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
<html>
<head>
<title>title!</title>
</head>
<body>
<section>
<p>Paragraph 1</p>
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
</section>
<section>
<p>This should be included in the output because the content is score is high enough.<p>
<p>The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.</p>
</section>
#{'<a href="/">This link lowers the body score.</a>' * 5}
</body>
</html>
HTML

expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("This should be included")
end

it "can optionally include other related siblings in the output if they meet the 80 character threshold" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
<html>
<head>
<title>title!</title>
</head>
<body>
<section>
<p>Paragraph 1</p>
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
</section>
<section>
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
</section>
#{'<a href="/">This link lowers the body score.</a>' * 5}
</body>
</html>
HTML

expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("should be included")
end
end

describe "the cant_read.html fixture" do
Expand Down

0 comments on commit 599ed39

Please sign in to comment.