From a088fe0fb24b4afaa91adac85a417dcd603c3444 Mon Sep 17 00:00:00 2001
From: Chris Patuzzo <chris@patuzzo.co.uk>
Date: Thu, 9 Nov 2023 13:38:22 +0000
Subject: [PATCH 1/2] Allow passing in an array of elements_to_score and add
 'pre' as a default

We were experiencing a problem where the h1 text was not being included
in the Readability#content. Here is an example that demonstrates the problem:

```
<article>
  <header>
    <h1>Title</h1>
  </header>
  <section>
    <p>Paragraph</p>
  </section>
</article>
```

Previously, the code would add the <p>, <section> and <article> elements
as @candidates because it adds the parent and grand parent of every <p>. It
would not add the <header> element as a candidate.

Then, the best_candidate with the highest score is the <section> element.
The code then tries to add related siblings in #get_article but it wasn't
adding the <header> element because it wasn't in the list of candidates.

We can solve this problem by adding <h1> to the list of elements to score
which will then ensure that <header> parent is included in the
candidates and can be added as a related sibling.

This commit also adds <pre> to the list of default nodes to score
because it is included in arc90's original code here:

https://github.com/masukomi/arc90-readability/blob/master/js/readability.js#L749

I'm not sure why this was omitted.
---
 lib/readability.rb | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/lib/readability.rb b/lib/readability.rb
index c618b8b..2b98cf4 100644
--- a/lib/readability.rb
+++ b/lib/readability.rb
@@ -17,7 +17,8 @@ class Document
       :min_image_height           => 80,
       :ignore_image_format        => [],
       :blacklist                  => nil,
-      :whitelist                  => nil
+      :whitelist                  => nil,
+      :elements_to_score          => ["p", "td", "pre"]
     }.freeze
     
     REGEXES = {
@@ -310,7 +311,7 @@ def get_link_density(elem)
 
     def score_paragraphs(min_text_length)
       candidates = {}
-      @html.css("p,td").each do |elem|
+      @html.css(options[:elements_to_score].join(',')).each do |elem|
         parent_node = elem.parent
         grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
         inner_text = elem.text

From 47435513ff98715584e4d13fd6f5052277528dfa Mon Sep 17 00:00:00 2001
From: Chris Patuzzo <chris@patuzzo.co.uk>
Date: Fri, 10 Nov 2023 14:10:50 +0000
Subject: [PATCH 2/2] Add unit tests for related siblings and introduce a
 :likely_siblings option
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code had two strategies for determining whether to include siblings
in the output after determining the best candidate based on score:

1) It checked if the sibling is a candidate that scored above a threshold
which is the maximum of 10 and 0.2 of the best_candidate’s score.

2) It checked if the sibling was a paragraph that was longer than 80
characters with a penalty given for each link within the paragraph.

Neither of these strategies worked well for extracting <h1> titles:

1) Failed because titles score poorly due to not containing many commas
2) Failed because titles are within <h1> or <header> elements

However, titles are usually longer than 80 characters and don’t contain
links so it seems reasonable to modify strategy 2) to allow for other
elements, such as <h1> and <header> to be included as related siblings.

Therefore, this commit introduces a :likely_silings option that defaults
to the same <p> elements as before but can now be set by the developer
to include other elements such as <h1> and <header>. These are not
added by default to remain in sync with Arc90’s original implementation.
---
 lib/readability.rb       |   6 +-
 spec/readability_spec.rb | 120 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/lib/readability.rb b/lib/readability.rb
index 2b98cf4..2a3f988 100644
--- a/lib/readability.rb
+++ b/lib/readability.rb
@@ -18,7 +18,8 @@ class Document
       :ignore_image_format        => [],
       :blacklist                  => nil,
       :whitelist                  => nil,
-      :elements_to_score          => ["p", "td", "pre"]
+      :elements_to_score          => ["p", "td", "pre"],
+      :likely_siblings            => ["p"]
     }.freeze
     
     REGEXES = {
@@ -261,13 +262,14 @@ def get_article(candidates, best_candidate)
       # Things like preambles, content split by ads that we removed, etc.
 
       sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
+      downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
       output = Nokogiri::XML::Node.new('div', @html)
       best_candidate[:elem].parent.children.each do |sibling|
         append = false
         append = true if sibling == best_candidate[:elem]
         append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
 
-        if sibling.name.downcase == "p"
+        if downcased_likely_siblings.include?(sibling.name.downcase)
           link_density = get_link_density(sibling)
           node_content = sibling.text
           node_length = node_content.length
diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb
index 4ba7fd5..4e45e0e 100644
--- a/spec/readability_spec.rb
+++ b/spec/readability_spec.rb
@@ -376,6 +376,126 @@
         expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1')
       end
     end
+
+    it "does not include short paragraphs as related siblings in the output" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <section>
+              <p>Too short</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).not_to include("Too short")
+    end
+
+    it "includes long paragraphs as related siblings in the output" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <p>This paragraph is longer than 80 characters so should be included as a sibling in the output.</p>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("This paragraph is longer")
+    end
+
+    it "does not include non-paragraph tags in the output, even when longer than 80 characters" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <section>
+              <p>Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).not_to include("Although this paragraph")
+    end
+
+    it "does include non-paragraph tags in the output if their content score is high enough" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
+            </section>
+            <section>
+              <p>This should be included in the output because the content is score is high enough.<p>
+              <p>The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("This should be included")
+    end
+
+    it "can optionally include other related siblings in the output if they meet the 80 character threshold" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
+            </section>
+            <section>
+              <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
+              <p>The likely_siblings now include the section tag so it should be included in the output.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("should be included")
+    end
   end
 
   describe "the cant_read.html fixture" do