Merge pull request #94 from beyondwords-io/allow-passing-in-elements-…

…to-score Allow passing in an array of elements_to_score and add 'pre' as a default
cantino · Nov 11, 2023 · 599ed39 · 599ed39
2 parents b1b87ff + 4743551
commit 599ed39
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 3 deletions.
diff --git a/lib/readability.rb b/lib/readability.rb
@@ -17,7 +17,9 @@ class Document
       :min_image_height           => 80,
       :ignore_image_format        => [],
       :blacklist                  => nil,
-      :whitelist                  => nil
+      :whitelist                  => nil,
+      :elements_to_score          => ["p", "td", "pre"],
+      :likely_siblings            => ["p"]
     }.freeze
 
     REGEXES = {
@@ -260,13 +262,14 @@ def get_article(candidates, best_candidate)
       # Things like preambles, content split by ads that we removed, etc.
 
       sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
+      downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
       output = Nokogiri::XML::Node.new('div', @html)
       best_candidate[:elem].parent.children.each do |sibling|
         append = false
         append = true if sibling == best_candidate[:elem]
         append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
 
-        if sibling.name.downcase == "p"
+        if downcased_likely_siblings.include?(sibling.name.downcase)
           link_density = get_link_density(sibling)
           node_content = sibling.text
           node_length = node_content.length
@@ -310,7 +313,7 @@ def get_link_density(elem)
 
     def score_paragraphs(min_text_length)
       candidates = {}
-      @html.css("p,td").each do |elem|
+      @html.css(options[:elements_to_score].join(',')).each do |elem|
         parent_node = elem.parent
         grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
         inner_text = elem.text

diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb
@@ -376,6 +376,126 @@
         expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1')
       end
     end
+
+    it "does not include short paragraphs as related siblings in the output" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <section>
+              <p>Too short</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).not_to include("Too short")
+    end
+
+    it "includes long paragraphs as related siblings in the output" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <p>This paragraph is longer than 80 characters so should be included as a sibling in the output.</p>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("This paragraph is longer")
+    end
+
+    it "does not include non-paragraph tags in the output, even when longer than 80 characters" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <section>
+              <p>Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).not_to include("Although this paragraph")
+    end
+
+    it "does include non-paragraph tags in the output if their content score is high enough" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
+            </section>
+            <section>
+              <p>This should be included in the output because the content is score is high enough.<p>
+              <p>The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("This should be included")
+    end
+
+    it "can optionally include other related siblings in the output if they meet the 80 character threshold" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
+            </section>
+            <section>
+              <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
+              <p>The likely_siblings now include the section tag so it should be included in the output.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("should be included")
+    end
   end
 
   describe "the cant_read.html fixture" do