BUG: Improve spacing for text extraction (#806)

PyPDF2 now takes positive / negative spaces between text blocks into account. Not very elegant, but the result looks way better than before.
py-pdf · Apr 23, 2022 · d1be80d · d1be80d
1 parent d4c8cab
commit d1be80d
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 26 deletions.
diff --git a/PyPDF2/pdf.py b/PyPDF2/pdf.py
@@ -146,7 +146,7 @@ def set_need_appearances_writer(self):
             self._root_object["/AcroForm"][need_appearances] = BooleanObject(True)
 
         except Exception as e:
-            print('set_need_appearances_writer() catch : ', repr(e))
+            logger.error('set_need_appearances_writer() catch : ', repr(e))
 
     def addPage(self, page):
         """
@@ -2777,7 +2777,7 @@ def compressContentStreams(self):
                 content = ContentStream(content, self.pdf)
             self[NameObject("/Contents")] = content.flateEncode()
 
-    def extractText(self, Tj_sep="", TJ_sep=" "):
+    def extractText(self, Tj_sep="", TJ_sep=""):
         """
         Locate all text drawing commands, in the order they are provided in the
         content stream, and extract the text.  This works well for some PDF
@@ -2819,6 +2819,15 @@ def extractText(self, Tj_sep="", TJ_sep=" "):
                     if isinstance(i, TextStringObject):
                         text += TJ_sep
                         text += i
+                    elif isinstance(i, NumberObject):
+                        # a positive value decreases and the negative value increases
+                        # space
+                        if int(i) < 0:
+                            if len(text) == 0 or text[-1] != " ":
+                                text += " "
+                        else:
+                            if len(text) > 1 and text[-1] == " ":
+                                text = text[:-1]
                 text += "\n"
         return text
 

diff --git a/Resources/crazyones.txt b/Resources/crazyones.txt
@@ -1,18 +1,18 @@
- The Cr azy Ones
- Octob er 14, 1998
- Heres to the crazy ones. The mis˝ts. The reb els. The troublemak ers.
- The round p egs in the square holes.
- The ones who see things di˙eren tly . Theyre not fond of rules. And
- they ha v e no resp ect for the status quo. Y ou can quote them,
- disagree with them, glorify or vilify them.
- Ab out the only thing y ou cant do is ignore them. Because they c hange
- things. They in v en t. They imagine. They heal. They explore. They
- create. They inspire. They push the h uman race forw ard.
- Ma yb e they ha v e to b e crazy .
- Ho w else can y ou stare at an empt y can v as and see a w ork of art? Or
- sit in silence and hear a song thats nev er b een written? Or gaze at
- a red planet and see a lab oratory on wheels?
- W e mak e to ols for these kinds of p eople.
- While some see them as the crazy ones, w e see genius. Because the
- p eople who are crazy enough to think they can c hange the w orld,
- are the ones who do.
+The Crazy Ones
+October 14, 1998
+Heres to the crazy ones. The mis˝ts. The reb els. The troublemakers.
+The round p egs in the square holes.
+The ones who see things di˙erently. Theyre not fond of rules. And
+they have no resp ect for the status quo. You can quote them,
+disagree with them, glorify or vilify them.
+Ab out the only thing you cant do is ignore them. Because they change
+things. They invent. They imagine. They heal. They explore. They
+create. They inspire. They push the human race forward.
+Mayb e they have to b e crazy.
+How else can you stare at an empty canvas and see a work of art? Or
+sit in silence and hear a song thats never b een written? Or gaze at
+a red planet and see a lab oratory on wheels?
+We make to ols for these kinds of p eople.
+While some see them as the crazy ones, we see genius. Because the
+p eople who are crazy enough to think they can change the world,
+are the ones who do.
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -1,6 +1,7 @@
 import io
 import os
 import time
+from sys import version_info
 
 import pytest
 
@@ -10,13 +11,14 @@
 from PyPDF2.constants import Ressources as RES
 from PyPDF2.errors import PdfReadError
 from PyPDF2.filters import _xobj_to_image
-from sys import version_info
 
-if version_info < ( 3, 0 ):
+if version_info < (3, 0):
     from cStringIO import StringIO
+
     StreamIO = StringIO
 else:
     from io import BytesIO
+
     StreamIO = BytesIO
 
 TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
@@ -475,7 +477,7 @@ def test_get_destination_age_number():
 
 def test_do_not_get_stuck_on_large_files_without_start_xref():
     """Tests for the absence of a DoS bug, where a large file without an startxref mark
-    would cause the library to hang for minutes to hours """
+    would cause the library to hang for minutes to hours"""
     start_time = time.time()
     broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000)
     with pytest.raises(PdfReadError):

diff --git a/Tests/test_workflows.py b/Tests/test_workflows.py
@@ -31,7 +31,7 @@ def test_PdfReaderFileLoad():
         with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file:
             pdftext = pdftext_file.read()
 
-        text = page.extractText().encode("utf-8")
+        text = page.extractText(Tj_sep="", TJ_sep="").encode("utf-8")
 
         # Compare the text of the PDF to a known source
         for expected_line, actual_line in zip(text.split(b"\n"), pdftext.split(b"\n")):

diff --git a/Tests/test_writer.py b/Tests/test_writer.py
@@ -96,7 +96,8 @@ def test_remove_images(input_path, ignoreByteStringObject):
     with open(tmp_filename, "rb") as input_stream:
         reader = PdfFileReader(input_stream)
         if input_path == "side-by-side-subfig.pdf":
-            assert "Lorem ipsum dolor sit amet" in reader.getPage(0).extractText()
+            extracted_text = reader.getPage(0).extractText()
+            assert "Lorem ipsum dolor sit amet" in extracted_text
 
     # Cleanup
     os.remove(tmp_filename)
@@ -166,7 +167,9 @@ def test_fill_form():
 
     writer.addPage(page)
 
-    writer.updatePageFormFieldValues(writer.getPage(0), {"foo": "some filled in text"}, flags=1)
+    writer.updatePageFormFieldValues(
+        writer.getPage(0), {"foo": "some filled in text"}, flags=1
+    )
 
     # write "output" to PyPDF2-output.pdf
     tmp_filename = "dont_commit_filled_pdf.pdf"