Skip to content

Commit

Permalink
BUG: Improve spacing for text extraction (#806)
Browse files Browse the repository at this point in the history
PyPDF2 now takes positive / negative spaces between text blocks into account. Not very elegant, but the result looks way better than before.
  • Loading branch information
MartinThoma authored Apr 23, 2022
1 parent d4c8cab commit d1be80d
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 26 deletions.
13 changes: 11 additions & 2 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def set_need_appearances_writer(self):
self._root_object["/AcroForm"][need_appearances] = BooleanObject(True)

except Exception as e:
print('set_need_appearances_writer() catch : ', repr(e))
logger.error('set_need_appearances_writer() catch : ', repr(e))

def addPage(self, page):
"""
Expand Down Expand Up @@ -2777,7 +2777,7 @@ def compressContentStreams(self):
content = ContentStream(content, self.pdf)
self[NameObject("/Contents")] = content.flateEncode()

def extractText(self, Tj_sep="", TJ_sep=" "):
def extractText(self, Tj_sep="", TJ_sep=""):
"""
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text. This works well for some PDF
Expand Down Expand Up @@ -2819,6 +2819,15 @@ def extractText(self, Tj_sep="", TJ_sep=" "):
if isinstance(i, TextStringObject):
text += TJ_sep
text += i
elif isinstance(i, NumberObject):
# a positive value decreases and the negative value increases
# space
if int(i) < 0:
if len(text) == 0 or text[-1] != " ":
text += " "
else:
if len(text) > 1 and text[-1] == " ":
text = text[:-1]
text += "\n"
return text

Expand Down
36 changes: 18 additions & 18 deletions Resources/crazyones.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
The Cr azy Ones
Octob er 14, 1998
Heres to the crazy ones. The mis˝ts. The reb els. The troublemak ers.
The round p egs in the square holes.
The ones who see things di˙eren tly . Theyre not fond of rules. And
they ha v e no resp ect for the status quo. Y ou can quote them,
disagree with them, glorify or vilify them.
Ab out the only thing y ou cant do is ignore them. Because they c hange
things. They in v en t. They imagine. They heal. They explore. They
create. They inspire. They push the h uman race forw ard.
Ma yb e they ha v e to b e crazy .
Ho w else can y ou stare at an empt y can v as and see a w ork of art? Or
sit in silence and hear a song thats nev er b een written? Or gaze at
a red planet and see a lab oratory on wheels?
W e mak e to ols for these kinds of p eople.
While some see them as the crazy ones, w e see genius. Because the
p eople who are crazy enough to think they can c hange the w orld,
are the ones who do.
The Crazy Ones
October 14, 1998
Heres to the crazy ones. The mis˝ts. The reb els. The troublemakers.
The round p egs in the square holes.
The ones who see things di˙erently. Theyre not fond of rules. And
they have no resp ect for the status quo. You can quote them,
disagree with them, glorify or vilify them.
Ab out the only thing you cant do is ignore them. Because they change
things. They invent. They imagine. They heal. They explore. They
create. They inspire. They push the human race forward.
Mayb e they have to b e crazy.
How else can you stare at an empty canvas and see a work of art? Or
sit in silence and hear a song thats never b een written? Or gaze at
a red planet and see a lab oratory on wheels?
We make to ols for these kinds of p eople.
While some see them as the crazy ones, we see genius. Because the
p eople who are crazy enough to think they can change the world,
are the ones who do.
8 changes: 5 additions & 3 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import io
import os
import time
from sys import version_info

import pytest

Expand All @@ -10,13 +11,14 @@
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError
from PyPDF2.filters import _xobj_to_image
from sys import version_info

if version_info < ( 3, 0 ):
if version_info < (3, 0):
from cStringIO import StringIO

StreamIO = StringIO
else:
from io import BytesIO

StreamIO = BytesIO

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
Expand Down Expand Up @@ -475,7 +477,7 @@ def test_get_destination_age_number():

def test_do_not_get_stuck_on_large_files_without_start_xref():
"""Tests for the absence of a DoS bug, where a large file without an startxref mark
would cause the library to hang for minutes to hours """
would cause the library to hang for minutes to hours"""
start_time = time.time()
broken_stream = StreamIO(b"\0" * 5 * 1000 * 1000)
with pytest.raises(PdfReadError):
Expand Down
2 changes: 1 addition & 1 deletion Tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_PdfReaderFileLoad():
with open(os.path.join(RESOURCE_ROOT, "crazyones.txt"), "rb") as pdftext_file:
pdftext = pdftext_file.read()

text = page.extractText().encode("utf-8")
text = page.extractText(Tj_sep="", TJ_sep="").encode("utf-8")

# Compare the text of the PDF to a known source
for expected_line, actual_line in zip(text.split(b"\n"), pdftext.split(b"\n")):
Expand Down
7 changes: 5 additions & 2 deletions Tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ def test_remove_images(input_path, ignoreByteStringObject):
with open(tmp_filename, "rb") as input_stream:
reader = PdfFileReader(input_stream)
if input_path == "side-by-side-subfig.pdf":
assert "Lorem ipsum dolor sit amet" in reader.getPage(0).extractText()
extracted_text = reader.getPage(0).extractText()
assert "Lorem ipsum dolor sit amet" in extracted_text

# Cleanup
os.remove(tmp_filename)
Expand Down Expand Up @@ -166,7 +167,9 @@ def test_fill_form():

writer.addPage(page)

writer.updatePageFormFieldValues(writer.getPage(0), {"foo": "some filled in text"}, flags=1)
writer.updatePageFormFieldValues(
writer.getPage(0), {"foo": "some filled in text"}, flags=1
)

# write "output" to PyPDF2-output.pdf
tmp_filename = "dont_commit_filled_pdf.pdf"
Expand Down

0 comments on commit d1be80d

Please sign in to comment.