Skip to content

Commit

Permalink
BUG: Fix incorrect tm_matrix in call to visitor_text (py-pdf#2060)
Browse files Browse the repository at this point in the history
Supply the old tm_matrix when flushing out `text` to the `visitor_text`
in `crlf_space_check`. The new one might already be changed and
unrelated to the current text.

Also add a test for the tm_matrix and cm_matrix that are given to
`visitor_text` when extracting text.
The test computes the coordinates of three letters in different
parts of a test page based on the matrices and checks, if they are
roughly where they should be.

Fixes py-pdf#2059
  • Loading branch information
troethe authored Aug 13, 2023
1 parent 46ac7ad commit 4458dc6
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1936,7 +1936,7 @@ def _extract_text(
1.0,
0.0,
0.0,
] # will store cm_matrix * tm_matrix
] # will store previous tm_matrix
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
Expand Down
15 changes: 8 additions & 7 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ def crlf_space_check(
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float]]:
m_prev = mult(tm_prev, cm_matrix)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - tm_prev[4]
delta_y = m[5] - tm_prev[5]
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
tm_prev = m
if orientation not in orientations:
raise OrientationNotFoundError
try:
Expand All @@ -117,7 +117,7 @@ def crlf_space_check(
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
tm_prev,
cmap[3],
font_size,
)
Expand All @@ -136,7 +136,7 @@ def crlf_space_check(
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
tm_prev,
cmap[3],
font_size,
)
Expand All @@ -155,7 +155,7 @@ def crlf_space_check(
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
tm_prev,
cmap[3],
font_size,
)
Expand All @@ -174,7 +174,7 @@ def crlf_space_check(
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
tm_prev,
cmap[3],
font_size,
)
Expand All @@ -187,6 +187,7 @@ def crlf_space_check(
text += " "
except Exception:
pass
tm_prev = tm_matrix.copy()
return text, output, tm_prev


Expand Down
Binary file added resources/inkscape-abc.pdf
Binary file not shown.
42 changes: 42 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,45 @@ def test_multi_language(visitor_text):
assert "حَبيبي" in reader.pages[2].extract_text(visitor_text=visitor_text)
assert "habibi" in reader.pages[3].extract_text(visitor_text=visitor_text)
assert "حَبيبي" in reader.pages[3].extract_text(visitor_text=visitor_text)


@pytest.mark.parametrize(("file_name", "constraints"),
[
("inkscape-abc.pdf",
{"A": lambda x, y:
0 < x < 94 and
189 < y < 283, # In upper left
"B": lambda x, y:
94 < x < 189 and
94 < y < 189, # In the center
"C": lambda x, y:
189 < x < 283 and
0 < y < 94} # In lower right
)
])
def test_visitor_text_matrices(file_name, constraints):
"""
Checks if the matrices given to the visitor_text function when calling
`extract_text` on the first page of `file_name` match some given constraints.
`constraints` is a dictionary mapping a line of text to a constraint that should
evaluate to `True` on its expected x,y-coordinates.
"""
reader = PdfReader(RESOURCE_ROOT / file_name)

lines = []

def visitor_text(text, cm, tm, font_dict, font_size) -> None:
x = tm[4] * cm[0] + tm[5] * cm[2] + cm[4] # mult(tm, cm)[4]
y = tm[4] * cm[1] + tm[5] * cm[3] + cm[5] # mult(tm, cm)[5]
lines.append({"text": text, "x": x, "y": y})

reader.pages[0].extract_text(visitor_text=visitor_text)

for text, constraint in constraints.items():
matches = [li for li in lines if li["text"].strip() == text]
assert len(matches) <= 1, f"Multiple lines match {text}"
assert len(matches) >= 1, f"No lines match {text}"

x = matches[0]["x"]
y = matches[0]["y"]
assert constraint(x, y), f'Line "{text}" is wrong at x:{x}, y:{y}'

0 comments on commit 4458dc6

Please sign in to comment.