Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: invalid cm/tm in visitor functions #2206

Merged
merged 15 commits into from
Oct 8, 2023
Merged
46 changes: 31 additions & 15 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1946,7 +1946,10 @@
1.0,
0.0,
0.0,
] # will store previous tm_matrix
] # will store cm_matrix * tm_matrix
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the old comment was more correct, right? This change was probably a side effect of the revert.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct we store inhere only tm_matrix

# memo_cm/tm will be used to store the position at the beginning of building the text
memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
Expand All @@ -1957,9 +1960,9 @@
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal orientations, rtl_dir, visitor_text
nonlocal orientations, rtl_dir, visitor_text, output, text
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS

check_crlf_space: bool = False
Expand All @@ -1968,14 +1971,18 @@
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
return None
elif operator == b"ET":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
# table 4.7 "Graphics state operators", page 219
# cm_matrix calculation is a reserved for the moment
elif operator == b"q":
Expand Down Expand Up @@ -2006,8 +2013,10 @@
elif operator == b"cm":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
cm_matrix = mult(
[
float(operands[0]),
Expand All @@ -2030,8 +2039,10 @@
if text != "":
output += text # .translate(cmap)
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
try:
# charMapTuple: font_type, float(sp_width / 2), encoding,
# map_dict, font-dictionary
Expand Down Expand Up @@ -2102,17 +2113,19 @@
try:
text, output, cm_prev, tm_prev = crlf_space_check(
text,
cm_prev,
tm_prev,
cm_matrix,
tm_matrix,
(cm_prev, tm_prev),
(cm_matrix, tm_matrix),
(memo_cm, memo_tm),
cmap,
orientations,
output,
font_size,
visitor_text,
current_spacewidth(),
)
if text == "":
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
except OrientationNotFoundError:
return None

Expand Down Expand Up @@ -2144,12 +2157,12 @@
elif operator == b"Do":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
try:
if output[-1] != "\n":
output += "\n"
if visitor_text is not None:
visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text("\n", memo_cm, memo_tm, cmap[3], font_size)

Check warning on line 2165 in pypdf/_page.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_page.py#L2165

Added line #L2165 was not covered by tests
except IndexError:
pass
try:
Expand All @@ -2165,21 +2178,24 @@
)
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)

Check warning on line 2181 in pypdf/_page.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_page.py#L2181

Added line #L2181 was not covered by tests
except Exception:
logger_warning(
f" impossible to decode XFormObject {operands[0]}",
__name__,
)
finally:
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()

else:
process_operation(operator, operands)
if visitor_operand_after is not None:
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
output += text # just in case of
if text != "" and visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)

Check warning on line 2198 in pypdf/_page.py

View check run for this annotation

Codecov / codecov/patch

pypdf/_page.py#L2198

Added line #L2198 was not covered by tests
return output

def extract_text(
Expand Down
31 changes: 19 additions & 12 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,9 @@ def orient(m: List[float]) -> int:

def crlf_space_check(
text: str,
cm_prev: List[float],
tm_prev: List[float],
cm_matrix: List[float],
tm_matrix: List[float],
cmtm_prev: Tuple[List[float], List[float]],
cmtm_matrix: Tuple[List[float], List[float]],
memo_cmtm: Tuple[List[float], List[float]],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
Expand All @@ -100,13 +99,21 @@ def crlf_space_check(
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float], List[float]]:
cm_prev = cmtm_prev[0]
tm_prev = cmtm_prev[1]
cm_matrix = cmtm_matrix[0]
tm_matrix = cmtm_matrix[1]
memo_cm = memo_cmtm[0]
memo_tm = memo_cmtm[1]

m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
cm_prev = m
Copy link
Contributor

@troethe troethe Sep 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look quite right. I think this line can however be safely removed, because cm_prev doesn't get accessed until it is being assigned to again at the end of this function (i.e. this is currently a noop).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree.
Good point to have noticed it . Actually I've forgot to pass the multiplied cm_matrix to the visitor

if orientation not in orientations:
raise OrientationNotFoundError
try:
Expand All @@ -117,8 +124,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -136,8 +143,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -155,8 +162,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -174,8 +181,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand Down
78 changes: 78 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,3 +1288,81 @@ def test_get_contents_from_nullobject():
p = writer.add_blank_page(100, 100)
p[NameObject("/Contents")] = writer._add_object(NullObject())
p.get_contents()


@pytest.mark.enable_socket()
def test_pos_text_in_textvisitor():
"""See #2200"""
url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf"
name = "test_text_pos.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
p = ()

def visitor_body2(text, cm, tm, fontdict, fontsize) -> None:
nonlocal p
if text.startswith("5425."):
p = (tm[4], tm[5])

reader.pages[0].extract_text(visitor_text=visitor_body2)
assert abs(p[0] - 323.5) < 0.1
assert abs(p[1] - 457.4) < 0.1


@pytest.mark.enable_socket()
def test_pos_text_in_textvisitor2():
"""See #2075"""
url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf"
name = "LegIndex-page6.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
x_lvl = 26
lst = []

def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None:
nonlocal x_lvl, lst
if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210:
lst.append(text.strip(" \n"))

reader.pages[0].extract_text(visitor_text=visitor_lvl)
assert lst == [
"ACUPUNCTURE BOARD",
"ACUPUNCTURISTS AND ACUPUNCTURE",
"ADMINISTRATIVE LAW AND PROCEDURE",
"ADMINISTRATIVE LAW, OFFICE OF",
"ADOPTION",
"ADULT EDUCATION",
"ADVERTISING. See also MARKETING; and particular subject matter (e.g.,",
]
x_lvl = 35
lst = []
reader.pages[0].extract_text(visitor_text=visitor_lvl)
assert lst == [
"members, AB 1264",
"assistants, acupuncture, AB 1264",
"complaints, investigations, etc., AB 1264",
"day, california acupuncture, HR 48",
"massage services, asian, AB 1264",
"supervising acupuncturists, AB 1264",
"supportive acupuncture services, basic, AB 1264",
"rules and regulations—",
"professional assistants and employees: employment and compensation, AB 916",
"adults, adoption of, AB 1756",
"agencies, organizations, etc.: requirements, prohibitions, etc., SB 807",
"assistance programs, adoption: nonminor dependents, SB 9",
"birth certificates, AB 1302",
"contact agreements, postadoption—",
"facilitators, adoption, AB 120",
"failed adoptions: reproductive loss leave, SB 848",
"hearings, adoption finalization: remote proceedings, technology, etc., SB 21",
"native american tribes, AB 120",
"parental rights, reinstatement of, AB 20",
"parents, prospective adoptive: criminal background checks, SB 824",
"services, adult educational, SB 877",
"week, adult education, ACR 31",
"alcoholic beverages: tied-house restrictions, AB 546",
"campaign re social equity, civil rights, etc., SB 447",
"cannabis, AB 794",
"elections. See ELECTIONS.",
"false, misleading, etc., advertising—",
"hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures, SB 683",
"housing rental properties advertised rates: disclosures, SB 611",
]
Loading