Skip to content

Commit

Permalink
BUG: invalid cm/tm in visitor functions (#2206)
Browse files Browse the repository at this point in the history
Reworks and is still valid to close #2059

Closes #2200
Closes #2075
  • Loading branch information
pubpub-zz authored Oct 8, 2023
1 parent 126f6be commit bcd85c4
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 45 deletions.
25 changes: 19 additions & 6 deletions docs/user/extract-text.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,27 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra
You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment.

The function provided in argument visitor_text of function extract_text has five arguments:
text, current transformation matrix, text matrix, font-dictionary and font-size.
In most cases the x and y coordinates of the current position
are in index 4 and 5 of the current transformation matrix.
* text: the current text (as long as possible, can be up to a full line)
* user_matrix: current matrix to move from user coordinate space (also known as CTM)
* tm_matrix: current matrix from text coordinate space
* font-dictionary: full font dictionary
* font-size: the size (in text coordinate space)

The matrix stores 6 parameters. The first 4 provide the rotation/scaling matrix and the last two provide the translation (horizontal/vertical)
It is recommended to use the user_matrix as it takes into all transformations.

Notes :

- as indicated in the PDF 1.7 reference, page 204 the user matrix applies to text space/image space/form space/pattern space.
- if you want to get the full transformation from text to user space, you can use the `mult` function (availalbe in global import) as follows:
`txt2user = mult(tm, cm))`
The font-size is the raw text size, that is affected by the `user_matrix`


The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

**Caveat**: In complicated documents the calculated positions might be wrong.
**Caveat**: In complicated documents the calculated positions may be difficult to (if you move from multiple forms to page user space for example).

The function provided in argument visitor_operand_before has four arguments:
operator, operand-arguments, current transformation matrix and text matrix.
Expand All @@ -53,7 +66,7 @@ parts = []


def visitor_body(text, cm, tm, font_dict, font_size):
y = tm[5]
y = cm[5]
if y > 50 and y < 720:
parts.append(text)

Expand Down Expand Up @@ -88,7 +101,7 @@ def visitor_svg_rect(op, args, cm, tm):


def visitor_svg_text(text, cm, tm, fontDict, fontSize):
(x, y) = (tm[4], tm[5])
(x, y) = (cm[4], cm[5])
dwg.add(dwg.text(text, insert=(x, y), fill="blue"))


Expand Down
3 changes: 2 additions & 1 deletion pypdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ._crypt_providers import crypt_provider
from ._encryption import PasswordType
from ._merger import PdfFileMerger, PdfMerger
from ._page import PageObject, Transformation
from ._page import PageObject, Transformation, mult
from ._reader import DocumentInformation, PdfFileReader, PdfReader
from ._version import __version__
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
Expand All @@ -31,6 +31,7 @@
__all__ = [
"__version__",
"_debug_versions",
"mult",
"PageRange",
"PaperSize",
"DocumentInformation",
Expand Down
70 changes: 47 additions & 23 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1921,18 +1921,17 @@ def _extract_text(
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.

cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_stack = []
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [
1.0,
0.0,
0.0,
1.0,
0.0,
0.0,
] # will store previous tm_matrix

# cm/tm_prev stores the last modified matrices can be an intermediate position
cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]

# memo_cm/tm will be used to store the position at the beginning of building the text
memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
Expand All @@ -1943,9 +1942,9 @@ def current_spacewidth() -> float:
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal orientations, rtl_dir, visitor_text
nonlocal orientations, rtl_dir, visitor_text, output, text
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS

check_crlf_space: bool = False
Expand All @@ -1954,14 +1953,18 @@ def process_operation(operator: bytes, operands: List) -> None:
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
return None
elif operator == b"ET":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
# table 4.7 "Graphics state operators", page 219
# cm_matrix calculation is a reserved for the moment
elif operator == b"q":
Expand Down Expand Up @@ -1992,7 +1995,7 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"cm":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
cm_matrix = mult(
[
Expand All @@ -2005,6 +2008,8 @@ def process_operation(operator: bytes, operands: List) -> None:
],
cm_matrix,
)
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
# Table 5.2 page 398
elif operator == b"Tz":
char_scale = float(operands[0]) / 100.0
Expand All @@ -2016,8 +2021,10 @@ def process_operation(operator: bytes, operands: List) -> None:
if text != "":
output += text # .translate(cmap)
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
try:
# charMapTuple: font_type, float(sp_width / 2), encoding,
# map_dict, font-dictionary
Expand Down Expand Up @@ -2088,17 +2095,19 @@ def process_operation(operator: bytes, operands: List) -> None:
try:
text, output, cm_prev, tm_prev = crlf_space_check(
text,
cm_prev,
tm_prev,
cm_matrix,
tm_matrix,
(cm_prev, tm_prev),
(cm_matrix, tm_matrix),
(memo_cm, memo_tm),
cmap,
orientations,
output,
font_size,
visitor_text,
current_spacewidth(),
)
if text == "":
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
except OrientationNotFoundError:
return None

Expand Down Expand Up @@ -2130,12 +2139,18 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"Do":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
try:
if output[-1] != "\n":
output += "\n"
if visitor_text is not None:
visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(
"\n",
memo_cm,
memo_tm,
cmap[3],
font_size,
)
except IndexError:
pass
try:
Expand All @@ -2151,21 +2166,30 @@ def process_operation(operator: bytes, operands: List) -> None:
)
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(
text,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
except Exception:
logger_warning(
f" impossible to decode XFormObject {operands[0]}",
__name__,
)
finally:
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()

else:
process_operation(operator, operands)
if visitor_operand_after is not None:
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
output += text # just in case of
if text != "" and visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
return output

def extract_text(
Expand Down
31 changes: 19 additions & 12 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,9 @@ def orient(m: List[float]) -> int:

def crlf_space_check(
text: str,
cm_prev: List[float],
tm_prev: List[float],
cm_matrix: List[float],
tm_matrix: List[float],
cmtm_prev: Tuple[List[float], List[float]],
cmtm_matrix: Tuple[List[float], List[float]],
memo_cmtm: Tuple[List[float], List[float]],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
Expand All @@ -100,13 +99,21 @@ def crlf_space_check(
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float], List[float]]:
cm_prev = cmtm_prev[0]
tm_prev = cmtm_prev[1]
cm_matrix = cmtm_matrix[0]
tm_matrix = cmtm_matrix[1]
memo_cm = memo_cmtm[0]
memo_tm = memo_cmtm[1]

m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
cm_prev = m
if orientation not in orientations:
raise OrientationNotFoundError
try:
Expand All @@ -117,8 +124,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -136,8 +143,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -155,8 +162,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -174,8 +181,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand Down
78 changes: 78 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,3 +1288,81 @@ def test_get_contents_from_nullobject():
p = writer.add_blank_page(100, 100)
p[NameObject("/Contents")] = writer._add_object(NullObject())
p.get_contents()


@pytest.mark.enable_socket()
def test_pos_text_in_textvisitor():
"""See #2200"""
url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf"
name = "test_text_pos.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
p = ()

def visitor_body2(text, cm, tm, fontdict, fontsize) -> None:
nonlocal p
if text.startswith("5425."):
p = (tm[4], tm[5])

reader.pages[0].extract_text(visitor_text=visitor_body2)
assert abs(p[0] - 323.5) < 0.1
assert abs(p[1] - 457.4) < 0.1


@pytest.mark.enable_socket()
def test_pos_text_in_textvisitor2():
"""See #2075"""
url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf"
name = "LegIndex-page6.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
x_lvl = 26
lst = []

def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None:
nonlocal x_lvl, lst
if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210:
lst.append(text.strip(" \n"))

reader.pages[0].extract_text(visitor_text=visitor_lvl)
assert lst == [
"ACUPUNCTURE BOARD",
"ACUPUNCTURISTS AND ACUPUNCTURE",
"ADMINISTRATIVE LAW AND PROCEDURE",
"ADMINISTRATIVE LAW, OFFICE OF",
"ADOPTION",
"ADULT EDUCATION",
"ADVERTISING. See also MARKETING; and particular subject matter (e.g.,",
]
x_lvl = 35
lst = []
reader.pages[0].extract_text(visitor_text=visitor_lvl)
assert lst == [
"members, AB 1264",
"assistants, acupuncture, AB 1264",
"complaints, investigations, etc., AB 1264",
"day, california acupuncture, HR 48",
"massage services, asian, AB 1264",
"supervising acupuncturists, AB 1264",
"supportive acupuncture services, basic, AB 1264",
"rules and regulations—",
"professional assistants and employees: employment and compensation, AB 916",
"adults, adoption of, AB 1756",
"agencies, organizations, etc.: requirements, prohibitions, etc., SB 807",
"assistance programs, adoption: nonminor dependents, SB 9",
"birth certificates, AB 1302",
"contact agreements, postadoption—",
"facilitators, adoption, AB 120",
"failed adoptions: reproductive loss leave, SB 848",
"hearings, adoption finalization: remote proceedings, technology, etc., SB 21",
"native american tribes, AB 120",
"parental rights, reinstatement of, AB 20",
"parents, prospective adoptive: criminal background checks, SB 824",
"services, adult educational, SB 877",
"week, adult education, ACR 31",
"alcoholic beverages: tied-house restrictions, AB 546",
"campaign re social equity, civil rights, etc., SB 447",
"cannabis, AB 794",
"elections. See ELECTIONS.",
"false, misleading, etc., advertising—",
"hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures, SB 683",
"housing rental properties advertised rates: disclosures, SB 611",
]
Loading

0 comments on commit bcd85c4

Please sign in to comment.