From fbc0dc7b236639726123891babde1b3db799303c Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 18 Sep 2023 21:40:34 +0200 Subject: [PATCH 01/11] Revert "BUG: Fix incorrect tm_matrix in call to visitor_text (#2060)" This reverts commit 4458dc60204c388185db119147d946c280b80a9b. --- pypdf/_page.py | 2 +- pypdf/_text_extraction/__init__.py | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 55054c47b..7e987e853 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1946,7 +1946,7 @@ def _extract_text( 1.0, 0.0, 0.0, - ] # will store previous tm_matrix + ] # will store cm_matrix * tm_matrix char_scale = 1.0 space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index aa262dd5a..7dbc9666d 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -103,10 +103,11 @@ def crlf_space_check( m_prev = mult(tm_prev, cm_prev) m = mult(tm_matrix, cm_matrix) orientation = orient(m) - delta_x = m[4] - m_prev[4] - delta_y = m[5] - m_prev[5] + delta_x = m[4] - tm_prev[4] + delta_y = m[5] - tm_prev[5] k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) f = font_size * k + tm_prev = m if orientation not in orientations: raise OrientationNotFoundError try: @@ -117,8 +118,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_prev, - tm_prev, + cm_matrix, + tm_matrix, cmap[3], font_size, ) @@ -136,8 +137,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_prev, - tm_prev, + cm_matrix, + tm_matrix, cmap[3], font_size, ) @@ -155,8 +156,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_prev, - tm_prev, + cm_matrix, + tm_matrix, cmap[3], font_size, ) @@ -174,8 +175,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_prev, - tm_prev, + cm_matrix, + tm_matrix, cmap[3], font_size, ) From 5d9226a711365d9a9f773d819eb919afb5cfa9d3 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 19 Sep 2023 23:21:33 +0200 Subject: [PATCH 02/11] BUG: invalid cm/tm in visitor functions --- pypdf/_page.py | 39 ++++++++++++++++++++++-------- pypdf/_text_extraction/__init__.py | 26 +++++++++++--------- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 7e987e853..378472667 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1947,6 +1947,9 @@ def _extract_text( 0.0, 0.0, ] # will store cm_matrix * tm_matrix + # memo_cm/tm will be used to store the position at the beginning of building the text + memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] char_scale = 1.0 space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf @@ -1957,9 +1960,9 @@ def current_spacewidth() -> float: return _space_width / 1000.0 def process_operation(operator: bytes, operands: List) -> None: - nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text + nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap - nonlocal orientations, rtl_dir, visitor_text + nonlocal orientations, rtl_dir, visitor_text, output, text global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS check_crlf_space: bool = False @@ -1968,14 +1971,18 @@ def process_operation(operator: bytes, operands: List) -> None: tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() return None elif operator == b"ET": output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() # table 4.7 "Graphics state operators", page 219 # cm_matrix calculation is a reserved for the moment elif operator == b"q": @@ -2006,8 +2013,10 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"cm": output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() cm_matrix = mult( [ float(operands[0]), @@ -2030,8 +2039,10 @@ def process_operation(operator: bytes, operands: List) -> None: if text != "": output += text # .translate(cmap) if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() try: # charMapTuple: font_type, float(sp_width / 2), encoding, # map_dict, font-dictionary @@ -2106,6 +2117,8 @@ def process_operation(operator: bytes, operands: List) -> None: tm_prev, cm_matrix, tm_matrix, + memo_cm, + memo_tm, cmap, orientations, output, @@ -2113,6 +2126,9 @@ def process_operation(operator: bytes, operands: List) -> None: visitor_text, current_spacewidth(), ) + if text == "": + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() except OrientationNotFoundError: return None @@ -2144,12 +2160,12 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"Do": output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) try: if output[-1] != "\n": output += "\n" if visitor_text is not None: - visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text("\n", memo_cm, memo_tm, cmap[3], font_size) except IndexError: pass try: @@ -2165,7 +2181,7 @@ def process_operation(operator: bytes, operands: List) -> None: ) output += text if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) except Exception: logger_warning( f" impossible to decode XFormObject {operands[0]}", @@ -2173,13 +2189,16 @@ def process_operation(operator: bytes, operands: List) -> None: ) finally: text = "" + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() + else: process_operation(operator, operands) if visitor_operand_after is not None: visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of if text != "" and visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) return output def extract_text( diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 7dbc9666d..d1127327e 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -85,12 +85,14 @@ def orient(m: List[float]) -> int: return 270 -def crlf_space_check( +def crlf_space_check( # noqa: PLR0913 text: str, cm_prev: List[float], tm_prev: List[float], cm_matrix: List[float], tm_matrix: List[float], + memo_cm: List[float], + memo_tm: List[float], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], @@ -103,11 +105,11 @@ def crlf_space_check( m_prev = mult(tm_prev, cm_prev) m = mult(tm_matrix, cm_matrix) orientation = orient(m) - delta_x = m[4] - tm_prev[4] - delta_y = m[5] - tm_prev[5] + delta_x = m[4] - m_prev[4] + delta_y = m[5] - m_prev[5] k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) f = font_size * k - tm_prev = m + cm_prev = m if orientation not in orientations: raise OrientationNotFoundError try: @@ -118,8 +120,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_matrix, - tm_matrix, + memo_cm, + memo_tm, cmap[3], font_size, ) @@ -137,8 +139,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_matrix, - tm_matrix, + memo_cm, + memo_tm, cmap[3], font_size, ) @@ -156,8 +158,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_matrix, - tm_matrix, + memo_cm, + memo_tm, cmap[3], font_size, ) @@ -175,8 +177,8 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - cm_matrix, - tm_matrix, + memo_cm, + memo_tm, cmap[3], font_size, ) From 4e9cf24b9867cba2e9eda1cb8ec9e563dfe60ce1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 20 Sep 2023 19:43:58 +0200 Subject: [PATCH 03/11] add Test --- tests/test_page.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_page.py b/tests/test_page.py index 1d6c49443..6f7e56e77 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1288,3 +1288,22 @@ def test_get_contents_from_nullobject(): p = writer.add_blank_page(100, 100) p[NameObject("/Contents")] = writer._add_object(NullObject()) p.get_contents() + + +@pytest.mark.enable_socket() +def test_pos_text_in_textvisitor(): + url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf" + name = "test_text_pos.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + p = () + + def visitor_body2(text, cm, tm, fontdict, fontsize) -> None: + nonlocal p + if text.startswith("5425."): + p = (tm[4], tm[5]) + + reader.pages[0].extract_text(visitor_text=visitor_body2) + assert p[0] > 323.5 - 0.1 + assert p[0] < 323.5 + 0.1 + assert p[1] > 457.4 - 0.1 + assert p[1] < 457.4 + 0.1 From 37485df15d1ea7dc2e3ecab3369a931c02894090 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 20 Sep 2023 21:20:37 +0200 Subject: [PATCH 04/11] complete test for #2075 --- pypdf/_page.py | 9 ++-- pypdf/_text_extraction/__init__.py | 18 ++++---- tests/test_page.py | 67 ++++++++++++++++++++++++++++-- 3 files changed, 77 insertions(+), 17 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 378472667..d9b46d717 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2113,12 +2113,9 @@ def process_operation(operator: bytes, operands: List) -> None: try: text, output, cm_prev, tm_prev = crlf_space_check( text, - cm_prev, - tm_prev, - cm_matrix, - tm_matrix, - memo_cm, - memo_tm, + (cm_prev, tm_prev), + (cm_matrix, tm_matrix), + (memo_cm, memo_tm), cmap, orientations, output, diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index d1127327e..ea8adf56c 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -85,14 +85,11 @@ def orient(m: List[float]) -> int: return 270 -def crlf_space_check( # noqa: PLR0913 +def crlf_space_check( text: str, - cm_prev: List[float], - tm_prev: List[float], - cm_matrix: List[float], - tm_matrix: List[float], - memo_cm: List[float], - memo_tm: List[float], + cmtm_prev: Tuple[List[float], List[float]], + cmtm_matrix: Tuple[List[float], List[float]], + memo_cmtm: Tuple[List[float], List[float]], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], @@ -102,6 +99,13 @@ def crlf_space_check( # noqa: PLR0913 visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], spacewidth: float, ) -> Tuple[str, str, List[float], List[float]]: + cm_prev = cmtm_prev[0] + tm_prev = cmtm_prev[1] + cm_matrix = cmtm_matrix[0] + tm_matrix = cmtm_matrix[1] + memo_cm = memo_cmtm[0] + memo_tm = memo_cmtm[1] + m_prev = mult(tm_prev, cm_prev) m = mult(tm_matrix, cm_matrix) orientation = orient(m) diff --git a/tests/test_page.py b/tests/test_page.py index 6f7e56e77..7368291a2 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1292,6 +1292,7 @@ def test_get_contents_from_nullobject(): @pytest.mark.enable_socket() def test_pos_text_in_textvisitor(): + """See #2200""" url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf" name = "test_text_pos.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) @@ -1303,7 +1304,65 @@ def visitor_body2(text, cm, tm, fontdict, fontsize) -> None: p = (tm[4], tm[5]) reader.pages[0].extract_text(visitor_text=visitor_body2) - assert p[0] > 323.5 - 0.1 - assert p[0] < 323.5 + 0.1 - assert p[1] > 457.4 - 0.1 - assert p[1] < 457.4 + 0.1 + assert abs(p[0] - 323.5) < 0.1 + assert abs(p[1] - 457.4) < 0.1 + + +@pytest.mark.enable_socket() +def test_pos_text_in_textvisitor2(): + """See #2075""" + url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf" + name = "LegIndex-page6.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + x_lvl = 26 + lst = [] + + def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None: + nonlocal x_lvl, lst + if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210: + lst.append(text.strip(" \n")) + + reader.pages[0].extract_text(visitor_text=visitor_lvl) + assert lst == [ + "ACUPUNCTURE BOARD", + "ACUPUNCTURISTS AND ACUPUNCTURE", + "ADMINISTRATIVE LAW AND PROCEDURE", + "ADMINISTRATIVE LAW, OFFICE OF", + "ADOPTION", + "ADULT EDUCATION", + "ADVERTISING. See also MARKETING; and particular subject matter (e.g.,", + ] + x_lvl = 35 + lst = [] + reader.pages[0].extract_text(visitor_text=visitor_lvl) + assert lst == [ + "members, AB 1264", + "assistants, acupuncture, AB 1264", + "complaints, investigations, etc., AB 1264", + "day, california acupuncture, HR 48", + "massage services, asian, AB 1264", + "supervising acupuncturists, AB 1264", + "supportive acupuncture services, basic, AB 1264", + "rules and regulations—", + "professional assistants and employees: employment and compensation, AB 916", + "adults, adoption of, AB 1756", + "agencies, organizations, etc.: requirements, prohibitions, etc., SB 807", + "assistance programs, adoption: nonminor dependents, SB 9", + "birth certificates, AB 1302", + "contact agreements, postadoption—", + "facilitators, adoption, AB 120", + "failed adoptions: reproductive loss leave, SB 848", + "hearings, adoption finalization: remote proceedings, technology, etc., SB 21", + "native american tribes, AB 120", + "parental rights, reinstatement of, AB 20", + "parents, prospective adoptive: criminal background checks, SB 824", + "services, adult educational, SB 877", + "week, adult education, ACR 31", + "alcoholic beverages: tied-house restrictions, AB 546", + "campaign re social equity, civil rights, etc., SB 447", + "cannabis, AB 794", + "elections. See ELECTIONS.", + "false, misleading, etc., advertising—", + "hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures, SB 683", + "housing rental properties advertised rates: disclosures, SB 611", + ] From a05ebc6f1ffb860f96da5fdbdc3a27a9bba03451 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 21 Sep 2023 20:26:08 +0200 Subject: [PATCH 05/11] apply comments from review --- pypdf/_page.py | 52 ++++++++++++++++++++---------- pypdf/_text_extraction/__init__.py | 8 ++--- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index d9b46d717..aaa37de2a 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1935,18 +1935,14 @@ def _extract_text( # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. - cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] cm_stack = [] tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - tm_prev: List[float] = [ - 1.0, - 0.0, - 0.0, - 1.0, - 0.0, - 0.0, - ] # will store cm_matrix * tm_matrix + + # cm/tm_prev stores the last modified matrices can be an intermediate position + cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + # memo_cm/tm will be used to store the position at the beginning of building the text memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] @@ -1971,7 +1967,9 @@ def process_operation(operator: bytes, operands: List) -> None: tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] output += text if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + visitor_text( + text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + ) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -1979,7 +1977,9 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"ET": output += text if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + visitor_text( + text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + ) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -2013,7 +2013,9 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"cm": output += text if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + visitor_text( + text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + ) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -2039,7 +2041,9 @@ def process_operation(operator: bytes, operands: List) -> None: if text != "": output += text # .translate(cmap) if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + visitor_text( + text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + ) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -2157,12 +2161,20 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"Do": output += text if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + visitor_text( + text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + ) try: if output[-1] != "\n": output += "\n" if visitor_text is not None: - visitor_text("\n", memo_cm, memo_tm, cmap[3], font_size) + visitor_text( + "\n", + mult(memo_cm, memo_tm), + memo_tm, + cmap[3], + font_size, + ) except IndexError: pass try: @@ -2178,7 +2190,13 @@ def process_operation(operator: bytes, operands: List) -> None: ) output += text if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + visitor_text( + text, + mult(memo_cm, memo_tm), + memo_tm, + cmap[3], + font_size, + ) except Exception: logger_warning( f" impossible to decode XFormObject {operands[0]}", @@ -2195,7 +2213,7 @@ def process_operation(operator: bytes, operands: List) -> None: visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of if text != "" and visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) + visitor_text(text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size) return output def extract_text( diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index ea8adf56c..2872f802a 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -124,7 +124,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - memo_cm, + mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size, @@ -143,7 +143,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - memo_cm, + mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size, @@ -162,7 +162,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - memo_cm, + mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size, @@ -181,7 +181,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - memo_cm, + mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size, From 41259440e534541bad8daaa944b8918b93ed7eee Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 21 Sep 2023 20:58:21 +0200 Subject: [PATCH 06/11] doc + generalize mult for cm_matrix --- docs/user/extract-text.md | 18 +++++++++++++----- pypdf/_page.py | 8 ++++++-- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index 6e1d1c775..76477f0bf 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -27,9 +27,17 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment. The function provided in argument visitor_text of function extract_text has five arguments: -text, current transformation matrix, text matrix, font-dictionary and font-size. -In most cases the x and y coordinates of the current position -are in index 4 and 5 of the current transformation matrix. +* text : the current text (as long as possible, can be up to a full line) +* user_matrix: current matrix in user coordinate space +* tm_matrix: current matrix in text coordinate space +* font-dictionary: full font dictionary +* font-size: the size (in text coordinate space) + +the matrix stores 6 parameters. the 4 first provides the rotation/scaling matrix and the last two provides the translation (horizontal/vertical) +it is recommended to use the user_matrix as it takes into all transformations. +(note: the cm matrix which is provided is already the product of the matrices cm_matrix and tm_matrix described in the pdf specification) +the font-size is the raw text size, that is affected by the user_matrix + The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". @@ -53,7 +61,7 @@ parts = [] def visitor_body(text, cm, tm, font_dict, font_size): - y = tm[5] + y = cm[5] if y > 50 and y < 720: parts.append(text) @@ -88,7 +96,7 @@ def visitor_svg_rect(op, args, cm, tm): def visitor_svg_text(text, cm, tm, fontDict, fontSize): - (x, y) = (tm[4], tm[5]) + (x, y) = (cm[4], cm[5]) dwg.add(dwg.text(text, insert=(x, y), fill="blue")) diff --git a/pypdf/_page.py b/pypdf/_page.py index aaa37de2a..1b4fbc355 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2135,7 +2135,9 @@ def process_operation(operator: bytes, operands: List) -> None: for operands, operator in content.operations: if visitor_operand_before is not None: - visitor_operand_before(operator, operands, cm_matrix, tm_matrix) + visitor_operand_before( + operator, operands, mult(cm_matrix, tm_matrix), tm_matrix + ) # multiple operators are defined in here #### if operator == b"'": process_operation(b"T*", []) @@ -2210,7 +2212,9 @@ def process_operation(operator: bytes, operands: List) -> None: else: process_operation(operator, operands) if visitor_operand_after is not None: - visitor_operand_after(operator, operands, cm_matrix, tm_matrix) + visitor_operand_after( + operator, operands, mult(cm_matrix, tm_matrix), tm_matrix + ) output += text # just in case of if text != "" and visitor_text is not None: visitor_text(text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size) From 919d4d60069a8db38b26bcb48a6de8032c33a706 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 24 Sep 2023 21:07:40 +0200 Subject: [PATCH 07/11] inversion tm/cm --- pypdf/_page.py | 20 ++++++++++---------- pypdf/_text_extraction/__init__.py | 4 ++-- tests/test_text_extraction.py | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 25ff0292b..1b8cab63d 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1952,7 +1952,7 @@ def process_operation(operator: bytes, operands: List) -> None: output += text if visitor_text is not None: visitor_text( - text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size ) text = "" memo_cm = cm_matrix.copy() @@ -1962,7 +1962,7 @@ def process_operation(operator: bytes, operands: List) -> None: output += text if visitor_text is not None: visitor_text( - text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size ) text = "" memo_cm = cm_matrix.copy() @@ -1998,11 +1998,9 @@ def process_operation(operator: bytes, operands: List) -> None: output += text if visitor_text is not None: visitor_text( - text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size ) text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() cm_matrix = mult( [ float(operands[0]), @@ -2014,6 +2012,8 @@ def process_operation(operator: bytes, operands: List) -> None: ], cm_matrix, ) + memo_cm = cm_matrix.copy() + memo_tm = tm_matrix.copy() # Table 5.2 page 398 elif operator == b"Tz": char_scale = float(operands[0]) / 100.0 @@ -2026,7 +2026,7 @@ def process_operation(operator: bytes, operands: List) -> None: output += text # .translate(cmap) if visitor_text is not None: visitor_text( - text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size ) text = "" memo_cm = cm_matrix.copy() @@ -2148,7 +2148,7 @@ def process_operation(operator: bytes, operands: List) -> None: output += text if visitor_text is not None: visitor_text( - text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size + text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size ) try: if output[-1] != "\n": @@ -2156,7 +2156,7 @@ def process_operation(operator: bytes, operands: List) -> None: if visitor_text is not None: visitor_text( "\n", - mult(memo_cm, memo_tm), + mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size, @@ -2178,7 +2178,7 @@ def process_operation(operator: bytes, operands: List) -> None: if visitor_text is not None: visitor_text( text, - mult(memo_cm, memo_tm), + mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size, @@ -2201,7 +2201,7 @@ def process_operation(operator: bytes, operands: List) -> None: ) output += text # just in case of if text != "" and visitor_text is not None: - visitor_text(text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size) + visitor_text(text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size) return output def extract_text( diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 2872f802a..a7f14f8af 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -124,7 +124,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - mult(memo_cm, memo_tm), + mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size, @@ -143,7 +143,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - mult(memo_cm, memo_tm), + mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size, diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index de39c1ace..f2836ad55 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -82,8 +82,8 @@ def test_visitor_text_matrices(file_name, constraints): lines = [] def visitor_text(text, cm, tm, font_dict, font_size) -> None: - x = tm[4] * cm[0] + tm[5] * cm[2] + cm[4] # mult(tm, cm)[4] - y = tm[4] * cm[1] + tm[5] * cm[3] + cm[5] # mult(tm, cm)[5] + x = cm[4] # used to be tm[4] * cm[0] + tm[5] * cm[2] + cm[4] # mult(tm, cm)[4] + y = cm[5] # used to be tm[4] * cm[1] + tm[5] * cm[3] + cm[5] # mult(tm, cm)[5] lines.append({"text": text, "x": x, "y": y}) reader.pages[0].extract_text(visitor_text=visitor_text) From ff71b08b38bf6a20511e55e81befa5a8819ffed3 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 1 Oct 2023 10:44:10 +0200 Subject: [PATCH 08/11] revert to tm/cm independent in visitors revert based on https://github.com/py-pdf/pypdf/discussions/2163#discussioncomment-7116034 --- docs/user/extract-text.md | 12 +++++++---- pypdf/__init__.py | 3 ++- pypdf/_page.py | 34 +++++++++--------------------- pypdf/_text_extraction/__init__.py | 8 +++---- tests/test_text_extraction.py | 9 +++++--- 5 files changed, 30 insertions(+), 36 deletions(-) diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index 76477f0bf..8e04bfeeb 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -28,21 +28,25 @@ You can use visitor-functions to control which part of a page you want to proces The function provided in argument visitor_text of function extract_text has five arguments: * text : the current text (as long as possible, can be up to a full line) -* user_matrix: current matrix in user coordinate space -* tm_matrix: current matrix in text coordinate space +* user_matrix: current matrix to move from user coordinate space(aka. CTM) +* tm_matrix: current matrix from text coordinate space * font-dictionary: full font dictionary * font-size: the size (in text coordinate space) the matrix stores 6 parameters. the 4 first provides the rotation/scaling matrix and the last two provides the translation (horizontal/vertical) it is recommended to use the user_matrix as it takes into all transformations. -(note: the cm matrix which is provided is already the product of the matrices cm_matrix and tm_matrix described in the pdf specification) + +notes : + - as indicated in pdf 1.7 refeence, page 204 the user matrix applies to text space/image space/form space/pattern space. + - if you want to get the full transformation from text to user space, you can use the mult function (availalbe in global import) as follow : +`txt2user = mult(tm, cm))` the font-size is the raw text size, that is affected by the user_matrix The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". -**Caveat**: In complicated documents the calculated positions might be wrong. +**Caveat**: In complicated documents the calculated positions may be difficult to (if you move from multiple forms to page user space for example). The function provided in argument visitor_operand_before has four arguments: operator, operand-arguments, current transformation matrix and text matrix. diff --git a/pypdf/__init__.py b/pypdf/__init__.py index 250c05564..df07b5306 100644 --- a/pypdf/__init__.py +++ b/pypdf/__init__.py @@ -10,7 +10,7 @@ from ._crypt_providers import crypt_provider from ._encryption import PasswordType from ._merger import PdfFileMerger, PdfMerger -from ._page import PageObject, Transformation +from ._page import PageObject, Transformation, mult from ._reader import DocumentInformation, PdfFileReader, PdfReader from ._version import __version__ from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter @@ -31,6 +31,7 @@ __all__ = [ "__version__", "_debug_versions", + "mult", "PageRange", "PaperSize", "DocumentInformation", diff --git a/pypdf/_page.py b/pypdf/_page.py index 1b8cab63d..b08facf98 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1951,9 +1951,7 @@ def process_operation(operator: bytes, operands: List) -> None: tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] output += text if visitor_text is not None: - visitor_text( - text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size - ) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -1961,9 +1959,7 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"ET": output += text if visitor_text is not None: - visitor_text( - text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size - ) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -1997,9 +1993,7 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"cm": output += text if visitor_text is not None: - visitor_text( - text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size - ) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" cm_matrix = mult( [ @@ -2025,9 +2019,7 @@ def process_operation(operator: bytes, operands: List) -> None: if text != "": output += text # .translate(cmap) if visitor_text is not None: - visitor_text( - text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size - ) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -2119,9 +2111,7 @@ def process_operation(operator: bytes, operands: List) -> None: for operands, operator in content.operations: if visitor_operand_before is not None: - visitor_operand_before( - operator, operands, mult(cm_matrix, tm_matrix), tm_matrix - ) + visitor_operand_before(operator, operands, cm_matrix, tm_matrix) # multiple operators are defined in here #### if operator == b"'": process_operation(b"T*", []) @@ -2147,16 +2137,14 @@ def process_operation(operator: bytes, operands: List) -> None: elif operator == b"Do": output += text if visitor_text is not None: - visitor_text( - text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size - ) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) try: if output[-1] != "\n": output += "\n" if visitor_text is not None: visitor_text( "\n", - mult(memo_tm, memo_cm), + memo_cm, memo_tm, cmap[3], font_size, @@ -2178,7 +2166,7 @@ def process_operation(operator: bytes, operands: List) -> None: if visitor_text is not None: visitor_text( text, - mult(memo_tm, memo_cm), + memo_cm, memo_tm, cmap[3], font_size, @@ -2196,12 +2184,10 @@ def process_operation(operator: bytes, operands: List) -> None: else: process_operation(operator, operands) if visitor_operand_after is not None: - visitor_operand_after( - operator, operands, mult(cm_matrix, tm_matrix), tm_matrix - ) + visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of if text != "" and visitor_text is not None: - visitor_text(text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size) + visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) return output def extract_text( diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index a7f14f8af..ea8adf56c 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -124,7 +124,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - mult(memo_tm, memo_cm), + memo_cm, memo_tm, cmap[3], font_size, @@ -143,7 +143,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - mult(memo_tm, memo_cm), + memo_cm, memo_tm, cmap[3], font_size, @@ -162,7 +162,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - mult(memo_cm, memo_tm), + memo_cm, memo_tm, cmap[3], font_size, @@ -181,7 +181,7 @@ def crlf_space_check( if visitor_text is not None: visitor_text( text + "\n", - mult(memo_cm, memo_tm), + memo_cm, memo_tm, cmap[3], font_size, diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index f2836ad55..790ce6cf6 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -7,7 +7,7 @@ import pytest -from pypdf import PdfReader +from pypdf import PdfReader, mult from pypdf._text_extraction import set_custom_rtl TESTS_ROOT = Path(__file__).parent.resolve() @@ -82,8 +82,11 @@ def test_visitor_text_matrices(file_name, constraints): lines = [] def visitor_text(text, cm, tm, font_dict, font_size) -> None: - x = cm[4] # used to be tm[4] * cm[0] + tm[5] * cm[2] + cm[4] # mult(tm, cm)[4] - y = cm[5] # used to be tm[4] * cm[1] + tm[5] * cm[3] + cm[5] # mult(tm, cm)[5] + ctm = mult(tm, cm) + x = ctm[4] # used to tm[4] * cm[0] + tm[5] * cm[2] + cm[4] # mult(tm, cm)[4] + y = ctm[ + 5 + ] # used to be tm[4] * cm[1] + tm[5] * cm[3] + cm[5] # mult(tm, cm)[5] lines.append({"text": text, "x": x, "y": y}) reader.pages[0].extract_text(visitor_text=visitor_text) From 1153081d86b5a25e9e898ba276860fdc208e0803 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Oct 2023 09:18:43 +0200 Subject: [PATCH 09/11] Update docs/user/extract-text.md Co-authored-by: Martin Thoma --- docs/user/extract-text.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index 8e04bfeeb..a2a96c2b6 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -28,7 +28,7 @@ You can use visitor-functions to control which part of a page you want to proces The function provided in argument visitor_text of function extract_text has five arguments: * text : the current text (as long as possible, can be up to a full line) -* user_matrix: current matrix to move from user coordinate space(aka. CTM) +* user_matrix: current matrix to move from user coordinate space (also known as CTM) * tm_matrix: current matrix from text coordinate space * font-dictionary: full font dictionary * font-size: the size (in text coordinate space) From fb5377f89b51afb0fdebaa47836fd1be0003afef Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sun, 8 Oct 2023 09:19:01 +0200 Subject: [PATCH 10/11] Update docs/user/extract-text.md Co-authored-by: Martin Thoma --- docs/user/extract-text.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index a2a96c2b6..b23201943 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -27,7 +27,7 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment. The function provided in argument visitor_text of function extract_text has five arguments: -* text : the current text (as long as possible, can be up to a full line) +* text: the current text (as long as possible, can be up to a full line) * user_matrix: current matrix to move from user coordinate space (also known as CTM) * tm_matrix: current matrix from text coordinate space * font-dictionary: full font dictionary From da0a7ebc7b8960289ea3b8a6b6fc247cf467c59d Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 8 Oct 2023 09:22:47 +0200 Subject: [PATCH 11/11] Just some typos/stylisitic changes --- docs/user/extract-text.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index b23201943..649f723f6 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -33,14 +33,15 @@ The function provided in argument visitor_text of function extract_text has five * font-dictionary: full font dictionary * font-size: the size (in text coordinate space) -the matrix stores 6 parameters. the 4 first provides the rotation/scaling matrix and the last two provides the translation (horizontal/vertical) -it is recommended to use the user_matrix as it takes into all transformations. +The matrix stores 6 parameters. The first 4 provide the rotation/scaling matrix and the last two provide the translation (horizontal/vertical) +It is recommended to use the user_matrix as it takes into all transformations. -notes : - - as indicated in pdf 1.7 refeence, page 204 the user matrix applies to text space/image space/form space/pattern space. - - if you want to get the full transformation from text to user space, you can use the mult function (availalbe in global import) as follow : +Notes : + + - as indicated in the PDF 1.7 reference, page 204 the user matrix applies to text space/image space/form space/pattern space. + - if you want to get the full transformation from text to user space, you can use the `mult` function (availalbe in global import) as follows: `txt2user = mult(tm, cm))` -the font-size is the raw text size, that is affected by the user_matrix +The font-size is the raw text size, that is affected by the `user_matrix` The font-dictionary may be None in case of unknown fonts.