From fbc0dc7b236639726123891babde1b3db799303c Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Mon, 18 Sep 2023 21:40:34 +0200
Subject: [PATCH 01/11] Revert "BUG: Fix incorrect tm_matrix in call to
 visitor_text (#2060)"

This reverts commit 4458dc60204c388185db119147d946c280b80a9b.
---
 pypdf/_page.py                     |  2 +-
 pypdf/_text_extraction/__init__.py | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 55054c47b..7e987e853 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1946,7 +1946,7 @@ def _extract_text(
             1.0,
             0.0,
             0.0,
-        ]  # will store previous tm_matrix
+        ]  # will store cm_matrix * tm_matrix
         char_scale = 1.0
         space_scale = 1.0
         _space_width: float = 500.0  # will be set correctly at first Tf
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index aa262dd5a..7dbc9666d 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -103,10 +103,11 @@ def crlf_space_check(
     m_prev = mult(tm_prev, cm_prev)
     m = mult(tm_matrix, cm_matrix)
     orientation = orient(m)
-    delta_x = m[4] - m_prev[4]
-    delta_y = m[5] - m_prev[5]
+    delta_x = m[4] - tm_prev[4]
+    delta_y = m[5] - tm_prev[5]
     k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
     f = font_size * k
+    tm_prev = m
     if orientation not in orientations:
         raise OrientationNotFoundError
     try:
@@ -117,8 +118,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_prev,
-                            tm_prev,
+                            cm_matrix,
+                            tm_matrix,
                             cmap[3],
                             font_size,
                         )
@@ -136,8 +137,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_prev,
-                            tm_prev,
+                            cm_matrix,
+                            tm_matrix,
                             cmap[3],
                             font_size,
                         )
@@ -155,8 +156,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_prev,
-                            tm_prev,
+                            cm_matrix,
+                            tm_matrix,
                             cmap[3],
                             font_size,
                         )
@@ -174,8 +175,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_prev,
-                            tm_prev,
+                            cm_matrix,
+                            tm_matrix,
                             cmap[3],
                             font_size,
                         )

From 5d9226a711365d9a9f773d819eb919afb5cfa9d3 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Tue, 19 Sep 2023 23:21:33 +0200
Subject: [PATCH 02/11] BUG: invalid cm/tm in visitor functions

---
 pypdf/_page.py                     | 39 ++++++++++++++++++++++--------
 pypdf/_text_extraction/__init__.py | 26 +++++++++++---------
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 7e987e853..378472667 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1947,6 +1947,9 @@ def _extract_text(
             0.0,
             0.0,
         ]  # will store cm_matrix * tm_matrix
+        # memo_cm/tm will be used to store the position at the beginning of building the text
+        memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         char_scale = 1.0
         space_scale = 1.0
         _space_width: float = 500.0  # will be set correctly at first Tf
@@ -1957,9 +1960,9 @@ def current_spacewidth() -> float:
             return _space_width / 1000.0
 
         def process_operation(operator: bytes, operands: List) -> None:
-            nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
+            nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
             nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
-            nonlocal orientations, rtl_dir, visitor_text
+            nonlocal orientations, rtl_dir, visitor_text, output, text
             global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
 
             check_crlf_space: bool = False
@@ -1968,14 +1971,18 @@ def process_operation(operator: bytes, operands: List) -> None:
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
                 return None
             elif operator == b"ET":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
             # table 4.7 "Graphics state operators", page 219
             # cm_matrix calculation is a reserved for the moment
             elif operator == b"q":
@@ -2006,8 +2013,10 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"cm":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
                 cm_matrix = mult(
                     [
                         float(operands[0]),
@@ -2030,8 +2039,10 @@ def process_operation(operator: bytes, operands: List) -> None:
                 if text != "":
                     output += text  # .translate(cmap)
                     if visitor_text is not None:
-                        visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                        visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
                 try:
                     # charMapTuple: font_type, float(sp_width / 2), encoding,
                     #               map_dict, font-dictionary
@@ -2106,6 +2117,8 @@ def process_operation(operator: bytes, operands: List) -> None:
                         tm_prev,
                         cm_matrix,
                         tm_matrix,
+                        memo_cm,
+                        memo_tm,
                         cmap,
                         orientations,
                         output,
@@ -2113,6 +2126,9 @@ def process_operation(operator: bytes, operands: List) -> None:
                         visitor_text,
                         current_spacewidth(),
                     )
+                    if text == "":
+                        memo_cm = cm_matrix.copy()
+                        memo_tm = tm_matrix.copy()
                 except OrientationNotFoundError:
                     return None
 
@@ -2144,12 +2160,12 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"Do":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 try:
                     if output[-1] != "\n":
                         output += "\n"
                         if visitor_text is not None:
-                            visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size)
+                            visitor_text("\n", memo_cm, memo_tm, cmap[3], font_size)
                 except IndexError:
                     pass
                 try:
@@ -2165,7 +2181,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                         )
                         output += text
                         if visitor_text is not None:
-                            visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+                            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 except Exception:
                     logger_warning(
                         f" impossible to decode XFormObject {operands[0]}",
@@ -2173,13 +2189,16 @@ def process_operation(operator: bytes, operands: List) -> None:
                     )
                 finally:
                     text = ""
+                    memo_cm = cm_matrix.copy()
+                    memo_tm = tm_matrix.copy()
+
             else:
                 process_operation(operator, operands)
             if visitor_operand_after is not None:
                 visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
         output += text  # just in case of
         if text != "" and visitor_text is not None:
-            visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
+            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
         return output
 
     def extract_text(
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 7dbc9666d..d1127327e 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -85,12 +85,14 @@ def orient(m: List[float]) -> int:
         return 270
 
 
-def crlf_space_check(
+def crlf_space_check(  # noqa: PLR0913
     text: str,
     cm_prev: List[float],
     tm_prev: List[float],
     cm_matrix: List[float],
     tm_matrix: List[float],
+    memo_cm: List[float],
+    memo_tm: List[float],
     cmap: Tuple[
         Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
     ],
@@ -103,11 +105,11 @@ def crlf_space_check(
     m_prev = mult(tm_prev, cm_prev)
     m = mult(tm_matrix, cm_matrix)
     orientation = orient(m)
-    delta_x = m[4] - tm_prev[4]
-    delta_y = m[5] - tm_prev[5]
+    delta_x = m[4] - m_prev[4]
+    delta_y = m[5] - m_prev[5]
     k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
     f = font_size * k
-    tm_prev = m
+    cm_prev = m
     if orientation not in orientations:
         raise OrientationNotFoundError
     try:
@@ -118,8 +120,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_matrix,
-                            tm_matrix,
+                            memo_cm,
+                            memo_tm,
                             cmap[3],
                             font_size,
                         )
@@ -137,8 +139,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_matrix,
-                            tm_matrix,
+                            memo_cm,
+                            memo_tm,
                             cmap[3],
                             font_size,
                         )
@@ -156,8 +158,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_matrix,
-                            tm_matrix,
+                            memo_cm,
+                            memo_tm,
                             cmap[3],
                             font_size,
                         )
@@ -175,8 +177,8 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            cm_matrix,
-                            tm_matrix,
+                            memo_cm,
+                            memo_tm,
                             cmap[3],
                             font_size,
                         )

From 4e9cf24b9867cba2e9eda1cb8ec9e563dfe60ce1 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Wed, 20 Sep 2023 19:43:58 +0200
Subject: [PATCH 03/11] add Test

---
 tests/test_page.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/test_page.py b/tests/test_page.py
index 1d6c49443..6f7e56e77 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1288,3 +1288,22 @@ def test_get_contents_from_nullobject():
     p = writer.add_blank_page(100, 100)
     p[NameObject("/Contents")] = writer._add_object(NullObject())
     p.get_contents()
+
+
+@pytest.mark.enable_socket()
+def test_pos_text_in_textvisitor():
+    url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf"
+    name = "test_text_pos.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    p = ()
+
+    def visitor_body2(text, cm, tm, fontdict, fontsize) -> None:
+        nonlocal p
+        if text.startswith("5425."):
+            p = (tm[4], tm[5])
+
+    reader.pages[0].extract_text(visitor_text=visitor_body2)
+    assert p[0] > 323.5 - 0.1
+    assert p[0] < 323.5 + 0.1
+    assert p[1] > 457.4 - 0.1
+    assert p[1] < 457.4 + 0.1

From 37485df15d1ea7dc2e3ecab3369a931c02894090 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Wed, 20 Sep 2023 21:20:37 +0200
Subject: [PATCH 04/11] complete test

for #2075
---
 pypdf/_page.py                     |  9 ++--
 pypdf/_text_extraction/__init__.py | 18 ++++----
 tests/test_page.py                 | 67 ++++++++++++++++++++++++++++--
 3 files changed, 77 insertions(+), 17 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 378472667..d9b46d717 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -2113,12 +2113,9 @@ def process_operation(operator: bytes, operands: List) -> None:
                 try:
                     text, output, cm_prev, tm_prev = crlf_space_check(
                         text,
-                        cm_prev,
-                        tm_prev,
-                        cm_matrix,
-                        tm_matrix,
-                        memo_cm,
-                        memo_tm,
+                        (cm_prev, tm_prev),
+                        (cm_matrix, tm_matrix),
+                        (memo_cm, memo_tm),
                         cmap,
                         orientations,
                         output,
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index d1127327e..ea8adf56c 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -85,14 +85,11 @@ def orient(m: List[float]) -> int:
         return 270
 
 
-def crlf_space_check(  # noqa: PLR0913
+def crlf_space_check(
     text: str,
-    cm_prev: List[float],
-    tm_prev: List[float],
-    cm_matrix: List[float],
-    tm_matrix: List[float],
-    memo_cm: List[float],
-    memo_tm: List[float],
+    cmtm_prev: Tuple[List[float], List[float]],
+    cmtm_matrix: Tuple[List[float], List[float]],
+    memo_cmtm: Tuple[List[float], List[float]],
     cmap: Tuple[
         Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
     ],
@@ -102,6 +99,13 @@ def crlf_space_check(  # noqa: PLR0913
     visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
     spacewidth: float,
 ) -> Tuple[str, str, List[float], List[float]]:
+    cm_prev = cmtm_prev[0]
+    tm_prev = cmtm_prev[1]
+    cm_matrix = cmtm_matrix[0]
+    tm_matrix = cmtm_matrix[1]
+    memo_cm = memo_cmtm[0]
+    memo_tm = memo_cmtm[1]
+
     m_prev = mult(tm_prev, cm_prev)
     m = mult(tm_matrix, cm_matrix)
     orientation = orient(m)
diff --git a/tests/test_page.py b/tests/test_page.py
index 6f7e56e77..7368291a2 100644
--- a/tests/test_page.py
+++ b/tests/test_page.py
@@ -1292,6 +1292,7 @@ def test_get_contents_from_nullobject():
 
 @pytest.mark.enable_socket()
 def test_pos_text_in_textvisitor():
+    """See #2200"""
     url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf"
     name = "test_text_pos.pdf"
     reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
@@ -1303,7 +1304,65 @@ def visitor_body2(text, cm, tm, fontdict, fontsize) -> None:
             p = (tm[4], tm[5])
 
     reader.pages[0].extract_text(visitor_text=visitor_body2)
-    assert p[0] > 323.5 - 0.1
-    assert p[0] < 323.5 + 0.1
-    assert p[1] > 457.4 - 0.1
-    assert p[1] < 457.4 + 0.1
+    assert abs(p[0] - 323.5) < 0.1
+    assert abs(p[1] - 457.4) < 0.1
+
+
+@pytest.mark.enable_socket()
+def test_pos_text_in_textvisitor2():
+    """See #2075"""
+    url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf"
+    name = "LegIndex-page6.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    x_lvl = 26
+    lst = []
+
+    def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None:
+        nonlocal x_lvl, lst
+        if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210:
+            lst.append(text.strip(" \n"))
+
+    reader.pages[0].extract_text(visitor_text=visitor_lvl)
+    assert lst == [
+        "ACUPUNCTURE BOARD",
+        "ACUPUNCTURISTS AND ACUPUNCTURE",
+        "ADMINISTRATIVE LAW AND PROCEDURE",
+        "ADMINISTRATIVE LAW, OFFICE OF",
+        "ADOPTION",
+        "ADULT EDUCATION",
+        "ADVERTISING. See also MARKETING; and particular subject matter (e.g.,",
+    ]
+    x_lvl = 35
+    lst = []
+    reader.pages[0].extract_text(visitor_text=visitor_lvl)
+    assert lst == [
+        "members,  AB 1264",
+        "assistants, acupuncture,  AB 1264",
+        "complaints, investigations, etc.,  AB 1264",
+        "day, california acupuncture,  HR 48",
+        "massage services, asian,  AB 1264",
+        "supervising acupuncturists,  AB 1264",
+        "supportive acupuncture services, basic,  AB 1264",
+        "rules and regulations—",
+        "professional assistants and employees: employment and compensation,  AB 916",
+        "adults, adoption of,  AB 1756",
+        "agencies, organizations, etc.: requirements, prohibitions, etc.,  SB 807",
+        "assistance programs, adoption: nonminor dependents,  SB 9",
+        "birth certificates,  AB 1302",
+        "contact agreements, postadoption—",
+        "facilitators, adoption,  AB 120",
+        "failed adoptions: reproductive loss leave,  SB 848",
+        "hearings, adoption finalization: remote proceedings, technology, etc.,  SB 21",
+        "native american tribes,  AB 120",
+        "parental rights, reinstatement of,  AB 20",
+        "parents, prospective adoptive: criminal background checks,  SB 824",
+        "services, adult educational,  SB 877",
+        "week, adult education,  ACR 31",
+        "alcoholic beverages: tied-house restrictions,  AB 546",
+        "campaign re social equity, civil rights, etc.,  SB 447",
+        "cannabis,  AB 794",
+        "elections. See ELECTIONS.",
+        "false, misleading, etc., advertising—",
+        "hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures,  SB 683",
+        "housing rental properties advertised rates: disclosures,  SB 611",
+    ]

From a05ebc6f1ffb860f96da5fdbdc3a27a9bba03451 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 21 Sep 2023 20:26:08 +0200
Subject: [PATCH 05/11] apply comments from review

---
 pypdf/_page.py                     | 52 ++++++++++++++++++++----------
 pypdf/_text_extraction/__init__.py |  8 ++---
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index d9b46d717..aaa37de2a 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1935,18 +1935,14 @@ def _extract_text(
         # are strings where the byte->string encoding was unknown, so adding
         # them to the text here would be gibberish.
 
-        cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         cm_stack = []
         tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
-        tm_prev: List[float] = [
-            1.0,
-            0.0,
-            0.0,
-            1.0,
-            0.0,
-            0.0,
-        ]  # will store cm_matrix * tm_matrix
+
+        # cm/tm_prev stores the last modified matrices can be an intermediate position
+        cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+        tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
+
         # memo_cm/tm will be used to store the position at the beginning of building the text
         memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
         memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
@@ -1971,7 +1967,9 @@ def process_operation(operator: bytes, operands: List) -> None:
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                    visitor_text(
+                        text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                    )
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -1979,7 +1977,9 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"ET":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                    visitor_text(
+                        text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                    )
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -2013,7 +2013,9 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"cm":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                    visitor_text(
+                        text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                    )
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -2039,7 +2041,9 @@ def process_operation(operator: bytes, operands: List) -> None:
                 if text != "":
                     output += text  # .translate(cmap)
                     if visitor_text is not None:
-                        visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                        visitor_text(
+                            text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                        )
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -2157,12 +2161,20 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"Do":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                    visitor_text(
+                        text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                    )
                 try:
                     if output[-1] != "\n":
                         output += "\n"
                         if visitor_text is not None:
-                            visitor_text("\n", memo_cm, memo_tm, cmap[3], font_size)
+                            visitor_text(
+                                "\n",
+                                mult(memo_cm, memo_tm),
+                                memo_tm,
+                                cmap[3],
+                                font_size,
+                            )
                 except IndexError:
                     pass
                 try:
@@ -2178,7 +2190,13 @@ def process_operation(operator: bytes, operands: List) -> None:
                         )
                         output += text
                         if visitor_text is not None:
-                            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+                            visitor_text(
+                                text,
+                                mult(memo_cm, memo_tm),
+                                memo_tm,
+                                cmap[3],
+                                font_size,
+                            )
                 except Exception:
                     logger_warning(
                         f" impossible to decode XFormObject {operands[0]}",
@@ -2195,7 +2213,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                 visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
         output += text  # just in case of
         if text != "" and visitor_text is not None:
-            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
+            visitor_text(text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size)
         return output
 
     def extract_text(
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index ea8adf56c..2872f802a 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -124,7 +124,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            memo_cm,
+                            mult(memo_cm, memo_tm),
                             memo_tm,
                             cmap[3],
                             font_size,
@@ -143,7 +143,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            memo_cm,
+                            mult(memo_cm, memo_tm),
                             memo_tm,
                             cmap[3],
                             font_size,
@@ -162,7 +162,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            memo_cm,
+                            mult(memo_cm, memo_tm),
                             memo_tm,
                             cmap[3],
                             font_size,
@@ -181,7 +181,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            memo_cm,
+                            mult(memo_cm, memo_tm),
                             memo_tm,
                             cmap[3],
                             font_size,

From 41259440e534541bad8daaa944b8918b93ed7eee Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Thu, 21 Sep 2023 20:58:21 +0200
Subject: [PATCH 06/11] doc + generalize mult for cm_matrix

---
 docs/user/extract-text.md | 18 +++++++++++++-----
 pypdf/_page.py            |  8 ++++++--
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
index 6e1d1c775..76477f0bf 100644
--- a/docs/user/extract-text.md
+++ b/docs/user/extract-text.md
@@ -27,9 +27,17 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra
 You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment.
 
 The function provided in argument visitor_text of function extract_text has five arguments:
-text, current transformation matrix, text matrix, font-dictionary and font-size.
-In most cases the x and y coordinates of the current position
-are in index 4 and 5 of the current transformation matrix.
+* text : the current text (as long as possible, can be up to a full line)
+* user_matrix: current matrix in user coordinate space
+* tm_matrix: current matrix in text coordinate space
+* font-dictionary: full font dictionary
+* font-size: the size (in text coordinate space)
+
+the matrix stores 6 parameters. the 4 first provides the rotation/scaling matrix and the last two provides the translation (horizontal/vertical)
+it is recommended to use the user_matrix as it takes into all transformations.
+(note: the cm matrix which is provided is already the product of the matrices cm_matrix and tm_matrix described in the pdf specification)
+the font-size is the raw text size, that is affected by the user_matrix
+
 
 The font-dictionary may be None in case of unknown fonts.
 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
@@ -53,7 +61,7 @@ parts = []
 
 
 def visitor_body(text, cm, tm, font_dict, font_size):
-    y = tm[5]
+    y = cm[5]
     if y > 50 and y < 720:
         parts.append(text)
 
@@ -88,7 +96,7 @@ def visitor_svg_rect(op, args, cm, tm):
 
 
 def visitor_svg_text(text, cm, tm, fontDict, fontSize):
-    (x, y) = (tm[4], tm[5])
+    (x, y) = (cm[4], cm[5])
     dwg.add(dwg.text(text, insert=(x, y), fill="blue"))
 
 
diff --git a/pypdf/_page.py b/pypdf/_page.py
index aaa37de2a..1b4fbc355 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -2135,7 +2135,9 @@ def process_operation(operator: bytes, operands: List) -> None:
 
         for operands, operator in content.operations:
             if visitor_operand_before is not None:
-                visitor_operand_before(operator, operands, cm_matrix, tm_matrix)
+                visitor_operand_before(
+                    operator, operands, mult(cm_matrix, tm_matrix), tm_matrix
+                )
             # multiple operators are defined in here ####
             if operator == b"'":
                 process_operation(b"T*", [])
@@ -2210,7 +2212,9 @@ def process_operation(operator: bytes, operands: List) -> None:
             else:
                 process_operation(operator, operands)
             if visitor_operand_after is not None:
-                visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
+                visitor_operand_after(
+                    operator, operands, mult(cm_matrix, tm_matrix), tm_matrix
+                )
         output += text  # just in case of
         if text != "" and visitor_text is not None:
             visitor_text(text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size)

From 919d4d60069a8db38b26bcb48a6de8032c33a706 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 24 Sep 2023 21:07:40 +0200
Subject: [PATCH 07/11] inversion tm/cm

---
 pypdf/_page.py                     | 20 ++++++++++----------
 pypdf/_text_extraction/__init__.py |  4 ++--
 tests/test_text_extraction.py      |  4 ++--
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/pypdf/_page.py b/pypdf/_page.py
index 25ff0292b..1b8cab63d 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1952,7 +1952,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                 output += text
                 if visitor_text is not None:
                     visitor_text(
-                        text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                        text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
                     )
                 text = ""
                 memo_cm = cm_matrix.copy()
@@ -1962,7 +1962,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                 output += text
                 if visitor_text is not None:
                     visitor_text(
-                        text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                        text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
                     )
                 text = ""
                 memo_cm = cm_matrix.copy()
@@ -1998,11 +1998,9 @@ def process_operation(operator: bytes, operands: List) -> None:
                 output += text
                 if visitor_text is not None:
                     visitor_text(
-                        text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                        text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
                     )
                 text = ""
-                memo_cm = cm_matrix.copy()
-                memo_tm = tm_matrix.copy()
                 cm_matrix = mult(
                     [
                         float(operands[0]),
@@ -2014,6 +2012,8 @@ def process_operation(operator: bytes, operands: List) -> None:
                     ],
                     cm_matrix,
                 )
+                memo_cm = cm_matrix.copy()
+                memo_tm = tm_matrix.copy()
             # Table 5.2 page 398
             elif operator == b"Tz":
                 char_scale = float(operands[0]) / 100.0
@@ -2026,7 +2026,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                     output += text  # .translate(cmap)
                     if visitor_text is not None:
                         visitor_text(
-                            text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                            text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
                         )
                 text = ""
                 memo_cm = cm_matrix.copy()
@@ -2148,7 +2148,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                 output += text
                 if visitor_text is not None:
                     visitor_text(
-                        text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size
+                        text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
                     )
                 try:
                     if output[-1] != "\n":
@@ -2156,7 +2156,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                         if visitor_text is not None:
                             visitor_text(
                                 "\n",
-                                mult(memo_cm, memo_tm),
+                                mult(memo_tm, memo_cm),
                                 memo_tm,
                                 cmap[3],
                                 font_size,
@@ -2178,7 +2178,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                         if visitor_text is not None:
                             visitor_text(
                                 text,
-                                mult(memo_cm, memo_tm),
+                                mult(memo_tm, memo_cm),
                                 memo_tm,
                                 cmap[3],
                                 font_size,
@@ -2201,7 +2201,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                 )
         output += text  # just in case of
         if text != "" and visitor_text is not None:
-            visitor_text(text, mult(memo_cm, memo_tm), memo_tm, cmap[3], font_size)
+            visitor_text(text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size)
         return output
 
     def extract_text(
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index 2872f802a..a7f14f8af 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -124,7 +124,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            mult(memo_cm, memo_tm),
+                            mult(memo_tm, memo_cm),
                             memo_tm,
                             cmap[3],
                             font_size,
@@ -143,7 +143,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            mult(memo_cm, memo_tm),
+                            mult(memo_tm, memo_cm),
                             memo_tm,
                             cmap[3],
                             font_size,
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index de39c1ace..f2836ad55 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -82,8 +82,8 @@ def test_visitor_text_matrices(file_name, constraints):
     lines = []
 
     def visitor_text(text, cm, tm, font_dict, font_size) -> None:
-        x = tm[4] * cm[0] + tm[5] * cm[2] + cm[4]  # mult(tm, cm)[4]
-        y = tm[4] * cm[1] + tm[5] * cm[3] + cm[5]  # mult(tm, cm)[5]
+        x = cm[4]  # used to be tm[4] * cm[0] + tm[5] * cm[2] + cm[4]  # mult(tm, cm)[4]
+        y = cm[5]  # used to be tm[4] * cm[1] + tm[5] * cm[3] + cm[5]  # mult(tm, cm)[5]
         lines.append({"text": text, "x": x, "y": y})
 
     reader.pages[0].extract_text(visitor_text=visitor_text)

From ff71b08b38bf6a20511e55e81befa5a8819ffed3 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 1 Oct 2023 10:44:10 +0200
Subject: [PATCH 08/11] revert to tm/cm independent in visitors

revert based on https://github.com/py-pdf/pypdf/discussions/2163#discussioncomment-7116034
---
 docs/user/extract-text.md          | 12 +++++++----
 pypdf/__init__.py                  |  3 ++-
 pypdf/_page.py                     | 34 +++++++++---------------------
 pypdf/_text_extraction/__init__.py |  8 +++----
 tests/test_text_extraction.py      |  9 +++++---
 5 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
index 76477f0bf..8e04bfeeb 100644
--- a/docs/user/extract-text.md
+++ b/docs/user/extract-text.md
@@ -28,21 +28,25 @@ You can use visitor-functions to control which part of a page you want to proces
 
 The function provided in argument visitor_text of function extract_text has five arguments:
 * text : the current text (as long as possible, can be up to a full line)
-* user_matrix: current matrix in user coordinate space
-* tm_matrix: current matrix in text coordinate space
+* user_matrix: current matrix to move from user coordinate space(aka. CTM)
+* tm_matrix: current matrix from text coordinate space
 * font-dictionary: full font dictionary
 * font-size: the size (in text coordinate space)
 
 the matrix stores 6 parameters. the 4 first provides the rotation/scaling matrix and the last two provides the translation (horizontal/vertical)
 it is recommended to use the user_matrix as it takes into all transformations.
-(note: the cm matrix which is provided is already the product of the matrices cm_matrix and tm_matrix described in the pdf specification)
+
+notes :
+ - as indicated in pdf 1.7 refeence, page 204 the user matrix applies to text space/image space/form space/pattern space.
+ - if you want to get the full transformation from text to user space, you can use the mult function (availalbe in global import) as follow :
+`txt2user = mult(tm, cm))`
 the font-size is the raw text size, that is affected by the user_matrix
 
 
 The font-dictionary may be None in case of unknown fonts.
 If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
 
-**Caveat**: In complicated documents the calculated positions might be wrong.
+**Caveat**: In complicated documents the calculated positions may be difficult to (if you move from multiple forms to page user space for example).
 
 The function provided in argument visitor_operand_before has four arguments:
 operator, operand-arguments, current transformation matrix and text matrix.
diff --git a/pypdf/__init__.py b/pypdf/__init__.py
index 250c05564..df07b5306 100644
--- a/pypdf/__init__.py
+++ b/pypdf/__init__.py
@@ -10,7 +10,7 @@
 from ._crypt_providers import crypt_provider
 from ._encryption import PasswordType
 from ._merger import PdfFileMerger, PdfMerger
-from ._page import PageObject, Transformation
+from ._page import PageObject, Transformation, mult
 from ._reader import DocumentInformation, PdfFileReader, PdfReader
 from ._version import __version__
 from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
@@ -31,6 +31,7 @@
 __all__ = [
     "__version__",
     "_debug_versions",
+    "mult",
     "PageRange",
     "PaperSize",
     "DocumentInformation",
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 1b8cab63d..b08facf98 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1951,9 +1951,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                 tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
                 output += text
                 if visitor_text is not None:
-                    visitor_text(
-                        text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
-                    )
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -1961,9 +1959,7 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"ET":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(
-                        text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
-                    )
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -1997,9 +1993,7 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"cm":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(
-                        text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
-                    )
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 cm_matrix = mult(
                     [
@@ -2025,9 +2019,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                 if text != "":
                     output += text  # .translate(cmap)
                     if visitor_text is not None:
-                        visitor_text(
-                            text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
-                        )
+                        visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 text = ""
                 memo_cm = cm_matrix.copy()
                 memo_tm = tm_matrix.copy()
@@ -2119,9 +2111,7 @@ def process_operation(operator: bytes, operands: List) -> None:
 
         for operands, operator in content.operations:
             if visitor_operand_before is not None:
-                visitor_operand_before(
-                    operator, operands, mult(cm_matrix, tm_matrix), tm_matrix
-                )
+                visitor_operand_before(operator, operands, cm_matrix, tm_matrix)
             # multiple operators are defined in here ####
             if operator == b"'":
                 process_operation(b"T*", [])
@@ -2147,16 +2137,14 @@ def process_operation(operator: bytes, operands: List) -> None:
             elif operator == b"Do":
                 output += text
                 if visitor_text is not None:
-                    visitor_text(
-                        text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size
-                    )
+                    visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
                 try:
                     if output[-1] != "\n":
                         output += "\n"
                         if visitor_text is not None:
                             visitor_text(
                                 "\n",
-                                mult(memo_tm, memo_cm),
+                                memo_cm,
                                 memo_tm,
                                 cmap[3],
                                 font_size,
@@ -2178,7 +2166,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                         if visitor_text is not None:
                             visitor_text(
                                 text,
-                                mult(memo_tm, memo_cm),
+                                memo_cm,
                                 memo_tm,
                                 cmap[3],
                                 font_size,
@@ -2196,12 +2184,10 @@ def process_operation(operator: bytes, operands: List) -> None:
             else:
                 process_operation(operator, operands)
             if visitor_operand_after is not None:
-                visitor_operand_after(
-                    operator, operands, mult(cm_matrix, tm_matrix), tm_matrix
-                )
+                visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
         output += text  # just in case of
         if text != "" and visitor_text is not None:
-            visitor_text(text, mult(memo_tm, memo_cm), memo_tm, cmap[3], font_size)
+            visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
         return output
 
     def extract_text(
diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py
index a7f14f8af..ea8adf56c 100644
--- a/pypdf/_text_extraction/__init__.py
+++ b/pypdf/_text_extraction/__init__.py
@@ -124,7 +124,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            mult(memo_tm, memo_cm),
+                            memo_cm,
                             memo_tm,
                             cmap[3],
                             font_size,
@@ -143,7 +143,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            mult(memo_tm, memo_cm),
+                            memo_cm,
                             memo_tm,
                             cmap[3],
                             font_size,
@@ -162,7 +162,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            mult(memo_cm, memo_tm),
+                            memo_cm,
                             memo_tm,
                             cmap[3],
                             font_size,
@@ -181,7 +181,7 @@ def crlf_space_check(
                     if visitor_text is not None:
                         visitor_text(
                             text + "\n",
-                            mult(memo_cm, memo_tm),
+                            memo_cm,
                             memo_tm,
                             cmap[3],
                             font_size,
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index f2836ad55..790ce6cf6 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from pypdf import PdfReader
+from pypdf import PdfReader, mult
 from pypdf._text_extraction import set_custom_rtl
 
 TESTS_ROOT = Path(__file__).parent.resolve()
@@ -82,8 +82,11 @@ def test_visitor_text_matrices(file_name, constraints):
     lines = []
 
     def visitor_text(text, cm, tm, font_dict, font_size) -> None:
-        x = cm[4]  # used to be tm[4] * cm[0] + tm[5] * cm[2] + cm[4]  # mult(tm, cm)[4]
-        y = cm[5]  # used to be tm[4] * cm[1] + tm[5] * cm[3] + cm[5]  # mult(tm, cm)[5]
+        ctm = mult(tm, cm)
+        x = ctm[4]  # used to tm[4] * cm[0] + tm[5] * cm[2] + cm[4]  # mult(tm, cm)[4]
+        y = ctm[
+            5
+        ]  # used to be tm[4] * cm[1] + tm[5] * cm[3] + cm[5]  # mult(tm, cm)[5]
         lines.append({"text": text, "x": x, "y": y})
 
     reader.pages[0].extract_text(visitor_text=visitor_text)

From 1153081d86b5a25e9e898ba276860fdc208e0803 Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Oct 2023 09:18:43 +0200
Subject: [PATCH 09/11] Update docs/user/extract-text.md

Co-authored-by: Martin Thoma <info@martin-thoma.de>
---
 docs/user/extract-text.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
index 8e04bfeeb..a2a96c2b6 100644
--- a/docs/user/extract-text.md
+++ b/docs/user/extract-text.md
@@ -28,7 +28,7 @@ You can use visitor-functions to control which part of a page you want to proces
 
 The function provided in argument visitor_text of function extract_text has five arguments:
 * text : the current text (as long as possible, can be up to a full line)
-* user_matrix: current matrix to move from user coordinate space(aka. CTM)
+* user_matrix: current matrix to move from user coordinate space (also known as CTM)
 * tm_matrix: current matrix from text coordinate space
 * font-dictionary: full font dictionary
 * font-size: the size (in text coordinate space)

From fb5377f89b51afb0fdebaa47836fd1be0003afef Mon Sep 17 00:00:00 2001
From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com>
Date: Sun, 8 Oct 2023 09:19:01 +0200
Subject: [PATCH 10/11] Update docs/user/extract-text.md

Co-authored-by: Martin Thoma <info@martin-thoma.de>
---
 docs/user/extract-text.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
index a2a96c2b6..b23201943 100644
--- a/docs/user/extract-text.md
+++ b/docs/user/extract-text.md
@@ -27,7 +27,7 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra
 You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment.
 
 The function provided in argument visitor_text of function extract_text has five arguments:
-* text : the current text (as long as possible, can be up to a full line)
+* text: the current text (as long as possible, can be up to a full line)
 * user_matrix: current matrix to move from user coordinate space (also known as CTM)
 * tm_matrix: current matrix from text coordinate space
 * font-dictionary: full font dictionary

From da0a7ebc7b8960289ea3b8a6b6fc247cf467c59d Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 8 Oct 2023 09:22:47 +0200
Subject: [PATCH 11/11] Just some typos/stylisitic changes

---
 docs/user/extract-text.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md
index b23201943..649f723f6 100644
--- a/docs/user/extract-text.md
+++ b/docs/user/extract-text.md
@@ -33,14 +33,15 @@ The function provided in argument visitor_text of function extract_text has five
 * font-dictionary: full font dictionary
 * font-size: the size (in text coordinate space)
 
-the matrix stores 6 parameters. the 4 first provides the rotation/scaling matrix and the last two provides the translation (horizontal/vertical)
-it is recommended to use the user_matrix as it takes into all transformations.
+The matrix stores 6 parameters. The first 4 provide the rotation/scaling matrix and the last two provide the translation (horizontal/vertical)
+It is recommended to use the user_matrix as it takes into all transformations.
 
-notes :
- - as indicated in pdf 1.7 refeence, page 204 the user matrix applies to text space/image space/form space/pattern space.
- - if you want to get the full transformation from text to user space, you can use the mult function (availalbe in global import) as follow :
+Notes :
+
+ - as indicated in the PDF 1.7 reference, page 204 the user matrix applies to text space/image space/form space/pattern space.
+ - if you want to get the full transformation from text to user space, you can use the `mult` function (availalbe in global import) as follows:
 `txt2user = mult(tm, cm))`
-the font-size is the raw text size, that is affected by the user_matrix
+The font-size is the raw text size, that is affected by the `user_matrix`
 
 
 The font-dictionary may be None in case of unknown fonts.