MAINT: Text extraction improvements

Credits to pubpub-zz, see #1118 (comment) Co-authored-by: pubpub-zz <[email protected]>
py-pdf · Jul 17, 2022 · 7740a6e · 7740a6e
1 parent ae0ff49
commit 7740a6e
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 11 deletions.
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -251,20 +251,25 @@ def parse_to_unicode(
         elif process_char:
             lst = [x for x in l.split(b" ") if x]
             map_dict[-1] = len(lst[0]) // 2
-            while len(lst) > 1:
-                map_to = ""
-                # placeholder (see above) means empty string
-                if lst[1] != b".":
-                    map_to = unhexlify(lst[1]).decode(
-                        "utf-16-be", "surrogatepass"
-                    )  # join is here as some cases where the code was split
+            if len(lst) == 1:
+                # some case where the 2nd param is empty (seems not IAW pdfspec)
                 map_dict[
                     unhexlify(lst[0]).decode(
                         "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
                     )
-                ] = map_to
-                int_entry.append(int(lst[0], 16))
-                lst = lst[2:]
+                ] = ""
+            else:
+                while len(lst) > 0:
+                    map_dict[
+                        unhexlify(lst[0]).decode(
+                            "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                            "surrogatepass",
+                        )
+                    ] = unhexlify(lst[1]).decode(
+                        "utf-16-be", "surrogatepass"
+                    )  # join is here as some cases where the code was split
+                    int_entry.append(int(lst[0], 16))
+                    lst = lst[2:]
     for a, value in map_dict.items():
         if value == " ":
             space_code = a

diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -1383,7 +1383,6 @@ def process_operation(operator: bytes, operands: List) -> None:
                     if isinstance(op, (int, float, NumberObject, FloatObject)):
                         if (
                             (abs(float(op)) >= _space_width)
-                            and (abs(float(op)) <= 8 * _space_width)
                             and (len(text) > 0)
                             and (text[-1] != " ")
                         ):