Skip to content

Commit

Permalink
MAINT: Text extraction improvements
Browse files Browse the repository at this point in the history
Credits to pubpub-zz, see
#1118 (comment)

Co-authored-by: pubpub-zz <[email protected]>
  • Loading branch information
MartinThoma and pubpub-zz committed Jul 17, 2022
1 parent ae0ff49 commit 7740a6e
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 11 deletions.
25 changes: 15 additions & 10 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,20 +251,25 @@ def parse_to_unicode(
elif process_char:
lst = [x for x in l.split(b" ") if x]
map_dict[-1] = len(lst[0]) // 2
while len(lst) > 1:
map_to = ""
# placeholder (see above) means empty string
if lst[1] != b".":
map_to = unhexlify(lst[1]).decode(
"utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
if len(lst) == 1:
# some case where the 2nd param is empty (seems not IAW pdfspec)
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
)
] = map_to
int_entry.append(int(lst[0], 16))
lst = lst[2:]
] = ""
else:
while len(lst) > 0:
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be",
"surrogatepass",
)
] = unhexlify(lst[1]).decode(
"utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
int_entry.append(int(lst[0], 16))
lst = lst[2:]
for a, value in map_dict.items():
if value == " ":
space_code = a
Expand Down
1 change: 0 additions & 1 deletion PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1383,7 +1383,6 @@ def process_operation(operator: bytes, operands: List) -> None:
if isinstance(op, (int, float, NumberObject, FloatObject)):
if (
(abs(float(op)) >= _space_width)
and (abs(float(op)) <= 8 * _space_width)
and (len(text) > 0)
and (text[-1] != " ")
):
Expand Down

0 comments on commit 7740a6e

Please sign in to comment.