Skip to content

Commit

Permalink
Merge branch 'add-with-as-usage-#1108' of https://github.com/Jianzhen…
Browse files Browse the repository at this point in the history
…gLuo/PyPDF2 into add-with-as-usage-#1108
  • Loading branch information
JianzhengLuo committed Jul 21, 2022
2 parents 519dad1 + 562ebc7 commit 336053a
Show file tree
Hide file tree
Showing 11 changed files with 222 additions and 19 deletions.
35 changes: 35 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,40 @@
# CHANGELOG

## Version 2.6.0, 2022-07-17

### New Features (ENH)
- Add color and font_format to PdfReader.outlines[i] (#1104)
- Extract Text Enhancement (whitespaces) (#1084)

### Bug Fixes (BUG)
- Use `build_destination` for named destination outlines (#1128)
- Avoid a crash when a ToUnicode CMap has an empty dstString in beginbfchar (#1118)
- Prevent deduplication of PageObject (#1105)
- None-check in DictionaryObject.read_from_stream (#1113)
- Avoid IndexError in _cmap.parse_to_unicode (#1110)

### Documentation (DOC)
- Explanation for git submodule
- Watermark and stamp (#1095)

### Maintenance (MAINT)
- Text extraction improvements (#1126)
- Destination.color returns ArrayObject instead of tuple as fallback (#1119)
- Use add_bookmark_destination in add_bookmark (#1100)
- Use add_bookmark_destination in add_bookmark_dict (#1099)

### Testing (TST)
- Add test for arab text (#1127)
- Add xfail for decryption fail (#1125)
- Add xfail test for IndexError when extracting text (#1124)
- Add MCVE showing outline title issue (#1123)

### Code Style (STY)
- Use IntFlag for permissions_flag / update_page_form_field_values (#1094)
- Simplify code (#1101)

Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.5.0...2.6.0

## Version 2.5.0, 2022-07-10

### New Features (ENH)
Expand Down
20 changes: 15 additions & 5 deletions PyPDF2/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,13 @@ def parse_to_unicode(
for i in range(len(ll)):
j = ll[i].find(b">")
if j >= 0:
ll[i] = ll[i][:j].replace(b" ", b"") + b" " + ll[i][j + 1 :]
if j == 0:
# string is empty: stash a placeholder here (see below)
# see https://github.com/py-pdf/PyPDF2/issues/1111
content = b"."
else:
content = ll[i][:j].replace(b" ", b"")
ll[i] = content + b" " + ll[i][j + 1 :]
cm = (
(b" ".join(ll))
.replace(b"[", b" [ ")
Expand Down Expand Up @@ -246,13 +252,17 @@ def parse_to_unicode(
lst = [x for x in l.split(b" ") if x]
map_dict[-1] = len(lst[0]) // 2
while len(lst) > 1:
map_to = ""
# placeholder (see above) means empty string
if lst[1] != b".":
map_to = unhexlify(lst[1]).decode(
"utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
map_dict[
unhexlify(lst[0]).decode(
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
)
] = unhexlify(lst[1]).decode(
"utf-16-be", "surrogatepass"
) # join is here as some cases where the code was split
] = map_to
int_entry.append(int(lst[0], 16))
lst = lst[2:]
for a, value in map_dict.items():
Expand All @@ -269,7 +279,7 @@ def compute_space_width(
w1 = {}
st: int = 0
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
try:
w1[-1] = cast(float, ft1["/DW"])
except Exception:
Expand Down
2 changes: 1 addition & 1 deletion PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1383,7 +1383,7 @@ def process_operation(operator: bytes, operands: List) -> None:
if isinstance(op, (int, float, NumberObject, FloatObject)):
if (
(abs(float(op)) >= _space_width)
and (abs(float(op)) <= 8 * _space_width)
and (len(text) > 0)
and (text[-1] != " ")
):
process_operation(b"Tj", [" "])
Expand Down
9 changes: 6 additions & 3 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,12 +834,11 @@ def _build_outline(self, node: DictionaryObject) -> Optional[Destination]:
if isinstance(dest, ArrayObject):
outline = self._build_destination(title, dest) # type: ignore
elif isinstance(dest, str) and dest in self._namedDests:
outline = self._namedDests[dest]
outline[NameObject("/Title")] = title # type: ignore
outline = self._build_destination(title, self._namedDests[dest].dest_array) # type: ignore
else:
raise PdfReadError(f"Unexpected destination {dest!r}")

# if outline created, add color and format if present
# if outline created, add color, format, and child count if present
if outline:
if "/C" in node:
# Color of outline in (R, G, B) with values ranging 0.0-1.0
Expand All @@ -848,6 +847,10 @@ def _build_outline(self, node: DictionaryObject) -> Optional[Destination]:
# specifies style characteristics bold and/or italic
# 1=italic, 2=bold, 3=both
outline[NameObject("/F")] = node["/F"]
if "/Count" in node:
# absolute value = num. visible children
# positive = open/unfolded, negative = closed/folded
outline[NameObject("/Count")] = node["/Count"]

return outline

Expand Down
2 changes: 1 addition & 1 deletion PyPDF2/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.5.0"
__version__ = "2.6.0"
6 changes: 3 additions & 3 deletions PyPDF2/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,7 +1431,7 @@ def removeText(
def add_uri(
self,
pagenum: int,
uri: int,
uri: str,
rect: RectangleObject,
border: Optional[ArrayObject] = None,
) -> None:
Expand All @@ -1440,7 +1440,7 @@ def add_uri(
This uses the basic structure of :meth:`add_link`
:param int pagenum: index of the page on which to place the URI action.
:param int uri: string -- uri of resource to link to.
:param str uri: URI of resource to link to.
:param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four
integers specifying the clickable rectangular area
``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
Expand Down Expand Up @@ -1498,7 +1498,7 @@ def add_uri(
def addURI(
self,
pagenum: int,
uri: int,
uri: str,
rect: RectangleObject,
border: Optional[ArrayObject] = None,
) -> None: # pragma: no cover
Expand Down
17 changes: 15 additions & 2 deletions PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import logging
import re
import warnings
from enum import IntFlag
from io import BytesIO
from typing import (
Any,
Expand All @@ -48,7 +49,7 @@
Union,
cast,
)
from enum import IntFlag

from ._codecs import ( # noqa: rev_encoding
_pdfdoc_encoding,
_pdfdoc_encoding_rev,
Expand Down Expand Up @@ -1891,13 +1892,25 @@ def bottom(self) -> Optional[FloatObject]:
@property
def color(self) -> Optional[ArrayObject]:
"""Read-only property accessing the color in (R, G, B) with values 0.0-1.0"""
return self.get("/C", ArrayObject([FloatObject(0), FloatObject(0), FloatObject(0)]))
return self.get(
"/C", ArrayObject([FloatObject(0), FloatObject(0), FloatObject(0)])
)

@property
def font_format(self) -> Optional[OutlineFontFlag]:
"""Read-only property accessing the font type. 1=italic, 2=bold, 3=both"""
return self.get("/F", 0)

@property
def outline_count(self) -> Optional[int]:
"""
Read-only property accessing the outline count.
positive = expanded
negative = collapsed
absolute value = number of visible descendents at all levels
"""
return self.get("/Count", None)


class Bookmark(Destination):
def write_to_stream(
Expand Down
8 changes: 8 additions & 0 deletions docs/dev/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ most cases we typically want to test for. The `sample-files` might cover a lot
more edge cases, the behavior we get when file sizes get bigger, different
PDF producers.

In order to get the sample-files folder, you need to execute:

```
git submodule update --init
```

## Tools: git and pre-commit

Git is a command line application for version control. If you don't know it,
Expand Down Expand Up @@ -67,6 +73,8 @@ The `PREFIX` can be:
* `ENH`: A new feature! Describe in the body what it can be used for.
* `DEP`: A deprecation - either marking something as "this is going to be removed"
or actually removing it.
* `PI`: A performance improvement. This could also be a reduction in the
file size of PDF files generated by PyPDF2.
* `ROB`: A robustness change. Dealing better with broken PDF files.
* `DOC`: A documentation change.
* `TST`: Adding / adjusting tests.
Expand Down
33 changes: 30 additions & 3 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from copy import deepcopy
from io import BytesIO
from pathlib import Path

import pytest

Expand All @@ -16,11 +17,11 @@
TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "resources")
EXTERNAL_ROOT = os.path.join(PROJECT_ROOT, "sample-files")
EXTERNAL_ROOT = Path(PROJECT_ROOT) / "sample-files"


def get_all_sample_files():
with open(os.path.join(EXTERNAL_ROOT, "files.json")) as fp:
with open(EXTERNAL_ROOT / "files.json") as fp:
data = fp.read()
meta = json.loads(data)
return meta
Expand All @@ -37,7 +38,7 @@ def get_all_sample_files():
)
@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning")
def test_read(meta):
pdf_path = os.path.join(EXTERNAL_ROOT, meta["path"])
pdf_path = EXTERNAL_ROOT / meta["path"]
reader = PdfReader(pdf_path)
reader.pages[0]
assert len(reader.pages) == meta["pages"]
Expand Down Expand Up @@ -322,3 +323,29 @@ def test_get_fonts(pdf_path, password, embedded, unembedded):
a = a.union(a_tmp)
b = b.union(b_tmp)
assert (a, b) == (embedded, unembedded)


@pytest.mark.xfail(reason="#1091")
def test_text_extraction_issue_1091():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/966/966635.pdf"
name = "tika-966635.pdf"
stream = BytesIO(get_pdf_from_url(url, name=name))
with pytest.warns(PdfReadWarning):
reader = PdfReader(stream)
for page in reader.pages:
page.extract_text()


@pytest.mark.xfail(reason="#1088")
def test_empyt_password_1088():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/941/941536.pdf"
name = "tika-941536.pdf"
stream = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(stream)
len(reader.pages)


@pytest.mark.xfail(reason="#1088 / #1126")
def test_arab_text_extraction():
reader = PdfReader(EXTERNAL_ROOT / "015-arabic/habibi.pdf")
assert reader.pages[0].extract_text() == "habibi حَبيبي"
Loading

0 comments on commit 336053a

Please sign in to comment.