Merge branch 'add-with-as-usage-#1108' of https://github.com/Jianzhen…

…gLuo/PyPDF2 into add-with-as-usage-#1108
py-pdf · Jul 21, 2022 · 336053a · 336053a
2 parents 519dad1 + 562ebc7
commit 336053a
Show file tree

Hide file tree

Showing 11 changed files with 222 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,40 @@
 # CHANGELOG
 
+## Version 2.6.0, 2022-07-17
+
+### New Features (ENH)
+-  Add color and font_format to PdfReader.outlines[i] (#1104)
+-  Extract Text Enhancement (whitespaces) (#1084)
+
+### Bug Fixes (BUG)
+-  Use `build_destination` for named destination outlines (#1128)
+-  Avoid a crash when a ToUnicode CMap has an empty dstString in beginbfchar (#1118)
+-  Prevent deduplication of PageObject (#1105)
+-  None-check in DictionaryObject.read_from_stream (#1113)
+-  Avoid IndexError in _cmap.parse_to_unicode (#1110)
+
+### Documentation (DOC)
+-  Explanation for git submodule
+-  Watermark and stamp (#1095)
+
+### Maintenance (MAINT)
+-  Text extraction improvements (#1126)
+-  Destination.color returns ArrayObject instead of tuple as fallback (#1119)
+-  Use add_bookmark_destination in add_bookmark (#1100)
+-  Use add_bookmark_destination in add_bookmark_dict (#1099)
+
+### Testing (TST)
+-  Add test for arab text (#1127)
+-  Add xfail for decryption fail (#1125)
+-  Add xfail test for IndexError when extracting text (#1124)
+-  Add MCVE showing outline title issue (#1123)
+
+### Code Style (STY)
+-  Use IntFlag for permissions_flag / update_page_form_field_values (#1094)
+-  Simplify code (#1101)
+
+Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.5.0...2.6.0
+
 ## Version 2.5.0, 2022-07-10
 
 ### New Features (ENH)

diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -191,7 +191,13 @@ def parse_to_unicode(
     for i in range(len(ll)):
         j = ll[i].find(b">")
         if j >= 0:
-            ll[i] = ll[i][:j].replace(b" ", b"") + b" " + ll[i][j + 1 :]
+            if j == 0:
+                # string is empty: stash a placeholder here (see below)
+                # see https://github.com/py-pdf/PyPDF2/issues/1111
+                content = b"."
+            else:
+                content = ll[i][:j].replace(b" ", b"")
+            ll[i] = content + b" " + ll[i][j + 1 :]
     cm = (
         (b" ".join(ll))
         .replace(b"[", b" [ ")
@@ -246,13 +252,17 @@ def parse_to_unicode(
             lst = [x for x in l.split(b" ") if x]
             map_dict[-1] = len(lst[0]) // 2
             while len(lst) > 1:
+                map_to = ""
+                # placeholder (see above) means empty string
+                if lst[1] != b".":
+                    map_to = unhexlify(lst[1]).decode(
+                        "utf-16-be", "surrogatepass"
+                    )  # join is here as some cases where the code was split
                 map_dict[
                     unhexlify(lst[0]).decode(
                         "charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
                     )
-                ] = unhexlify(lst[1]).decode(
-                    "utf-16-be", "surrogatepass"
-                )  # join is here as some cases where the code was split
+                ] = map_to
                 int_entry.append(int(lst[0], 16))
                 lst = lst[2:]
     for a, value in map_dict.items():
@@ -269,7 +279,7 @@ def compute_space_width(
     w1 = {}
     st: int = 0
     if "/DescendantFonts" in ft:  # ft["/Subtype"].startswith("/CIDFontType"):
-        ft1 = ft["/DescendantFonts"][0].get_object()    # type: ignore
+        ft1 = ft["/DescendantFonts"][0].get_object()  # type: ignore
         try:
             w1[-1] = cast(float, ft1["/DW"])
         except Exception:

diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -1383,7 +1383,7 @@ def process_operation(operator: bytes, operands: List) -> None:
                     if isinstance(op, (int, float, NumberObject, FloatObject)):
                         if (
                             (abs(float(op)) >= _space_width)
-                            and (abs(float(op)) <= 8 * _space_width)
+                            and (len(text) > 0)
                             and (text[-1] != " ")
                         ):
                             process_operation(b"Tj", [" "])

diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py
@@ -834,12 +834,11 @@ def _build_outline(self, node: DictionaryObject) -> Optional[Destination]:
             if isinstance(dest, ArrayObject):
                 outline = self._build_destination(title, dest)  # type: ignore
             elif isinstance(dest, str) and dest in self._namedDests:
-                outline = self._namedDests[dest]
-                outline[NameObject("/Title")] = title  # type: ignore
+                outline = self._build_destination(title, self._namedDests[dest].dest_array)  # type: ignore
             else:
                 raise PdfReadError(f"Unexpected destination {dest!r}")
 
-        # if outline created, add color and format if present
+        # if outline created, add color, format, and child count if present
         if outline:
             if "/C" in node:
                 # Color of outline in (R, G, B) with values ranging 0.0-1.0
@@ -848,6 +847,10 @@ def _build_outline(self, node: DictionaryObject) -> Optional[Destination]:
                 # specifies style characteristics bold and/or italic
                 # 1=italic, 2=bold, 3=both
                 outline[NameObject("/F")] = node["/F"]
+            if "/Count" in node:
+                # absolute value = num. visible children
+                # positive = open/unfolded, negative = closed/folded
+                outline[NameObject("/Count")] = node["/Count"]
 
         return outline
 

diff --git a/PyPDF2/_version.py b/PyPDF2/_version.py
@@ -1 +1 @@
-__version__ = "2.5.0"
+__version__ = "2.6.0"
diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py
@@ -1431,7 +1431,7 @@ def removeText(
     def add_uri(
         self,
         pagenum: int,
-        uri: int,
+        uri: str,
         rect: RectangleObject,
         border: Optional[ArrayObject] = None,
     ) -> None:
@@ -1440,7 +1440,7 @@ def add_uri(
         This uses the basic structure of :meth:`add_link`
 
         :param int pagenum: index of the page on which to place the URI action.
-        :param int uri: string -- uri of resource to link to.
+        :param str uri: URI of resource to link to.
         :param rect: :class:`RectangleObject<PyPDF2.generic.RectangleObject>` or array of four
             integers specifying the clickable rectangular area
             ``[xLL, yLL, xUR, yUR]``, or string in the form ``"[ xLL yLL xUR yUR ]"``.
@@ -1498,7 +1498,7 @@ def add_uri(
     def addURI(
         self,
         pagenum: int,
-        uri: int,
+        uri: str,
         rect: RectangleObject,
         border: Optional[ArrayObject] = None,
     ) -> None:  # pragma: no cover

diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py
@@ -36,6 +36,7 @@
 import logging
 import re
 import warnings
+from enum import IntFlag
 from io import BytesIO
 from typing import (
     Any,
@@ -48,7 +49,7 @@
     Union,
     cast,
 )
-from enum import IntFlag
+
 from ._codecs import (  # noqa: rev_encoding
     _pdfdoc_encoding,
     _pdfdoc_encoding_rev,
@@ -1891,13 +1892,25 @@ def bottom(self) -> Optional[FloatObject]:
     @property
     def color(self) -> Optional[ArrayObject]:
         """Read-only property accessing the color in (R, G, B) with values 0.0-1.0"""
-        return self.get("/C", ArrayObject([FloatObject(0), FloatObject(0), FloatObject(0)]))
+        return self.get(
+            "/C", ArrayObject([FloatObject(0), FloatObject(0), FloatObject(0)])
+        )
 
     @property
     def font_format(self) -> Optional[OutlineFontFlag]:
         """Read-only property accessing the font type. 1=italic, 2=bold, 3=both"""
         return self.get("/F", 0)
 
+    @property
+    def outline_count(self) -> Optional[int]:
+        """
+        Read-only property accessing the outline count.
+        positive = expanded
+        negative = collapsed
+        absolute value = number of visible descendents at all levels
+        """
+        return self.get("/Count", None)
+
 
 class Bookmark(Destination):
     def write_to_stream(

diff --git a/docs/dev/intro.md b/docs/dev/intro.md
@@ -31,6 +31,12 @@ most cases we typically want to test for. The `sample-files` might cover a lot
 more edge cases, the behavior we get when file sizes get bigger, different
 PDF producers.
 
+In order to get the sample-files folder, you need to execute:
+
+```
+git submodule update --init
+```
+
 ## Tools: git and pre-commit
 
 Git is a command line application for version control. If you don't know it,
@@ -67,6 +73,8 @@ The `PREFIX` can be:
 * `ENH`: A new feature! Describe in the body what it can be used for.
 * `DEP`: A deprecation - either marking something as "this is going to be removed"
    or actually removing it.
+* `PI`: A performance improvement. This could also be a reduction in the
+        file size of PDF files generated by PyPDF2.
 * `ROB`: A robustness change. Dealing better with broken PDF files.
 * `DOC`: A documentation change.
 * `TST`: Adding / adjusting tests.

diff --git a/sample-files b/sample-files
diff --git a/tests/test_page.py b/tests/test_page.py
@@ -2,6 +2,7 @@
 import os
 from copy import deepcopy
 from io import BytesIO
+from pathlib import Path
 
 import pytest
 
@@ -16,11 +17,11 @@
 TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
 PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
 RESOURCE_ROOT = os.path.join(PROJECT_ROOT, "resources")
-EXTERNAL_ROOT = os.path.join(PROJECT_ROOT, "sample-files")
+EXTERNAL_ROOT = Path(PROJECT_ROOT) / "sample-files"
 
 
 def get_all_sample_files():
-    with open(os.path.join(EXTERNAL_ROOT, "files.json")) as fp:
+    with open(EXTERNAL_ROOT / "files.json") as fp:
         data = fp.read()
     meta = json.loads(data)
     return meta
@@ -37,7 +38,7 @@ def get_all_sample_files():
 )
 @pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning")
 def test_read(meta):
-    pdf_path = os.path.join(EXTERNAL_ROOT, meta["path"])
+    pdf_path = EXTERNAL_ROOT / meta["path"]
     reader = PdfReader(pdf_path)
     reader.pages[0]
     assert len(reader.pages) == meta["pages"]
@@ -322,3 +323,29 @@ def test_get_fonts(pdf_path, password, embedded, unembedded):
         a = a.union(a_tmp)
         b = b.union(b_tmp)
     assert (a, b) == (embedded, unembedded)
+
+
+@pytest.mark.xfail(reason="#1091")
+def test_text_extraction_issue_1091():
+    url = "https://corpora.tika.apache.org/base/docs/govdocs1/966/966635.pdf"
+    name = "tika-966635.pdf"
+    stream = BytesIO(get_pdf_from_url(url, name=name))
+    with pytest.warns(PdfReadWarning):
+        reader = PdfReader(stream)
+    for page in reader.pages:
+        page.extract_text()
+
+
+@pytest.mark.xfail(reason="#1088")
+def test_empyt_password_1088():
+    url = "https://corpora.tika.apache.org/base/docs/govdocs1/941/941536.pdf"
+    name = "tika-941536.pdf"
+    stream = BytesIO(get_pdf_from_url(url, name=name))
+    reader = PdfReader(stream)
+    len(reader.pages)
+
+
+@pytest.mark.xfail(reason="#1088 / #1126")
+def test_arab_text_extraction():
+    reader = PdfReader(EXTERNAL_ROOT / "015-arabic/habibi.pdf")
+    assert reader.pages[0].extract_text() == "habibi حَبيبي"