From 7b852acb3350033a9f76fbc61f6e0d27f561b444 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Thu, 28 Jul 2022 19:42:44 +0200 Subject: [PATCH 1/5] DOC: We now have CMAP support (#1177) --- docs/user/pdf-version-support.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user/pdf-version-support.md b/docs/user/pdf-version-support.md index a588997ab..ad2033fb6 100644 --- a/docs/user/pdf-version-support.md +++ b/docs/user/pdf-version-support.md @@ -21,12 +21,12 @@ all features of PDF 2.0. | Feature | PDF-Version | PyPDF2 Support | | --------------------------------------- | ----------- | -------------- | | Transparent Graphics | 1.4 | ? | -| CMaps | 1.4 | ❌ [#201](https://github.com/py-pdf/PyPDF2/pull/201), [#464](https://github.com/py-pdf/PyPDF2/pull/464), [#805](https://github.com/py-pdf/PyPDF2/pull/805) | +| CMaps | 1.4 | ✅ | | Object Streams | 1.5 | ? | | Cross-reference Streams | 1.5 | ? | | Optional Content Groups (OCGs) - Layers | 1.5 | ? | | Content Stream Compression | 1.5 | ? | -| AES Encryption | 1.6 | ❌ [#749](https://github.com/py-pdf/PyPDF2/pull/749) | +| AES Encryption | 1.6 | ✅ | See [History of PDF](https://en.wikipedia.org/wiki/History_of_PDF) for more features. From 8d5037c590fbab28d9980962070d28a94dfd9be5 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Fri, 29 Jul 2022 08:47:25 +0200 Subject: [PATCH 2/5] DOC: Mention pyHanko for signing PDF documents (#1178) --- docs/user/pdf-version-support.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/user/pdf-version-support.md b/docs/user/pdf-version-support.md index ad2033fb6..e8312c752 100644 --- a/docs/user/pdf-version-support.md +++ b/docs/user/pdf-version-support.md @@ -30,3 +30,8 @@ all features of PDF 2.0. See [History of PDF](https://en.wikipedia.org/wiki/History_of_PDF) for more features. + +Some PDF features are not supported by PyPDF2, but other libraries can be used +for them: + +* [pyHanko](https://pyhanko.readthedocs.io/en/latest/index.html): Cryptographically sign a PDF From 85ca871b007c23e0b5dcb8ab5915b63b1d9ac7e7 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Fri, 29 Jul 2022 18:54:31 +0200 Subject: [PATCH 3/5] DOC: Table extraction (#1179) --- docs/user/pdf-version-support.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/user/pdf-version-support.md b/docs/user/pdf-version-support.md index e8312c752..4ac0adf13 100644 --- a/docs/user/pdf-version-support.md +++ b/docs/user/pdf-version-support.md @@ -34,4 +34,5 @@ features. Some PDF features are not supported by PyPDF2, but other libraries can be used for them: -* [pyHanko](https://pyhanko.readthedocs.io/en/latest/index.html): Cryptographically sign a PDF +* [pyHanko](https://pyhanko.readthedocs.io/en/latest/index.html): Cryptographically sign a PDF ([#302](https://github.com/py-pdf/PyPDF2/issues/302)) +* [camelot-py](https://pypi.org/project/camelot-py/): Table Extraction ([#231](https://github.com/py-pdf/PyPDF2/issues/231)) From 2d480685a72d665826dbd53f973173b34cf4c872 Mon Sep 17 00:00:00 2001 From: Mathieu Kniewallner Date: Fri, 29 Jul 2022 19:43:02 +0200 Subject: [PATCH 4/5] DOC: Update changelog url in package metadata (#1180) --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 62d77a818..b43b87b97 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ url = https://pypdf2.readthedocs.io/en/latest/ project_urls = Source = https://github.com/py-pdf/PyPDF2 Bug Reports = https://github.com/py-pdf/PyPDF2/issues - Changelog = https://raw.githubusercontent.com/py-pdf/PyPDF2/main/CHANGELOG + Changelog = https://pypdf2.readthedocs.io/en/latest/meta/CHANGELOG.html classifiers = Development Status :: 5 - Production/Stable Intended Audience :: Developers From 8c532a0ff13395b706d0ae1f183dd24bab577bfc Mon Sep 17 00:00:00 2001 From: mtd91429 Date: Sat, 30 Jul 2022 00:09:41 -0500 Subject: [PATCH 5/5] MAINT: Consistent terminology for outline items (#1156) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR makes sure PyPDF2 uses a consistent nomenclature for the outline: * **Outline**: A document has exactly one outline (also called "table of contents", in short toc). That outline might be empty. * **Outline Item**: An element within an outline. This is also called a "bookmark" by some PDF viewers. This means that some names will be deprecated to ensure consistency: ## PdfReader * `outlines` ➔ `outline` * `_build_outline()` ➔ `_build_outline_item()` ## PdfWriter * Keep `get_outline_root()` * `add_bookmark_dict()` ➔ `add_outline()` * `add_bookmark()` ➔ `add_outline_item()` ## PdfMerger * `find_bookmark()` ➔ `find_outline_item()` * `_write_bookmarks()` ➔ `_write_outline()` * `_write_bookmark_on_page()` ➔ `_write_outline_item_on_page()` * `_associate_bookmarks_to_pages()` ➔ `_associate_outline_items_to_pages()` * Keep `_trim_outline()` ## generic.py * `Bookmark` ➔ `OutlineItem` Closes #1048 Closes #1098 --- PyPDF2/_merger.py | 255 ++++++++++++++++++++++---------------- PyPDF2/_reader.py | 103 ++++++++------- PyPDF2/_utils.py | 44 ++++++- PyPDF2/_writer.py | 137 ++++++++++++++------ PyPDF2/generic.py | 20 +-- PyPDF2/types.py | 11 +- docs/user/extract-text.md | 4 +- tests/bench.py | 12 +- tests/test_generic.py | 10 +- tests/test_merger.py | 87 ++++++++----- tests/test_reader.py | 66 +++++----- tests/test_workflows.py | 2 +- tests/test_writer.py | 65 +++++++--- 13 files changed, 519 insertions(+), 297 deletions(-) diff --git a/PyPDF2/_merger.py b/PyPDF2/_merger.py index 6db5df4d7..78ee8c6bf 100644 --- a/PyPDF2/_merger.py +++ b/PyPDF2/_merger.py @@ -31,14 +31,13 @@ from ._encryption import Encryption from ._page import PageObject from ._reader import PdfReader -from ._utils import StrByteType, deprecate_with_replacement, str_ +from ._utils import StrByteType, deprecate_with_replacement, deprecate_bookmark, str_ from ._writer import PdfWriter from .constants import GoToActionArguments from .constants import PagesAttributes as PA from .constants import TypArguments, TypFitArguments from .generic import ( ArrayObject, - Bookmark, Destination, DictionaryObject, FloatObject, @@ -46,11 +45,12 @@ NameObject, NullObject, NumberObject, + OutlineItem, TextStringObject, TreeObject, ) from .pagerange import PageRange, PageRangeSpec -from .types import FitType, LayoutType, OutlinesType, PagemodeType, ZoomArgType +from .types import FitType, LayoutType, OutlineType, PagemodeType, ZoomArgType ERR_CLOSED_WRITER = "close() was called and thus the writer cannot be used anymore" @@ -80,22 +80,24 @@ class PdfMerger: Defaults to ``False``. """ + @deprecate_bookmark(bookmarks="outline") def __init__(self, strict: bool = False) -> None: self.inputs: List[Tuple[Any, PdfReader, bool]] = [] self.pages: List[Any] = [] self.output: Optional[PdfWriter] = PdfWriter() - self.bookmarks: OutlinesType = [] + self.outline: OutlineType = [] self.named_dests: List[Any] = [] self.id_count = 0 self.strict = strict + @deprecate_bookmark(bookmark="outline_item", import_bookmarks="import_outline") def merge( self, position: int, fileobj: Union[StrByteType, PdfReader], - bookmark: Optional[str] = None, + outline_item: Optional[str] = None, pages: Optional[PageRangeSpec] = None, - import_bookmarks: bool = True, + import_outline: bool = True, ) -> None: """ Merge the pages from the given file into the output file at the @@ -108,17 +110,18 @@ def merge( read and seek methods similar to a File Object. Could also be a string representing a path to a PDF file. - :param str bookmark: Optionally, you may specify a bookmark to be - applied at the beginning of the included file by supplying the text - of the bookmark. + :param str outline_item: Optionally, you may specify an outline item + (previously referred to as a 'bookmark') to be applied at the + beginning of the included file by supplying the text of the outline item. :param pages: can be a :class:`PageRange` or a ``(start, stop[, step])`` tuple to merge only the specified range of pages from the source document into the output document. - :param bool import_bookmarks: You may prevent the source document's - bookmarks from being imported by specifying this as ``False``. + :param bool import_outline: You may prevent the source document's + outline (collection of outline items, previously referred to as + 'bookmarks') from being imported by specifying this as ``False``. """ stream, my_file, encryption_obj = self._create_stream(fileobj) @@ -140,19 +143,19 @@ def merge( srcpages = [] outline = [] - if import_bookmarks: - outline = reader.outlines + if import_outline: + outline = reader.outline outline = self._trim_outline(reader, outline, pages) - if bookmark: - bookmark_typ = Bookmark( - TextStringObject(bookmark), + if outline_item: + outline_item_typ = OutlineItem( + TextStringObject(outline_item), NumberObject(self.id_count), NameObject(TypFitArguments.FIT), ) - self.bookmarks += [bookmark_typ, outline] # type: ignore + self.outline += [outline_item_typ, outline] # type: ignore else: - self.bookmarks += outline + self.outline += outline dests = reader.named_destinations trimmed_dests = self._trim_dests(reader, dests, pages) @@ -170,7 +173,7 @@ def merge( srcpages.append(mp) self._associate_dests_to_pages(srcpages) - self._associate_bookmarks_to_pages(srcpages) + self._associate_outline_items_to_pages(srcpages) # Slice to insert the pages at the specified position self.pages[position:position] = srcpages @@ -213,12 +216,13 @@ def _create_stream( stream = fileobj return stream, my_file, encryption_obj + @deprecate_bookmark(bookmark="outline_item", import_bookmarks="import_outline") def append( self, fileobj: Union[StrByteType, PdfReader], - bookmark: Optional[str] = None, + outline_item: Optional[str] = None, pages: Union[None, PageRange, Tuple[int, int], Tuple[int, int, int]] = None, - import_bookmarks: bool = True, + import_outline: bool = True, ) -> None: """ Identical to the :meth:`merge()` method, but assumes you want to @@ -229,19 +233,20 @@ def append( read and seek methods similar to a File Object. Could also be a string representing a path to a PDF file. - :param str bookmark: Optionally, you may specify a bookmark to be - applied at the beginning of the included file by supplying the text - of the bookmark. + :param str outline_item: Optionally, you may specify an outline item + (previously referred to as a 'bookmark') to be applied at the + beginning of the included file by supplying the text of the outline item. :param pages: can be a :class:`PageRange` or a ``(start, stop[, step])`` tuple to merge only the specified range of pages from the source document into the output document. - :param bool import_bookmarks: You may prevent the source document's - bookmarks from being imported by specifying this as ``False``. + :param bool import_outline: You may prevent the source document's + outline (collection of outline items, previously referred to as + 'bookmarks') from being imported by specifying this as ``False``. """ - self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks) + self.merge(len(self.pages), fileobj, outline_item, pages, import_outline) def write(self, fileobj: StrByteType) -> None: """ @@ -269,9 +274,9 @@ def write(self, fileobj: StrByteType) -> None: # idnum = self.output._objects.index(self.output._pages.get_object()[PA.KIDS][-1].get_object()) + 1 # page.out_pagedata = IndirectObject(idnum, 0, self.output) - # Once all pages are added, create bookmarks to point at those pages + # Once all pages are added, create outline items to point at those pages self._write_dests() - self._write_bookmarks() + self._write_outline() # Write the output to the file self.output.write(fileobj) @@ -366,9 +371,9 @@ def set_page_mode(self, mode: PagemodeType) -> None: :widths: 50 200 * - /UseNone - - Do not show outlines or thumbnails panels + - Do not show outline or thumbnails panels * - /UseOutlines - - Show outlines (aka bookmarks) panel + - Show outline (aka bookmarks) panel * - /UseThumbs - Show page thumbnails panel * - /FullScreen @@ -402,15 +407,15 @@ def _trim_dests( def _trim_outline( self, pdf: PdfReader, - outline: OutlinesType, + outline: OutlineType, pages: Union[Tuple[int, int], Tuple[int, int, int]], - ) -> OutlinesType: - """Remove outline/bookmark entries that are not a part of the specified page set.""" + ) -> OutlineType: + """Remove outline item entries that are not a part of the specified page set.""" new_outline = [] prev_header_added = True - for i, o in enumerate(outline): - if isinstance(o, list): - sub = self._trim_outline(pdf, o, pages) # type: ignore + for i, outline_item in enumerate(outline): + if isinstance(outline_item, list): + sub = self._trim_outline(pdf, outline_item, pages) # type: ignore if sub: if not prev_header_added: new_outline.append(outline[i - 1]) @@ -418,11 +423,11 @@ def _trim_outline( else: prev_header_added = False for j in range(*pages): - if o["/Page"] is None: + if outline_item["/Page"] is None: continue - if pdf.pages[j].get_object() == o["/Page"].get_object(): - o[NameObject("/Page")] = o["/Page"].get_object() - new_outline.append(o) + if pdf.pages[j].get_object() == outline_item["/Page"].get_object(): + outline_item[NameObject("/Page")] = outline_item["/Page"].get_object() + new_outline.append(outline_item) prev_header_added = True break return new_outline @@ -441,38 +446,40 @@ def _write_dests(self) -> None: if pageno is not None: self.output.add_named_destination_object(named_dest) - def _write_bookmarks( + @deprecate_bookmark(bookmarks="outline") + def _write_outline( self, - bookmarks: Optional[Iterable[Bookmark]] = None, + outline: Optional[Iterable[OutlineItem]] = None, parent: Optional[TreeObject] = None, ) -> None: if self.output is None: raise RuntimeError(ERR_CLOSED_WRITER) - if bookmarks is None: - bookmarks = self.bookmarks # type: ignore - assert bookmarks is not None, "hint for mypy" # TODO: is that true? + if outline is None: + outline = self.outline # type: ignore + assert outline is not None, "hint for mypy" # TODO: is that true? last_added = None - for bookmark in bookmarks: - if isinstance(bookmark, list): - self._write_bookmarks(bookmark, last_added) + for outline_item in outline: + if isinstance(outline_item, list): + self._write_outline(outline_item, last_added) continue page_no = None - if "/Page" in bookmark: + if "/Page" in outline_item: for page_no, page in enumerate(self.pages): # noqa: B007 - if page.id == bookmark["/Page"]: - self._write_bookmark_on_page(bookmark, page) + if page.id == outline_item["/Page"]: + self._write_outline_item_on_page(outline_item, page) break if page_no is not None: - del bookmark["/Page"], bookmark["/Type"] - last_added = self.output.add_bookmark_dict(bookmark, parent) + del outline_item["/Page"], outline_item["/Type"] + last_added = self.output.add_outline_item_dict(outline_item, parent) - def _write_bookmark_on_page( - self, bookmark: Union[Bookmark, Destination], page: _MergedPage + @deprecate_bookmark(bookmark="outline_item") + def _write_outline_item_on_page( + self, outline_item: Union[OutlineItem, Destination], page: _MergedPage ) -> None: - bm_type = cast(str, bookmark["/Type"]) - args = [NumberObject(page.id), NameObject(bm_type)] + oi_type = cast(str, outline_item["/Type"]) + args = [NumberObject(page.id), NameObject(oi_type)] fit2arg_keys: Dict[str, Tuple[str, ...]] = { TypFitArguments.FIT_H: (TypArguments.TOP,), TypFitArguments.FIT_BH: (TypArguments.TOP,), @@ -486,14 +493,16 @@ def _write_bookmark_on_page( TypArguments.TOP, ), } - for arg_key in fit2arg_keys.get(bm_type, tuple()): - if arg_key in bookmark and not isinstance(bookmark[arg_key], NullObject): - args.append(FloatObject(bookmark[arg_key])) + for arg_key in fit2arg_keys.get(oi_type, tuple()): + if arg_key in outline_item and not isinstance( + outline_item[arg_key], NullObject + ): + args.append(FloatObject(outline_item[arg_key])) else: args.append(FloatObject(0)) - del bookmark[arg_key] + del outline_item[arg_key] - bookmark[NameObject("/A")] = DictionaryObject( + outline_item[NameObject("/A")] = DictionaryObject( { NameObject(GoToActionArguments.S): NameObject("/GoTo"), NameObject(GoToActionArguments.D): ArrayObject(args), @@ -517,51 +526,101 @@ def _associate_dests_to_pages(self, pages: List[_MergedPage]) -> None: else: raise ValueError(f"Unresolved named destination '{nd['/Title']}'") - def _associate_bookmarks_to_pages( - self, pages: List[_MergedPage], bookmarks: Optional[Iterable[Bookmark]] = None + @deprecate_bookmark(bookmarks="outline") + def _associate_outline_items_to_pages( + self, pages: List[_MergedPage], outline: Optional[Iterable[OutlineItem]] = None ) -> None: - if bookmarks is None: - bookmarks = self.bookmarks # type: ignore # TODO: self.bookmarks can be None! - assert bookmarks is not None, "hint for mypy" - for b in bookmarks: - if isinstance(b, list): - self._associate_bookmarks_to_pages(pages, b) + if outline is None: + outline = self.outline # type: ignore # TODO: self.bookmarks can be None! + assert outline is not None, "hint for mypy" + for outline_item in outline: + if isinstance(outline_item, list): + self._associate_outline_items_to_pages(pages, outline_item) continue pageno = None - bp = b["/Page"] + outline_item_page = outline_item["/Page"] - if isinstance(bp, NumberObject): + if isinstance(outline_item_page, NumberObject): continue for p in pages: - if bp.get_object() == p.pagedata.get_object(): + if outline_item_page.get_object() == p.pagedata.get_object(): pageno = p.id if pageno is not None: - b[NameObject("/Page")] = NumberObject(pageno) + outline_item[NameObject("/Page")] = NumberObject(pageno) - def find_bookmark( + @deprecate_bookmark(bookmark="outline_item") + def find_outline_item( self, - bookmark: Dict[str, Any], - root: Optional[OutlinesType] = None, + outline_item: Dict[str, Any], + root: Optional[OutlineType] = None, ) -> Optional[List[int]]: if root is None: - root = self.bookmarks + root = self.outline - for i, b in enumerate(root): - if isinstance(b, list): - # b is still an inner node - # (OutlinesType, if recursive types were supported by mypy) - res = self.find_bookmark(bookmark, b) # type: ignore + for i, oi_enum in enumerate(root): + if isinstance(oi_enum, list): + # oi_enum is still an inner node + # (OutlineType, if recursive types were supported by mypy) + res = self.find_outline_item(outline_item, oi_enum) # type: ignore if res: return [i] + res - elif b == bookmark or cast(Dict[Any, Any], b["/Title"]) == bookmark: + elif ( + oi_enum == outline_item + or cast(Dict[Any, Any], oi_enum["/Title"]) == outline_item + ): # we found a leaf node return [i] return None + @deprecate_bookmark(bookmark="outline_item") + def find_bookmark( + self, + outline_item: Dict[str, Any], + root: Optional[OutlineType] = None, + ) -> Optional[List[int]]: + """ + .. deprecated:: 2.9.0 + Use :meth:`find_outline_item` instead. + """ + + return self.find_outline_item(outline_item, root) + + def add_outline_item( + self, + title: str, + pagenum: int, + parent: Union[None, TreeObject, IndirectObject] = None, + color: Optional[Tuple[float, float, float]] = None, + bold: bool = False, + italic: bool = False, + fit: FitType = "/Fit", + *args: ZoomArgType, + ) -> IndirectObject: + """ + Add an outline item (commonly referred to as a "Bookmark") to this PDF file. + + :param str title: Title to use for this outline item. + :param int pagenum: Page number this outline item will point to. + :param parent: A reference to a parent outline item to create nested + outline items. + :param tuple color: Color of the outline item's font as a red, green, blue tuple + from 0.0 to 1.0 + :param bool bold: Outline item font is bold + :param bool italic: Outline item font is italic + :param str fit: The fit of the destination page. See + :meth:`add_link()` for details. + """ + writer = self.output + if writer is None: + raise RuntimeError(ERR_CLOSED_WRITER) + return writer.add_outline_item( + title, pagenum, parent, color, bold, italic, fit, *args + ) + def addBookmark( self, title: str, @@ -575,10 +634,10 @@ def addBookmark( ) -> IndirectObject: # pragma: no cover """ .. deprecated:: 1.28.0 - Use :meth:`add_bookmark` instead. + Use :meth:`add_outline_item` instead. """ - deprecate_with_replacement("addBookmark", "add_bookmark") - return self.add_bookmark( + deprecate_with_replacement("addBookmark", "add_outline_item") + return self.add_outline_item( title, pagenum, parent, color, bold, italic, fit, *args ) @@ -594,23 +653,11 @@ def add_bookmark( *args: ZoomArgType, ) -> IndirectObject: """ - Add a bookmark to this PDF file. - - :param str title: Title to use for this bookmark. - :param int pagenum: Page number this bookmark will point to. - :param parent: A reference to a parent bookmark to create nested - bookmarks. - :param tuple color: Color of the bookmark as a red, green, blue tuple - from 0.0 to 1.0 - :param bool bold: Bookmark is bold - :param bool italic: Bookmark is italic - :param str fit: The fit of the destination page. See - :meth:`addLink()` for details. + .. deprecated:: 2.9.0 + Use :meth:`add_outline_item` instead. """ - writer = self.output - if writer is None: - raise RuntimeError(ERR_CLOSED_WRITER) - return writer.add_bookmark( + deprecate_with_replacement("addBookmark", "add_outline_item") + return self.add_outline_item( title, pagenum, parent, color, bold, italic, fit, *args ) diff --git a/PyPDF2/_reader.py b/PyPDF2/_reader.py index 98f1699d9..80125d9d0 100644 --- a/PyPDF2/_reader.py +++ b/PyPDF2/_reader.py @@ -89,7 +89,7 @@ TreeObject, read_object, ) -from .types import OutlinesType, PagemodeType +from .types import OutlineType, PagemodeType from .xmp import XmpInformation @@ -677,19 +677,30 @@ def getNamedDestinations( return self._get_named_destinations(tree, retval) @property - def outlines(self) -> OutlinesType: + def outline(self) -> OutlineType: """ - Read-only property for outlines present in the document. + Read-only property for the outline (i.e., a collection of 'outline items' + which are also known as 'bookmarks') present in the document. :return: a nested list of :class:`Destinations`. """ - return self._get_outlines() + return self._get_outline() - def _get_outlines( - self, node: Optional[DictionaryObject] = None, outlines: Optional[Any] = None - ) -> OutlinesType: - if outlines is None: - outlines = [] + @property + def outlines(self) -> OutlineType: + """ + .. deprecated:: 2.9.0 + + Use :py:attr:`outline` instead. + """ + deprecate_with_replacement("outlines", "outline") + return self.outline + + def _get_outline( + self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None + ) -> OutlineType: + if outline is None: + outline = [] catalog = cast(DictionaryObject, self.trailer[TK.ROOT]) # get the outline dictionary and named destinations @@ -699,11 +710,11 @@ def _get_outlines( except PdfReadError: # this occurs if the /Outlines object reference is incorrect # for an example of such a file, see https://unglueit-files.s3.amazonaws.com/ebf/7552c42e9280b4476e59e77acc0bc812.pdf - # so continue to load the file without the Bookmarks - return outlines + # so continue to load the file without the Outlines + return outline if isinstance(lines, NullObject): - return outlines + return outline # TABLE 8.3 Entries in the outline dictionary if lines is not None and "/First" in lines: @@ -711,37 +722,37 @@ def _get_outlines( self._namedDests = self._get_named_destinations() if node is None: - return outlines + return outline - # see if there are any more outlines + # see if there are any more outline items while True: - outline = self._build_outline(node) - if outline: - outlines.append(outline) + outline_obj = self._build_outline_item(node) + if outline_obj: + outline.append(outline_obj) - # check for sub-outlines + # check for sub-outline if "/First" in node: - sub_outlines: List[Any] = [] - self._get_outlines(cast(DictionaryObject, node["/First"]), sub_outlines) - if sub_outlines: - outlines.append(sub_outlines) + sub_outline: List[Any] = [] + self._get_outline(cast(DictionaryObject, node["/First"]), sub_outline) + if sub_outline: + outline.append(sub_outline) if "/Next" not in node: break node = cast(DictionaryObject, node["/Next"]) - return outlines + return outline def getOutlines( - self, node: Optional[DictionaryObject] = None, outlines: Optional[Any] = None - ) -> OutlinesType: # pragma: no cover + self, node: Optional[DictionaryObject] = None, outline: Optional[Any] = None + ) -> OutlineType: # pragma: no cover """ .. deprecated:: 1.28.0 - Use :py:attr:`outlines` instead. + Use :py:attr:`outline` instead. """ - deprecate_with_replacement("getOutlines", "outlines") - return self._get_outlines(node, outlines) + deprecate_with_replacement("getOutlines", "outline") + return self._get_outline(node, outline) def _get_page_number_by_indirect( self, indirect_ref: Union[None, int, NullObject, IndirectObject] @@ -809,7 +820,7 @@ def _build_destination( array: List[Union[NumberObject, IndirectObject, NullObject, DictionaryObject]], ) -> Destination: page, typ = None, None - # handle outlines with missing or invalid destination + # handle outline items with missing or invalid destination if ( isinstance(array, (type(None), NullObject)) or (isinstance(array, ArrayObject) and len(array) == 0) @@ -835,8 +846,8 @@ def _build_destination( title, indirect_ref, TextStringObject("/Fit") # type: ignore ) - def _build_outline(self, node: DictionaryObject) -> Optional[Destination]: - dest, title, outline = None, None, None + def _build_outline_item(self, node: DictionaryObject) -> Optional[Destination]: + dest, title, outline_item = None, None, None # title required for valid outline # PDF Reference 1.7: TABLE 8.4 Entries in an outline item dictionary @@ -861,40 +872,40 @@ def _build_outline(self, node: DictionaryObject) -> Optional[Destination]: dest = dest["/D"] if isinstance(dest, ArrayObject): - outline = self._build_destination(title, dest) # type: ignore + outline_item = self._build_destination(title, dest) # type: ignore elif isinstance(dest, str): # named destination, addresses NameObject Issue #193 try: - outline = self._build_destination( + outline_item = self._build_destination( title, self._namedDests[dest].dest_array ) except KeyError: # named destination not found in Name Dict - outline = self._build_destination(title, None) + outline_item = self._build_destination(title, None) elif isinstance(dest, type(None)): - # outline not required to have destination or action + # outline item not required to have destination or action # PDFv1.7 Table 153 - outline = self._build_destination(title, dest) # type: ignore + outline_item = self._build_destination(title, dest) # type: ignore else: if self.strict: raise PdfReadError(f"Unexpected destination {dest!r}") - outline = self._build_destination(title, None) # type: ignore + outline_item = self._build_destination(title, None) # type: ignore - # if outline created, add color, format, and child count if present - if outline: + # if outline item created, add color, format, and child count if present + if outline_item: if "/C" in node: - # Color of outline in (R, G, B) with values ranging 0.0-1.0 - outline[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore + # Color of outline item font in (R, G, B) with values ranging 0.0-1.0 + outline_item[NameObject("/C")] = ArrayObject(FloatObject(c) for c in node["/C"]) # type: ignore if "/F" in node: # specifies style characteristics bold and/or italic # 1=italic, 2=bold, 3=both - outline[NameObject("/F")] = node["/F"] + outline_item[NameObject("/F")] = node["/F"] if "/Count" in node: # absolute value = num. visible children # positive = open/unfolded, negative = closed/folded - outline[NameObject("/Count")] = node["/Count"] + outline_item[NameObject("/Count")] = node["/Count"] - return outline + return outline_item @property def pages(self) -> _VirtualList: @@ -961,9 +972,9 @@ def page_mode(self) -> Optional[PagemodeType]: :widths: 50 200 * - /UseNone - - Do not show outlines or thumbnails panels + - Do not show outline or thumbnails panels * - /UseOutlines - - Show outlines (aka bookmarks) panel + - Show outline (aka bookmarks) panel * - /UseThumbs - Show page thumbnails panel * - /FullScreen diff --git a/PyPDF2/_utils.py b/PyPDF2/_utils.py index 78dc0f9f0..6d80832e0 100644 --- a/PyPDF2/_utils.py +++ b/PyPDF2/_utils.py @@ -29,6 +29,7 @@ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" +import functools import logging import warnings from codecs import getencoder @@ -40,7 +41,7 @@ FileIO, ) from os import SEEK_CUR -from typing import Dict, Optional, Pattern, Tuple, Union, overload +from typing import Any, Callable, Dict, Optional, Pattern, Tuple, Union, overload try: # Python 3.10+: https://www.python.org/dev/peps/pep-0484/ @@ -362,3 +363,44 @@ def logger_warning(msg: str, src: str) -> None: to strict=False mode. """ logging.getLogger(src).warning(msg) + + +def deprecate_bookmark(**aliases: str) -> Callable: + """ + Decorator for deprecated term "bookmark" + To be used for methods and function arguments + outline_item = a bookmark + outline = a collection of outline items + """ + + def decoration(func: Callable): # type: ignore + @functools.wraps(func) + def wrapper(*args, **kwargs): # type: ignore + rename_kwargs(func.__name__, kwargs, aliases) + return func(*args, **kwargs) + + return wrapper + + return decoration + + +def rename_kwargs( # type: ignore + func_name: str, kwargs: Dict[str, Any], aliases: Dict[str, str] +): + """ + Helper function to deprecate arguments. + """ + + for old_term, new_term in aliases.items(): + if old_term in kwargs: + if new_term in kwargs: + raise TypeError( + f"{func_name} received both {old_term} and {new_term} as an argument." + f"{old_term} is deprecated. Use {new_term} instead." + ) + kwargs[new_term] = kwargs.pop(old_term) + warnings.warn( + message=( + f"{old_term} is deprecated as an argument. Use {new_term} instead" + ) + ) diff --git a/PyPDF2/_writer.py b/PyPDF2/_writer.py index a3b8ed645..579812554 100644 --- a/PyPDF2/_writer.py +++ b/PyPDF2/_writer.py @@ -59,6 +59,7 @@ _get_max_pdf_version_header, b_, deprecate_with_replacement, + deprecate_bookmark, ) from .constants import AnnotationDictionaryAttributes from .constants import CatalogAttributes as CA @@ -82,6 +83,7 @@ BooleanObject, ByteStringObject, ContentStream, + _create_outline_item, DecodedStreamObject, Destination, DictionaryObject, @@ -95,15 +97,14 @@ StreamObject, TextStringObject, TreeObject, - _create_bookmark, create_string_object, ) from .types import ( - BookmarkTypes, BorderArrayType, FitType, LayoutType, PagemodeType, + OutlineItemType, ZoomArgsType, ZoomArgType, ) @@ -1073,7 +1074,7 @@ def getNamedDestRoot(self) -> ArrayObject: # pragma: no cover deprecate_with_replacement("getNamedDestRoot", "get_named_dest_root") return self.get_named_dest_root() - def add_bookmark_destination( + def add_outline_item_destination( self, dest: Union[PageObject, TreeObject], parent: Union[None, TreeObject, IndirectObject] = None, @@ -1087,47 +1088,78 @@ def add_bookmark_destination( return dest_ref + def add_bookmark_destination( + self, + dest: Union[PageObject, TreeObject], + parent: Union[None, TreeObject, IndirectObject] = None, + ) -> IndirectObject: + """ + .. deprecated:: 2.9.0 + + Use :meth:`add_outline_item_destination` instead. + """ + deprecate_with_replacement( + "add_bookmark_destination", "add_outline_item_destination" + ) + return self.add_outline_item_destination(dest, parent) + def addBookmarkDestination( self, dest: PageObject, parent: Optional[TreeObject] = None ) -> IndirectObject: # pragma: no cover """ .. deprecated:: 1.28.0 - Use :meth:`add_bookmark_destination` instead. + Use :meth:`add_outline_item_destination` instead. """ - deprecate_with_replacement("addBookmarkDestination", "add_bookmark_destination") - return self.add_bookmark_destination(dest, parent) + deprecate_with_replacement( + "addBookmarkDestination", "add_outline_item_destination" + ) + return self.add_outline_item_destination(dest, parent) - def add_bookmark_dict( - self, bookmark: BookmarkTypes, parent: Optional[TreeObject] = None + @deprecate_bookmark(bookmark="outline_item") + def add_outline_item_dict( + self, outline_item: OutlineItemType, parent: Optional[TreeObject] = None ) -> IndirectObject: - bookmark_obj = TreeObject() - for k, v in list(bookmark.items()): - bookmark_obj[NameObject(str(k))] = v - bookmark_obj.update(bookmark) + outline_item_object = TreeObject() + for k, v in list(outline_item.items()): + outline_item_object[NameObject(str(k))] = v + outline_item_object.update(outline_item) - if "/A" in bookmark: + if "/A" in outline_item: action = DictionaryObject() - a_dict = cast(DictionaryObject, bookmark["/A"]) + a_dict = cast(DictionaryObject, outline_item["/A"]) for k, v in list(a_dict.items()): action[NameObject(str(k))] = v action_ref = self._add_object(action) - bookmark_obj[NameObject("/A")] = action_ref + outline_item_object[NameObject("/A")] = action_ref + + return self.add_outline_item_destination(outline_item_object, parent) - return self.add_bookmark_destination(bookmark_obj, parent) + @deprecate_bookmark(bookmark="outline_item") + def add_bookmark_dict( + self, outline_item: OutlineItemType, parent: Optional[TreeObject] = None + ) -> IndirectObject: + """ + .. deprecated:: 2.9.0 + + Use :meth:`add_outline_item_dict` instead. + """ + deprecate_with_replacement("add_bookmark_dict", "add_outline_item_dict") + return self.add_outline_item_dict(outline_item, parent) + @deprecate_bookmark(bookmark="outline_item") def addBookmarkDict( - self, bookmark: BookmarkTypes, parent: Optional[TreeObject] = None + self, outline_item: OutlineItemType, parent: Optional[TreeObject] = None ) -> IndirectObject: # pragma: no cover """ .. deprecated:: 1.28.0 - Use :meth:`add_bookmark_dict` instead. + Use :meth:`add_outline_item_dict` instead. """ - deprecate_with_replacement("addBookmarkDict", "add_bookmark_dict") - return self.add_bookmark_dict(bookmark, parent) + deprecate_with_replacement("addBookmarkDict", "add_outline_item_dict") + return self.add_outline_item_dict(outline_item, parent) - def add_bookmark( + def add_outline_item( self, title: str, pagenum: int, @@ -1139,25 +1171,28 @@ def add_bookmark( *args: ZoomArgType, ) -> IndirectObject: """ - Add a bookmark to this PDF file. + Add an outline item (commonly referred to as a "Bookmark") to this PDF file. - :param str title: Title to use for this bookmark. - :param int pagenum: Page number this bookmark will point to. - :param parent: A reference to a parent bookmark to create nested - bookmarks. - :param tuple color: Color of the bookmark as a red, green, blue tuple + :param str title: Title to use for this outline item. + :param int pagenum: Page number this outline item will point to. + :param parent: A reference to a parent outline item to create nested + outline items. + :param tuple color: Color of the outline item's font as a red, green, blue tuple from 0.0 to 1.0 - :param bool bold: Bookmark is bold - :param bool italic: Bookmark is italic + :param bool bold: Outline item font is bold + :param bool italic: Outline item font is italic :param str fit: The fit of the destination page. See - :meth:`addLink()` for details. + :meth:`add_link()` for details. """ page_ref = NumberObject(pagenum) zoom_args: ZoomArgsType = [ NullObject() if a is None else NumberObject(a) for a in args ] dest = Destination( - NameObject("/" + title + " bookmark"), page_ref, NameObject(fit), *zoom_args + NameObject("/" + title + " outline item"), + page_ref, + NameObject(fit), + *zoom_args, ) action_ref = self._add_object( @@ -1168,11 +1203,32 @@ def add_bookmark( } ) ) - bookmark = _create_bookmark(action_ref, title, color, italic, bold) + outline_item = _create_outline_item(action_ref, title, color, italic, bold) if parent is None: parent = self.get_outline_root() - return self.add_bookmark_destination(bookmark, parent) + return self.add_outline_item_destination(outline_item, parent) + + def add_bookmark( + self, + title: str, + pagenum: int, + parent: Union[None, TreeObject, IndirectObject] = None, + color: Optional[Tuple[float, float, float]] = None, + bold: bool = False, + italic: bool = False, + fit: FitType = "/Fit", + *args: ZoomArgType, + ) -> IndirectObject: + """ + .. deprecated:: 2.9.0 + + Use :meth:`add_outline_item` instead. + """ + deprecate_with_replacement("add_bookmark", "add_outline_item") + return self.add_outline_item( + title, pagenum, parent, color, bold, italic, fit, *args + ) def addBookmark( self, @@ -1188,13 +1244,18 @@ def addBookmark( """ .. deprecated:: 1.28.0 - Use :meth:`add_bookmark` instead. + Use :meth:`add_outline_item` instead. """ - deprecate_with_replacement("addBookmark", "add_bookmark") - return self.add_bookmark( + deprecate_with_replacement("addBookmark", "add_outline_item") + return self.add_outline_item( title, pagenum, parent, color, bold, italic, fit, *args ) + def add_outline(self) -> None: + raise NotImplementedError( + "This method is not yet implemented. Use :meth:`add_outline_item` instead." + ) + def add_named_destination_object(self, dest: PdfObject) -> IndirectObject: dest_ref = self._add_object(dest) @@ -1764,9 +1825,9 @@ def page_mode(self) -> Optional[PagemodeType]: :widths: 50 200 * - /UseNone - - Do not show outlines or thumbnails panels + - Do not show outline or thumbnails panels * - /UseOutlines - - Show outlines (aka bookmarks) panel + - Show outline (aka bookmarks) panel * - /UseThumbs - Show page thumbnails panel * - /FullScreen diff --git a/PyPDF2/generic.py b/PyPDF2/generic.py index 350bf8c16..bfdeff6ef 100644 --- a/PyPDF2/generic.py +++ b/PyPDF2/generic.py @@ -1933,7 +1933,7 @@ def outline_count(self) -> Optional[int]: return self.get("/Count", None) -class Bookmark(Destination): +class OutlineItem(Destination): def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] ) -> None: @@ -1957,6 +1957,12 @@ def write_to_stream( stream.write(b">>") +class Bookmark(OutlineItem): # pragma: no cover + def __init__(self, *args: Any, **kwargs: Any) -> None: + deprecate_with_replacement("Bookmark", "OutlineItem") + super().__init__(*args, **kwargs) + + def createStringObject( string: Union[str, bytes], forced_encoding: Union[None, str, List[str], Dict[int, str]] = None, @@ -2011,16 +2017,16 @@ def create_string_object( raise TypeError("create_string_object should have str or unicode arg") -def _create_bookmark( +def _create_outline_item( action_ref: IndirectObject, title: str, color: Optional[Tuple[float, float, float]], italic: bool, bold: bool, ) -> TreeObject: - bookmark = TreeObject() + outline_item = TreeObject() - bookmark.update( + outline_item.update( { NameObject("/A"): action_ref, NameObject("/Title"): create_string_object(title), @@ -2028,7 +2034,7 @@ def _create_bookmark( ) if color is not None: - bookmark.update( + outline_item.update( {NameObject("/C"): ArrayObject([FloatObject(c) for c in color])} ) @@ -2038,8 +2044,8 @@ def _create_bookmark( if bold: format_flag += 2 if format_flag: - bookmark.update({NameObject("/F"): NumberObject(format_flag)}) - return bookmark + outline_item.update({NameObject("/F"): NumberObject(format_flag)}) + return outline_item def encode_pdfdocencoding(unicode_string: str) -> bytes: diff --git a/PyPDF2/types.py b/PyPDF2/types.py index cb5358647..f17e5aa21 100644 --- a/PyPDF2/types.py +++ b/PyPDF2/types.py @@ -16,15 +16,17 @@ from .generic import ( ArrayObject, - Bookmark, Destination, NameObject, NullObject, NumberObject, + OutlineItem, ) BorderArrayType: TypeAlias = List[Union[NameObject, NumberObject, ArrayObject]] -BookmarkTypes: TypeAlias = Union[Bookmark, Destination] +OutlineItemType: TypeAlias = Union[OutlineItem, Destination] +# BookmarkTypes is deprecated. Use OutlineItemType instead +BookmarkTypes: TypeAlias = OutlineItemType # TODO: remove in version 3.0.0 FitType: TypeAlias = Literal[ "/Fit", "/XYZ", "/FitH", "/FitV", "/FitR", "/FitB", "/FitBH", "/FitBV" ] @@ -36,8 +38,9 @@ # OutlinesType = List[Union[Destination, "OutlinesType"]] # See https://github.com/python/mypy/issues/731 # Hence use this for the moment: -OutlinesType = List[Union[Destination, List[Union[Destination, List[Destination]]]]] - +OutlineType = List[Union[Destination, List[Union[Destination, List[Destination]]]]] +# OutlinesType is deprecated. Use OutlineType instead +OutlinesType: TypeAlias = OutlineType # TODO: remove in version 3.0.0 LayoutType: TypeAlias = Literal[ "/NoLayout", diff --git a/docs/user/extract-text.md b/docs/user/extract-text.md index 8415ee31c..30eaaf2a8 100644 --- a/docs/user/extract-text.md +++ b/docs/user/extract-text.md @@ -85,11 +85,11 @@ PyPDF2 might make mistakes parsing that. Hence I would distinguish three types of PDF documents: * **Digitally-born PDF files**: The file was created digitally on the computer. - It can contain images, texts, links, bookmarks, JavaScript, ... + It can contain images, texts, links, outline items (a.k.a., bookmarks), JavaScript, ... If you Zoom in a lot, the text still looks sharp. * **Scanned PDF files**: Any number of pages was scanned. The images were then stored in a PDF file. Hence the file is just a container for those images. - You cannot copy the text, you don't have links, bookmarks, JavaScript. + You cannot copy the text, you don't have links, outline items, JavaScript. * **OCRed PDF files**: The scanner ran OCR software and put the recognized text in the background of the image. Hence you can copy the text, but it still looks like a scan. If you zoom in enough, you can recognize pixels. diff --git a/tests/bench.py b/tests/bench.py index d8f526ed9..b2856b909 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -71,14 +71,14 @@ def merge(): file_merger.append(reader) # PdfReader object: - file_merger.append(PyPDF2.PdfReader(pdf_path, "rb"), bookmark=True) + file_merger.append(PyPDF2.PdfReader(pdf_path, "rb"), outline_item=True) # File handle with open(pdf_path, "rb") as fh: file_merger.append(fh) - bookmark = file_merger.add_bookmark("A bookmark", 0) - file_merger.add_bookmark("deeper", 0, parent=bookmark) + outline_item = file_merger.add_outline_item("An outline item", 0) + file_merger.add_outline_item("deeper", 0, parent=outline_item) file_merger.add_metadata({"author": "Martin Thoma"}) file_merger.add_named_destination("title", 0) file_merger.set_page_layout("/SinglePage") @@ -88,12 +88,12 @@ def merge(): file_merger.write(tmp_path) file_merger.close() - # Check if bookmarks are correct + # Check if outline is correct reader = PyPDF2.PdfReader(tmp_path) assert [ - el.title for el in reader._get_outlines() if isinstance(el, Destination) + el.title for el in reader._get_outline() if isinstance(el, Destination) ] == [ - "A bookmark", + "An outline item", "Foo", "Bar", "Baz", diff --git a/tests/test_generic.py b/tests/test_generic.py index 1014c5d7c..234a4ee88 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -9,7 +9,6 @@ from PyPDF2.generic import ( AnnotationBuilder, ArrayObject, - Bookmark, BooleanObject, ByteStringObject, CheckboxRadioButtonAttributes, @@ -20,6 +19,7 @@ NameObject, NullObject, NumberObject, + OutlineItem, RectangleObject, TextStringObject, TreeObject, @@ -197,12 +197,12 @@ def test_destination_exception(): ) -def test_bookmark_write_to_stream(): +def test_outline_item_write_to_stream(): stream = BytesIO() - bm = Bookmark( + oi = OutlineItem( NameObject("title"), NullObject(), NameObject(TF.FIT_V), FloatObject(0) ) - bm.write_to_stream(stream, None) + oi.write_to_stream(stream, None) stream.seek(0, 0) assert stream.read() == b"<<\n/Title title\n/Dest [ null /FitV 0 ]\n>>" @@ -400,7 +400,7 @@ def test_remove_child_in_tree(): reader = PdfReader(pdf) writer = PdfWriter() writer.add_page(reader.pages[0]) - writer.add_bookmark("foo", pagenum=0) + writer.add_outline_item("foo", pagenum=0) obj = writer._objects[-1] tree.add_child(obj, writer) tree.remove_child(obj) diff --git a/tests/test_merger.py b/tests/test_merger.py index 2af7a0251..67d3f655e 100644 --- a/tests/test_merger.py +++ b/tests/test_merger.py @@ -30,7 +30,7 @@ def test_merge(): merger.append(outline) merger.append(pdf_path, pages=PyPDF2.pagerange.PageRange(slice(0, 0))) merger.append(pdf_forms) - merger.merge(0, pdf_path, import_bookmarks=False) + merger.merge(0, pdf_path, import_outline=False) # Merging an encrypted file reader = PyPDF2.PdfReader(pdf_pw) @@ -38,40 +38,46 @@ def test_merge(): merger.append(reader) # PdfReader object: - merger.append(PyPDF2.PdfReader(pdf_path), bookmark="foo") + merger.append(PyPDF2.PdfReader(pdf_path), outline_item="foo") # File handle with open(pdf_path, "rb") as fh: merger.append(fh) - bookmark = merger.add_bookmark("A bookmark", 0) - bm2 = merger.add_bookmark("deeper", 0, parent=bookmark, italic=True, bold=True) - merger.add_bookmark("Let's see", 2, bm2, (255, 255, 0), True, True, "/FitBV", 12) - merger.add_bookmark( - "The XYZ fit", 0, bookmark, (255, 0, 15), True, True, "/XYZ", 10, 20, 3 + outline_item = merger.add_outline_item("An outline item", 0) + oi2 = merger.add_outline_item( + "deeper", 0, parent=outline_item, italic=True, bold=True ) - merger.add_bookmark( - "The FitH fit", 0, bookmark, (255, 0, 15), True, True, "/FitH", 10 + merger.add_outline_item( + "Let's see", 2, oi2, (255, 255, 0), True, True, "/FitBV", 12 ) - merger.add_bookmark( - "The FitV fit", 0, bookmark, (255, 0, 15), True, True, "/FitV", 10 + merger.add_outline_item( + "The XYZ fit", 0, outline_item, (255, 0, 15), True, True, "/XYZ", 10, 20, 3 ) - merger.add_bookmark( - "The FitR fit", 0, bookmark, (255, 0, 15), True, True, "/FitR", 10, 20, 30, 40 + merger.add_outline_item( + "The FitH fit", 0, outline_item, (255, 0, 15), True, True, "/FitH", 10 ) - merger.add_bookmark("The FitB fit", 0, bookmark, (255, 0, 15), True, True, "/FitB") - merger.add_bookmark( - "The FitBH fit", 0, bookmark, (255, 0, 15), True, True, "/FitBH", 10 + merger.add_outline_item( + "The FitV fit", 0, outline_item, (255, 0, 15), True, True, "/FitV", 10 ) - merger.add_bookmark( - "The FitBV fit", 0, bookmark, (255, 0, 15), True, True, "/FitBV", 10 + merger.add_outline_item( + "The FitR fit", 0, outline_item, (255, 0, 15), True, True, "/FitR", 10, 20, 30, 40, + ) + merger.add_outline_item( + "The FitB fit", 0, outline_item, (255, 0, 15), True, True, "/FitB" + ) + merger.add_outline_item( + "The FitBH fit", 0, outline_item, (255, 0, 15), True, True, "/FitBH", 10 + ) + merger.add_outline_item( + "The FitBV fit", 0, outline_item, (255, 0, 15), True, True, "/FitBV", 10 ) - found_bm = merger.find_bookmark("nothing here") - assert found_bm is None + found_oi = merger.find_outline_item("nothing here") + assert found_oi is None - found_bm = merger.find_bookmark("foo") - assert found_bm == [9] + found_oi = merger.find_outline_item("foo") + assert found_oi == [9] merger.add_metadata({"author": "Martin Thoma"}) merger.add_named_destination("title", 0) @@ -82,12 +88,10 @@ def test_merge(): merger.write(tmp_path) merger.close() - # Check if bookmarks are correct + # Check if outline is correct reader = PyPDF2.PdfReader(tmp_path) - assert [ - el.title for el in reader._get_outlines() if isinstance(el, Destination) - ] == [ - "A bookmark", + assert [el.title for el in reader.outline if isinstance(el, Destination)] == [ + "An outline item", "Foo", "Bar", "Baz", @@ -147,11 +151,11 @@ def test_merge_write_closed_fh(): assert exc.value.args[0] == err_closed with pytest.raises(RuntimeError) as exc: - merger._write_bookmarks() + merger._write_outline() assert exc.value.args[0] == err_closed with pytest.raises(RuntimeError) as exc: - merger.add_bookmark("A bookmark", 0) + merger.add_outline_item("An outline item", 0) assert exc.value.args[0] == err_closed with pytest.raises(RuntimeError) as exc: @@ -195,7 +199,7 @@ def test_zoom_xyz_no_left(): os.remove("tmp-merger-do-not-commit.pdf") -def test_bookmark(): +def test_outline_item(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/997/997511.pdf" name = "tika-997511.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) @@ -296,3 +300,26 @@ def test_iss1145(): name = "iss1145.pdf" merger = PdfMerger() merger.append(PdfReader(BytesIO(get_pdf_from_url(url, name=name)))) + + +def test_deprecate_bookmark_decorator_warning(): + reader = PdfReader( + os.path.join(RESOURCE_ROOT, "outlines-with-invalid-destinations.pdf") + ) + merger = PdfMerger() + with pytest.warns( + UserWarning, + match="import_bookmarks is deprecated as an argument. Use import_outline instead", + ): + merger.merge(0, reader, import_bookmarks=True) + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_deprecate_bookmark_decorator_output(): + reader = PdfReader( + os.path.join(RESOURCE_ROOT, "outlines-with-invalid-destinations.pdf") + ) + merger = PdfMerger() + merger.merge(0, reader, import_bookmarks=True) + first_oi_title = 'Valid Destination: Action /GoTo Named Destination "section.1"' + assert merger.outline[0].title == first_oi_title diff --git a/tests/test_reader.py b/tests/test_reader.py index 419b1558c..84260a490 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -140,10 +140,10 @@ def test_get_attachments(src): (os.path.join(RESOURCE_ROOT, "crazyones.pdf"), 0), ], ) -def test_get_outlines(src, outline_elements): +def test_get_outline(src, outline_elements): reader = PdfReader(src) - outlines = reader._get_outlines() - assert len(outlines) == outline_elements + outline = reader.outline + assert len(outline) == outline_elements @pytest.mark.parametrize( @@ -524,10 +524,10 @@ def test_read_encrypted_without_decryption(): def test_get_destination_page_number(): src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf") reader = PdfReader(src) - outlines = reader._get_outlines() - for outline in outlines: - if not isinstance(outline, list): - reader.get_destination_page_number(outline) + outline = reader.outline + for outline_item in outline: + if not isinstance(outline_item, list): + reader.get_destination_page_number(outline_item) def test_do_not_get_stuck_on_large_files_without_start_xref(): @@ -560,7 +560,7 @@ def test_decrypt_when_no_id(): def test_reader_properties(): reader = PdfReader(os.path.join(RESOURCE_ROOT, "crazyones.pdf")) - assert reader.outlines == [] + assert reader.outline == [] assert len(reader.pages) == 1 assert reader.page_layout is None assert reader.page_mode is None @@ -575,18 +575,18 @@ def test_issue604(strict): """Test with invalid destinations""" # todo with open(os.path.join(RESOURCE_ROOT, "issue-604.pdf"), "rb") as f: pdf = None - bookmarks = None + outline = None if strict: pdf = PdfReader(f, strict=strict) with pytest.raises(PdfReadError) as exc, pytest.warns(PdfReadWarning): - bookmarks = pdf._get_outlines() + outline = pdf.outline if "Unknown Destination" not in exc.value.args[0]: raise Exception("Expected exception not raised") - return # bookmarks not correct + return # outline is not correct else: pdf = PdfReader(f, strict=strict) with pytest.warns(PdfReadWarning): - bookmarks = pdf._get_outlines() + outline = pdf.outline def get_dest_pages(x): if isinstance(x, list): @@ -596,10 +596,8 @@ def get_dest_pages(x): return pdf.get_destination_page_number(x) + 1 out = [] - for ( - b - ) in bookmarks: # b can be destination or a list:preferred to just print them - out.append(get_dest_pages(b)) + for oi in outline: # oi can be destination or a list:preferred to just print them + out.append(get_dest_pages(oi)) def test_decode_permissions(): @@ -853,33 +851,33 @@ def test_outline_color(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - assert reader.outlines[0].color == [0, 0, 1] + assert reader.outline[0].color == [0, 0, 1] def test_outline_font_format(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - assert reader.outlines[0].font_format == 2 + assert reader.outline[0].font_format == 2 -def get_outlines_property(outlines, attribute_name: str): +def get_outline_property(outline, attribute_name: str): results = [] - if isinstance(outlines, list): - for outline in outlines: - if isinstance(outline, Destination): - results.append(getattr(outline, attribute_name)) + if isinstance(outline, list): + for outline_item in outline: + if isinstance(outline_item, Destination): + results.append(getattr(outline_item, attribute_name)) else: - results.append(get_outlines_property(outline, attribute_name)) + results.append(get_outline_property(outline_item, attribute_name)) else: - raise ValueError(f"got {type(outlines)}") + raise ValueError(f"got {type(outline)}") return results def test_outline_title_issue_1121(): reader = PdfReader(EXTERNAL_ROOT / "014-outlines/mistitled_outlines_example.pdf") - assert get_outlines_property(reader.outlines, "title") == [ + assert get_outline_property(reader.outline, "title") == [ "First", [ "Second", @@ -925,7 +923,7 @@ def test_outline_title_issue_1121(): def test_outline_count(): reader = PdfReader(EXTERNAL_ROOT / "014-outlines/mistitled_outlines_example.pdf") - assert get_outlines_property(reader.outlines, "outline_count") == [ + assert get_outline_property(reader.outline, "outline_count") == [ 5, [ None, @@ -973,7 +971,7 @@ def test_outline_missing_title(): os.path.join(RESOURCE_ROOT, "outline-without-title.pdf"), strict=True ) with pytest.raises(PdfReadError) as exc: - reader.outlines + reader.outline assert exc.value.args[0].startswith("Outline Entry Missing /Title attribute:") @@ -997,24 +995,24 @@ def test_outline_with_missing_named_destination(): name = "tika-913678.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) # outline items in document reference a named destination that is not defined - assert reader.outlines[1][0].title.startswith("Report for 2002AZ3B: Microbial") + assert reader.outline[1][0].title.startswith("Report for 2002AZ3B: Microbial") def test_outline_with_empty_action(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/924/924546.pdf" name = "tika-924546.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - # outline (entitled Tables and Figures) utilize an empty action (/A) + # outline items (entitled Tables and Figures) utilize an empty action (/A) # that has no type or destination - assert reader.outlines[-4].title == "Tables" + assert reader.outline[-4].title == "Tables" -def test_outlines_with_invalid_destinations(): +def test_outline_with_invalid_destinations(): reader = PdfReader( os.path.join(RESOURCE_ROOT, "outlines-with-invalid-destinations.pdf") ) - # contains 9 outlines, 6 with invalid destinations caused by different malformations - assert len(reader.outlines) == 9 + # contains 9 outline items, 6 with invalid destinations caused by different malformations + assert len(reader.outline) == 9 def test_PdfReaderMultipleDefinitions(): diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 9a9571dd7..868018a40 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -548,7 +548,7 @@ def test_image_extraction2(url, name): def test_get_outline(url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) - reader.outlines + reader.outline @pytest.mark.parametrize( diff --git a/tests/test_writer.py b/tests/test_writer.py index 96a186cd1..67b4188de 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -44,23 +44,33 @@ def test_writer_operations(): assert exc.value.args == () writer.insert_page(page, 1) writer.insert_page(reader_outline.pages[0], 0) - writer.add_bookmark_destination(page) + writer.add_outline_item_destination(page) writer.remove_links() - writer.add_bookmark_destination(page) - bm = writer.add_bookmark( - "A bookmark", 0, None, (255, 0, 15), True, True, "/FitBV", 10 + writer.add_outline_item_destination(page) + oi = writer.add_outline_item( + "An outline item", 0, None, (255, 0, 15), True, True, "/FitBV", 10 ) - writer.add_bookmark( - "The XYZ fit", 0, bm, (255, 0, 15), True, True, "/XYZ", 10, 20, 3 + writer.add_outline_item( + "The XYZ fit", 0, oi, (255, 0, 15), True, True, "/XYZ", 10, 20, 3 ) - writer.add_bookmark("The FitH fit", 0, bm, (255, 0, 15), True, True, "/FitH", 10) - writer.add_bookmark("The FitV fit", 0, bm, (255, 0, 15), True, True, "/FitV", 10) - writer.add_bookmark( - "The FitR fit", 0, bm, (255, 0, 15), True, True, "/FitR", 10, 20, 30, 40 + writer.add_outline_item( + "The FitH fit", 0, oi, (255, 0, 15), True, True, "/FitH", 10 + ) + writer.add_outline_item( + "The FitV fit", 0, oi, (255, 0, 15), True, True, "/FitV", 10 + ) + writer.add_outline_item( + "The FitR fit", 0, oi, (255, 0, 15), True, True, "/FitR", 10, 20, 30, 40 + ) + writer.add_outline_item( + "The FitB fit", 0, oi, (255, 0, 15), True, True, "/FitB" + ) + writer.add_outline_item( + "The FitBH fit", 0, oi, (255, 0, 15), True, True, "/FitBH", 10 + ) + writer.add_outline_item( + "The FitBV fit", 0, oi, (255, 0, 15), True, True, "/FitBV", 10 ) - writer.add_bookmark("The FitB fit", 0, bm, (255, 0, 15), True, True, "/FitB") - writer.add_bookmark("The FitBH fit", 0, bm, (255, 0, 15), True, True, "/FitBH", 10) - writer.add_bookmark("The FitBV fit", 0, bm, (255, 0, 15), True, True, "/FitBV", 10) writer.add_blank_page() writer.add_uri(2, "https://example.com", RectangleObject([0, 0, 100, 100])) writer.add_link(2, 1, RectangleObject([0, 0, 100, 100])) @@ -305,20 +315,22 @@ def test_encrypt(use_128bit): os.remove(tmp_filename) -def test_add_bookmark(): +def test_add_outline_item(): reader = PdfReader(os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")) writer = PdfWriter() for page in reader.pages: writer.add_page(page) - bookmark = writer.add_bookmark( - "A bookmark", 1, None, (255, 0, 15), True, True, "/Fit", 200, 0, None + outline_item = writer.add_outline_item( + "An outline item", 1, None, (255, 0, 15), True, True, "/Fit", 200, 0, None + ) + writer.add_outline_item( + "Another", 2, outline_item, None, False, False, "/Fit", 0, 0, None ) - writer.add_bookmark("Another", 2, bookmark, None, False, False, "/Fit", 0, 0, None) # write "output" to PyPDF2-output.pdf - tmp_filename = "dont_commit_bookmark.pdf" + tmp_filename = "dont_commit_outline_item.pdf" with open(tmp_filename, "wb") as output_stream: writer.write(output_stream) @@ -494,7 +506,7 @@ def test_sweep_indirect_references_nullobject_exception(): os.remove("tmp-merger-do-not-commit.pdf") -def test_write_bookmark_on_page_fitv(): +def test_write_outline_item_on_page_fitv(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/922/922840.pdf" name = "tika-922840.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) @@ -602,3 +614,18 @@ def test_add_single_annotation(): # Cleanup os.remove(target) # remove for testing + + +def test_deprecate_bookmark_decorator(): + reader = PdfReader( + os.path.join(RESOURCE_ROOT, "outlines-with-invalid-destinations.pdf") + ) + page = reader.pages[0] + outline_item = reader.outline[0] + writer = PdfWriter() + writer.add_page(page) + with pytest.warns( + UserWarning, + match="bookmark is deprecated as an argument. Use outline_item instead", + ): + writer.add_outline_item_dict(bookmark=outline_item)