Introduce start_page argument to partitioning functions that assign…

… `element.metadata.page_number` (#2884) This small change will be useful for users who partition only fragments of their PDF documents. It's a small step towards addressing this issue: #2461 Related PRs: * #2842 * #2673
Unstructured-IO · Apr 15, 2024 · cb1e910 · cb1e910
1 parent ba3f374
commit cb1e910
Show file tree

Hide file tree

Showing 14 changed files with 126 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,11 @@
-## 0.13.3-dev4
+## 0.13.3-dev5
 
 ### Enhancements
 
 * **Add support for `start_index` in `html` links extraction**
 * **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
 * **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
+* **Introduce `starting_page_number` parameter to partitioning functions** It applies to those partitioners which support `page_number` in element's metadata: PDF, TIFF, XLSX, DOC, DOCX, PPT, PPTX.
 
 ### Features
 

diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -377,11 +377,13 @@ def test_partition_docx_includes_page_numbers_when_page_break_elements_are_suppr
 
 
 def test_partition_docx_includes_page_break_elements_when_so_instructed():
-    elements = partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
+    elements = partition_docx(
+        example_doc_path("handbook-1p.docx"), include_page_breaks=True, starting_page_number=3
+    )
 
     assert "PageBreak" in [type(e).__name__ for e in elements]
-    assert elements[1].metadata.page_number == 1
-    assert elements[-2].metadata.page_number == 2
+    assert elements[1].metadata.page_number == 3
+    assert elements[-2].metadata.page_number == 4
 
 
 # ------------------------------------------------------------------------------------------------

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -148,19 +148,21 @@ def test_partition_pdf_local_raises_with_no_filename():
 
 @pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
 @pytest.mark.parametrize(
-    ("strategy", "expected", "origin"),
+    ("strategy", "starting_page_number", "expected_page_numbers", "origin"),
     # fast: can't capture the "intentionally left blank page" page
     # others: will ignore the actual blank page
     [
-        (PartitionStrategy.FAST, {1, 4}, {"pdfminer"}),
-        (PartitionStrategy.HI_RES, {1, 3, 4}, {"yolox", "pdfminer"}),
-        (PartitionStrategy.OCR_ONLY, {1, 3, 4}, {"ocr_tesseract"}),
+        (PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
+        (PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
+        (PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
+        (PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
     ],
 )
-def test_partition_pdf(
+def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
     file_mode,
     strategy,
-    expected,
+    starting_page_number,
+    expected_page_numbers,
     origin,
     filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
 ):
@@ -169,23 +171,29 @@ def _test(result):
         # validate that the result is a non-empty list of dicts
         assert len(result) > 10
         # check that the pdf has multiple different page numbers
-        assert {element.metadata.page_number for element in result} == expected
+        assert {element.metadata.page_number for element in result} == expected_page_numbers
         if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
             assert {element.metadata.detection_origin for element in result} == origin
 
     if file_mode == "filename":
-        result = pdf.partition_pdf(filename=filename, strategy=strategy)
+        result = pdf.partition_pdf(
+            filename=filename, strategy=strategy, starting_page_number=starting_page_number
+        )
         _test(result)
     elif file_mode == "rb":
         with open(filename, "rb") as f:
-            result = pdf.partition_pdf(file=f, strategy=strategy)
+            result = pdf.partition_pdf(
+                file=f, strategy=strategy, starting_page_number=starting_page_number
+            )
             _test(result)
     else:
         with open(filename, "rb") as test_file:
             spooled_temp_file = SpooledTemporaryFile()
             spooled_temp_file.write(test_file.read())
             spooled_temp_file.seek(0)
-            result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
+            result = pdf.partition_pdf(
+                file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
+            )
             _test(result)
 
 
@@ -298,10 +306,12 @@ def test_partition_pdf_with_no_page_breaks(
 def test_partition_pdf_with_fast_strategy(
     filename=example_doc_path("layout-parser-paper-fast.pdf"),
 ):
-    elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
+    elements = pdf.partition_pdf(
+        filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
+    )
     assert len(elements) > 10
     # check that the pdf has multiple different page numbers
-    assert {element.metadata.page_number for element in elements} == {1, 2}
+    assert {element.metadata.page_number for element in elements} == {3, 4}
     for element in elements:
         assert element.metadata.filename == "layout-parser-paper-fast.pdf"
 

diff --git a/test_unstructured/partition/pptx/test_pptx.py b/test_unstructured/partition/pptx/test_pptx.py
@@ -703,6 +703,21 @@ def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
         list(opts.increment_page_number())
         assert opts.page_number == 2
 
+    def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
+        self, opts_args: dict[str, Any]
+    ):
+        opts = _PptxPartitionerOptions(**opts_args, starting_page_number=3)
+        # -- move to the "first" slide --
+        list(opts.increment_page_number())
+
+        table_metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
+        text_metadata = opts.text_metadata()
+
+        assert isinstance(table_metadata, ElementMetadata)
+        assert isinstance(text_metadata, ElementMetadata)
+        assert text_metadata.page_number == 3
+        assert table_metadata.page_number == 3
+
     # -- .pptx_file ------------------------------
 
     def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -363,6 +363,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
         extract_image_block_to_payload=False,
         hi_res_model_name=None,
         date_from_file_object=False,
+        starting_page_number=1,
     )
 
 
@@ -840,6 +841,11 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
     assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE
 
 
+def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
+    elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
+    assert elements[1].metadata.page_number == 3
+
+
 EXPECTED_XLS_TEXT_LEN = 550
 
 

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.3-dev4"  # pragma: no cover
+__version__ = "0.13.3-dev5"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -156,6 +156,7 @@ def partition(
     hi_res_model_name: Optional[str] = None,
     model_name: Optional[str] = None,  # to be deprecated
     date_from_file_object: bool = False,
+    starting_page_number: int = 1,
     **kwargs,
 ):
     """Partitions a document into its constituent elements. Will use libmagic to determine
@@ -243,6 +244,10 @@ def partition(
         Applies only when providing file via `file` parameter. If this option is True and inference
         from message header failed, attempt to infer last_modified metadata from bytes,
         otherwise set it to None.
+    starting_page_number
+        Indicates what page number should be assigned to the first page in the document.
+        This information will be reflected in elements' metadata and can be be especially
+        useful when partitioning a document that is part of a larger document.
     """
     exactly_one(file=file, filename=filename, url=url)
 
@@ -308,6 +313,7 @@ def partition(
             infer_table_structure=infer_table_structure,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            starting_page_number=starting_page_number,
             **kwargs,
         )
     elif filetype == FileType.DOCX:
@@ -318,6 +324,7 @@ def partition(
             infer_table_structure=infer_table_structure,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            starting_page_number=starting_page_number,
             **kwargs,
         )
     elif filetype == FileType.ODT:
@@ -426,6 +433,7 @@ def partition(
             extract_image_block_types=extract_image_block_types,
             extract_image_block_output_dir=extract_image_block_output_dir,
             extract_image_block_to_payload=extract_image_block_to_payload,
+            starting_page_number=starting_page_number,
             **kwargs,
         )
     elif filetype in IMAGE_FILETYPES:
@@ -485,6 +493,7 @@ def partition(
             infer_table_structure=infer_table_structure,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            starting_page_number=starting_page_number,
             **kwargs,
         )
     elif filetype == FileType.JSON:
@@ -502,6 +511,7 @@ def partition(
             infer_table_structure=infer_table_structure,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            starting_page_number=starting_page_number,
             **kwargs,
         )
     elif filetype == FileType.CSV:

diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
@@ -540,13 +540,14 @@ def document_to_element_list(
     detection_origin: Optional[str] = None,
     sort_mode: str = SORT_MODE_XY_CUT,
     languages: Optional[List[str]] = None,
+    starting_page_number: int = 1,
     **kwargs: Any,
 ) -> List[Element]:
     """Converts a DocumentLayout object to a list of unstructured elements."""
     elements: List[Element] = []
 
     num_pages = len(document.pages)
-    for i, page in enumerate(document.pages):
+    for page_number, page in enumerate(document.pages, start=starting_page_number):
         page_elements: List[Element] = []
 
         page_image_metadata = _get_page_image_metadata(page)
@@ -571,7 +572,7 @@ def document_to_element_list(
                 for el in element:
                     if last_modification_date:
                         el.metadata.last_modified = last_modification_date
-                    el.metadata.page_number = i + 1
+                    el.metadata.page_number = page_number
                 page_elements.extend(element)
                 translation_mapping.extend([(layout_element, el) for el in element])
                 continue
@@ -601,7 +602,7 @@ def document_to_element_list(
 
             add_element_metadata(
                 element,
-                page_number=i + 1,
+                page_number=page_number,
                 filetype=image_format,
                 coordinates=coordinates,
                 coordinate_system=coordinate_system,
@@ -622,7 +623,7 @@ def document_to_element_list(
         if sortable and sort_mode != SORT_MODE_DONT:
             sorted_page_elements = sort_page_elements(page_elements, sort_mode)
 
-        if include_page_breaks and i < num_pages - 1:
+        if include_page_breaks and page_number < num_pages + starting_page_number:
             sorted_page_elements.append(PageBreak(text=""))
         elements.extend(sorted_page_elements)
 

diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py
@@ -29,6 +29,7 @@ def partition_doc(
     languages: Optional[List[str]] = ["auto"],
     detect_language_per_element: bool = False,
     date_from_file_object: bool = False,
+    starting_page_number: int = 1,
     **kwargs: Any,
 ) -> List[Element]:
     """Partitions Microsoft Word Documents in .doc format into its document elements.
@@ -55,6 +56,10 @@ def partition_doc(
     date_from_file_object
         Applies only when providing file via `file` parameter. If this option is True, attempt
         infer last_modified metadata from bytes, otherwise set it to None.
+    starting_page_number
+        Indicates what page number should be assigned to the first page in the document.
+        This information will be reflected in elements' metadata and can be be especially
+        useful when partitioning a document that is part of a larger document.
     """
     # Verify that only one of the arguments was provided
     if filename is None:
@@ -97,6 +102,7 @@ def partition_doc(
             metadata_last_modified=metadata_last_modified or last_modification_date,
             languages=languages,
             detect_language_per_element=detect_language_per_element,
+            starting_page_number=starting_page_number,
         )
         # remove tmp.name from filename if parsing file
         if file:

diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -181,6 +181,7 @@ def partition_docx(
     languages: Optional[List[str]] = ["auto"],
     detect_language_per_element: bool = False,
     date_from_file_object: bool = False,
+    starting_page_number: int = 1,
     **kwargs: Any,  # used by decorator
 ) -> List[Element]:
     """Partitions Microsoft Word Documents in .docx format into its document elements.
@@ -212,6 +213,10 @@ def partition_docx(
     date_from_file_object
         Applies only when providing file via `file` parameter. If this option is True, attempt
         infer last_modified metadata from bytes, otherwise set it to None.
+    starting_page_number
+        Indicates what page number should be assigned to the first page in the document.
+        This information will be reflected in elements' metadata and can be be especially
+        useful when partitioning a document that is part of a larger document.
     """
     # -- verify that only one file-specifier argument was provided --
     exactly_one(filename=filename, file=file)
@@ -224,6 +229,7 @@ def partition_docx(
         infer_table_structure,
         metadata_last_modified,
         date_from_file_object,
+        starting_page_number=starting_page_number,
     )
     elements = apply_lang_metadata(
         elements=elements,
@@ -249,14 +255,15 @@ def __init__(
         infer_table_structure: bool = True,
         metadata_last_modified: Optional[str] = None,
         date_from_file_object: bool = False,
+        starting_page_number: int = 1,
     ) -> None:
         self._filename = filename
         self._file = file
         self._metadata_filename = metadata_filename
         self._include_page_breaks = include_page_breaks
         self._infer_table_structure = infer_table_structure
         self._metadata_last_modified = metadata_last_modified
-        self._page_counter: int = 1
+        self._page_counter = starting_page_number
         self._date_from_file_object = date_from_file_object
 
     @classmethod
@@ -269,6 +276,7 @@ def iter_document_elements(
         infer_table_structure: bool = True,
         metadata_last_modified: Optional[str] = None,
         date_from_file_object: bool = False,
+        starting_page_number: int = 1,
     ) -> Iterator[Element]:
         """Partition MS Word documents (.docx format) into its document elements."""
         self = cls(
@@ -279,6 +287,7 @@ def iter_document_elements(
             infer_table_structure=infer_table_structure,
             metadata_last_modified=metadata_last_modified,
             date_from_file_object=date_from_file_object,
+            starting_page_number=starting_page_number,
         )
         # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
         # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.13.3-dev4" # pragma: no cover
		__version__ = "0.13.3-dev5" # pragma: no cover