Skip to content

Commit

Permalink
Introduce start_page argument to partitioning functions that assign…
Browse files Browse the repository at this point in the history
… `element.metadata.page_number` (#2884)

This small change will be useful for users who partition only fragments
of their PDF documents.
It's a small step towards addressing this issue:
#2461

Related PRs:
* #2842
* #2673
  • Loading branch information
micmarty-deepsense authored Apr 15, 2024
1 parent ba3f374 commit cb1e910
Show file tree
Hide file tree
Showing 14 changed files with 126 additions and 31 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
## 0.13.3-dev4
## 0.13.3-dev5

### Enhancements

* **Add support for `start_index` in `html` links extraction**
* **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.
* **Support pluggable sub-partitioner for PPTX Picture shapes.** Use a distinct sub-partitioner for partitioning PPTX Picture (image) shapes and allow the default picture sub-partitioner to be replaced at run-time by one of the user's choosing.
* **Introduce `starting_page_number` parameter to partitioning functions** It applies to those partitioners which support `page_number` in element's metadata: PDF, TIFF, XLSX, DOC, DOCX, PPT, PPTX.

### Features

Expand Down
8 changes: 5 additions & 3 deletions test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,11 +377,13 @@ def test_partition_docx_includes_page_numbers_when_page_break_elements_are_suppr


def test_partition_docx_includes_page_break_elements_when_so_instructed():
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
elements = partition_docx(
example_doc_path("handbook-1p.docx"), include_page_breaks=True, starting_page_number=3
)

assert "PageBreak" in [type(e).__name__ for e in elements]
assert elements[1].metadata.page_number == 1
assert elements[-2].metadata.page_number == 2
assert elements[1].metadata.page_number == 3
assert elements[-2].metadata.page_number == 4


# ------------------------------------------------------------------------------------------------
Expand Down
34 changes: 22 additions & 12 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,21 @@ def test_partition_pdf_local_raises_with_no_filename():

@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
@pytest.mark.parametrize(
("strategy", "expected", "origin"),
("strategy", "starting_page_number", "expected_page_numbers", "origin"),
# fast: can't capture the "intentionally left blank page" page
# others: will ignore the actual blank page
[
(PartitionStrategy.FAST, {1, 4}, {"pdfminer"}),
(PartitionStrategy.HI_RES, {1, 3, 4}, {"yolox", "pdfminer"}),
(PartitionStrategy.OCR_ONLY, {1, 3, 4}, {"ocr_tesseract"}),
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
],
)
def test_partition_pdf(
def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
file_mode,
strategy,
expected,
starting_page_number,
expected_page_numbers,
origin,
filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
):
Expand All @@ -169,23 +171,29 @@ def _test(result):
# validate that the result is a non-empty list of dicts
assert len(result) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in result} == expected
assert {element.metadata.page_number for element in result} == expected_page_numbers
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in result} == origin

if file_mode == "filename":
result = pdf.partition_pdf(filename=filename, strategy=strategy)
result = pdf.partition_pdf(
filename=filename, strategy=strategy, starting_page_number=starting_page_number
)
_test(result)
elif file_mode == "rb":
with open(filename, "rb") as f:
result = pdf.partition_pdf(file=f, strategy=strategy)
result = pdf.partition_pdf(
file=f, strategy=strategy, starting_page_number=starting_page_number
)
_test(result)
else:
with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
result = pdf.partition_pdf(file=spooled_temp_file, strategy=strategy)
result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
)
_test(result)


Expand Down Expand Up @@ -298,10 +306,12 @@ def test_partition_pdf_with_no_page_breaks(
def test_partition_pdf_with_fast_strategy(
filename=example_doc_path("layout-parser-paper-fast.pdf"),
):
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
elements = pdf.partition_pdf(
filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
)
assert len(elements) > 10
# check that the pdf has multiple different page numbers
assert {element.metadata.page_number for element in elements} == {1, 2}
assert {element.metadata.page_number for element in elements} == {3, 4}
for element in elements:
assert element.metadata.filename == "layout-parser-paper-fast.pdf"

Expand Down
15 changes: 15 additions & 0 deletions test_unstructured/partition/pptx/test_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,21 @@ def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
list(opts.increment_page_number())
assert opts.page_number == 2

def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
self, opts_args: dict[str, Any]
):
opts = _PptxPartitionerOptions(**opts_args, starting_page_number=3)
# -- move to the "first" slide --
list(opts.increment_page_number())

table_metadata = opts.table_metadata(text_as_html="<table><tr/></table>")
text_metadata = opts.text_metadata()

assert isinstance(table_metadata, ElementMetadata)
assert isinstance(text_metadata, ElementMetadata)
assert text_metadata.page_number == 3
assert table_metadata.page_number == 3

# -- .pptx_file ------------------------------

def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
Expand Down
6 changes: 6 additions & 0 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
extract_image_block_to_payload=False,
hi_res_model_name=None,
date_from_file_object=False,
starting_page_number=1,
)


Expand Down Expand Up @@ -840,6 +841,11 @@ def test_auto_partition_xlsx_from_file(filename="example-docs/stanley-cups.xlsx"
assert elements[1].metadata.filetype == EXPECTED_XLSX_FILETYPE


def test_auto_partition_respects_starting_page_number_argument_for_xlsx():
elements = partition("example-docs/stanley-cups.xlsx", starting_page_number=3)
assert elements[1].metadata.page_number == 3


EXPECTED_XLS_TEXT_LEN = 550


Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.3-dev4" # pragma: no cover
__version__ = "0.13.3-dev5" # pragma: no cover
10 changes: 10 additions & 0 deletions unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ def partition(
hi_res_model_name: Optional[str] = None,
model_name: Optional[str] = None, # to be deprecated
date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs,
):
"""Partitions a document into its constituent elements. Will use libmagic to determine
Expand Down Expand Up @@ -243,6 +244,10 @@ def partition(
Applies only when providing file via `file` parameter. If this option is True and inference
from message header failed, attempt to infer last_modified metadata from bytes,
otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first page in the document.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
"""
exactly_one(file=file, filename=filename, url=url)

Expand Down Expand Up @@ -308,6 +313,7 @@ def partition(
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs,
)
elif filetype == FileType.DOCX:
Expand All @@ -318,6 +324,7 @@ def partition(
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs,
)
elif filetype == FileType.ODT:
Expand Down Expand Up @@ -426,6 +433,7 @@ def partition(
extract_image_block_types=extract_image_block_types,
extract_image_block_output_dir=extract_image_block_output_dir,
extract_image_block_to_payload=extract_image_block_to_payload,
starting_page_number=starting_page_number,
**kwargs,
)
elif filetype in IMAGE_FILETYPES:
Expand Down Expand Up @@ -485,6 +493,7 @@ def partition(
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs,
)
elif filetype == FileType.JSON:
Expand All @@ -502,6 +511,7 @@ def partition(
infer_table_structure=infer_table_structure,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
**kwargs,
)
elif filetype == FileType.CSV:
Expand Down
9 changes: 5 additions & 4 deletions unstructured/partition/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,13 +540,14 @@ def document_to_element_list(
detection_origin: Optional[str] = None,
sort_mode: str = SORT_MODE_XY_CUT,
languages: Optional[List[str]] = None,
starting_page_number: int = 1,
**kwargs: Any,
) -> List[Element]:
"""Converts a DocumentLayout object to a list of unstructured elements."""
elements: List[Element] = []

num_pages = len(document.pages)
for i, page in enumerate(document.pages):
for page_number, page in enumerate(document.pages, start=starting_page_number):
page_elements: List[Element] = []

page_image_metadata = _get_page_image_metadata(page)
Expand All @@ -571,7 +572,7 @@ def document_to_element_list(
for el in element:
if last_modification_date:
el.metadata.last_modified = last_modification_date
el.metadata.page_number = i + 1
el.metadata.page_number = page_number
page_elements.extend(element)
translation_mapping.extend([(layout_element, el) for el in element])
continue
Expand Down Expand Up @@ -601,7 +602,7 @@ def document_to_element_list(

add_element_metadata(
element,
page_number=i + 1,
page_number=page_number,
filetype=image_format,
coordinates=coordinates,
coordinate_system=coordinate_system,
Expand All @@ -622,7 +623,7 @@ def document_to_element_list(
if sortable and sort_mode != SORT_MODE_DONT:
sorted_page_elements = sort_page_elements(page_elements, sort_mode)

if include_page_breaks and i < num_pages - 1:
if include_page_breaks and page_number < num_pages + starting_page_number:
sorted_page_elements.append(PageBreak(text=""))
elements.extend(sorted_page_elements)

Expand Down
6 changes: 6 additions & 0 deletions unstructured/partition/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def partition_doc(
languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs: Any,
) -> List[Element]:
"""Partitions Microsoft Word Documents in .doc format into its document elements.
Expand All @@ -55,6 +56,10 @@ def partition_doc(
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first page in the document.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
"""
# Verify that only one of the arguments was provided
if filename is None:
Expand Down Expand Up @@ -97,6 +102,7 @@ def partition_doc(
metadata_last_modified=metadata_last_modified or last_modification_date,
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
)
# remove tmp.name from filename if parsing file
if file:
Expand Down
11 changes: 10 additions & 1 deletion unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def partition_docx(
languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1,
**kwargs: Any, # used by decorator
) -> List[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.
Expand Down Expand Up @@ -212,6 +213,10 @@ def partition_docx(
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
starting_page_number
Indicates what page number should be assigned to the first page in the document.
This information will be reflected in elements' metadata and can be be especially
useful when partitioning a document that is part of a larger document.
"""
# -- verify that only one file-specifier argument was provided --
exactly_one(filename=filename, file=file)
Expand All @@ -224,6 +229,7 @@ def partition_docx(
infer_table_structure,
metadata_last_modified,
date_from_file_object,
starting_page_number=starting_page_number,
)
elements = apply_lang_metadata(
elements=elements,
Expand All @@ -249,14 +255,15 @@ def __init__(
infer_table_structure: bool = True,
metadata_last_modified: Optional[str] = None,
date_from_file_object: bool = False,
starting_page_number: int = 1,
) -> None:
self._filename = filename
self._file = file
self._metadata_filename = metadata_filename
self._include_page_breaks = include_page_breaks
self._infer_table_structure = infer_table_structure
self._metadata_last_modified = metadata_last_modified
self._page_counter: int = 1
self._page_counter = starting_page_number
self._date_from_file_object = date_from_file_object

@classmethod
Expand All @@ -269,6 +276,7 @@ def iter_document_elements(
infer_table_structure: bool = True,
metadata_last_modified: Optional[str] = None,
date_from_file_object: bool = False,
starting_page_number: int = 1,
) -> Iterator[Element]:
"""Partition MS Word documents (.docx format) into its document elements."""
self = cls(
Expand All @@ -279,6 +287,7 @@ def iter_document_elements(
infer_table_structure=infer_table_structure,
metadata_last_modified=metadata_last_modified,
date_from_file_object=date_from_file_object,
starting_page_number=starting_page_number,
)
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
Expand Down
Loading

0 comments on commit cb1e910

Please sign in to comment.