From 12b30d28109434de825445a860cd4bf2f2436a58 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 14 May 2024 17:50:31 -0700 Subject: [PATCH] rfctr(docx): extract DocxPartitionerOptions (#3018) **Reviewers:** Probably easier to review first and second commits separately as the first one adds all the new code and tests (without installing it), and the second one installs it into the partitioner along with the required changes to code and tests. **Summary** Enable communication of partitioning options to sub-partitioners, in particular to the pluggable `PicturePartitioner` coming in a closely subsequent PR to implement image-extraction and OCR for DOCX, DOC, and ODT formats. **Additional Context** In general, validation of partitioning options as well as assigning default values and computing derived partitioning settings can be extracted from partitioners into a neatly encapsulated separate object. This simplifies the core partitioning code by removing the noise associated with computing metadata values and deciding how to access the source document, etc. However, better factoring aside, having the partition-time "settings" available in a single object allows partitioning of certain document features, for example images, to be readily _delegated_ to a sub-partitioner while still giving it access to all the relevant partitioning settings for the current document. This is particularly important when a sub-partitioner is "pluggable" at runtime and must rely on a clearly-defined (and simple as possible) interface to operate smoothly. --- CHANGELOG.md | 4 +- test_unstructured/partition/docx/test_docx.py | 425 +++++++++++++++--- unstructured/__version__.py | 2 +- unstructured/partition/docx.py | 312 +++++++------ 4 files changed, 543 insertions(+), 200 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89caeaa410..0496479af1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ -## 0.13.8-dev7 +## 0.13.8-dev8 ### Enhancements -**Faster evaluation** Support for concurrent processing of documents during evaluation +* **Faster evaluation** Support for concurrent processing of documents during evaluation ### Features diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 59e90540a3..9e89d99737 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -4,16 +4,26 @@ from __future__ import annotations +import io import pathlib import re import tempfile +from typing import Any import docx import pytest from docx.document import Document from pytest_mock import MockFixture -from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path +from test_unstructured.unit_utils import ( + FixtureRequest, + Mock, + assert_round_trips_through_JSON, + example_doc_path, + function_mock, + instance_mock, + property_mock, +) from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ( Address, @@ -29,7 +39,7 @@ Text, Title, ) -from unstructured.partition.docx import _DocxPartitioner, partition_docx +from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA # -- docx-file loading behaviors ----------------------------------------------------------------- @@ -89,14 +99,16 @@ def test_partition_docx_from_file_with_metadata_filename( assert element.metadata.filename == "test" -def test_partition_docx_raises_with_both_specified(mock_document_file_path: str): - with open(mock_document_file_path, "rb") as f: - with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"): - partition_docx(filename=mock_document_file_path, file=f) +def test_partition_docx_uses_file_path_when_both_are_specified( + mock_document_file_path: str, expected_elements: list[Text] +): + f = io.BytesIO(b"abcde") + elements = partition_docx(filename=mock_document_file_path, file=f) + assert elements == expected_elements def test_partition_docx_raises_with_neither(): - with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"): + with pytest.raises(ValueError, match="either `filename` or `file` argument must be provided"): partition_docx() @@ -292,15 +304,13 @@ def test_partition_docx_from_file_without_metadata_date(): assert elements[0].metadata.last_modified is None -def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dict[str, str]]): - partitioner = _DocxPartitioner( - example_doc_path("fake-doc-emphasized-text.docx"), - None, - None, - False, - True, - None, - ) +def test_get_emphasized_texts_from_paragraph( + opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]] +): + opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) + paragraph = partitioner._document.paragraphs[1] emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph)) assert paragraph.text == "I am a bold italic bold-italic text." @@ -317,34 +327,31 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dic assert emphasized_texts == [] -def test_iter_table_emphasis(expected_emphasized_texts: list[dict[str, str]]): - partitioner = _DocxPartitioner( - example_doc_path("fake-doc-emphasized-text.docx"), - None, - None, - False, - True, - None, - ) +def test_iter_table_emphasis( + opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]] +): + opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) table = partitioner._document.tables[0] + emphasized_texts = list(partitioner._iter_table_emphasis(table)) + assert emphasized_texts == expected_emphasized_texts def test_table_emphasis( + opts_args: dict[str, Any], expected_emphasized_text_contents: list[str], expected_emphasized_text_tags: list[str], ): - partitioner = _DocxPartitioner( - example_doc_path("fake-doc-emphasized-text.docx"), - None, - None, - False, - True, - None, - ) + opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) table = partitioner._document.tables[0] + emphasized_text_contents, emphasized_text_tags = partitioner._table_emphasis(table) + assert emphasized_text_contents == expected_emphasized_text_contents assert emphasized_text_tags == expected_emphasized_text_tags @@ -373,15 +380,10 @@ def test_partition_docx_with_json(mock_document_file_path: str): assert_round_trips_through_JSON(elements) -def test_parse_category_depth_by_style(): - partitioner = _DocxPartitioner( - example_doc_path("category-level.docx"), - None, - None, - False, - True, - None, - ) +def test_parse_category_depth_by_style(opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("category-level.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) # Category depths are 0-indexed and relative to the category type # Title, list item, bullet, narrative text, etc. @@ -411,9 +413,9 @@ def test_parse_category_depth_by_style(): ), f"expected paragraph[{idx}] to have depth=={depth}, got {actual_depth}" -def test_parse_category_depth_by_style_name(): - partitioner = _DocxPartitioner(None, None, None, False, True, None) - +def test_parse_category_depth_by_style_name(opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) test_cases = [ (0, "Heading 1"), (1, "Heading 2"), @@ -436,8 +438,9 @@ def test_parse_category_depth_by_style_name(): ), f"test case {test_cases[idx]} failed" -def test_parse_category_depth_by_style_ilvl(): - partitioner = _DocxPartitioner(None, None, None, False, True, None) +def test_parse_category_depth_by_style_ilvl(opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) assert partitioner._parse_category_depth_by_style_ilvl() == 0 @@ -683,6 +686,24 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) -> return filename +@pytest.fixture() +def opts_args() -> dict[str, Any]: + """All default arguments for `DocxPartitionerOptions`. + + Individual argument values can be changed to suit each test. Makes construction of opts more + compact for testing purposes. + """ + return { + "date_from_file_object": False, + "file": None, + "file_path": None, + "include_page_breaks": True, + "infer_table_structure": True, + "metadata_file_path": None, + "metadata_last_modified": None, + } + + # ================================================================================================ # ISOLATED UNIT TESTS # ================================================================================================ @@ -691,14 +712,280 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) -> # ================================================================================================ +class DescribeDocxPartitionerOptions: + """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects.""" + + # -- .document ------------------------------- + + def it_loads_the_docx_document( + self, + request: FixtureRequest, + opts_args: dict[str, Any], + ): + document_ = instance_mock(request, Document) + docx_Document_ = function_mock( + request, "unstructured.partition.docx.docx.Document", return_value=document_ + ) + _docx_file_prop_ = property_mock( + request, DocxPartitionerOptions, "_docx_file", return_value="abcde.docx" + ) + opts = DocxPartitionerOptions(**opts_args) + + document = opts.document + + _docx_file_prop_.assert_called_once_with() + docx_Document_.assert_called_once_with("abcde.docx") + assert document is document_ + + # -- .include_page_breaks -------------------- + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["include_page_breaks"] = arg_value + opts = DocxPartitionerOptions(**opts_args) + + assert opts.include_page_breaks is arg_value + + # -- .infer_table_structure ------------------ + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_include_text_as_html_in_Table_metadata( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["infer_table_structure"] = arg_value + opts = DocxPartitionerOptions(**opts_args) + + assert opts.infer_table_structure is arg_value + + # -- .increment_page_number() ---------------- + + def it_generates_a_PageBreak_element_when_the_page_number_is_incremented( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args) + + page_break_iter = opts.increment_page_number() + + assert isinstance(next(page_break_iter, None), PageBreak) + assert opts.page_number == 2 + with pytest.raises(StopIteration): + next(page_break_iter) + + def but_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off( + self, opts_args: dict[str, Any] + ): + opts_args["include_page_breaks"] = False + opts = DocxPartitionerOptions(**opts_args) + + page_break_iter = opts.increment_page_number() + + with pytest.raises(StopIteration): + next(page_break_iter) + assert opts.page_number == 2 + + # -- .last_modified -------------------------- + + def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided( + self, opts_args: dict[str, Any] + ): + opts_args["metadata_last_modified"] = "2024-03-05T17:02:53" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.last_modified == "2024-03-05T17:02:53" + + def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( + self, opts_args: dict[str, Any], get_last_modified_date_: Mock + ): + opts_args["file_path"] = "a/b/document.docx" + get_last_modified_date_.return_value = "2024-04-02T20:32:35" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_.assert_called_once_with("a/b/document.docx") + assert last_modified == "2024-04-02T20:32:35" + + def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided( + self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts_args["date_from_file_object"] = True + get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_from_file_.assert_called_once_with(file) + assert last_modified == "2024-04-02T20:42:07" + + def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False( + self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts_args["date_from_file_object"] = False + get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_from_file_.assert_not_called() + assert last_modified is None + + # -- .metadata_file_path --------------------- + + def it_uses_the_user_provided_file_path_in_the_metadata_when_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = "x/y/z.docx" + opts_args["metadata_file_path"] = "a/b/c.docx" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == "a/b/c.docx" + + @pytest.mark.parametrize("file_path", ["u/v/w.docx", None]) + def and_it_falls_back_to_the_document_file_path_otherwise( + self, file_path: str | None, opts_args: dict[str, Any] + ): + opts_args["file_path"] = file_path + opts_args["metadata_file_path"] = None + opts = DocxPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == file_path + + # -- ._metadata_page_number ------------------ + + @pytest.mark.parametrize( + ("page_count", "document_contains_pagebreaks", "expected_value"), + [(7, True, 7), (1, False, None)], + ) + def it_reports_None_when_no_rendered_page_breaks_are_found_in_document( + self, + request: FixtureRequest, + opts_args: dict[str, Any], + page_count: int, + document_contains_pagebreaks: bool, + expected_value: int | None, + ): + _document_contains_pagebreaks_prop_ = property_mock( + request, + DocxPartitionerOptions, + "_document_contains_pagebreaks", + return_value=document_contains_pagebreaks, + ) + opts = DocxPartitionerOptions(**opts_args) + opts._page_counter = page_count + + metadata_page_number = opts.metadata_page_number + + _document_contains_pagebreaks_prop_.assert_called_once_with() + assert metadata_page_number is expected_value + + # -- .page_number ---------------------------- + + def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]): + """In DOCX, page-number is the slide number.""" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.page_number == 1 + list(opts.increment_page_number()) + assert opts.page_number == 2 + list(opts.increment_page_number()) + assert opts.page_number == 3 + + def it_assigns_the_correct_page_number_when_starting_page_number_is_given( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args, starting_page_number=3) + + assert opts.page_number == 3 + list(opts.increment_page_number()) + assert opts.page_number == 4 + + # -- ._document_contains_pagebreaks ---------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), [("page-breaks.docx", True), ("teams_chat.docx", False)] + ) + def it_knows_whether_the_document_contains_page_breaks( + self, opts_args: dict[str, Any], file_name: str, expected_value: bool + ): + opts_args["file_path"] = example_doc_path(file_name) + opts = DocxPartitionerOptions(**opts_args) + + assert opts._document_contains_pagebreaks is expected_value + + # -- ._docx_file ----------------------------- + + def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = "l/m/n.docx" + opts = DocxPartitionerOptions(**opts_args) + + assert opts._docx_file == "l/m/n.docx" + + def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( + self, opts_args: dict[str, Any] + ): + spooled_temp_file = tempfile.SpooledTemporaryFile() + spooled_temp_file.write(b"abcdefg") + opts_args["file"] = spooled_temp_file + opts = DocxPartitionerOptions(**opts_args) + + docx_file = opts._docx_file + + assert docx_file is not spooled_temp_file + assert isinstance(docx_file, io.BytesIO) + assert docx_file.getvalue() == b"abcdefg" + + def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( + self, opts_args: dict[str, Any] + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts = DocxPartitionerOptions(**opts_args) + + docx_file = opts._docx_file + + assert docx_file is file + assert isinstance(docx_file, io.BytesIO) + assert docx_file.getvalue() == b"abcdefg" + + def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args) + + with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "): + opts._docx_file + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture() + def get_last_modified_date_(self, request: FixtureRequest) -> Mock: + return function_mock(request, "unstructured.partition.docx.get_last_modified_date") + + @pytest.fixture() + def get_last_modified_date_from_file_(self, request: FixtureRequest): + return function_mock( + request, "unstructured.partition.docx.get_last_modified_date_from_file" + ) + + class Describe_DocxPartitioner: """Unit-test suite for `unstructured.partition.docx._DocxPartitioner`.""" # -- table behaviors ------------------------------------------------------------------------- - def it_can_convert_a_table_to_html(self): + def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] - assert _DocxPartitioner()._convert_table_to_html(table) == ( + + assert _DocxPartitioner(opts)._convert_table_to_html(table) == ( "\n" "\n" "\n" @@ -709,7 +996,7 @@ def it_can_convert_a_table_to_html(self): "
Header Col 1 Header Col 2
" ) - def and_it_can_convert_a_nested_table_to_html(self): + def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]): """ Fixture table is: @@ -725,10 +1012,11 @@ def and_it_can_convert_a_nested_table_to_html(self): | j | k | l | +---+-------------+---+ """ + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] # -- re.sub() strips out the extra padding inserted by tabulate -- - html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table)) + html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table)) expected_lines = [ "", @@ -750,13 +1038,15 @@ def and_it_can_convert_a_nested_table_to_html(self): for expected, actual in zip(expected_lines, actual_lines): assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}" - def it_can_convert_a_table_to_plain_text(self): + def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == ( + + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == ( "Header Col 1 Header Col 2 Lorem ipsum A link example" ) - def and_it_can_convert_a_nested_table_to_plain_text(self): + def and_it_can_convert_a_nested_table_to_plain_text(self, opts_args: dict[str, Any]): """ Fixture table is: @@ -772,12 +1062,14 @@ def and_it_can_convert_a_nested_table_to_plain_text(self): | j | k | l | +---+-------------+---+ """ + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == ( + + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == ( "a >b< c d e f g&t h i j k l" ) - def but_the_text_of_a_merged_cell_appears_only_once(self): + def but_the_text_of_a_merged_cell_appears_only_once(self, opts_args: dict[str, Any]): """ Fixture table is: @@ -789,8 +1081,9 @@ def but_the_text_of_a_merged_cell_appears_only_once(self): | e | | +-------+---+ """ + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[2] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e" + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == "a b c d e" def it_can_partition_tables_with_incomplete_rows(self): """DOCX permits table rows to start late and end early. @@ -921,7 +1214,7 @@ def it_can_partition_tables_with_incomplete_rows(self): # -- page-break behaviors -------------------------------------------------------------------- - def it_places_page_breaks_precisely_where_they_occur(self): + def it_places_page_breaks_precisely_where_they_occur(self, opts_args: dict[str, Any]): """Page-break behavior has some subtleties. * A hard page-break does not generate a PageBreak element (because that would double-count @@ -940,6 +1233,8 @@ def str_repr(e: Element) -> str: """A more detailed `repr()` to aid debugging when assertion fails.""" return f"{e.__class__.__name__}('{e}')" + opts_args["file_path"] = example_doc_path("page-breaks.docx") + opts = DocxPartitionerOptions(**opts_args) expected = [ # NOTE(scanny) - -- page 1 -- NarrativeText( @@ -975,7 +1270,7 @@ def str_repr(e: Element) -> str: Title("< str: # -- header/footer behaviors ----------------------------------------------------------------- - def it_includes_table_cell_text_in_Header_text(self): - partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx")) + def it_includes_table_cell_text_in_Header_text(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("docx-hdrftr.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) section = partitioner._document.sections[0] header_iter = partitioner._iter_section_headers(section) @@ -995,9 +1292,11 @@ def it_includes_table_cell_text_in_Header_text(self): element = next(header_iter) assert element.text == "First header para\nTable cell1 Table cell2\nLast header para" - def it_includes_table_cell_text_in_Footer_text(self): + def it_includes_table_cell_text_in_Footer_text(self, opts_args: dict[str, Any]): """This case also verifies nested-table and merged-cell behaviors.""" - partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx")) + opts_args["file_path"] = example_doc_path("docx-hdrftr.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) section = partitioner._document.sections[0] footer_iter = partitioner._iter_section_footers(section) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4c908fc3d0..83c1597480 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev7" # pragma: no cover +__version__ = "0.13.8-dev8" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 97aa88e7ed..430f87a301 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -217,19 +217,19 @@ def partition_docx( Assign this number to the first page of this document and increment the page number from there. """ - # -- verify that only one file-specifier argument was provided -- - exactly_one(filename=filename, file=file) - - elements = _DocxPartitioner.iter_document_elements( - filename, - file, - metadata_filename, - include_page_breaks, - infer_table_structure, - metadata_last_modified, - date_from_file_object, + opts = DocxPartitionerOptions( + date_from_file_object=date_from_file_object, + file=file, + file_path=filename, + include_page_breaks=include_page_breaks, + infer_table_structure=infer_table_structure, + metadata_file_path=metadata_filename, + metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, ) + + elements = _DocxPartitioner.iter_document_elements(opts) + elements = apply_lang_metadata( elements=elements, languages=languages, @@ -238,56 +238,169 @@ def partition_docx( return list(elements) -class _DocxPartitioner: - """Provides `.partition()` for MS-Word 2007+ (.docx) files.""" +class DocxPartitionerOptions: + """Encapsulates partitioning option validation, computation, and application of defaults.""" def __init__( self, - # -- NOTE(scanny): default values here are unnecessary for production use because - # -- `.iter_document_elements()` is the only interface method and always calls with all - # -- args. However, providing defaults eases unit-testing and decouples unit-tests from - # -- future changes to args. - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - include_page_breaks: bool = True, - infer_table_structure: bool = True, - metadata_last_modified: Optional[str] = None, - date_from_file_object: bool = False, + *, + date_from_file_object: bool, + file: IO[bytes] | None, + file_path: str | None, + include_page_breaks: bool, + infer_table_structure: bool, + metadata_file_path: Optional[str], + metadata_last_modified: Optional[str], starting_page_number: int = 1, - ) -> None: - self._filename = filename + ): + self._date_from_file_object = date_from_file_object self._file = file - self._metadata_filename = metadata_filename + self._file_path = file_path self._include_page_breaks = include_page_breaks self._infer_table_structure = infer_table_structure + self._metadata_file_path = metadata_file_path self._metadata_last_modified = metadata_last_modified + # -- options object maintains page-number state -- self._page_counter = starting_page_number - self._date_from_file_object = date_from_file_object + + @lazyproperty + def document(self) -> Document: + """The python-docx `Document` object loaded from file or filename.""" + return docx.Document(self._docx_file) + + @lazyproperty + def include_page_breaks(self) -> bool: + """When True, include `PageBreak` elements in element-stream. + + Note that regardless of this setting, page-breaks are detected, and page-number is tracked + and included in element metadata. Only the presence of distinct `PageBreak` elements (which + contain no text) in the element stream is affected. + """ + return self._include_page_breaks + + def increment_page_number(self) -> Iterator[PageBreak]: + """Increment page-number by 1 and generate a PageBreak element if enabled.""" + self._page_counter += 1 + # -- only emit page-breaks when enabled -- + if self._include_page_breaks: + yield PageBreak("", detection_origin=DETECTION_ORIGIN) + + @lazyproperty + def infer_table_structure(self) -> bool: + """True when partitioner should compute and apply `text_as_html` metadata for tables.""" + return self._infer_table_structure + + @lazyproperty + def last_modified(self) -> Optional[str]: + """The best last-modified date available, None if no sources are available.""" + # -- Value explicitly specified by caller takes precedence. This is used for example when + # -- this file was converted from another format, and any last-modified date for the file + # -- would be just now. + if self._metadata_last_modified: + return self._metadata_last_modified + + if self._file_path: + return ( + None + if is_temp_file_path(self._file_path) + else get_last_modified_date(self._file_path) + ) + + if self._file: + return ( + get_last_modified_date_from_file(self._file) + if self._date_from_file_object + else None + ) + + return None + + @lazyproperty + def metadata_file_path(self) -> str | None: + """The best available file-path for this document or `None` if unavailable.""" + return self._metadata_file_path or self._file_path + + @property + def metadata_page_number(self) -> Optional[int]: + """The current page number to report in metadata, or None if we can't really tell. + + Page numbers are not added to element metadata if we can't find any page-breaks in the + document (which may be a common case). + + In the DOCX format, determining page numbers is strictly a best-efforts attempt since + actual page-breaks are determined at rendering time (e.g. printing) based on the + font-metrics of the target device. Explicit (hard) page-breaks are always recorded in the + docx file but the rendered page-breaks are only added optionally. + """ + return self._page_counter if self._document_contains_pagebreaks else None + + @property + def page_number(self) -> int: + """The current page number. + + Note this value may not represent the actual rendered page number when rendered page-break + indicators are not present in the document (not uncommon). Use `.metadata_page_number` for + metadata purposes, which is `None` when rendered page-breaks are not present in this + document. + """ + return self._page_counter + + @lazyproperty + def _document_contains_pagebreaks(self) -> bool: + """True when there is at least one page-break detected in the document. + + Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably + inserted by Microsoft Word, but probably don't appear in documents converted into .docx + format from for example .odt format. + """ + xpath = ( + # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can + # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which + # is w:p inner-content and both of these can occur inside a table-cell as well as the + # document body + "./w:body/w:p/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" + ) + + return bool(self.document.element.xpath(xpath)) + + @lazyproperty + def _docx_file(self) -> str | IO[bytes]: + """The Word 2007+ document file to be partitioned. + + This is either a `str` path or a file-like object. `python-docx` accepts either for opening + a document file. + """ + if self._file_path: + return self._file_path + + # -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an + # -- exception when Zipfile tries to open it. The docx format is a zip archive so we need + # -- to work around that bug here. + if isinstance(self._file, tempfile.SpooledTemporaryFile): + self._file.seek(0) + return io.BytesIO(self._file.read()) + + if self._file: + return self._file + + raise ValueError( + "No DOCX document specified, either `filename` or `file` argument must be provided" + ) + + +class _DocxPartitioner: + """Provides `.partition()` for MS-Word 2007+ (.docx) files.""" + + def __init__(self, opts: DocxPartitionerOptions) -> None: + self._opts = opts @classmethod - def iter_document_elements( - cls, - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - include_page_breaks: bool = True, - infer_table_structure: bool = True, - metadata_last_modified: Optional[str] = None, - date_from_file_object: bool = False, - starting_page_number: int = 1, - ) -> Iterator[Element]: + def iter_document_elements(cls, opts: DocxPartitionerOptions) -> Iterator[Element]: """Partition MS Word documents (.docx format) into its document elements.""" - self = cls( - filename=filename, - file=file, - metadata_filename=metadata_filename, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - metadata_last_modified=metadata_last_modified, - date_from_file_object=date_from_file_object, - starting_page_number=starting_page_number, - ) + self = cls(opts) # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a # "section-less" document has to be interated differently and has no headers or footers and @@ -452,37 +565,7 @@ def iter_row_cells_as_text(row: _Row) -> Iterator[str]: @lazyproperty def _document(self) -> Document: """The python-docx `Document` object loaded from file or filename.""" - filename, file = self._filename, self._file - - if filename is not None: - return docx.Document(filename) - - assert file is not None - if isinstance(file, tempfile.SpooledTemporaryFile): - file.seek(0) - file = io.BytesIO(file.read()) - return docx.Document(file) - - @lazyproperty - def _document_contains_pagebreaks(self) -> bool: - """True when there is at least one page-break detected in the document. - - Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably - inserted by Microsoft Word, but probably don't appear in documents converted into .docx - format from for example .odt format. - """ - xpath = ( - # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can - # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which - # is w:p inner-content and both of these can occur inside a table-cell as well as the - # document body - "./w:body/w:p/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" - ) - - return bool(self._document.element.xpath(xpath)) + return self._opts.document @lazyproperty def _document_contains_sections(self) -> bool: @@ -524,12 +607,6 @@ def iter_hdrftr_texts(hdrftr: _Header | _Footer) -> Iterator[str]: return "\n".join(text for text in iter_hdrftr_texts(hdrftr) if text) - def _increment_page_number(self) -> Iterator[PageBreak]: - """Increment page-number by 1 and generate a PageBreak element if enabled.""" - self._page_counter += 1 - if self._include_page_breaks: - yield PageBreak("", detection_origin=DETECTION_ORIGIN) - def _is_list_item(self, paragraph: Paragraph) -> bool: """True when `paragraph` can be identified as a list-item.""" if is_bulleted_text(paragraph.text): @@ -581,7 +658,7 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP if isinstance(item, Paragraph): yield from self._classify_paragraph_to_element(item) else: - yield from self._increment_page_number() + yield from self._opts.increment_page_number() def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, str]]: """Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`.""" @@ -616,7 +693,7 @@ def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]: text=text, detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( - filename=self._metadata_filename, + filename=self._opts.metadata_file_path, header_footer_type=header_footer_type, category_depth=0, ), @@ -645,7 +722,7 @@ def maybe_iter_header(header: _Header, header_footer_type: str) -> Iterator[Head text=text, detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( - filename=self._metadata_filename, + filename=self._opts.metadata_file_path, header_footer_type=header_footer_type, category_depth=0, # -- headers are always at the root level} ), @@ -668,7 +745,7 @@ def _iter_section_page_breaks(self, section_idx: int, section: Section) -> Itera """ def page_is_odd() -> bool: - return self._page_counter % 2 == 1 + return self._opts.page_number % 2 == 1 start_type = section.start_type @@ -682,14 +759,14 @@ def page_is_odd() -> bool: # -- on an even page we need two total, add one to supplement the rendered page break # -- to follow. There is no "first-document-page" special case because 1 is odd. if not page_is_odd(): - yield from self._increment_page_number() + yield from self._opts.increment_page_number() elif start_type == WD_SECTION_START.ODD_PAGE: # -- the first page of the document is an implicit "new" odd-page, so no page-break -- if section_idx == 0: return if page_is_odd(): - yield from self._increment_page_number() + yield from self._opts.increment_page_number() # -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of # -- which need our help to get the page-breaks right. @@ -699,7 +776,9 @@ def _iter_table_element(self, table: DocxTable) -> Iterator[Table]: """Generate zero-or-one Table element for a DOCX `w:tbl` XML element.""" # -- at present, we always generate exactly one Table element, but we might want # -- to skip, for example, an empty table. - html_table = self._convert_table_to_html(table) if self._infer_table_structure else None + html_table = ( + self._convert_table_to_html(table) if self._opts.infer_table_structure else None + ) text_table = " ".join(self._iter_table_texts(table)) emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table) @@ -708,9 +787,9 @@ def _iter_table_element(self, table: DocxTable) -> Iterator[Table]: detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( text_as_html=html_table, - filename=self._metadata_filename, - page_number=self._page_number, - last_modified=self._last_modified, + filename=self._opts.metadata_file_path, + page_number=self._opts.metadata_page_number, + last_modified=self._opts.last_modified, emphasized_text_contents=emphasized_text_contents or None, emphasized_text_tags=emphasized_text_tags or None, ), @@ -753,41 +832,6 @@ def iter_cell_texts(cell: _Cell) -> Iterator[str]: # -- do not generate empty strings -- yield from (text for text in iter_cell_texts(_Cell(tc, table)) if text) - @lazyproperty - def _last_modified(self) -> Optional[str]: - """Last-modified date suitable for use in element metadata.""" - # -- if this file was converted from another format, any last-modified date for the file - # -- will be today, so we get it from the conversion step in `._metadata_last_modified`. - if self._metadata_last_modified: - return self._metadata_last_modified - - file_path, file = self._filename, self._file - - # -- if the file is on the filesystem, get its date from there -- - if file_path is not None: - return None if is_temp_file_path(file_path) else get_last_modified_date(file_path) - - # -- otherwise, as long as user explicitly requested it, try getting it from the file-like - # -- object (unlikely since BytesIO and its brethren have no such metadata). - assert file is not None - if self._date_from_file_object: - return get_last_modified_date_from_file(file) - return None - - @property - def _page_number(self) -> Optional[int]: - """The current page number, or None if we can't really tell. - - Page numbers are not added to element metadata if we can't find any page-breaks in the - document (which may be a common case). - - In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual - page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the - target device. Explicit (hard) page-breaks are always recorded in the docx file but the - rendered page-breaks are only added optionally. - """ - return self._page_counter if self._document_contains_pagebreaks else None - def _paragraph_emphasis(self, paragraph: Paragraph) -> tuple[list[str], list[str]]: """[contents, tags] pair describing emphasized text in `paragraph`.""" iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph)) @@ -842,12 +886,12 @@ def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata: category_depth=category_depth, emphasized_text_contents=emphasized_text_contents or None, emphasized_text_tags=emphasized_text_tags or None, - filename=self._metadata_filename, - last_modified=self._last_modified, + filename=self._opts.metadata_file_path, + last_modified=self._opts.last_modified, link_texts=link_texts or None, link_urls=link_urls or None, links=links or None, - page_number=self._page_number, + page_number=self._opts.metadata_page_number, ) element_metadata.detection_origin = "docx" return element_metadata