diff --git a/CHANGELOG.md b/CHANGELOG.md index 89caeaa410..0496479af1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ -## 0.13.8-dev7 +## 0.13.8-dev8 ### Enhancements -**Faster evaluation** Support for concurrent processing of documents during evaluation +* **Faster evaluation** Support for concurrent processing of documents during evaluation ### Features diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 59e90540a3..9e89d99737 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -4,16 +4,26 @@ from __future__ import annotations +import io import pathlib import re import tempfile +from typing import Any import docx import pytest from docx.document import Document from pytest_mock import MockFixture -from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path +from test_unstructured.unit_utils import ( + FixtureRequest, + Mock, + assert_round_trips_through_JSON, + example_doc_path, + function_mock, + instance_mock, + property_mock, +) from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ( Address, @@ -29,7 +39,7 @@ Text, Title, ) -from unstructured.partition.docx import _DocxPartitioner, partition_docx +from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA # -- docx-file loading behaviors ----------------------------------------------------------------- @@ -89,14 +99,16 @@ def test_partition_docx_from_file_with_metadata_filename( assert element.metadata.filename == "test" -def test_partition_docx_raises_with_both_specified(mock_document_file_path: str): - with open(mock_document_file_path, "rb") as f: - with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"): - partition_docx(filename=mock_document_file_path, file=f) +def test_partition_docx_uses_file_path_when_both_are_specified( + mock_document_file_path: str, expected_elements: list[Text] +): + f = io.BytesIO(b"abcde") + elements = partition_docx(filename=mock_document_file_path, file=f) + assert elements == expected_elements def test_partition_docx_raises_with_neither(): - with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"): + with pytest.raises(ValueError, match="either `filename` or `file` argument must be provided"): partition_docx() @@ -292,15 +304,13 @@ def test_partition_docx_from_file_without_metadata_date(): assert elements[0].metadata.last_modified is None -def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dict[str, str]]): - partitioner = _DocxPartitioner( - example_doc_path("fake-doc-emphasized-text.docx"), - None, - None, - False, - True, - None, - ) +def test_get_emphasized_texts_from_paragraph( + opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]] +): + opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) + paragraph = partitioner._document.paragraphs[1] emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph)) assert paragraph.text == "I am a bold italic bold-italic text." @@ -317,34 +327,31 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dic assert emphasized_texts == [] -def test_iter_table_emphasis(expected_emphasized_texts: list[dict[str, str]]): - partitioner = _DocxPartitioner( - example_doc_path("fake-doc-emphasized-text.docx"), - None, - None, - False, - True, - None, - ) +def test_iter_table_emphasis( + opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]] +): + opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) table = partitioner._document.tables[0] + emphasized_texts = list(partitioner._iter_table_emphasis(table)) + assert emphasized_texts == expected_emphasized_texts def test_table_emphasis( + opts_args: dict[str, Any], expected_emphasized_text_contents: list[str], expected_emphasized_text_tags: list[str], ): - partitioner = _DocxPartitioner( - example_doc_path("fake-doc-emphasized-text.docx"), - None, - None, - False, - True, - None, - ) + opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) table = partitioner._document.tables[0] + emphasized_text_contents, emphasized_text_tags = partitioner._table_emphasis(table) + assert emphasized_text_contents == expected_emphasized_text_contents assert emphasized_text_tags == expected_emphasized_text_tags @@ -373,15 +380,10 @@ def test_partition_docx_with_json(mock_document_file_path: str): assert_round_trips_through_JSON(elements) -def test_parse_category_depth_by_style(): - partitioner = _DocxPartitioner( - example_doc_path("category-level.docx"), - None, - None, - False, - True, - None, - ) +def test_parse_category_depth_by_style(opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("category-level.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) # Category depths are 0-indexed and relative to the category type # Title, list item, bullet, narrative text, etc. @@ -411,9 +413,9 @@ def test_parse_category_depth_by_style(): ), f"expected paragraph[{idx}] to have depth=={depth}, got {actual_depth}" -def test_parse_category_depth_by_style_name(): - partitioner = _DocxPartitioner(None, None, None, False, True, None) - +def test_parse_category_depth_by_style_name(opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) test_cases = [ (0, "Heading 1"), (1, "Heading 2"), @@ -436,8 +438,9 @@ def test_parse_category_depth_by_style_name(): ), f"test case {test_cases[idx]} failed" -def test_parse_category_depth_by_style_ilvl(): - partitioner = _DocxPartitioner(None, None, None, False, True, None) +def test_parse_category_depth_by_style_ilvl(opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) assert partitioner._parse_category_depth_by_style_ilvl() == 0 @@ -683,6 +686,24 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) -> return filename +@pytest.fixture() +def opts_args() -> dict[str, Any]: + """All default arguments for `DocxPartitionerOptions`. + + Individual argument values can be changed to suit each test. Makes construction of opts more + compact for testing purposes. + """ + return { + "date_from_file_object": False, + "file": None, + "file_path": None, + "include_page_breaks": True, + "infer_table_structure": True, + "metadata_file_path": None, + "metadata_last_modified": None, + } + + # ================================================================================================ # ISOLATED UNIT TESTS # ================================================================================================ @@ -691,14 +712,280 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) -> # ================================================================================================ +class DescribeDocxPartitionerOptions: + """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects.""" + + # -- .document ------------------------------- + + def it_loads_the_docx_document( + self, + request: FixtureRequest, + opts_args: dict[str, Any], + ): + document_ = instance_mock(request, Document) + docx_Document_ = function_mock( + request, "unstructured.partition.docx.docx.Document", return_value=document_ + ) + _docx_file_prop_ = property_mock( + request, DocxPartitionerOptions, "_docx_file", return_value="abcde.docx" + ) + opts = DocxPartitionerOptions(**opts_args) + + document = opts.document + + _docx_file_prop_.assert_called_once_with() + docx_Document_.assert_called_once_with("abcde.docx") + assert document is document_ + + # -- .include_page_breaks -------------------- + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["include_page_breaks"] = arg_value + opts = DocxPartitionerOptions(**opts_args) + + assert opts.include_page_breaks is arg_value + + # -- .infer_table_structure ------------------ + + @pytest.mark.parametrize("arg_value", [True, False]) + def it_knows_whether_to_include_text_as_html_in_Table_metadata( + self, arg_value: bool, opts_args: dict[str, Any] + ): + opts_args["infer_table_structure"] = arg_value + opts = DocxPartitionerOptions(**opts_args) + + assert opts.infer_table_structure is arg_value + + # -- .increment_page_number() ---------------- + + def it_generates_a_PageBreak_element_when_the_page_number_is_incremented( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args) + + page_break_iter = opts.increment_page_number() + + assert isinstance(next(page_break_iter, None), PageBreak) + assert opts.page_number == 2 + with pytest.raises(StopIteration): + next(page_break_iter) + + def but_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off( + self, opts_args: dict[str, Any] + ): + opts_args["include_page_breaks"] = False + opts = DocxPartitionerOptions(**opts_args) + + page_break_iter = opts.increment_page_number() + + with pytest.raises(StopIteration): + next(page_break_iter) + assert opts.page_number == 2 + + # -- .last_modified -------------------------- + + def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided( + self, opts_args: dict[str, Any] + ): + opts_args["metadata_last_modified"] = "2024-03-05T17:02:53" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.last_modified == "2024-03-05T17:02:53" + + def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided( + self, opts_args: dict[str, Any], get_last_modified_date_: Mock + ): + opts_args["file_path"] = "a/b/document.docx" + get_last_modified_date_.return_value = "2024-04-02T20:32:35" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_.assert_called_once_with("a/b/document.docx") + assert last_modified == "2024-04-02T20:32:35" + + def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided( + self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts_args["date_from_file_object"] = True + get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_from_file_.assert_called_once_with(file) + assert last_modified == "2024-04-02T20:42:07" + + def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False( + self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts_args["date_from_file_object"] = False + get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07" + opts = DocxPartitionerOptions(**opts_args) + + last_modified = opts.last_modified + + get_last_modified_date_from_file_.assert_not_called() + assert last_modified is None + + # -- .metadata_file_path --------------------- + + def it_uses_the_user_provided_file_path_in_the_metadata_when_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = "x/y/z.docx" + opts_args["metadata_file_path"] = "a/b/c.docx" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == "a/b/c.docx" + + @pytest.mark.parametrize("file_path", ["u/v/w.docx", None]) + def and_it_falls_back_to_the_document_file_path_otherwise( + self, file_path: str | None, opts_args: dict[str, Any] + ): + opts_args["file_path"] = file_path + opts_args["metadata_file_path"] = None + opts = DocxPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == file_path + + # -- ._metadata_page_number ------------------ + + @pytest.mark.parametrize( + ("page_count", "document_contains_pagebreaks", "expected_value"), + [(7, True, 7), (1, False, None)], + ) + def it_reports_None_when_no_rendered_page_breaks_are_found_in_document( + self, + request: FixtureRequest, + opts_args: dict[str, Any], + page_count: int, + document_contains_pagebreaks: bool, + expected_value: int | None, + ): + _document_contains_pagebreaks_prop_ = property_mock( + request, + DocxPartitionerOptions, + "_document_contains_pagebreaks", + return_value=document_contains_pagebreaks, + ) + opts = DocxPartitionerOptions(**opts_args) + opts._page_counter = page_count + + metadata_page_number = opts.metadata_page_number + + _document_contains_pagebreaks_prop_.assert_called_once_with() + assert metadata_page_number is expected_value + + # -- .page_number ---------------------------- + + def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]): + """In DOCX, page-number is the slide number.""" + opts = DocxPartitionerOptions(**opts_args) + + assert opts.page_number == 1 + list(opts.increment_page_number()) + assert opts.page_number == 2 + list(opts.increment_page_number()) + assert opts.page_number == 3 + + def it_assigns_the_correct_page_number_when_starting_page_number_is_given( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args, starting_page_number=3) + + assert opts.page_number == 3 + list(opts.increment_page_number()) + assert opts.page_number == 4 + + # -- ._document_contains_pagebreaks ---------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), [("page-breaks.docx", True), ("teams_chat.docx", False)] + ) + def it_knows_whether_the_document_contains_page_breaks( + self, opts_args: dict[str, Any], file_name: str, expected_value: bool + ): + opts_args["file_path"] = example_doc_path(file_name) + opts = DocxPartitionerOptions(**opts_args) + + assert opts._document_contains_pagebreaks is expected_value + + # -- ._docx_file ----------------------------- + + def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = "l/m/n.docx" + opts = DocxPartitionerOptions(**opts_args) + + assert opts._docx_file == "l/m/n.docx" + + def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( + self, opts_args: dict[str, Any] + ): + spooled_temp_file = tempfile.SpooledTemporaryFile() + spooled_temp_file.write(b"abcdefg") + opts_args["file"] = spooled_temp_file + opts = DocxPartitionerOptions(**opts_args) + + docx_file = opts._docx_file + + assert docx_file is not spooled_temp_file + assert isinstance(docx_file, io.BytesIO) + assert docx_file.getvalue() == b"abcdefg" + + def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( + self, opts_args: dict[str, Any] + ): + file = io.BytesIO(b"abcdefg") + opts_args["file"] = file + opts = DocxPartitionerOptions(**opts_args) + + docx_file = opts._docx_file + + assert docx_file is file + assert isinstance(docx_file, io.BytesIO) + assert docx_file.getvalue() == b"abcdefg" + + def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided( + self, opts_args: dict[str, Any] + ): + opts = DocxPartitionerOptions(**opts_args) + + with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "): + opts._docx_file + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture() + def get_last_modified_date_(self, request: FixtureRequest) -> Mock: + return function_mock(request, "unstructured.partition.docx.get_last_modified_date") + + @pytest.fixture() + def get_last_modified_date_from_file_(self, request: FixtureRequest): + return function_mock( + request, "unstructured.partition.docx.get_last_modified_date_from_file" + ) + + class Describe_DocxPartitioner: """Unit-test suite for `unstructured.partition.docx._DocxPartitioner`.""" # -- table behaviors ------------------------------------------------------------------------- - def it_can_convert_a_table_to_html(self): + def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] - assert _DocxPartitioner()._convert_table_to_html(table) == ( + + assert _DocxPartitioner(opts)._convert_table_to_html(table) == ( "\n" "\n" "\n" @@ -709,7 +996,7 @@ def it_can_convert_a_table_to_html(self): "
Header Col 1 Header Col 2
" ) - def and_it_can_convert_a_nested_table_to_html(self): + def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]): """ Fixture table is: @@ -725,10 +1012,11 @@ def and_it_can_convert_a_nested_table_to_html(self): | j | k | l | +---+-------------+---+ """ + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] # -- re.sub() strips out the extra padding inserted by tabulate -- - html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table)) + html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table)) expected_lines = [ "", @@ -750,13 +1038,15 @@ def and_it_can_convert_a_nested_table_to_html(self): for expected, actual in zip(expected_lines, actual_lines): assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}" - def it_can_convert_a_table_to_plain_text(self): + def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]): + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[0] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == ( + + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == ( "Header Col 1 Header Col 2 Lorem ipsum A link example" ) - def and_it_can_convert_a_nested_table_to_plain_text(self): + def and_it_can_convert_a_nested_table_to_plain_text(self, opts_args: dict[str, Any]): """ Fixture table is: @@ -772,12 +1062,14 @@ def and_it_can_convert_a_nested_table_to_plain_text(self): | j | k | l | +---+-------------+---+ """ + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[1] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == ( + + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == ( "a >b< c d e f g&t h i j k l" ) - def but_the_text_of_a_merged_cell_appears_only_once(self): + def but_the_text_of_a_merged_cell_appears_only_once(self, opts_args: dict[str, Any]): """ Fixture table is: @@ -789,8 +1081,9 @@ def but_the_text_of_a_merged_cell_appears_only_once(self): | e | | +-------+---+ """ + opts = DocxPartitionerOptions(**opts_args) table = docx.Document(example_doc_path("docx-tables.docx")).tables[2] - assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e" + assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == "a b c d e" def it_can_partition_tables_with_incomplete_rows(self): """DOCX permits table rows to start late and end early. @@ -921,7 +1214,7 @@ def it_can_partition_tables_with_incomplete_rows(self): # -- page-break behaviors -------------------------------------------------------------------- - def it_places_page_breaks_precisely_where_they_occur(self): + def it_places_page_breaks_precisely_where_they_occur(self, opts_args: dict[str, Any]): """Page-break behavior has some subtleties. * A hard page-break does not generate a PageBreak element (because that would double-count @@ -940,6 +1233,8 @@ def str_repr(e: Element) -> str: """A more detailed `repr()` to aid debugging when assertion fails.""" return f"{e.__class__.__name__}('{e}')" + opts_args["file_path"] = example_doc_path("page-breaks.docx") + opts = DocxPartitionerOptions(**opts_args) expected = [ # NOTE(scanny) - -- page 1 -- NarrativeText( @@ -975,7 +1270,7 @@ def str_repr(e: Element) -> str: Title("< str: # -- header/footer behaviors ----------------------------------------------------------------- - def it_includes_table_cell_text_in_Header_text(self): - partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx")) + def it_includes_table_cell_text_in_Header_text(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("docx-hdrftr.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) section = partitioner._document.sections[0] header_iter = partitioner._iter_section_headers(section) @@ -995,9 +1292,11 @@ def it_includes_table_cell_text_in_Header_text(self): element = next(header_iter) assert element.text == "First header para\nTable cell1 Table cell2\nLast header para" - def it_includes_table_cell_text_in_Footer_text(self): + def it_includes_table_cell_text_in_Footer_text(self, opts_args: dict[str, Any]): """This case also verifies nested-table and merged-cell behaviors.""" - partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx")) + opts_args["file_path"] = example_doc_path("docx-hdrftr.docx") + opts = DocxPartitionerOptions(**opts_args) + partitioner = _DocxPartitioner(opts) section = partitioner._document.sections[0] footer_iter = partitioner._iter_section_footers(section) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4c908fc3d0..83c1597480 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev7" # pragma: no cover +__version__ = "0.13.8-dev8" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 97aa88e7ed..430f87a301 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -217,19 +217,19 @@ def partition_docx( Assign this number to the first page of this document and increment the page number from there. """ - # -- verify that only one file-specifier argument was provided -- - exactly_one(filename=filename, file=file) - - elements = _DocxPartitioner.iter_document_elements( - filename, - file, - metadata_filename, - include_page_breaks, - infer_table_structure, - metadata_last_modified, - date_from_file_object, + opts = DocxPartitionerOptions( + date_from_file_object=date_from_file_object, + file=file, + file_path=filename, + include_page_breaks=include_page_breaks, + infer_table_structure=infer_table_structure, + metadata_file_path=metadata_filename, + metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, ) + + elements = _DocxPartitioner.iter_document_elements(opts) + elements = apply_lang_metadata( elements=elements, languages=languages, @@ -238,56 +238,169 @@ def partition_docx( return list(elements) -class _DocxPartitioner: - """Provides `.partition()` for MS-Word 2007+ (.docx) files.""" +class DocxPartitionerOptions: + """Encapsulates partitioning option validation, computation, and application of defaults.""" def __init__( self, - # -- NOTE(scanny): default values here are unnecessary for production use because - # -- `.iter_document_elements()` is the only interface method and always calls with all - # -- args. However, providing defaults eases unit-testing and decouples unit-tests from - # -- future changes to args. - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - include_page_breaks: bool = True, - infer_table_structure: bool = True, - metadata_last_modified: Optional[str] = None, - date_from_file_object: bool = False, + *, + date_from_file_object: bool, + file: IO[bytes] | None, + file_path: str | None, + include_page_breaks: bool, + infer_table_structure: bool, + metadata_file_path: Optional[str], + metadata_last_modified: Optional[str], starting_page_number: int = 1, - ) -> None: - self._filename = filename + ): + self._date_from_file_object = date_from_file_object self._file = file - self._metadata_filename = metadata_filename + self._file_path = file_path self._include_page_breaks = include_page_breaks self._infer_table_structure = infer_table_structure + self._metadata_file_path = metadata_file_path self._metadata_last_modified = metadata_last_modified + # -- options object maintains page-number state -- self._page_counter = starting_page_number - self._date_from_file_object = date_from_file_object + + @lazyproperty + def document(self) -> Document: + """The python-docx `Document` object loaded from file or filename.""" + return docx.Document(self._docx_file) + + @lazyproperty + def include_page_breaks(self) -> bool: + """When True, include `PageBreak` elements in element-stream. + + Note that regardless of this setting, page-breaks are detected, and page-number is tracked + and included in element metadata. Only the presence of distinct `PageBreak` elements (which + contain no text) in the element stream is affected. + """ + return self._include_page_breaks + + def increment_page_number(self) -> Iterator[PageBreak]: + """Increment page-number by 1 and generate a PageBreak element if enabled.""" + self._page_counter += 1 + # -- only emit page-breaks when enabled -- + if self._include_page_breaks: + yield PageBreak("", detection_origin=DETECTION_ORIGIN) + + @lazyproperty + def infer_table_structure(self) -> bool: + """True when partitioner should compute and apply `text_as_html` metadata for tables.""" + return self._infer_table_structure + + @lazyproperty + def last_modified(self) -> Optional[str]: + """The best last-modified date available, None if no sources are available.""" + # -- Value explicitly specified by caller takes precedence. This is used for example when + # -- this file was converted from another format, and any last-modified date for the file + # -- would be just now. + if self._metadata_last_modified: + return self._metadata_last_modified + + if self._file_path: + return ( + None + if is_temp_file_path(self._file_path) + else get_last_modified_date(self._file_path) + ) + + if self._file: + return ( + get_last_modified_date_from_file(self._file) + if self._date_from_file_object + else None + ) + + return None + + @lazyproperty + def metadata_file_path(self) -> str | None: + """The best available file-path for this document or `None` if unavailable.""" + return self._metadata_file_path or self._file_path + + @property + def metadata_page_number(self) -> Optional[int]: + """The current page number to report in metadata, or None if we can't really tell. + + Page numbers are not added to element metadata if we can't find any page-breaks in the + document (which may be a common case). + + In the DOCX format, determining page numbers is strictly a best-efforts attempt since + actual page-breaks are determined at rendering time (e.g. printing) based on the + font-metrics of the target device. Explicit (hard) page-breaks are always recorded in the + docx file but the rendered page-breaks are only added optionally. + """ + return self._page_counter if self._document_contains_pagebreaks else None + + @property + def page_number(self) -> int: + """The current page number. + + Note this value may not represent the actual rendered page number when rendered page-break + indicators are not present in the document (not uncommon). Use `.metadata_page_number` for + metadata purposes, which is `None` when rendered page-breaks are not present in this + document. + """ + return self._page_counter + + @lazyproperty + def _document_contains_pagebreaks(self) -> bool: + """True when there is at least one page-break detected in the document. + + Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably + inserted by Microsoft Word, but probably don't appear in documents converted into .docx + format from for example .odt format. + """ + xpath = ( + # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can + # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which + # is w:p inner-content and both of these can occur inside a table-cell as well as the + # document body + "./w:body/w:p/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak" + " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" + ) + + return bool(self.document.element.xpath(xpath)) + + @lazyproperty + def _docx_file(self) -> str | IO[bytes]: + """The Word 2007+ document file to be partitioned. + + This is either a `str` path or a file-like object. `python-docx` accepts either for opening + a document file. + """ + if self._file_path: + return self._file_path + + # -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an + # -- exception when Zipfile tries to open it. The docx format is a zip archive so we need + # -- to work around that bug here. + if isinstance(self._file, tempfile.SpooledTemporaryFile): + self._file.seek(0) + return io.BytesIO(self._file.read()) + + if self._file: + return self._file + + raise ValueError( + "No DOCX document specified, either `filename` or `file` argument must be provided" + ) + + +class _DocxPartitioner: + """Provides `.partition()` for MS-Word 2007+ (.docx) files.""" + + def __init__(self, opts: DocxPartitionerOptions) -> None: + self._opts = opts @classmethod - def iter_document_elements( - cls, - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - metadata_filename: Optional[str] = None, - include_page_breaks: bool = True, - infer_table_structure: bool = True, - metadata_last_modified: Optional[str] = None, - date_from_file_object: bool = False, - starting_page_number: int = 1, - ) -> Iterator[Element]: + def iter_document_elements(cls, opts: DocxPartitionerOptions) -> Iterator[Element]: """Partition MS Word documents (.docx format) into its document elements.""" - self = cls( - filename=filename, - file=file, - metadata_filename=metadata_filename, - include_page_breaks=include_page_breaks, - infer_table_structure=infer_table_structure, - metadata_last_modified=metadata_last_modified, - date_from_file_object=date_from_file_object, - starting_page_number=starting_page_number, - ) + self = cls(opts) # NOTE(scanny): It's possible for a Word document to have no sections. In particular, a # Microsoft Teams chat transcript exported to DOCX contains no sections. Such a # "section-less" document has to be interated differently and has no headers or footers and @@ -452,37 +565,7 @@ def iter_row_cells_as_text(row: _Row) -> Iterator[str]: @lazyproperty def _document(self) -> Document: """The python-docx `Document` object loaded from file or filename.""" - filename, file = self._filename, self._file - - if filename is not None: - return docx.Document(filename) - - assert file is not None - if isinstance(file, tempfile.SpooledTemporaryFile): - file.seek(0) - file = io.BytesIO(file.read()) - return docx.Document(file) - - @lazyproperty - def _document_contains_pagebreaks(self) -> bool: - """True when there is at least one page-break detected in the document. - - Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably - inserted by Microsoft Word, but probably don't appear in documents converted into .docx - format from for example .odt format. - """ - xpath = ( - # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can - # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which - # is w:p inner-content and both of these can occur inside a table-cell as well as the - # document body - "./w:body/w:p/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak" - " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak" - ) - - return bool(self._document.element.xpath(xpath)) + return self._opts.document @lazyproperty def _document_contains_sections(self) -> bool: @@ -524,12 +607,6 @@ def iter_hdrftr_texts(hdrftr: _Header | _Footer) -> Iterator[str]: return "\n".join(text for text in iter_hdrftr_texts(hdrftr) if text) - def _increment_page_number(self) -> Iterator[PageBreak]: - """Increment page-number by 1 and generate a PageBreak element if enabled.""" - self._page_counter += 1 - if self._include_page_breaks: - yield PageBreak("", detection_origin=DETECTION_ORIGIN) - def _is_list_item(self, paragraph: Paragraph) -> bool: """True when `paragraph` can be identified as a list-item.""" if is_bulleted_text(paragraph.text): @@ -581,7 +658,7 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP if isinstance(item, Paragraph): yield from self._classify_paragraph_to_element(item) else: - yield from self._increment_page_number() + yield from self._opts.increment_page_number() def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, str]]: """Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`.""" @@ -616,7 +693,7 @@ def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]: text=text, detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( - filename=self._metadata_filename, + filename=self._opts.metadata_file_path, header_footer_type=header_footer_type, category_depth=0, ), @@ -645,7 +722,7 @@ def maybe_iter_header(header: _Header, header_footer_type: str) -> Iterator[Head text=text, detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( - filename=self._metadata_filename, + filename=self._opts.metadata_file_path, header_footer_type=header_footer_type, category_depth=0, # -- headers are always at the root level} ), @@ -668,7 +745,7 @@ def _iter_section_page_breaks(self, section_idx: int, section: Section) -> Itera """ def page_is_odd() -> bool: - return self._page_counter % 2 == 1 + return self._opts.page_number % 2 == 1 start_type = section.start_type @@ -682,14 +759,14 @@ def page_is_odd() -> bool: # -- on an even page we need two total, add one to supplement the rendered page break # -- to follow. There is no "first-document-page" special case because 1 is odd. if not page_is_odd(): - yield from self._increment_page_number() + yield from self._opts.increment_page_number() elif start_type == WD_SECTION_START.ODD_PAGE: # -- the first page of the document is an implicit "new" odd-page, so no page-break -- if section_idx == 0: return if page_is_odd(): - yield from self._increment_page_number() + yield from self._opts.increment_page_number() # -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of # -- which need our help to get the page-breaks right. @@ -699,7 +776,9 @@ def _iter_table_element(self, table: DocxTable) -> Iterator[Table]: """Generate zero-or-one Table element for a DOCX `w:tbl` XML element.""" # -- at present, we always generate exactly one Table element, but we might want # -- to skip, for example, an empty table. - html_table = self._convert_table_to_html(table) if self._infer_table_structure else None + html_table = ( + self._convert_table_to_html(table) if self._opts.infer_table_structure else None + ) text_table = " ".join(self._iter_table_texts(table)) emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table) @@ -708,9 +787,9 @@ def _iter_table_element(self, table: DocxTable) -> Iterator[Table]: detection_origin=DETECTION_ORIGIN, metadata=ElementMetadata( text_as_html=html_table, - filename=self._metadata_filename, - page_number=self._page_number, - last_modified=self._last_modified, + filename=self._opts.metadata_file_path, + page_number=self._opts.metadata_page_number, + last_modified=self._opts.last_modified, emphasized_text_contents=emphasized_text_contents or None, emphasized_text_tags=emphasized_text_tags or None, ), @@ -753,41 +832,6 @@ def iter_cell_texts(cell: _Cell) -> Iterator[str]: # -- do not generate empty strings -- yield from (text for text in iter_cell_texts(_Cell(tc, table)) if text) - @lazyproperty - def _last_modified(self) -> Optional[str]: - """Last-modified date suitable for use in element metadata.""" - # -- if this file was converted from another format, any last-modified date for the file - # -- will be today, so we get it from the conversion step in `._metadata_last_modified`. - if self._metadata_last_modified: - return self._metadata_last_modified - - file_path, file = self._filename, self._file - - # -- if the file is on the filesystem, get its date from there -- - if file_path is not None: - return None if is_temp_file_path(file_path) else get_last_modified_date(file_path) - - # -- otherwise, as long as user explicitly requested it, try getting it from the file-like - # -- object (unlikely since BytesIO and its brethren have no such metadata). - assert file is not None - if self._date_from_file_object: - return get_last_modified_date_from_file(file) - return None - - @property - def _page_number(self) -> Optional[int]: - """The current page number, or None if we can't really tell. - - Page numbers are not added to element metadata if we can't find any page-breaks in the - document (which may be a common case). - - In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual - page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the - target device. Explicit (hard) page-breaks are always recorded in the docx file but the - rendered page-breaks are only added optionally. - """ - return self._page_counter if self._document_contains_pagebreaks else None - def _paragraph_emphasis(self, paragraph: Paragraph) -> tuple[list[str], list[str]]: """[contents, tags] pair describing emphasized text in `paragraph`.""" iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph)) @@ -842,12 +886,12 @@ def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata: category_depth=category_depth, emphasized_text_contents=emphasized_text_contents or None, emphasized_text_tags=emphasized_text_tags or None, - filename=self._metadata_filename, - last_modified=self._last_modified, + filename=self._opts.metadata_file_path, + last_modified=self._opts.last_modified, link_texts=link_texts or None, link_urls=link_urls or None, links=links or None, - page_number=self._page_number, + page_number=self._opts.metadata_page_number, ) element_metadata.detection_origin = "docx" return element_metadata