diff --git a/CHANGELOG.md b/CHANGELOG.md
index 89caeaa410..0496479af1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,8 @@
-## 0.13.8-dev7
+## 0.13.8-dev8
### Enhancements
-**Faster evaluation** Support for concurrent processing of documents during evaluation
+* **Faster evaluation** Support for concurrent processing of documents during evaluation
### Features
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
index 59e90540a3..9e89d99737 100644
--- a/test_unstructured/partition/docx/test_docx.py
+++ b/test_unstructured/partition/docx/test_docx.py
@@ -4,16 +4,26 @@
from __future__ import annotations
+import io
import pathlib
import re
import tempfile
+from typing import Any
import docx
import pytest
from docx.document import Document
from pytest_mock import MockFixture
-from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
+from test_unstructured.unit_utils import (
+ FixtureRequest,
+ Mock,
+ assert_round_trips_through_JSON,
+ example_doc_path,
+ function_mock,
+ instance_mock,
+ property_mock,
+)
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
Address,
@@ -29,7 +39,7 @@
Text,
Title,
)
-from unstructured.partition.docx import _DocxPartitioner, partition_docx
+from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
# -- docx-file loading behaviors -----------------------------------------------------------------
@@ -89,14 +99,16 @@ def test_partition_docx_from_file_with_metadata_filename(
assert element.metadata.filename == "test"
-def test_partition_docx_raises_with_both_specified(mock_document_file_path: str):
- with open(mock_document_file_path, "rb") as f:
- with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
- partition_docx(filename=mock_document_file_path, file=f)
+def test_partition_docx_uses_file_path_when_both_are_specified(
+ mock_document_file_path: str, expected_elements: list[Text]
+):
+ f = io.BytesIO(b"abcde")
+ elements = partition_docx(filename=mock_document_file_path, file=f)
+ assert elements == expected_elements
def test_partition_docx_raises_with_neither():
- with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
+ with pytest.raises(ValueError, match="either `filename` or `file` argument must be provided"):
partition_docx()
@@ -292,15 +304,13 @@ def test_partition_docx_from_file_without_metadata_date():
assert elements[0].metadata.last_modified is None
-def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dict[str, str]]):
- partitioner = _DocxPartitioner(
- example_doc_path("fake-doc-emphasized-text.docx"),
- None,
- None,
- False,
- True,
- None,
- )
+def test_get_emphasized_texts_from_paragraph(
+ opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]]
+):
+ opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
+ opts = DocxPartitionerOptions(**opts_args)
+ partitioner = _DocxPartitioner(opts)
+
paragraph = partitioner._document.paragraphs[1]
emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
assert paragraph.text == "I am a bold italic bold-italic text."
@@ -317,34 +327,31 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dic
assert emphasized_texts == []
-def test_iter_table_emphasis(expected_emphasized_texts: list[dict[str, str]]):
- partitioner = _DocxPartitioner(
- example_doc_path("fake-doc-emphasized-text.docx"),
- None,
- None,
- False,
- True,
- None,
- )
+def test_iter_table_emphasis(
+ opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]]
+):
+ opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
+ opts = DocxPartitionerOptions(**opts_args)
+ partitioner = _DocxPartitioner(opts)
table = partitioner._document.tables[0]
+
emphasized_texts = list(partitioner._iter_table_emphasis(table))
+
assert emphasized_texts == expected_emphasized_texts
def test_table_emphasis(
+ opts_args: dict[str, Any],
expected_emphasized_text_contents: list[str],
expected_emphasized_text_tags: list[str],
):
- partitioner = _DocxPartitioner(
- example_doc_path("fake-doc-emphasized-text.docx"),
- None,
- None,
- False,
- True,
- None,
- )
+ opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
+ opts = DocxPartitionerOptions(**opts_args)
+ partitioner = _DocxPartitioner(opts)
table = partitioner._document.tables[0]
+
emphasized_text_contents, emphasized_text_tags = partitioner._table_emphasis(table)
+
assert emphasized_text_contents == expected_emphasized_text_contents
assert emphasized_text_tags == expected_emphasized_text_tags
@@ -373,15 +380,10 @@ def test_partition_docx_with_json(mock_document_file_path: str):
assert_round_trips_through_JSON(elements)
-def test_parse_category_depth_by_style():
- partitioner = _DocxPartitioner(
- example_doc_path("category-level.docx"),
- None,
- None,
- False,
- True,
- None,
- )
+def test_parse_category_depth_by_style(opts_args: dict[str, Any]):
+ opts_args["file_path"] = example_doc_path("category-level.docx")
+ opts = DocxPartitionerOptions(**opts_args)
+ partitioner = _DocxPartitioner(opts)
# Category depths are 0-indexed and relative to the category type
# Title, list item, bullet, narrative text, etc.
@@ -411,9 +413,9 @@ def test_parse_category_depth_by_style():
), f"expected paragraph[{idx}] to have depth=={depth}, got {actual_depth}"
-def test_parse_category_depth_by_style_name():
- partitioner = _DocxPartitioner(None, None, None, False, True, None)
-
+def test_parse_category_depth_by_style_name(opts_args: dict[str, Any]):
+ opts = DocxPartitionerOptions(**opts_args)
+ partitioner = _DocxPartitioner(opts)
test_cases = [
(0, "Heading 1"),
(1, "Heading 2"),
@@ -436,8 +438,9 @@ def test_parse_category_depth_by_style_name():
), f"test case {test_cases[idx]} failed"
-def test_parse_category_depth_by_style_ilvl():
- partitioner = _DocxPartitioner(None, None, None, False, True, None)
+def test_parse_category_depth_by_style_ilvl(opts_args: dict[str, Any]):
+ opts = DocxPartitionerOptions(**opts_args)
+ partitioner = _DocxPartitioner(opts)
assert partitioner._parse_category_depth_by_style_ilvl() == 0
@@ -683,6 +686,24 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) ->
return filename
+@pytest.fixture()
+def opts_args() -> dict[str, Any]:
+ """All default arguments for `DocxPartitionerOptions`.
+
+ Individual argument values can be changed to suit each test. Makes construction of opts more
+ compact for testing purposes.
+ """
+ return {
+ "date_from_file_object": False,
+ "file": None,
+ "file_path": None,
+ "include_page_breaks": True,
+ "infer_table_structure": True,
+ "metadata_file_path": None,
+ "metadata_last_modified": None,
+ }
+
+
# ================================================================================================
# ISOLATED UNIT TESTS
# ================================================================================================
@@ -691,14 +712,280 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) ->
# ================================================================================================
+class DescribeDocxPartitionerOptions:
+ """Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
+
+ # -- .document -------------------------------
+
+ def it_loads_the_docx_document(
+ self,
+ request: FixtureRequest,
+ opts_args: dict[str, Any],
+ ):
+ document_ = instance_mock(request, Document)
+ docx_Document_ = function_mock(
+ request, "unstructured.partition.docx.docx.Document", return_value=document_
+ )
+ _docx_file_prop_ = property_mock(
+ request, DocxPartitionerOptions, "_docx_file", return_value="abcde.docx"
+ )
+ opts = DocxPartitionerOptions(**opts_args)
+
+ document = opts.document
+
+ _docx_file_prop_.assert_called_once_with()
+ docx_Document_.assert_called_once_with("abcde.docx")
+ assert document is document_
+
+ # -- .include_page_breaks --------------------
+
+ @pytest.mark.parametrize("arg_value", [True, False])
+ def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream(
+ self, arg_value: bool, opts_args: dict[str, Any]
+ ):
+ opts_args["include_page_breaks"] = arg_value
+ opts = DocxPartitionerOptions(**opts_args)
+
+ assert opts.include_page_breaks is arg_value
+
+ # -- .infer_table_structure ------------------
+
+ @pytest.mark.parametrize("arg_value", [True, False])
+ def it_knows_whether_to_include_text_as_html_in_Table_metadata(
+ self, arg_value: bool, opts_args: dict[str, Any]
+ ):
+ opts_args["infer_table_structure"] = arg_value
+ opts = DocxPartitionerOptions(**opts_args)
+
+ assert opts.infer_table_structure is arg_value
+
+ # -- .increment_page_number() ----------------
+
+ def it_generates_a_PageBreak_element_when_the_page_number_is_incremented(
+ self, opts_args: dict[str, Any]
+ ):
+ opts = DocxPartitionerOptions(**opts_args)
+
+ page_break_iter = opts.increment_page_number()
+
+ assert isinstance(next(page_break_iter, None), PageBreak)
+ assert opts.page_number == 2
+ with pytest.raises(StopIteration):
+ next(page_break_iter)
+
+ def but_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off(
+ self, opts_args: dict[str, Any]
+ ):
+ opts_args["include_page_breaks"] = False
+ opts = DocxPartitionerOptions(**opts_args)
+
+ page_break_iter = opts.increment_page_number()
+
+ with pytest.raises(StopIteration):
+ next(page_break_iter)
+ assert opts.page_number == 2
+
+ # -- .last_modified --------------------------
+
+ def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
+ self, opts_args: dict[str, Any]
+ ):
+ opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
+ opts = DocxPartitionerOptions(**opts_args)
+
+ assert opts.last_modified == "2024-03-05T17:02:53"
+
+ def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
+ self, opts_args: dict[str, Any], get_last_modified_date_: Mock
+ ):
+ opts_args["file_path"] = "a/b/document.docx"
+ get_last_modified_date_.return_value = "2024-04-02T20:32:35"
+ opts = DocxPartitionerOptions(**opts_args)
+
+ last_modified = opts.last_modified
+
+ get_last_modified_date_.assert_called_once_with("a/b/document.docx")
+ assert last_modified == "2024-04-02T20:32:35"
+
+ def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
+ self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
+ ):
+ file = io.BytesIO(b"abcdefg")
+ opts_args["file"] = file
+ opts_args["date_from_file_object"] = True
+ get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
+ opts = DocxPartitionerOptions(**opts_args)
+
+ last_modified = opts.last_modified
+
+ get_last_modified_date_from_file_.assert_called_once_with(file)
+ assert last_modified == "2024-04-02T20:42:07"
+
+ def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
+ self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
+ ):
+ file = io.BytesIO(b"abcdefg")
+ opts_args["file"] = file
+ opts_args["date_from_file_object"] = False
+ get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
+ opts = DocxPartitionerOptions(**opts_args)
+
+ last_modified = opts.last_modified
+
+ get_last_modified_date_from_file_.assert_not_called()
+ assert last_modified is None
+
+ # -- .metadata_file_path ---------------------
+
+ def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
+ self, opts_args: dict[str, Any]
+ ):
+ opts_args["file_path"] = "x/y/z.docx"
+ opts_args["metadata_file_path"] = "a/b/c.docx"
+ opts = DocxPartitionerOptions(**opts_args)
+
+ assert opts.metadata_file_path == "a/b/c.docx"
+
+ @pytest.mark.parametrize("file_path", ["u/v/w.docx", None])
+ def and_it_falls_back_to_the_document_file_path_otherwise(
+ self, file_path: str | None, opts_args: dict[str, Any]
+ ):
+ opts_args["file_path"] = file_path
+ opts_args["metadata_file_path"] = None
+ opts = DocxPartitionerOptions(**opts_args)
+
+ assert opts.metadata_file_path == file_path
+
+ # -- ._metadata_page_number ------------------
+
+ @pytest.mark.parametrize(
+ ("page_count", "document_contains_pagebreaks", "expected_value"),
+ [(7, True, 7), (1, False, None)],
+ )
+ def it_reports_None_when_no_rendered_page_breaks_are_found_in_document(
+ self,
+ request: FixtureRequest,
+ opts_args: dict[str, Any],
+ page_count: int,
+ document_contains_pagebreaks: bool,
+ expected_value: int | None,
+ ):
+ _document_contains_pagebreaks_prop_ = property_mock(
+ request,
+ DocxPartitionerOptions,
+ "_document_contains_pagebreaks",
+ return_value=document_contains_pagebreaks,
+ )
+ opts = DocxPartitionerOptions(**opts_args)
+ opts._page_counter = page_count
+
+ metadata_page_number = opts.metadata_page_number
+
+ _document_contains_pagebreaks_prop_.assert_called_once_with()
+ assert metadata_page_number is expected_value
+
+ # -- .page_number ----------------------------
+
+ def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
+ """In DOCX, page-number is the slide number."""
+ opts = DocxPartitionerOptions(**opts_args)
+
+ assert opts.page_number == 1
+ list(opts.increment_page_number())
+ assert opts.page_number == 2
+ list(opts.increment_page_number())
+ assert opts.page_number == 3
+
+ def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
+ self, opts_args: dict[str, Any]
+ ):
+ opts = DocxPartitionerOptions(**opts_args, starting_page_number=3)
+
+ assert opts.page_number == 3
+ list(opts.increment_page_number())
+ assert opts.page_number == 4
+
+ # -- ._document_contains_pagebreaks ----------
+
+ @pytest.mark.parametrize(
+ ("file_name", "expected_value"), [("page-breaks.docx", True), ("teams_chat.docx", False)]
+ )
+ def it_knows_whether_the_document_contains_page_breaks(
+ self, opts_args: dict[str, Any], file_name: str, expected_value: bool
+ ):
+ opts_args["file_path"] = example_doc_path(file_name)
+ opts = DocxPartitionerOptions(**opts_args)
+
+ assert opts._document_contains_pagebreaks is expected_value
+
+ # -- ._docx_file -----------------------------
+
+ def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
+ self, opts_args: dict[str, Any]
+ ):
+ opts_args["file_path"] = "l/m/n.docx"
+ opts = DocxPartitionerOptions(**opts_args)
+
+ assert opts._docx_file == "l/m/n.docx"
+
+ def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
+ self, opts_args: dict[str, Any]
+ ):
+ spooled_temp_file = tempfile.SpooledTemporaryFile()
+ spooled_temp_file.write(b"abcdefg")
+ opts_args["file"] = spooled_temp_file
+ opts = DocxPartitionerOptions(**opts_args)
+
+ docx_file = opts._docx_file
+
+ assert docx_file is not spooled_temp_file
+ assert isinstance(docx_file, io.BytesIO)
+ assert docx_file.getvalue() == b"abcdefg"
+
+ def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
+ self, opts_args: dict[str, Any]
+ ):
+ file = io.BytesIO(b"abcdefg")
+ opts_args["file"] = file
+ opts = DocxPartitionerOptions(**opts_args)
+
+ docx_file = opts._docx_file
+
+ assert docx_file is file
+ assert isinstance(docx_file, io.BytesIO)
+ assert docx_file.getvalue() == b"abcdefg"
+
+ def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
+ self, opts_args: dict[str, Any]
+ ):
+ opts = DocxPartitionerOptions(**opts_args)
+
+ with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "):
+ opts._docx_file
+
+ # -- fixtures --------------------------------------------------------------------------------
+
+ @pytest.fixture()
+ def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
+ return function_mock(request, "unstructured.partition.docx.get_last_modified_date")
+
+ @pytest.fixture()
+ def get_last_modified_date_from_file_(self, request: FixtureRequest):
+ return function_mock(
+ request, "unstructured.partition.docx.get_last_modified_date_from_file"
+ )
+
+
class Describe_DocxPartitioner:
"""Unit-test suite for `unstructured.partition.docx._DocxPartitioner`."""
# -- table behaviors -------------------------------------------------------------------------
- def it_can_convert_a_table_to_html(self):
+ def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]):
+ opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
- assert _DocxPartitioner()._convert_table_to_html(table) == (
+
+ assert _DocxPartitioner(opts)._convert_table_to_html(table) == (
"
\n"
"\n"
"Header Col 1 | Header Col 2 |
\n"
@@ -709,7 +996,7 @@ def it_can_convert_a_table_to_html(self):
"
"
)
- def and_it_can_convert_a_nested_table_to_html(self):
+ def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]):
"""
Fixture table is:
@@ -725,10 +1012,11 @@ def and_it_can_convert_a_nested_table_to_html(self):
| j | k | l |
+---+-------------+---+
"""
+ opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]
# -- re.sub() strips out the extra padding inserted by tabulate --
- html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table))
+ html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table))
expected_lines = [
"",
@@ -750,13 +1038,15 @@ def and_it_can_convert_a_nested_table_to_html(self):
for expected, actual in zip(expected_lines, actual_lines):
assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}"
- def it_can_convert_a_table_to_plain_text(self):
+ def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]):
+ opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
- assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == (
+
+ assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == (
"Header Col 1 Header Col 2 Lorem ipsum A link example"
)
- def and_it_can_convert_a_nested_table_to_plain_text(self):
+ def and_it_can_convert_a_nested_table_to_plain_text(self, opts_args: dict[str, Any]):
"""
Fixture table is:
@@ -772,12 +1062,14 @@ def and_it_can_convert_a_nested_table_to_plain_text(self):
| j | k | l |
+---+-------------+---+
"""
+ opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]
- assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == (
+
+ assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == (
"a >b< c d e f g&t h i j k l"
)
- def but_the_text_of_a_merged_cell_appears_only_once(self):
+ def but_the_text_of_a_merged_cell_appears_only_once(self, opts_args: dict[str, Any]):
"""
Fixture table is:
@@ -789,8 +1081,9 @@ def but_the_text_of_a_merged_cell_appears_only_once(self):
| e | |
+-------+---+
"""
+ opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[2]
- assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e"
+ assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == "a b c d e"
def it_can_partition_tables_with_incomplete_rows(self):
"""DOCX permits table rows to start late and end early.
@@ -921,7 +1214,7 @@ def it_can_partition_tables_with_incomplete_rows(self):
# -- page-break behaviors --------------------------------------------------------------------
- def it_places_page_breaks_precisely_where_they_occur(self):
+ def it_places_page_breaks_precisely_where_they_occur(self, opts_args: dict[str, Any]):
"""Page-break behavior has some subtleties.
* A hard page-break does not generate a PageBreak element (because that would double-count
@@ -940,6 +1233,8 @@ def str_repr(e: Element) -> str:
"""A more detailed `repr()` to aid debugging when assertion fails."""
return f"{e.__class__.__name__}('{e}')"
+ opts_args["file_path"] = example_doc_path("page-breaks.docx")
+ opts = DocxPartitionerOptions(**opts_args)
expected = [
# NOTE(scanny) - -- page 1 --
NarrativeText(
@@ -975,7 +1270,7 @@ def str_repr(e: Element) -> str:
Title("< str:
# -- header/footer behaviors -----------------------------------------------------------------
- def it_includes_table_cell_text_in_Header_text(self):
- partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx"))
+ def it_includes_table_cell_text_in_Header_text(self, opts_args: dict[str, Any]):
+ opts_args["file_path"] = example_doc_path("docx-hdrftr.docx")
+ opts = DocxPartitionerOptions(**opts_args)
+ partitioner = _DocxPartitioner(opts)
section = partitioner._document.sections[0]
header_iter = partitioner._iter_section_headers(section)
@@ -995,9 +1292,11 @@ def it_includes_table_cell_text_in_Header_text(self):
element = next(header_iter)
assert element.text == "First header para\nTable cell1 Table cell2\nLast header para"
- def it_includes_table_cell_text_in_Footer_text(self):
+ def it_includes_table_cell_text_in_Footer_text(self, opts_args: dict[str, Any]):
"""This case also verifies nested-table and merged-cell behaviors."""
- partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx"))
+ opts_args["file_path"] = example_doc_path("docx-hdrftr.docx")
+ opts = DocxPartitionerOptions(**opts_args)
+ partitioner = _DocxPartitioner(opts)
section = partitioner._document.sections[0]
footer_iter = partitioner._iter_section_footers(section)
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 4c908fc3d0..83c1597480 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.8-dev7" # pragma: no cover
+__version__ = "0.13.8-dev8" # pragma: no cover
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
index 97aa88e7ed..430f87a301 100644
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@@ -217,19 +217,19 @@ def partition_docx(
Assign this number to the first page of this document and increment the page number from
there.
"""
- # -- verify that only one file-specifier argument was provided --
- exactly_one(filename=filename, file=file)
-
- elements = _DocxPartitioner.iter_document_elements(
- filename,
- file,
- metadata_filename,
- include_page_breaks,
- infer_table_structure,
- metadata_last_modified,
- date_from_file_object,
+ opts = DocxPartitionerOptions(
+ date_from_file_object=date_from_file_object,
+ file=file,
+ file_path=filename,
+ include_page_breaks=include_page_breaks,
+ infer_table_structure=infer_table_structure,
+ metadata_file_path=metadata_filename,
+ metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
)
+
+ elements = _DocxPartitioner.iter_document_elements(opts)
+
elements = apply_lang_metadata(
elements=elements,
languages=languages,
@@ -238,56 +238,169 @@ def partition_docx(
return list(elements)
-class _DocxPartitioner:
- """Provides `.partition()` for MS-Word 2007+ (.docx) files."""
+class DocxPartitionerOptions:
+ """Encapsulates partitioning option validation, computation, and application of defaults."""
def __init__(
self,
- # -- NOTE(scanny): default values here are unnecessary for production use because
- # -- `.iter_document_elements()` is the only interface method and always calls with all
- # -- args. However, providing defaults eases unit-testing and decouples unit-tests from
- # -- future changes to args.
- filename: Optional[str] = None,
- file: Optional[IO[bytes]] = None,
- metadata_filename: Optional[str] = None,
- include_page_breaks: bool = True,
- infer_table_structure: bool = True,
- metadata_last_modified: Optional[str] = None,
- date_from_file_object: bool = False,
+ *,
+ date_from_file_object: bool,
+ file: IO[bytes] | None,
+ file_path: str | None,
+ include_page_breaks: bool,
+ infer_table_structure: bool,
+ metadata_file_path: Optional[str],
+ metadata_last_modified: Optional[str],
starting_page_number: int = 1,
- ) -> None:
- self._filename = filename
+ ):
+ self._date_from_file_object = date_from_file_object
self._file = file
- self._metadata_filename = metadata_filename
+ self._file_path = file_path
self._include_page_breaks = include_page_breaks
self._infer_table_structure = infer_table_structure
+ self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
+ # -- options object maintains page-number state --
self._page_counter = starting_page_number
- self._date_from_file_object = date_from_file_object
+
+ @lazyproperty
+ def document(self) -> Document:
+ """The python-docx `Document` object loaded from file or filename."""
+ return docx.Document(self._docx_file)
+
+ @lazyproperty
+ def include_page_breaks(self) -> bool:
+ """When True, include `PageBreak` elements in element-stream.
+
+ Note that regardless of this setting, page-breaks are detected, and page-number is tracked
+ and included in element metadata. Only the presence of distinct `PageBreak` elements (which
+ contain no text) in the element stream is affected.
+ """
+ return self._include_page_breaks
+
+ def increment_page_number(self) -> Iterator[PageBreak]:
+ """Increment page-number by 1 and generate a PageBreak element if enabled."""
+ self._page_counter += 1
+ # -- only emit page-breaks when enabled --
+ if self._include_page_breaks:
+ yield PageBreak("", detection_origin=DETECTION_ORIGIN)
+
+ @lazyproperty
+ def infer_table_structure(self) -> bool:
+ """True when partitioner should compute and apply `text_as_html` metadata for tables."""
+ return self._infer_table_structure
+
+ @lazyproperty
+ def last_modified(self) -> Optional[str]:
+ """The best last-modified date available, None if no sources are available."""
+ # -- Value explicitly specified by caller takes precedence. This is used for example when
+ # -- this file was converted from another format, and any last-modified date for the file
+ # -- would be just now.
+ if self._metadata_last_modified:
+ return self._metadata_last_modified
+
+ if self._file_path:
+ return (
+ None
+ if is_temp_file_path(self._file_path)
+ else get_last_modified_date(self._file_path)
+ )
+
+ if self._file:
+ return (
+ get_last_modified_date_from_file(self._file)
+ if self._date_from_file_object
+ else None
+ )
+
+ return None
+
+ @lazyproperty
+ def metadata_file_path(self) -> str | None:
+ """The best available file-path for this document or `None` if unavailable."""
+ return self._metadata_file_path or self._file_path
+
+ @property
+ def metadata_page_number(self) -> Optional[int]:
+ """The current page number to report in metadata, or None if we can't really tell.
+
+ Page numbers are not added to element metadata if we can't find any page-breaks in the
+ document (which may be a common case).
+
+ In the DOCX format, determining page numbers is strictly a best-efforts attempt since
+ actual page-breaks are determined at rendering time (e.g. printing) based on the
+ font-metrics of the target device. Explicit (hard) page-breaks are always recorded in the
+ docx file but the rendered page-breaks are only added optionally.
+ """
+ return self._page_counter if self._document_contains_pagebreaks else None
+
+ @property
+ def page_number(self) -> int:
+ """The current page number.
+
+ Note this value may not represent the actual rendered page number when rendered page-break
+ indicators are not present in the document (not uncommon). Use `.metadata_page_number` for
+ metadata purposes, which is `None` when rendered page-breaks are not present in this
+ document.
+ """
+ return self._page_counter
+
+ @lazyproperty
+ def _document_contains_pagebreaks(self) -> bool:
+ """True when there is at least one page-break detected in the document.
+
+ Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
+ inserted by Microsoft Word, but probably don't appear in documents converted into .docx
+ format from for example .odt format.
+ """
+ xpath = (
+ # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
+ # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
+ # is w:p inner-content and both of these can occur inside a table-cell as well as the
+ # document body
+ "./w:body/w:p/w:r/w:lastRenderedPageBreak"
+ " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
+ " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
+ " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
+ )
+
+ return bool(self.document.element.xpath(xpath))
+
+ @lazyproperty
+ def _docx_file(self) -> str | IO[bytes]:
+ """The Word 2007+ document file to be partitioned.
+
+ This is either a `str` path or a file-like object. `python-docx` accepts either for opening
+ a document file.
+ """
+ if self._file_path:
+ return self._file_path
+
+ # -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an
+ # -- exception when Zipfile tries to open it. The docx format is a zip archive so we need
+ # -- to work around that bug here.
+ if isinstance(self._file, tempfile.SpooledTemporaryFile):
+ self._file.seek(0)
+ return io.BytesIO(self._file.read())
+
+ if self._file:
+ return self._file
+
+ raise ValueError(
+ "No DOCX document specified, either `filename` or `file` argument must be provided"
+ )
+
+
+class _DocxPartitioner:
+ """Provides `.partition()` for MS-Word 2007+ (.docx) files."""
+
+ def __init__(self, opts: DocxPartitionerOptions) -> None:
+ self._opts = opts
@classmethod
- def iter_document_elements(
- cls,
- filename: Optional[str] = None,
- file: Optional[IO[bytes]] = None,
- metadata_filename: Optional[str] = None,
- include_page_breaks: bool = True,
- infer_table_structure: bool = True,
- metadata_last_modified: Optional[str] = None,
- date_from_file_object: bool = False,
- starting_page_number: int = 1,
- ) -> Iterator[Element]:
+ def iter_document_elements(cls, opts: DocxPartitionerOptions) -> Iterator[Element]:
"""Partition MS Word documents (.docx format) into its document elements."""
- self = cls(
- filename=filename,
- file=file,
- metadata_filename=metadata_filename,
- include_page_breaks=include_page_breaks,
- infer_table_structure=infer_table_structure,
- metadata_last_modified=metadata_last_modified,
- date_from_file_object=date_from_file_object,
- starting_page_number=starting_page_number,
- )
+ self = cls(opts)
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
# "section-less" document has to be interated differently and has no headers or footers and
@@ -452,37 +565,7 @@ def iter_row_cells_as_text(row: _Row) -> Iterator[str]:
@lazyproperty
def _document(self) -> Document:
"""The python-docx `Document` object loaded from file or filename."""
- filename, file = self._filename, self._file
-
- if filename is not None:
- return docx.Document(filename)
-
- assert file is not None
- if isinstance(file, tempfile.SpooledTemporaryFile):
- file.seek(0)
- file = io.BytesIO(file.read())
- return docx.Document(file)
-
- @lazyproperty
- def _document_contains_pagebreaks(self) -> bool:
- """True when there is at least one page-break detected in the document.
-
- Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
- inserted by Microsoft Word, but probably don't appear in documents converted into .docx
- format from for example .odt format.
- """
- xpath = (
- # NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
- # appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
- # is w:p inner-content and both of these can occur inside a table-cell as well as the
- # document body
- "./w:body/w:p/w:r/w:lastRenderedPageBreak"
- " | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
- " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
- " | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
- )
-
- return bool(self._document.element.xpath(xpath))
+ return self._opts.document
@lazyproperty
def _document_contains_sections(self) -> bool:
@@ -524,12 +607,6 @@ def iter_hdrftr_texts(hdrftr: _Header | _Footer) -> Iterator[str]:
return "\n".join(text for text in iter_hdrftr_texts(hdrftr) if text)
- def _increment_page_number(self) -> Iterator[PageBreak]:
- """Increment page-number by 1 and generate a PageBreak element if enabled."""
- self._page_counter += 1
- if self._include_page_breaks:
- yield PageBreak("", detection_origin=DETECTION_ORIGIN)
-
def _is_list_item(self, paragraph: Paragraph) -> bool:
"""True when `paragraph` can be identified as a list-item."""
if is_bulleted_text(paragraph.text):
@@ -581,7 +658,7 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP
if isinstance(item, Paragraph):
yield from self._classify_paragraph_to_element(item)
else:
- yield from self._increment_page_number()
+ yield from self._opts.increment_page_number()
def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, str]]:
"""Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`."""
@@ -616,7 +693,7 @@ def iter_footer(footer: _Footer, header_footer_type: str) -> Iterator[Footer]:
text=text,
detection_origin=DETECTION_ORIGIN,
metadata=ElementMetadata(
- filename=self._metadata_filename,
+ filename=self._opts.metadata_file_path,
header_footer_type=header_footer_type,
category_depth=0,
),
@@ -645,7 +722,7 @@ def maybe_iter_header(header: _Header, header_footer_type: str) -> Iterator[Head
text=text,
detection_origin=DETECTION_ORIGIN,
metadata=ElementMetadata(
- filename=self._metadata_filename,
+ filename=self._opts.metadata_file_path,
header_footer_type=header_footer_type,
category_depth=0, # -- headers are always at the root level}
),
@@ -668,7 +745,7 @@ def _iter_section_page_breaks(self, section_idx: int, section: Section) -> Itera
"""
def page_is_odd() -> bool:
- return self._page_counter % 2 == 1
+ return self._opts.page_number % 2 == 1
start_type = section.start_type
@@ -682,14 +759,14 @@ def page_is_odd() -> bool:
# -- on an even page we need two total, add one to supplement the rendered page break
# -- to follow. There is no "first-document-page" special case because 1 is odd.
if not page_is_odd():
- yield from self._increment_page_number()
+ yield from self._opts.increment_page_number()
elif start_type == WD_SECTION_START.ODD_PAGE:
# -- the first page of the document is an implicit "new" odd-page, so no page-break --
if section_idx == 0:
return
if page_is_odd():
- yield from self._increment_page_number()
+ yield from self._opts.increment_page_number()
# -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of
# -- which need our help to get the page-breaks right.
@@ -699,7 +776,9 @@ def _iter_table_element(self, table: DocxTable) -> Iterator[Table]:
"""Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
# -- at present, we always generate exactly one Table element, but we might want
# -- to skip, for example, an empty table.
- html_table = self._convert_table_to_html(table) if self._infer_table_structure else None
+ html_table = (
+ self._convert_table_to_html(table) if self._opts.infer_table_structure else None
+ )
text_table = " ".join(self._iter_table_texts(table))
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
@@ -708,9 +787,9 @@ def _iter_table_element(self, table: DocxTable) -> Iterator[Table]:
detection_origin=DETECTION_ORIGIN,
metadata=ElementMetadata(
text_as_html=html_table,
- filename=self._metadata_filename,
- page_number=self._page_number,
- last_modified=self._last_modified,
+ filename=self._opts.metadata_file_path,
+ page_number=self._opts.metadata_page_number,
+ last_modified=self._opts.last_modified,
emphasized_text_contents=emphasized_text_contents or None,
emphasized_text_tags=emphasized_text_tags or None,
),
@@ -753,41 +832,6 @@ def iter_cell_texts(cell: _Cell) -> Iterator[str]:
# -- do not generate empty strings --
yield from (text for text in iter_cell_texts(_Cell(tc, table)) if text)
- @lazyproperty
- def _last_modified(self) -> Optional[str]:
- """Last-modified date suitable for use in element metadata."""
- # -- if this file was converted from another format, any last-modified date for the file
- # -- will be today, so we get it from the conversion step in `._metadata_last_modified`.
- if self._metadata_last_modified:
- return self._metadata_last_modified
-
- file_path, file = self._filename, self._file
-
- # -- if the file is on the filesystem, get its date from there --
- if file_path is not None:
- return None if is_temp_file_path(file_path) else get_last_modified_date(file_path)
-
- # -- otherwise, as long as user explicitly requested it, try getting it from the file-like
- # -- object (unlikely since BytesIO and its brethren have no such metadata).
- assert file is not None
- if self._date_from_file_object:
- return get_last_modified_date_from_file(file)
- return None
-
- @property
- def _page_number(self) -> Optional[int]:
- """The current page number, or None if we can't really tell.
-
- Page numbers are not added to element metadata if we can't find any page-breaks in the
- document (which may be a common case).
-
- In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual
- page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the
- target device. Explicit (hard) page-breaks are always recorded in the docx file but the
- rendered page-breaks are only added optionally.
- """
- return self._page_counter if self._document_contains_pagebreaks else None
-
def _paragraph_emphasis(self, paragraph: Paragraph) -> tuple[list[str], list[str]]:
"""[contents, tags] pair describing emphasized text in `paragraph`."""
iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph))
@@ -842,12 +886,12 @@ def _paragraph_metadata(self, paragraph: Paragraph) -> ElementMetadata:
category_depth=category_depth,
emphasized_text_contents=emphasized_text_contents or None,
emphasized_text_tags=emphasized_text_tags or None,
- filename=self._metadata_filename,
- last_modified=self._last_modified,
+ filename=self._opts.metadata_file_path,
+ last_modified=self._opts.last_modified,
link_texts=link_texts or None,
link_urls=link_urls or None,
links=links or None,
- page_number=self._page_number,
+ page_number=self._opts.metadata_page_number,
)
element_metadata.detection_origin = "docx"
return element_metadata