From efe5785d78bed435d3e86c628bc5ea89122ccdc6 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 13 May 2024 15:17:57 -0700 Subject: [PATCH] spike: add strategy param to partition_docx() --- CHANGELOG.md | 3 ++- test_unstructured/partition/docx/test_docx.py | 20 +++++++++++++++++- unstructured/__version__.py | 2 +- unstructured/partition/docx.py | 21 ++++++++++++++++--- 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82cedc761c..28b09f0c14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,9 @@ -## 0.13.8-dev10 +## 0.13.8-dev11 ### Enhancements * **Faster evaluation** Support for concurrent processing of documents during evaluation +* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy. ### Features diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py index 9e89d99737..3bdf235fd8 100644 --- a/test_unstructured/partition/docx/test_docx.py +++ b/test_unstructured/partition/docx/test_docx.py @@ -40,7 +40,10 @@ Title, ) from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx -from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA +from unstructured.partition.utils.constants import ( + UNSTRUCTURED_INCLUDE_DEBUG_METADATA, + PartitionStrategy, +) # -- docx-file loading behaviors ----------------------------------------------------------------- @@ -701,6 +704,7 @@ def opts_args() -> dict[str, Any]: "infer_table_structure": True, "metadata_file_path": None, "metadata_last_modified": None, + "strategy": None, } @@ -905,6 +909,20 @@ def it_assigns_the_correct_page_number_when_starting_page_number_is_given( list(opts.increment_page_number()) assert opts.page_number == 4 + # -- .strategy ------------------------------- + + @pytest.mark.parametrize( + ("arg_value", "expected_value"), + [(None, "hi_res"), (PartitionStrategy.FAST, "fast"), (PartitionStrategy.HI_RES, "hi_res")], + ) + def it_knows_which_partitioning_strategy_to_use( + self, opts_args: dict[str, Any], arg_value: str, expected_value: str + ): + opts_args["strategy"] = arg_value + opts = DocxPartitionerOptions(**opts_args) + + assert opts.strategy == expected_value + # -- ._document_contains_pagebreaks ---------- @pytest.mark.parametrize( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b1a7af4fa6..d0f6aafe83 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.8-dev10" # pragma: no cover +__version__ = "0.13.8-dev11" # pragma: no cover diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 430f87a301..19f874a338 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -57,6 +57,7 @@ is_possible_title, is_us_city_state_zip, ) +from unstructured.partition.utils.constants import PartitionStrategy from unstructured.utils import ( dependency_exists, is_temp_file_path, @@ -170,15 +171,17 @@ def extract_docx_filename(file_path: str) -> str: @add_chunking_strategy def partition_docx( filename: Optional[str] = None, + *, + date_from_file_object: bool = False, + detect_language_per_element: bool = False, file: Optional[IO[bytes]] = None, include_page_breaks: bool = True, infer_table_structure: bool = True, + languages: Optional[list[str]] = ["auto"], metadata_filename: Optional[str] = None, metadata_last_modified: Optional[str] = None, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, - date_from_file_object: bool = False, starting_page_number: int = 1, + strategy: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partitions Microsoft Word Documents in .docx format into its document elements. @@ -226,6 +229,7 @@ def partition_docx( metadata_file_path=metadata_filename, metadata_last_modified=metadata_last_modified, starting_page_number=starting_page_number, + strategy=strategy, ) elements = _DocxPartitioner.iter_document_elements(opts) @@ -252,6 +256,7 @@ def __init__( metadata_file_path: Optional[str], metadata_last_modified: Optional[str], starting_page_number: int = 1, + strategy: str | None = None, ): self._date_from_file_object = date_from_file_object self._file = file @@ -260,6 +265,7 @@ def __init__( self._infer_table_structure = infer_table_structure self._metadata_file_path = metadata_file_path self._metadata_last_modified = metadata_last_modified + self._strategy = strategy # -- options object maintains page-number state -- self._page_counter = starting_page_number @@ -345,6 +351,15 @@ def page_number(self) -> int: """ return self._page_counter + @lazyproperty + def strategy(self) -> str: + """The partitioning strategy for this document. + + One of "hi_res", "fast", and a few others. These are available as class attributes on + `unstructured.partition.utils.constants.PartitionStrategy` but resolve to str values. + """ + return PartitionStrategy.HI_RES if self._strategy is None else self._strategy + @lazyproperty def _document_contains_pagebreaks(self) -> bool: """True when there is at least one page-break detected in the document.