Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(docx): add strategy parameter to partition_docx() #3026

Merged
merged 1 commit into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
## 0.13.8-dev10
## 0.13.8-dev11

### Enhancements

* **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.

### Features

Expand Down
20 changes: 19 additions & 1 deletion test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@
Title,
)
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.partition.utils.constants import (
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
PartitionStrategy,
)

# -- docx-file loading behaviors -----------------------------------------------------------------

Expand Down Expand Up @@ -701,6 +704,7 @@ def opts_args() -> dict[str, Any]:
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": None,
}


Expand Down Expand Up @@ -905,6 +909,20 @@ def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
list(opts.increment_page_number())
assert opts.page_number == 4

# -- .strategy -------------------------------

@pytest.mark.parametrize(
("arg_value", "expected_value"),
[(None, "hi_res"), (PartitionStrategy.FAST, "fast"), (PartitionStrategy.HI_RES, "hi_res")],
)
def it_knows_which_partitioning_strategy_to_use(
self, opts_args: dict[str, Any], arg_value: str, expected_value: str
):
opts_args["strategy"] = arg_value
opts = DocxPartitionerOptions(**opts_args)

assert opts.strategy == expected_value

# -- ._document_contains_pagebreaks ----------

@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev10" # pragma: no cover
__version__ = "0.13.8-dev11" # pragma: no cover
21 changes: 18 additions & 3 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
is_possible_title,
is_us_city_state_zip,
)
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.utils import (
dependency_exists,
is_temp_file_path,
Expand Down Expand Up @@ -170,15 +171,17 @@ def extract_docx_filename(file_path: str) -> str:
@add_chunking_strategy
def partition_docx(
filename: Optional[str] = None,
*,
date_from_file_object: bool = False,
detect_language_per_element: bool = False,
file: Optional[IO[bytes]] = None,
include_page_breaks: bool = True,
infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"],
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1,
strategy: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.
Expand Down Expand Up @@ -226,6 +229,7 @@ def partition_docx(
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
strategy=strategy,
)

elements = _DocxPartitioner.iter_document_elements(opts)
Expand All @@ -252,6 +256,7 @@ def __init__(
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
strategy: str | None = None,
):
self._date_from_file_object = date_from_file_object
self._file = file
Expand All @@ -260,6 +265,7 @@ def __init__(
self._infer_table_structure = infer_table_structure
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._strategy = strategy
# -- options object maintains page-number state --
self._page_counter = starting_page_number

Expand Down Expand Up @@ -345,6 +351,15 @@ def page_number(self) -> int:
"""
return self._page_counter

@lazyproperty
def strategy(self) -> str:
"""The partitioning strategy for this document.

One of "hi_res", "fast", and a few others. These are available as class attributes on
`unstructured.partition.utils.constants.PartitionStrategy` but resolve to str values.
"""
return PartitionStrategy.HI_RES if self._strategy is None else self._strategy
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just want to double-check that it should default to hi_res, otherwise looks good

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, I'll confirm with Matt :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k, yep, confirmed :)


@lazyproperty
def _document_contains_pagebreaks(self) -> bool:
"""True when there is at least one page-break detected in the document.
Expand Down
Loading