Skip to content

Commit

Permalink
feat(odt): add strategy param to partition_odt()
Browse files Browse the repository at this point in the history
To support pluggable DOCX image sub-partitioners we need
broker-partitioners that target `partition_docx()` to accept and pass
along the strategy argument value.
  • Loading branch information
scanny committed May 16, 2024
1 parent 4251b2c commit 037cc5b
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 1 deletion.
25 changes: 24 additions & 1 deletion test_unstructured/partition/odt/test_odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@
import pytest
from pytest_mock import MockFixture

from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from test_unstructured.unit_utils import (
FixtureRequest,
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
)
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import CompositeElement, Table, TableChunk, Title
from unstructured.partition.docx import partition_docx
Expand Down Expand Up @@ -220,6 +225,24 @@ def test_partition_odt_respects_detect_language_per_element_arg():
# -- miscellaneous -------------------------------------------------------------------------------


@pytest.mark.parametrize(
("kwargs", "expected_value"),
[({}, None), ({"strategy": None}, None), ({"strategy": "hi_res"}, "hi_res")],
)
def test_partition_odt_forwards_strategy_arg_to_partition_docx(
request: FixtureRequest, kwargs: dict[str, Any], expected_value: str | None
):
partition_docx_ = function_mock(request, "unstructured.partition.odt.partition_docx")

partition_odt(example_doc_path("simple.odt"), **kwargs)

call_kwargs = partition_docx_.call_args.kwargs
# -- `strategy` keyword-argument appeared in the call --
assert "strategy" in call_kwargs
# -- `strategy` argument was passed with the expected value --
assert call_kwargs["strategy"] == expected_value


def test_partition_odt_round_trips_through_json():
"""Elements produced can be serialized then deserialized without loss."""
assert_round_trips_through_JSON(partition_odt(example_doc_path("simple.odt")))
Expand Down
2 changes: 2 additions & 0 deletions unstructured/partition/odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def partition_odt(
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
starting_page_number: int = 1,
strategy: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions Open Office Documents in .odt format into its document elements.
Expand Down Expand Up @@ -76,6 +77,7 @@ def partition_odt(
metadata_filename=metadata_filename,
metadata_last_modified=metadata_last_modified or last_modification_date,
starting_page_number=starting_page_number,
strategy=strategy,
)

return elements
Expand Down

0 comments on commit 037cc5b

Please sign in to comment.