Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(pptx): partition_pptx() accepts strategy arg #2879

Merged
merged 2 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## 0.13.3-dev2
## 0.13.3-dev3

### Enhancements

* **Add `strategy` arg value to `_PptxPartitionerOptions`.** This makes this paritioning option available for sub-partitioners to come that may optionally use inference or other expensive operations to improve the partitioning.

### Features

### Fixes
Expand Down
16 changes: 14 additions & 2 deletions test_unstructured/partition/pptx/test_pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def test_partition_pptx_malformed():
assert element.metadata.filename == "fake-power-point-malformed.pptx"


# == DescribePptxPartitionerMetadataBehaviors ====================================================
# == metadata behaviors ==========================================================================


def test_partition_pptx_metadata_date(mocker: MockFixture):
Expand Down Expand Up @@ -357,7 +357,7 @@ def test_partition_pptx_raises_TypeError_for_invalid_languages():
partition_pptx(example_doc_path("fake-power-point.pptx"), languages="eng") # type: ignore


# == DescribePptxPartitionerDownstreamBehaviors ==================================================
# == downstream behaviors ========================================================================


def test_partition_pptx_with_json():
Expand Down Expand Up @@ -714,6 +714,17 @@ def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
with pytest.raises(ValueError, match="No PPTX document specified, either `filename` or "):
opts.pptx_file

# -- .strategy -------------------------------

@pytest.mark.parametrize("arg_value", ["fast", "hi_res"])
def it_knows_which_partitioning_strategy_to_use(
self, arg_value: str, opts_args: dict[str, Any]
):
opts_args["strategy"] = arg_value
opts = _PptxPartitionerOptions(**opts_args)

assert opts.strategy == arg_value

# -- .table_metadata -------------------------

def it_can_create_table_metadata(self, opts_args: dict[str, Any]):
Expand Down Expand Up @@ -776,4 +787,5 @@ def opts_args(self) -> dict[str, Any]:
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": "fast",
}
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.3-dev2" # pragma: no cover
__version__ = "0.13.3-dev3" # pragma: no cover
28 changes: 24 additions & 4 deletions unstructured/partition/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
is_possible_narrative_text,
is_possible_title,
)
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.utils import lazyproperty

DETECTION_ORIGIN = "pptx"
Expand All @@ -49,15 +50,17 @@
@add_chunking_strategy
def partition_pptx(
filename: Optional[str] = None,
*,
file: Optional[IO[bytes]] = None,
date_from_file_object: bool = False,
detect_language_per_element: bool = False,
include_page_breaks: bool = True,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
include_slide_notes: bool = False,
infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
Comment on lines +53 to +62
Copy link
Collaborator Author

@scanny scanny Apr 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding the "*" just makes explicit that all arguments other than filename are keyword-only. This has always been the case because filename and file can't both be used, but explicit is better than implicit. The remaining line changes are just sorting the kwargs alphabetically for the sake of good order, making it easier to find the one you're looking for and tell whether one is missing.

strategy: str = PartitionStrategy.FAST,
**kwargs: Any,
) -> list[Element]:
"""Partition PowerPoint document in .pptx format into its document elements.
Expand Down Expand Up @@ -104,6 +107,7 @@ def partition_pptx(
infer_table_structure=infer_table_structure,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
strategy=strategy,
)

elements = _PptxPartitioner.iter_presentation_elements(opts)
Expand Down Expand Up @@ -314,6 +318,7 @@ def __init__(
infer_table_structure: bool,
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
strategy: str,
):
self._date_from_file_object = date_from_file_object
self._file = file
Expand All @@ -323,6 +328,8 @@ def __init__(
self._infer_table_structure = infer_table_structure
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._strategy = strategy
# -- options object maintains page-number state --
self._page_counter = 0

@lazyproperty
Expand Down Expand Up @@ -414,6 +421,19 @@ def pptx_file(self) -> str | IO[bytes]:
"No PPTX document specified, either `filename` or `file` argument must be provided"
)

@lazyproperty
def strategy(self) -> str:
"""The requested partitioning strategy.

This indicates whether the partitioner should undertake expensive operations like inference
and OCR to produce a more thorough and/or accurate partitioning of the document.

Can take several values but for PPTX purposes there is only "hi_res" and not "hi_res".
Depending on the picture-partitioner used, images may only be OCR'ed and added to the
element-stream when this partitioning strategy is "hi_res".
"""
return self._strategy

def table_metadata(self, text_as_html: str | None):
"""ElementMetadata instance suitable for use with Table element."""
element_metadata = ElementMetadata(
Expand Down
Loading