From 7770838be1f322ad4c4464e80dfa01c3fe791bce Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Mon, 9 Oct 2023 10:45:48 -0400 Subject: [PATCH] Add new parameter to map to skip_infer_table_types partition arg --- CHANGELOG.md | 13 +++++++++++-- unstructured/__version__.py | 2 +- unstructured/ingest/cli/interfaces.py | 6 ++++++ unstructured/ingest/interfaces.py | 1 + unstructured/ingest/pipeline/partition.py | 15 +++++++++++---- 5 files changed, 30 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2b2d152dd..83999d408b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.10.21-dev0 + +### Enhancements +* **Expose skip_infer_table_types in ingest CLI** For each connector a new `--skip-infer-table-types` parameter was added to map to the `skip_infer_table_types` partition argument. + +### Features + +### Fixes + ## 0.10.20 ### Enhancements @@ -12,7 +21,7 @@ * **Refactor of the ingest cli workflow** The refactored approach uses a dynamically set pipeline with a snapshot along each step to save progress and accommodate continuation from a snapshot if an error occurs. This also allows the pipeline to dynamically assign any number of steps to modify the partitioned content before it gets written to a destination. * **Applies `max_characters=` argument to all element types in `add_chunking_strategy` decorator** Previously this argument was only utilized in chunking Table elements and now applies to all partitioned elements if `add_chunking_strategy` decorator is utilized, further preparing the elements for downstream processing. * **Add common retry strategy utilities for unstructured-ingest** Dynamic retry strategy with exponential backoff added to Notion source connector. -* + ### Features * **Adds `bag_of_words` and `percent_missing_text` functions** In order to count the word frequencies in two input texts and calculate the percentage of text missing relative to the source document. @@ -1462,4 +1471,4 @@ of an email. ## 0.2.0 -* Initial release of unstructured +* Initial release of unstructured \ No newline at end of file diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d21df6b84b..f611a4edf8 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.20" # pragma: no cover +__version__ = "0.10.21-dev0" # pragma: no cover diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 2e5ab72c8a..45aa945c97 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -139,6 +139,12 @@ class CliPartitionConfig(PartitionConfig, CliMixin): @staticmethod def add_cli_options(cmd: click.Command) -> None: options = [ + click.Option( + ["--skip-infer-table-types"], + type=DelimitedString(), + default=None, + help="Optional list of document types to skip table extraction on", + ), click.Option( ["--pdf-infer-table-structure"], default=False, diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 6c219a6d0f..457711ed1c 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -38,6 +38,7 @@ class BaseConfig(DataClassJsonMixin, ABC): class PartitionConfig(BaseConfig): # where to write structured data outputs pdf_infer_table_structure: bool = False + skip_infer_table_types: t.Optional[t.List[str]] = None strategy: str = "auto" ocr_languages: str = "eng" encoding: t.Optional[str] = None diff --git a/unstructured/ingest/pipeline/partition.py b/unstructured/ingest/pipeline/partition.py index b3f39ccb90..1efa18d461 100644 --- a/unstructured/ingest/pipeline/partition.py +++ b/unstructured/ingest/pipeline/partition.py @@ -30,12 +30,19 @@ def run(self, ingest_doc_json) -> str: if self.partition_config.ocr_languages else [] ) + partition_kwargs = { + "strategy": self.partition_config.strategy, + "languages": languages, + "encoding": self.partition_config.encoding, + "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure, + } + if self.partition_config.skip_infer_table_types: + partition_kwargs[ + "skip_infer_table_types" + ] = self.partition_config.skip_infer_table_types elements = doc.process_file( partition_config=self.partition_config, - strategy=self.partition_config.strategy, - languages=languages, - encoding=self.partition_config.encoding, - pdf_infer_table_structure=self.partition_config.pdf_infer_table_structure, + **partition_kwargs, ) with open(json_path, "w", encoding="utf8") as output_f: logger.info(f"writing partitioned content to {json_path}")