diff --git a/test_unstructured_ingest/test-ingest-local-single-file.sh b/test_unstructured_ingest/test-ingest-local-single-file.sh index bf6ad5f416..959b0b482c 100755 --- a/test_unstructured_ingest/test-ingest-local-single-file.sh +++ b/test_unstructured_ingest/test-ingest-local-single-file.sh @@ -22,11 +22,10 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ - --ocr-languages eng+kor \ --strategy ocr_only \ --verbose \ --reprocess \ - --input-path example-docs/english-and-korean.png \ + --input-path example-docs/language-docs/UDHR_first_article_all.txt \ --work-dir "$WORK_DIR" set +e diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 2e5ab72c8a..49b5f6eb1d 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -153,12 +153,11 @@ def add_cli_options(cmd: click.Command) -> None: ), click.Option( ["--ocr-languages"], - default="eng", + default=None, help="A list of language packs to specify which languages to use for OCR, " "separated by '+' e.g. 'eng+deu' to use the English and German language packs. " "The appropriate Tesseract " - "language pack needs to be installed." - "Default: eng", + "language pack needs to be installed.", ), click.Option( ["--encoding"], diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 6c219a6d0f..c3597c233f 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -39,7 +39,7 @@ class PartitionConfig(BaseConfig): # where to write structured data outputs pdf_infer_table_structure: bool = False strategy: str = "auto" - ocr_languages: str = "eng" + ocr_languages: t.Optional[str] = None encoding: t.Optional[str] = None fields_include: t.List[str] = field( default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"], diff --git a/unstructured/ingest/pipeline/partition.py b/unstructured/ingest/pipeline/partition.py index b3f39ccb90..c36459d10f 100644 --- a/unstructured/ingest/pipeline/partition.py +++ b/unstructured/ingest/pipeline/partition.py @@ -1,5 +1,6 @@ import hashlib import json +import typing as t from dataclasses import dataclass from pathlib import Path @@ -25,17 +26,19 @@ def run(self, ingest_doc_json) -> str: if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size: logger.info(f"File exists: {json_path}, skipping partition") return str(json_path) - languages = ( - self.partition_config.ocr_languages.split("+") - if self.partition_config.ocr_languages - else [] - ) + + partition_kwargs: t.Dict[str, t.Any] = { + "strategy": self.partition_config.strategy, + "encoding": self.partition_config.encoding, + "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure, + } + + if self.partition_config.ocr_languages: + partition_kwargs["languages"] = self.partition_config.ocr_languages.split("+") + elements = doc.process_file( partition_config=self.partition_config, - strategy=self.partition_config.strategy, - languages=languages, - encoding=self.partition_config.encoding, - pdf_infer_table_structure=self.partition_config.pdf_infer_table_structure, + **partition_kwargs, ) with open(json_path, "w", encoding="utf8") as output_f: logger.info(f"writing partitioned content to {json_path}") diff --git a/unstructured/utils.py b/unstructured/utils.py index d6fc32366b..d0dc772be6 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -221,7 +221,7 @@ def scarf_analytics(): + platform.machine() + "&gpu=" + str(gpu_present) - + "&dev=true" + + "&dev=true", ) else: requests.get( @@ -235,7 +235,7 @@ def scarf_analytics(): + platform.machine() + "&gpu=" + str(gpu_present) - + "&dev=false" + + "&dev=false", ) except Exception: pass