Set language to None by default

Unstructured-IO · Oct 12, 2023 · f53622a · f53622a
1 parent 9864086
commit f53622a
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 17 deletions.
diff --git a/test_unstructured_ingest/test-ingest-local-single-file.sh b/test_unstructured_ingest/test-ingest-local-single-file.sh
@@ -22,11 +22,10 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
     --num-processes "$max_processes" \
     --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
     --output-dir "$OUTPUT_DIR" \
-    --ocr-languages eng+kor \
     --strategy ocr_only \
     --verbose \
     --reprocess \
-    --input-path example-docs/english-and-korean.png \
+    --input-path example-docs/language-docs/UDHR_first_article_all.txt \
     --work-dir "$WORK_DIR"
 
 set +e

diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py
@@ -153,12 +153,11 @@ def add_cli_options(cmd: click.Command) -> None:
             ),
             click.Option(
                 ["--ocr-languages"],
-                default="eng",
+                default=None,
                 help="A list of language packs to specify which languages to use for OCR, "
                 "separated by '+' e.g. 'eng+deu' to use the English and German language packs. "
                 "The appropriate Tesseract "
-                "language pack needs to be installed."
-                "Default: eng",
+                "language pack needs to be installed.",
             ),
             click.Option(
                 ["--encoding"],

diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py
@@ -39,7 +39,7 @@ class PartitionConfig(BaseConfig):
     # where to write structured data outputs
     pdf_infer_table_structure: bool = False
     strategy: str = "auto"
-    ocr_languages: str = "eng"
+    ocr_languages: t.Optional[str] = None
     encoding: t.Optional[str] = None
     fields_include: t.List[str] = field(
         default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],

diff --git a/unstructured/ingest/pipeline/partition.py b/unstructured/ingest/pipeline/partition.py
@@ -1,5 +1,6 @@
 import hashlib
 import json
+import typing as t
 from dataclasses import dataclass
 from pathlib import Path
 
@@ -25,17 +26,19 @@ def run(self, ingest_doc_json) -> str:
         if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
             logger.info(f"File exists: {json_path}, skipping partition")
             return str(json_path)
-        languages = (
-            self.partition_config.ocr_languages.split("+")
-            if self.partition_config.ocr_languages
-            else []
-        )
+
+        partition_kwargs: t.Dict[str, t.Any] = {
+            "strategy": self.partition_config.strategy,
+            "encoding": self.partition_config.encoding,
+            "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
+        }
+
+        if self.partition_config.ocr_languages:
+            partition_kwargs["languages"] = self.partition_config.ocr_languages.split("+")
+
         elements = doc.process_file(
             partition_config=self.partition_config,
-            strategy=self.partition_config.strategy,
-            languages=languages,
-            encoding=self.partition_config.encoding,
-            pdf_infer_table_structure=self.partition_config.pdf_infer_table_structure,
+            **partition_kwargs,
         )
         with open(json_path, "w", encoding="utf8") as output_f:
             logger.info(f"writing partitioned content to {json_path}")

diff --git a/unstructured/utils.py b/unstructured/utils.py
@@ -221,7 +221,7 @@ def scarf_analytics():
                     + platform.machine()
                     + "&gpu="
                     + str(gpu_present)
-                    + "&dev=true"
+                    + "&dev=true",
                 )
             else:
                 requests.get(
@@ -235,7 +235,7 @@ def scarf_analytics():
                     + platform.machine()
                     + "&gpu="
                     + str(gpu_present)
-                    + "&dev=false"
+                    + "&dev=false",
                 )
     except Exception:
         pass