Skip to content

Commit

Permalink
Set language to None by default
Browse files Browse the repository at this point in the history
  • Loading branch information
rbiseck3 committed Oct 12, 2023
1 parent 9864086 commit f53622a
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 17 deletions.
3 changes: 1 addition & 2 deletions test_unstructured_ingest/test-ingest-local-single-file.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--ocr-languages eng+kor \
--strategy ocr_only \
--verbose \
--reprocess \
--input-path example-docs/english-and-korean.png \
--input-path example-docs/language-docs/UDHR_first_article_all.txt \
--work-dir "$WORK_DIR"

set +e
Expand Down
5 changes: 2 additions & 3 deletions unstructured/ingest/cli/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,11 @@ def add_cli_options(cmd: click.Command) -> None:
),
click.Option(
["--ocr-languages"],
default="eng",
default=None,
help="A list of language packs to specify which languages to use for OCR, "
"separated by '+' e.g. 'eng+deu' to use the English and German language packs. "
"The appropriate Tesseract "
"language pack needs to be installed."
"Default: eng",
"language pack needs to be installed.",
),
click.Option(
["--encoding"],
Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class PartitionConfig(BaseConfig):
# where to write structured data outputs
pdf_infer_table_structure: bool = False
strategy: str = "auto"
ocr_languages: str = "eng"
ocr_languages: t.Optional[str] = None
encoding: t.Optional[str] = None
fields_include: t.List[str] = field(
default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],
Expand Down
21 changes: 12 additions & 9 deletions unstructured/ingest/pipeline/partition.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import hashlib
import json
import typing as t
from dataclasses import dataclass
from pathlib import Path

Expand All @@ -25,17 +26,19 @@ def run(self, ingest_doc_json) -> str:
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
logger.info(f"File exists: {json_path}, skipping partition")
return str(json_path)
languages = (
self.partition_config.ocr_languages.split("+")
if self.partition_config.ocr_languages
else []
)

partition_kwargs: t.Dict[str, t.Any] = {
"strategy": self.partition_config.strategy,
"encoding": self.partition_config.encoding,
"pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure,
}

if self.partition_config.ocr_languages:
partition_kwargs["languages"] = self.partition_config.ocr_languages.split("+")

elements = doc.process_file(
partition_config=self.partition_config,
strategy=self.partition_config.strategy,
languages=languages,
encoding=self.partition_config.encoding,
pdf_infer_table_structure=self.partition_config.pdf_infer_table_structure,
**partition_kwargs,
)
with open(json_path, "w", encoding="utf8") as output_f:
logger.info(f"writing partitioned content to {json_path}")
Expand Down
4 changes: 2 additions & 2 deletions unstructured/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def scarf_analytics():
+ platform.machine()
+ "&gpu="
+ str(gpu_present)
+ "&dev=true"
+ "&dev=true",
)
else:
requests.get(
Expand All @@ -235,7 +235,7 @@ def scarf_analytics():
+ platform.machine()
+ "&gpu="
+ str(gpu_present)
+ "&dev=false"
+ "&dev=false",
)
except Exception:
pass

0 comments on commit f53622a

Please sign in to comment.