diff --git a/unstructured/partition/ocr.py b/unstructured/partition/ocr.py index c54622769d..d5b90e0dbf 100644 --- a/unstructured/partition/ocr.py +++ b/unstructured/partition/ocr.py @@ -10,8 +10,6 @@ # unstructured.documents.elements.Image from PIL import Image as PILImage from PIL import ImageSequence - -from unstructured.partition.utils.constants import OCRMode from unstructured_inference.inference.elements import ( Rectangle, TextRegion, @@ -24,8 +22,12 @@ from unstructured_pytesseract import Output from unstructured.logger import logger +from unstructured.partition.utils.constants import SUBREGION_THRESHOLD_FOR_OCR, OCRMode -SUBREGION_THRESHOLD_FOR_OCR = 0.5 +# Force tesseract to be single threaded, +# otherwise we see major performance problems +if "OMP_THREAD_LIMIT" not in os.environ: + os.environ["OMP_THREAD_LIMIT"] = "1" def process_data_with_ocr( diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index 137c97ad90..f88305a36e 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -8,3 +8,5 @@ class OCRMode(Enum): SORT_MODE_XY_CUT = "xy-cut" SORT_MODE_BASIC = "basic" + +SUBREGION_THRESHOLD_FOR_OCR = 0.5