fix: handle errors from Tesseract (#165)

* fix: handle errors from Tesseract Certain regions of a document are failing ocr with this error: `pytesseract.pytesseract.TesseractError: (-8, 'Estimating resolution as 1250')` When I try the same region on the CLI, I get: ``` $ tesseract bad_tile.jpeg output Estimating resolution as 1813 Floating point exception ``` Whatever the root cause, let's catch this error and return an empty string. * fix lint error * more lint stuff * temporary bump to dev0 version for debugging * release version --------- Co-authored-by: Crag Wolfe <[email protected]> Co-authored-by: shreyanid <[email protected]>
Unstructured-IO · Aug 9, 2023 · 203f7ab · 203f7ab
1 parent 9a53178
commit 203f7ab
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.5.9
+
+* Handle exceptions from Tesseract
+
 ## 0.5.8
 
 * Add alternative architecture for detectron2 (but default is unchanged)

diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -83,6 +83,21 @@ def detect(self, *args):
     assert elements.ocr(text_block, image=image) == mock_text
 
 
+def test_ocr_with_error(monkeypatch):
+    class MockOCRAgent:
+        def detect(self, *args):
+            # We sometimes get this error on very small images
+            raise tesseract.TesseractError(-8, "Estimating resolution as 1023")
+
+    monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent})
+    monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True)
+
+    image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB")
+    text_block = layout.TextRegion(1, 2, 3, 4, text=None)
+
+    assert elements.ocr(text_block, image=image) == ""
+
+
 class MockLayoutModel:
     def __init__(self, layout):
         self.layout_return = layout

diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.8"  # pragma: no cover
+__version__ = "0.5.9"  # pragma: no cover
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
@@ -269,7 +269,11 @@ def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> s
     agent = tesseract.ocr_agents.get(languages)
     if agent is None:
         raise RuntimeError("OCR agent is not loaded for {languages}.")
-    return agent.detect(cropped_image)
+
+    try:
+        return agent.detect(cropped_image)
+    except tesseract.TesseractError:
+        return ""
 
 
 def needs_ocr(

diff --git a/unstructured_inference/models/tesseract.py b/unstructured_inference/models/tesseract.py
@@ -1,11 +1,14 @@
 from typing import Dict
 
+import pytesseract
 from layoutparser.ocr.tesseract_agent import TesseractAgent, is_pytesseract_available
 
 from unstructured_inference.logger import logger
 
 ocr_agents: Dict[str, TesseractAgent] = {}
 
+TesseractError = pytesseract.pytesseract.TesseractError
+
 
 def load_agent(languages: str = "eng"):
     """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.5.8" # pragma: no cover
		__version__ = "0.5.9" # pragma: no cover