From 8685905bd134a509386b6d65400a943d96a8c4a8 Mon Sep 17 00:00:00 2001
From: Pluto <kamil.plucinski@deepsense.ai>
Date: Mon, 13 Jan 2025 14:12:46 +0100
Subject: [PATCH] Character confidence threshold (#3860)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This change adds the ability to filter out characters predicted by
Tesseract with low confidence scores.

Some notes:
- I intentionally disabled it by default; I think some low score(like
0.9-0.95 for Tesseract) could be a safe choice though
- I wanted to use character bboxes and combine them into word bbox
later. However, a bug in Tesseract in some specific scenarios returns
incorrect character bboxes (unit tests caught it 🥳 ). More in comment in
the code
---
 CHANGELOG.md                                  |  5 +-
 .../partition/pdf_image/test_ocr.py           | 86 +++++++++++++++++-
 .../partition/pdf_image/test_pdf.py           |  4 +-
 unstructured/__version__.py                   |  2 +-
 unstructured/partition/utils/config.py        |  5 ++
 .../utils/ocr_models/tesseract_ocr.py         | 90 +++++++++++++++++--
 6 files changed, 177 insertions(+), 15 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49045f10fe..bf7d87f567 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 0.16.13-dev0
+## 0.16.13-dev1
 
 ### Enhancements
+- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable.
 
 ### Features
 
@@ -8,7 +9,7 @@
 
 - **Fix NLTK Download** to use nltk assets in docker image
 - removed the ability to automatically download nltk package if missing
-  
+
 ## 0.16.12
 
 ### Enhancements
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
index b0be34fbfb..e9982810a0 100644
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import pytest
 import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
 from pdf2image.exceptions import PDFPageCountError
 from PIL import Image, UnidentifiedImageError
 from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
 
 def test_get_ocr_layout_from_image_tesseract(monkeypatch):
     monkeypatch.setattr(
-        unstructured_pytesseract,
-        "image_to_data",
+        OCRAgentTesseract,
+        "image_to_data_with_character_confidence_filter",
         lambda *args, **kwargs: pd.DataFrame(
             {
                 "left": [10, 20, 30, 0],
@@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
     monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
     monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
     monkeypatch.setattr(
-        unstructured_pytesseract,
-        "image_to_data",
+        OCRAgentTesseract,
+        "image_to_data_with_character_confidence_filter",
         lambda *args, **kwargs: pd.DataFrame(
             {
                 "left": [10, 20, 30, 0],
@@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
     # Check if the final layout contains both original elements and OCR-derived elements
     assert all(element in final_layout for element in mock_out_layout)
     assert any(element in final_layout for element in ocr_elements)
+
+
+def _create_hocr_word_span(
+    characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
+) -> Tag:
+    word_span = BeautifulSoup(
+        f"<span class='ocrx_word' title='"
+        f"bbox {word_bbox[0]} {word_bbox[1]} {word_bbox[2]} {word_bbox[3]}"
+        f"; x_wconf 64'></span>",
+        "html.parser",
+    ).span
+    for char, x_conf in characters:
+        char_span = BeautifulSoup(
+            f"""
+            <span class='ocrx_cinfo' title='x_bboxes 0 0 0 0; x_conf {x_conf}'>{char}</span>
+            """,  # noqa : E501
+            "html.parser",
+        ).span
+        word_span.append(char_span)
+    return word_span
+
+
+def test_extract_word_from_hocr():
+    characters = [
+        ("w", "99.0"),
+        ("o", "98.5"),
+        ("r", "97.5"),
+        ("d", "96.0"),
+        ("!", "50.0"),
+        ("@", "45.0"),
+    ]
+    word_bbox = (10, 9, 70, 22)
+    word_span = _create_hocr_word_span(characters, word_bbox)
+
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
+    assert text == "word!@"
+
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
+    assert text == "word"
+
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
+    assert text == "w"
+
+    text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
+    assert text == ""
+
+
+def test_hocr_to_dataframe():
+    characters = [
+        ("w", "99.0"),
+        ("o", "98.5"),
+        ("r", "97.5"),
+        ("d", "96.0"),
+        ("!", "50.0"),
+        ("@", "45.0"),
+    ]
+    word_bbox = (10, 9, 70, 22)
+    hocr = str(_create_hocr_word_span(characters, word_bbox))
+    df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
+
+    assert df.shape == (1, 5)
+    assert df["left"].iloc[0] == 10
+    assert df["top"].iloc[0] == 9
+    assert df["width"].iloc[0] == 60
+    assert df["height"].iloc[0] == 13
+    assert df["text"].iloc[0] == "word"
+
+
+def test_hocr_to_dataframe_when_no_prediction_empty_df():
+    df = OCRAgentTesseract().hocr_to_dataframe(hocr="")
+
+    assert df.shape == (0, 5)
+    assert "left" in df.columns
+    assert "top" in df.columns
+    assert "width" in df.columns
+    assert "text" in df.columns
+    assert "text" in df.columns
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 9b1b8de6e1..200edf3e2a 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():
     [
         (
             PartitionStrategy.HI_RES,
-            "unstructured_pytesseract.image_to_data",
+            "unstructured_pytesseract.image_to_pdf_or_hocr",
         ),
         (
             PartitionStrategy.OCR_ONLY,
-            "unstructured_pytesseract.image_to_data",
+            "unstructured_pytesseract.image_to_pdf_or_hocr",
         ),
         (
             PartitionStrategy.OCR_ONLY,
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index a88e673551..ac5e032772 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.13-dev0"  # pragma: no cover
+__version__ = "0.16.13-dev1"  # pragma: no cover
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index 7023ff9d33..291ae1b6a3 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
         """optimum text height for tesseract OCR"""
         return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
 
+    @property
+    def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
+        """Tesseract predictions with confidence below this threshold are ignored"""
+        return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
+
     @property
     def GOOGLEVISION_API_ENDPOINT(self) -> str:
         """API endpoint to use for Google Vision"""
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 46eb8a0cbd..6e2c96da00 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -1,14 +1,15 @@
 from __future__ import annotations
 
 import os
+import re
 from typing import TYPE_CHECKING, List
 
 import cv2
 import numpy as np
 import pandas as pd
 import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
 from PIL import Image as PILImage
-from unstructured_pytesseract import Output
 
 from unstructured.logger import trace_logger
 from unstructured.partition.utils.config import env_config
@@ -47,10 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
 
         trace_logger.detail("Processing entire page OCR with tesseract...")
         zoom = 1
-        ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
+        ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
             np.array(image),
             lang=self.language,
-            output_type=Output.DATAFRAME,
+            character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
         )
         ocr_df = ocr_df.dropna()
 
@@ -76,17 +77,94 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
                 np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
                 max_zoom,
             )
-            ocr_df = unstructured_pytesseract.image_to_data(
+            ocr_df = self.image_to_data_with_character_confidence_filter(
                 np.array(zoom_image(image, zoom)),
                 lang=self.language,
-                output_type=Output.DATAFRAME,
+                character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
             )
             ocr_df = ocr_df.dropna()
-
         ocr_regions = self.parse_data(ocr_df, zoom=zoom)
 
         return ocr_regions
 
+    def image_to_data_with_character_confidence_filter(
+        self,
+        image: np.ndarray,
+        lang: str = "eng",
+        config: str = "",
+        character_confidence_threshold: float = 0.0,
+    ) -> pd.DataFrame:
+        hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
+            image,
+            lang=lang,
+            config="-c hocr_char_boxes=1 " + config,
+            extension="hocr",
+        )
+        ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold)
+        return ocr_df
+
+    def hocr_to_dataframe(
+        self, hocr: str, character_confidence_threshold: float = 0.0
+    ) -> pd.DataFrame:
+        soup = BeautifulSoup(hocr, "html.parser")
+        word_spans = soup.find_all("span", class_="ocrx_word")
+
+        df_entries = []
+        for word_span in word_spans:
+            word_title = word_span.get("title", "")
+            bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)
+
+            # Note: word bbox is used instead of combining characters together due to tesseract
+            # bug that causes the character bboxes to be outside the word bbox, and they have 0
+            # height or width when text is horizontal
+            text = self.extract_word_from_hocr(
+                word=word_span, character_confidence_threshold=character_confidence_threshold
+            )
+            if text and bbox_match:
+                word_bbox = list(map(int, bbox_match.groups()))
+                left, top, right, bottom = word_bbox
+                df_entries.append(
+                    {
+                        "left": left,
+                        "top": top,
+                        "right": right,
+                        "bottom": bottom,
+                        "text": text,
+                    }
+                )
+        ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "right", "bottom", "text"])
+
+        ocr_df["width"] = ocr_df["right"] - ocr_df["left"]
+        ocr_df["height"] = ocr_df["bottom"] - ocr_df["top"]
+
+        ocr_df = ocr_df.drop(columns=["right", "bottom"])
+        return ocr_df
+
+    @staticmethod
+    def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
+        """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
+
+        character_spans = word.find_all("span", class_="ocrx_cinfo")
+        if len(character_spans) == 0:
+            return ""
+
+        word_text = ""
+        for character_span in character_spans:
+            char = character_span.text
+
+            char_title = character_span.get("title", "")
+            conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
+
+            if not (char and conf_match):
+                continue
+
+            character_probability = float(conf_match.group(1)) / 100
+
+            if character_probability >= character_confidence_threshold:
+                word_text += char
+
+        return word_text
+
     @requires_dependencies("unstructured_inference")
     def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
         from unstructured.partition.pdf_image.inference_utils import (