diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49045f10fe..bf7d87f567 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 0.16.13-dev0
+## 0.16.13-dev1
### Enhancements
+- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable.
### Features
@@ -8,7 +9,7 @@
- **Fix NLTK Download** to use nltk assets in docker image
- removed the ability to automatically download nltk package if missing
-
+
## 0.16.12
### Enhancements
diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
index b0be34fbfb..e9982810a0 100644
--- a/test_unstructured/partition/pdf_image/test_ocr.py
+++ b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -6,6 +6,7 @@
import pandas as pd
import pytest
import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
from pdf2image.exceptions import PDFPageCountError
from PIL import Image, UnidentifiedImageError
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
def test_get_ocr_layout_from_image_tesseract(monkeypatch):
monkeypatch.setattr(
- unstructured_pytesseract,
- "image_to_data",
+ OCRAgentTesseract,
+ "image_to_data_with_character_confidence_filter",
lambda *args, **kwargs: pd.DataFrame(
{
"left": [10, 20, 30, 0],
@@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
monkeypatch.setattr(
- unstructured_pytesseract,
- "image_to_data",
+ OCRAgentTesseract,
+ "image_to_data_with_character_confidence_filter",
lambda *args, **kwargs: pd.DataFrame(
{
"left": [10, 20, 30, 0],
@@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions):
# Check if the final layout contains both original elements and OCR-derived elements
assert all(element in final_layout for element in mock_out_layout)
assert any(element in final_layout for element in ocr_elements)
+
+
+def _create_hocr_word_span(
+ characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int]
+) -> Tag:
+ word_span = BeautifulSoup(
+ f"",
+ "html.parser",
+ ).span
+ for char, x_conf in characters:
+ char_span = BeautifulSoup(
+ f"""
+ {char}
+ """, # noqa : E501
+ "html.parser",
+ ).span
+ word_span.append(char_span)
+ return word_span
+
+
+def test_extract_word_from_hocr():
+ characters = [
+ ("w", "99.0"),
+ ("o", "98.5"),
+ ("r", "97.5"),
+ ("d", "96.0"),
+ ("!", "50.0"),
+ ("@", "45.0"),
+ ]
+ word_bbox = (10, 9, 70, 22)
+ word_span = _create_hocr_word_span(characters, word_bbox)
+
+ text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0)
+ assert text == "word!@"
+
+ text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960)
+ assert text == "word"
+
+ text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990)
+ assert text == "w"
+
+ text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999)
+ assert text == ""
+
+
+def test_hocr_to_dataframe():
+ characters = [
+ ("w", "99.0"),
+ ("o", "98.5"),
+ ("r", "97.5"),
+ ("d", "96.0"),
+ ("!", "50.0"),
+ ("@", "45.0"),
+ ]
+ word_bbox = (10, 9, 70, 22)
+ hocr = str(_create_hocr_word_span(characters, word_bbox))
+ df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960)
+
+ assert df.shape == (1, 5)
+ assert df["left"].iloc[0] == 10
+ assert df["top"].iloc[0] == 9
+ assert df["width"].iloc[0] == 60
+ assert df["height"].iloc[0] == 13
+ assert df["text"].iloc[0] == "word"
+
+
+def test_hocr_to_dataframe_when_no_prediction_empty_df():
+ df = OCRAgentTesseract().hocr_to_dataframe(hocr="")
+
+ assert df.shape == (0, 5)
+ assert "left" in df.columns
+ assert "top" in df.columns
+ assert "width" in df.columns
+ assert "text" in df.columns
+ assert "text" in df.columns
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
index 9b1b8de6e1..200edf3e2a 100644
--- a/test_unstructured/partition/pdf_image/test_pdf.py
+++ b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None():
[
(
PartitionStrategy.HI_RES,
- "unstructured_pytesseract.image_to_data",
+ "unstructured_pytesseract.image_to_pdf_or_hocr",
),
(
PartitionStrategy.OCR_ONLY,
- "unstructured_pytesseract.image_to_data",
+ "unstructured_pytesseract.image_to_pdf_or_hocr",
),
(
PartitionStrategy.OCR_ONLY,
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index a88e673551..ac5e032772 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.13-dev0" # pragma: no cover
+__version__ = "0.16.13-dev1" # pragma: no cover
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
index 7023ff9d33..291ae1b6a3 100644
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
"""optimum text height for tesseract OCR"""
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
+ @property
+ def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int:
+ """Tesseract predictions with confidence below this threshold are ignored"""
+ return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0)
+
@property
def GOOGLEVISION_API_ENDPOINT(self) -> str:
"""API endpoint to use for Google Vision"""
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
index 46eb8a0cbd..6e2c96da00 100644
--- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py
+++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -1,14 +1,15 @@
from __future__ import annotations
import os
+import re
from typing import TYPE_CHECKING, List
import cv2
import numpy as np
import pandas as pd
import unstructured_pytesseract
+from bs4 import BeautifulSoup, Tag
from PIL import Image as PILImage
-from unstructured_pytesseract import Output
from unstructured.logger import trace_logger
from unstructured.partition.utils.config import env_config
@@ -47,10 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
trace_logger.detail("Processing entire page OCR with tesseract...")
zoom = 1
- ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
+ ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter(
np.array(image),
lang=self.language,
- output_type=Output.DATAFRAME,
+ character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
)
ocr_df = ocr_df.dropna()
@@ -76,17 +77,94 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
max_zoom,
)
- ocr_df = unstructured_pytesseract.image_to_data(
+ ocr_df = self.image_to_data_with_character_confidence_filter(
np.array(zoom_image(image, zoom)),
lang=self.language,
- output_type=Output.DATAFRAME,
+ character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD,
)
ocr_df = ocr_df.dropna()
-
ocr_regions = self.parse_data(ocr_df, zoom=zoom)
return ocr_regions
+ def image_to_data_with_character_confidence_filter(
+ self,
+ image: np.ndarray,
+ lang: str = "eng",
+ config: str = "",
+ character_confidence_threshold: float = 0.0,
+ ) -> pd.DataFrame:
+ hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr(
+ image,
+ lang=lang,
+ config="-c hocr_char_boxes=1 " + config,
+ extension="hocr",
+ )
+ ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold)
+ return ocr_df
+
+ def hocr_to_dataframe(
+ self, hocr: str, character_confidence_threshold: float = 0.0
+ ) -> pd.DataFrame:
+ soup = BeautifulSoup(hocr, "html.parser")
+ word_spans = soup.find_all("span", class_="ocrx_word")
+
+ df_entries = []
+ for word_span in word_spans:
+ word_title = word_span.get("title", "")
+ bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title)
+
+ # Note: word bbox is used instead of combining characters together due to tesseract
+ # bug that causes the character bboxes to be outside the word bbox, and they have 0
+ # height or width when text is horizontal
+ text = self.extract_word_from_hocr(
+ word=word_span, character_confidence_threshold=character_confidence_threshold
+ )
+ if text and bbox_match:
+ word_bbox = list(map(int, bbox_match.groups()))
+ left, top, right, bottom = word_bbox
+ df_entries.append(
+ {
+ "left": left,
+ "top": top,
+ "right": right,
+ "bottom": bottom,
+ "text": text,
+ }
+ )
+ ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "right", "bottom", "text"])
+
+ ocr_df["width"] = ocr_df["right"] - ocr_df["left"]
+ ocr_df["height"] = ocr_df["bottom"] - ocr_df["top"]
+
+ ocr_df = ocr_df.drop(columns=["right", "bottom"])
+ return ocr_df
+
+ @staticmethod
+ def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str:
+ """Extracts a word from an hOCR word tag, filtering out characters with low confidence."""
+
+ character_spans = word.find_all("span", class_="ocrx_cinfo")
+ if len(character_spans) == 0:
+ return ""
+
+ word_text = ""
+ for character_span in character_spans:
+ char = character_span.text
+
+ char_title = character_span.get("title", "")
+ conf_match = re.search(r"x_conf (\d+\.\d+)", char_title)
+
+ if not (char and conf_match):
+ continue
+
+ character_probability = float(conf_match.group(1)) / 100
+
+ if character_probability >= character_confidence_threshold:
+ word_text += char
+
+ return word_text
+
@requires_dependencies("unstructured_inference")
def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]:
from unstructured.partition.pdf_image.inference_utils import (