From 8685905bd134a509386b6d65400a943d96a8c4a8 Mon Sep 17 00:00:00 2001 From: Pluto Date: Mon, 13 Jan 2025 14:12:46 +0100 Subject: [PATCH] Character confidence threshold (#3860) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change adds the ability to filter out characters predicted by Tesseract with low confidence scores. Some notes: - I intentionally disabled it by default; I think some low score(like 0.9-0.95 for Tesseract) could be a safe choice though - I wanted to use character bboxes and combine them into word bbox later. However, a bug in Tesseract in some specific scenarios returns incorrect character bboxes (unit tests caught it 🥳 ). More in comment in the code --- CHANGELOG.md | 5 +- .../partition/pdf_image/test_ocr.py | 86 +++++++++++++++++- .../partition/pdf_image/test_pdf.py | 4 +- unstructured/__version__.py | 2 +- unstructured/partition/utils/config.py | 5 ++ .../utils/ocr_models/tesseract_ocr.py | 90 +++++++++++++++++-- 6 files changed, 177 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49045f10fe..bf7d87f567 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ -## 0.16.13-dev0 +## 0.16.13-dev1 ### Enhancements +- **Add character-level filtering for tesseract output**. It is controllable via `TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD` environment variable. ### Features @@ -8,7 +9,7 @@ - **Fix NLTK Download** to use nltk assets in docker image - removed the ability to automatically download nltk package if missing - + ## 0.16.12 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index b0be34fbfb..e9982810a0 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -6,6 +6,7 @@ import pandas as pd import pytest import unstructured_pytesseract +from bs4 import BeautifulSoup, Tag from pdf2image.exceptions import PDFPageCountError from PIL import Image, UnidentifiedImageError from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion @@ -71,8 +72,8 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch): def test_get_ocr_layout_from_image_tesseract(monkeypatch): monkeypatch.setattr( - unstructured_pytesseract, - "image_to_data", + OCRAgentTesseract, + "image_to_data_with_character_confidence_filter", lambda *args, **kwargs: pd.DataFrame( { "left": [10, 20, 30, 0], @@ -445,8 +446,8 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch): monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000") monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000") monkeypatch.setattr( - unstructured_pytesseract, - "image_to_data", + OCRAgentTesseract, + "image_to_data_with_character_confidence_filter", lambda *args, **kwargs: pd.DataFrame( { "left": [10, 20, 30, 0], @@ -484,3 +485,80 @@ def test_merge_out_layout_with_cid_code(mock_out_layout, mock_ocr_regions): # Check if the final layout contains both original elements and OCR-derived elements assert all(element in final_layout for element in mock_out_layout) assert any(element in final_layout for element in ocr_elements) + + +def _create_hocr_word_span( + characters: list[tuple[str, str]], word_bbox: tuple[int, int, int, int] +) -> Tag: + word_span = BeautifulSoup( + f"", + "html.parser", + ).span + for char, x_conf in characters: + char_span = BeautifulSoup( + f""" + {char} + """, # noqa : E501 + "html.parser", + ).span + word_span.append(char_span) + return word_span + + +def test_extract_word_from_hocr(): + characters = [ + ("w", "99.0"), + ("o", "98.5"), + ("r", "97.5"), + ("d", "96.0"), + ("!", "50.0"), + ("@", "45.0"), + ] + word_bbox = (10, 9, 70, 22) + word_span = _create_hocr_word_span(characters, word_bbox) + + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.0) + assert text == "word!@" + + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.960) + assert text == "word" + + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.990) + assert text == "w" + + text = OCRAgentTesseract.extract_word_from_hocr(word_span, 0.999) + assert text == "" + + +def test_hocr_to_dataframe(): + characters = [ + ("w", "99.0"), + ("o", "98.5"), + ("r", "97.5"), + ("d", "96.0"), + ("!", "50.0"), + ("@", "45.0"), + ] + word_bbox = (10, 9, 70, 22) + hocr = str(_create_hocr_word_span(characters, word_bbox)) + df = OCRAgentTesseract().hocr_to_dataframe(hocr=hocr, character_confidence_threshold=0.960) + + assert df.shape == (1, 5) + assert df["left"].iloc[0] == 10 + assert df["top"].iloc[0] == 9 + assert df["width"].iloc[0] == 60 + assert df["height"].iloc[0] == 13 + assert df["text"].iloc[0] == "word" + + +def test_hocr_to_dataframe_when_no_prediction_empty_df(): + df = OCRAgentTesseract().hocr_to_dataframe(hocr="") + + assert df.shape == (0, 5) + assert "left" in df.columns + assert "top" in df.columns + assert "width" in df.columns + assert "text" in df.columns + assert "text" in df.columns diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 9b1b8de6e1..200edf3e2a 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -995,11 +995,11 @@ def test_partition_hi_res_model_name_default_to_None(): [ ( PartitionStrategy.HI_RES, - "unstructured_pytesseract.image_to_data", + "unstructured_pytesseract.image_to_pdf_or_hocr", ), ( PartitionStrategy.OCR_ONLY, - "unstructured_pytesseract.image_to_data", + "unstructured_pytesseract.image_to_pdf_or_hocr", ), ( PartitionStrategy.OCR_ONLY, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a88e673551..ac5e032772 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.13-dev0" # pragma: no cover +__version__ = "0.16.13-dev1" # pragma: no cover diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py index 7023ff9d33..291ae1b6a3 100644 --- a/unstructured/partition/utils/config.py +++ b/unstructured/partition/utils/config.py @@ -96,6 +96,11 @@ def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: """optimum text height for tesseract OCR""" return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20) + @property + def TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD(self) -> int: + """Tesseract predictions with confidence below this threshold are ignored""" + return self._get_float("TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD", 0.0) + @property def GOOGLEVISION_API_ENDPOINT(self) -> str: """API endpoint to use for Google Vision""" diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 46eb8a0cbd..6e2c96da00 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -1,14 +1,15 @@ from __future__ import annotations import os +import re from typing import TYPE_CHECKING, List import cv2 import numpy as np import pandas as pd import unstructured_pytesseract +from bs4 import BeautifulSoup, Tag from PIL import Image as PILImage -from unstructured_pytesseract import Output from unstructured.logger import trace_logger from unstructured.partition.utils.config import env_config @@ -47,10 +48,10 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: trace_logger.detail("Processing entire page OCR with tesseract...") zoom = 1 - ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data( + ocr_df: pd.DataFrame = self.image_to_data_with_character_confidence_filter( np.array(image), lang=self.language, - output_type=Output.DATAFRAME, + character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, ) ocr_df = ocr_df.dropna() @@ -76,17 +77,94 @@ def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1), max_zoom, ) - ocr_df = unstructured_pytesseract.image_to_data( + ocr_df = self.image_to_data_with_character_confidence_filter( np.array(zoom_image(image, zoom)), lang=self.language, - output_type=Output.DATAFRAME, + character_confidence_threshold=env_config.TESSERACT_CHARACTER_CONFIDENCE_THRESHOLD, ) ocr_df = ocr_df.dropna() - ocr_regions = self.parse_data(ocr_df, zoom=zoom) return ocr_regions + def image_to_data_with_character_confidence_filter( + self, + image: np.ndarray, + lang: str = "eng", + config: str = "", + character_confidence_threshold: float = 0.0, + ) -> pd.DataFrame: + hocr: str = unstructured_pytesseract.image_to_pdf_or_hocr( + image, + lang=lang, + config="-c hocr_char_boxes=1 " + config, + extension="hocr", + ) + ocr_df = self.hocr_to_dataframe(hocr, character_confidence_threshold) + return ocr_df + + def hocr_to_dataframe( + self, hocr: str, character_confidence_threshold: float = 0.0 + ) -> pd.DataFrame: + soup = BeautifulSoup(hocr, "html.parser") + word_spans = soup.find_all("span", class_="ocrx_word") + + df_entries = [] + for word_span in word_spans: + word_title = word_span.get("title", "") + bbox_match = re.search(r"bbox (\d+) (\d+) (\d+) (\d+)", word_title) + + # Note: word bbox is used instead of combining characters together due to tesseract + # bug that causes the character bboxes to be outside the word bbox, and they have 0 + # height or width when text is horizontal + text = self.extract_word_from_hocr( + word=word_span, character_confidence_threshold=character_confidence_threshold + ) + if text and bbox_match: + word_bbox = list(map(int, bbox_match.groups())) + left, top, right, bottom = word_bbox + df_entries.append( + { + "left": left, + "top": top, + "right": right, + "bottom": bottom, + "text": text, + } + ) + ocr_df = pd.DataFrame(df_entries, columns=["left", "top", "right", "bottom", "text"]) + + ocr_df["width"] = ocr_df["right"] - ocr_df["left"] + ocr_df["height"] = ocr_df["bottom"] - ocr_df["top"] + + ocr_df = ocr_df.drop(columns=["right", "bottom"]) + return ocr_df + + @staticmethod + def extract_word_from_hocr(word: Tag, character_confidence_threshold: float = 0.0) -> str: + """Extracts a word from an hOCR word tag, filtering out characters with low confidence.""" + + character_spans = word.find_all("span", class_="ocrx_cinfo") + if len(character_spans) == 0: + return "" + + word_text = "" + for character_span in character_spans: + char = character_span.text + + char_title = character_span.get("title", "") + conf_match = re.search(r"x_conf (\d+\.\d+)", char_title) + + if not (char and conf_match): + continue + + character_probability = float(conf_match.group(1)) / 100 + + if character_probability >= character_confidence_threshold: + word_text += char + + return word_text + @requires_dependencies("unstructured_inference") def get_layout_elements_from_image(self, image: PILImage.Image) -> List["LayoutElement"]: from unstructured.partition.pdf_image.inference_utils import (