From 67b49dc40a104f75d7a6446e91b29b4c072c8e8e Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Tue, 26 Sep 2023 11:05:15 -0600 Subject: [PATCH] Fix/pdf miner source property (#228) This PR adds three possible values for `source` field: * `pdfminer` as source for elements directly obtained from PDFs. * `OCR-tesseract` and `OCR-paddle` for elements obtained with the respective OCR engines. All those new values are stored in a new class `Source` in unstructured_inference>constants.py This would help users filter certain elements depending on how were obtained. --- CHANGELOG.md | 5 +++++ .../inference/test_layout.py | 17 ++++++++++++++++- .../inference/test_layout_element.py | 4 ++-- unstructured_inference/__version__.py | 2 +- unstructured_inference/constants.py | 10 ++++++++++ unstructured_inference/inference/elements.py | 1 + unstructured_inference/inference/layout.py | 7 ++++--- .../inference/layoutelement.py | 16 +++++++++++----- unstructured_inference/models/detectron2onnx.py | 3 ++- unstructured_inference/models/yolox.py | 3 ++- 10 files changed, 54 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49b29b4f..7748867f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.6.5-dev0 + +* Fix `source` property for elements generated by pdfminer. +* Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR. + ## 0.6.4 * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index f169f93c..fb73f3d0 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -10,9 +10,10 @@ from PIL import Image import unstructured_inference.models.base as models -from unstructured_inference.constants import OCRMode +from unstructured_inference.constants import OCRMode, Source from unstructured_inference.inference import elements, layout, layoutelement from unstructured_inference.models import chipper, detectron2, tesseract +from unstructured_inference.models.base import get_model from unstructured_inference.models.unstructuredmodel import ( UnstructuredElementExtractionModel, UnstructuredObjectDetectionModel, @@ -124,6 +125,19 @@ def detect(self, *args): assert elements.ocr(text_block, image=image) == "" +def test_ocr_source(): + file = "sample-docs/loremipsum-flat.pdf" + model = get_model("yolox_tiny") + doc = layout.DocumentLayout.from_file( + file, + model, + ocr_mode=OCRMode.FULL_PAGE.value, + supplement_with_ocr_elements=True, + ocr_strategy="force", + ) + assert Source.OCR_TESSERACT in {e.source for e in doc.pages[0].elements} + + class MockLayoutModel: def __init__(self, layout): self.layout_return = layout @@ -700,6 +714,7 @@ def test_ocr_image(region, objects, ocr_strategy, expected): @pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"]) def test_load_pdf(filename): layouts, images = layout.load_pdf(f"sample-docs/{filename}") + assert Source.PDFMINER in {e.source for e in layouts[0]} assert len(layouts) for lo in layouts: assert len(lo) diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 32ab3fb0..f3c775b0 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -2,7 +2,7 @@ from layoutparser.elements import TextBlock from layoutparser.elements.layout_elements import Rectangle as LPRectangle -from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR +from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR, Source from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layoutelement import ( LayoutElement, @@ -151,7 +151,7 @@ def test_layout_element_from_lp_textblock(): 300, 300, text="Sample Text", - source="detectron2_lp", + source=Source.DETECTRON2_LP, type="Text", prob=0.99, ) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 4bf914ad..530da768 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.4" # pragma: no cover +__version__ = "0.6.5-dev0" # pragma: no cover diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index c6c20299..78c46379 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -11,5 +11,15 @@ class AnnotationResult(Enum): PLOT = "plot" +class Source(Enum): + YOLOX = "yolox" + DETECTRON2_ONNX = "detectron2_onnx" + DETECTRON2_LP = "detectron2_lp" + OCR_TESSERACT = "OCR-tesseract" + OCR_PADDLE = "OCR-paddle" + PDFMINER = "pdfminer" + MERGED = "merged" + + SUBREGION_THRESHOLD_FOR_OCR = 0.5 FULL_PAGE_REGION_THRESHOLD = 0.99 diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 2c09b31a..c250ed86 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -12,6 +12,7 @@ from scipy.sparse.csgraph import connected_components from unstructured_inference.config import inference_config +from unstructured_inference.constants import Source from unstructured_inference.logger import logger from unstructured_inference.math import safe_division from unstructured_inference.models import tesseract diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index a8bb3410..d701414d 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -13,7 +13,7 @@ from PIL import Image, ImageSequence from pytesseract import Output -from unstructured_inference.constants import OCRMode +from unstructured_inference.constants import OCRMode, Source from unstructured_inference.inference.elements import ( EmbeddedTextRegion, ImageTextRegion, @@ -682,6 +682,7 @@ def load_pdf( x2 * coef, y2 * coef, text=_text, + source=Source.PDFMINER, ) if text_region.bbox is not None and text_region.bbox.area > 0: @@ -743,7 +744,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]: (x1, y1, x2, y2) = l, t, l + w, t + h text = ocr_data["text"][i] if text: - text_region = TextRegion.from_coords(x1, y1, x2, y2, text=text, source="OCR") + text_region = TextRegion.from_coords(x1, y1, x2, y2, text=text, source=Source.OCR_TESSERACT) text_regions.append(text_region) return text_regions @@ -779,7 +780,7 @@ def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]: y2 = max([i[1] for i in line[0]]) text = line[1][0] if text: - text_region = TextRegion(x1, y1, x2, y2, text) + text_region = TextRegion(x1, y1, x2, y2, text, source=Source.OCR_PADDLE) text_regions.append(text_region) return text_regions diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 66d497ec..184a943c 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -9,7 +9,11 @@ from PIL import Image from unstructured_inference.config import inference_config -from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR +from unstructured_inference.constants import ( + FULL_PAGE_REGION_THRESHOLD, + SUBREGION_THRESHOLD_FOR_OCR, + Source, +) from unstructured_inference.inference.elements import ( ImageTextRegion, Rectangle, @@ -80,7 +84,7 @@ def from_lp_textblock(cls, textblock: TextBlock): x2, y2, text=text, - source="detectron2_lp", + source=Source.DETECTRON2_LP, type=type, prob=prob, ) @@ -317,8 +321,10 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion: merged_text = " ".join([tr.text for tr in regions if tr.text]) sources = [*{tr.source for tr in regions}] - source = sources.pop() if len(sources) == 1 else "merged:".join(sources) # type:ignore - return TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text) + source = sources.pop() if len(sources) == 1 else Source.MERGED + element = TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text) + setattr(element, "merged_sources", sources) + return element def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]: @@ -332,7 +338,7 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE ) merged_regions = [merge_text_regions(group) for group in grouped_regions] return [ - LayoutElement(text=r.text, source=None, type="UncategorizedText", bbox=r.bbox) + LayoutElement(text=r.text, source=r.source, type="UncategorizedText", bbox=r.bbox) for r in merged_regions ] diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py index 7068cc79..c18c4adf 100644 --- a/unstructured_inference/models/detectron2onnx.py +++ b/unstructured_inference/models/detectron2onnx.py @@ -9,6 +9,7 @@ from onnxruntime.quantization import QuantType, quantize_dynamic from PIL import Image +from unstructured_inference.constants import Source from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.logger import logger, logger_onnx from unstructured_inference.models.unstructuredmodel import ( @@ -158,7 +159,7 @@ def postprocess( text=None, type=detected_class, prob=conf, - source="detectron2_onnx", + source=Source.DETECTRON2_ONNX, ) regions.append(region) diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index 11d9cad0..c11557a6 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -14,6 +14,7 @@ from onnxruntime.quantization import QuantType, quantize_dynamic from PIL import Image +from unstructured_inference.constants import Source from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.logger import logger from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel @@ -149,7 +150,7 @@ def image_processing( text=None, type=detected_class, prob=prob, - source="yolox", + source=Source.YOLOX, ) regions.append(region)