Skip to content

Commit

Permalink
Fix/pdf miner source property (#228)
Browse files Browse the repository at this point in the history
This PR adds three possible values for `source` field:
* `pdfminer` as source for elements directly obtained from PDFs.
* `OCR-tesseract` and `OCR-paddle` for elements obtained with the
respective OCR engines.

All those new values are stored in a new class `Source` in unstructured_inference>constants.py

This would help users filter certain elements depending on how were
obtained.
  • Loading branch information
benjats07 authored and Benjamin Torres committed Sep 30, 2023
1 parent b82cec0 commit 67b49dc
Show file tree
Hide file tree
Showing 10 changed files with 54 additions and 14 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.6.5-dev0

* Fix `source` property for elements generated by pdfminer.
* Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR.

## 0.6.4

* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
Expand Down
17 changes: 16 additions & 1 deletion test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from PIL import Image

import unstructured_inference.models.base as models
from unstructured_inference.constants import OCRMode
from unstructured_inference.constants import OCRMode, Source
from unstructured_inference.inference import elements, layout, layoutelement
from unstructured_inference.models import chipper, detectron2, tesseract
from unstructured_inference.models.base import get_model
from unstructured_inference.models.unstructuredmodel import (
UnstructuredElementExtractionModel,
UnstructuredObjectDetectionModel,
Expand Down Expand Up @@ -124,6 +125,19 @@ def detect(self, *args):
assert elements.ocr(text_block, image=image) == ""


def test_ocr_source():
file = "sample-docs/loremipsum-flat.pdf"
model = get_model("yolox_tiny")
doc = layout.DocumentLayout.from_file(
file,
model,
ocr_mode=OCRMode.FULL_PAGE.value,
supplement_with_ocr_elements=True,
ocr_strategy="force",
)
assert Source.OCR_TESSERACT in {e.source for e in doc.pages[0].elements}


class MockLayoutModel:
def __init__(self, layout):
self.layout_return = layout
Expand Down Expand Up @@ -700,6 +714,7 @@ def test_ocr_image(region, objects, ocr_strategy, expected):
@pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
def test_load_pdf(filename):
layouts, images = layout.load_pdf(f"sample-docs/{filename}")
assert Source.PDFMINER in {e.source for e in layouts[0]}
assert len(layouts)
for lo in layouts:
assert len(lo)
Expand Down
4 changes: 2 additions & 2 deletions test_unstructured_inference/inference/test_layout_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from layoutparser.elements import TextBlock
from layoutparser.elements.layout_elements import Rectangle as LPRectangle

from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR
from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR, Source
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.layoutelement import (
LayoutElement,
Expand Down Expand Up @@ -151,7 +151,7 @@ def test_layout_element_from_lp_textblock():
300,
300,
text="Sample Text",
source="detectron2_lp",
source=Source.DETECTRON2_LP,
type="Text",
prob=0.99,
)
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.6.4" # pragma: no cover
__version__ = "0.6.5-dev0" # pragma: no cover
10 changes: 10 additions & 0 deletions unstructured_inference/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,15 @@ class AnnotationResult(Enum):
PLOT = "plot"


class Source(Enum):
YOLOX = "yolox"
DETECTRON2_ONNX = "detectron2_onnx"
DETECTRON2_LP = "detectron2_lp"
OCR_TESSERACT = "OCR-tesseract"
OCR_PADDLE = "OCR-paddle"
PDFMINER = "pdfminer"
MERGED = "merged"


SUBREGION_THRESHOLD_FOR_OCR = 0.5
FULL_PAGE_REGION_THRESHOLD = 0.99
1 change: 1 addition & 0 deletions unstructured_inference/inference/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from scipy.sparse.csgraph import connected_components

from unstructured_inference.config import inference_config
from unstructured_inference.constants import Source
from unstructured_inference.logger import logger
from unstructured_inference.math import safe_division
from unstructured_inference.models import tesseract
Expand Down
7 changes: 4 additions & 3 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from PIL import Image, ImageSequence
from pytesseract import Output

from unstructured_inference.constants import OCRMode
from unstructured_inference.constants import OCRMode, Source
from unstructured_inference.inference.elements import (
EmbeddedTextRegion,
ImageTextRegion,
Expand Down Expand Up @@ -682,6 +682,7 @@ def load_pdf(
x2 * coef,
y2 * coef,
text=_text,
source=Source.PDFMINER,
)

if text_region.bbox is not None and text_region.bbox.area > 0:
Expand Down Expand Up @@ -743,7 +744,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]:
(x1, y1, x2, y2) = l, t, l + w, t + h
text = ocr_data["text"][i]
if text:
text_region = TextRegion.from_coords(x1, y1, x2, y2, text=text, source="OCR")
text_region = TextRegion.from_coords(x1, y1, x2, y2, text=text, source=Source.OCR_TESSERACT)
text_regions.append(text_region)

return text_regions
Expand Down Expand Up @@ -779,7 +780,7 @@ def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]:
y2 = max([i[1] for i in line[0]])
text = line[1][0]
if text:
text_region = TextRegion(x1, y1, x2, y2, text)
text_region = TextRegion(x1, y1, x2, y2, text, source=Source.OCR_PADDLE)
text_regions.append(text_region)

return text_regions
16 changes: 11 additions & 5 deletions unstructured_inference/inference/layoutelement.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
from PIL import Image

from unstructured_inference.config import inference_config
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR
from unstructured_inference.constants import (
FULL_PAGE_REGION_THRESHOLD,
SUBREGION_THRESHOLD_FOR_OCR,
Source,
)
from unstructured_inference.inference.elements import (
ImageTextRegion,
Rectangle,
Expand Down Expand Up @@ -80,7 +84,7 @@ def from_lp_textblock(cls, textblock: TextBlock):
x2,
y2,
text=text,
source="detectron2_lp",
source=Source.DETECTRON2_LP,
type=type,
prob=prob,
)
Expand Down Expand Up @@ -317,8 +321,10 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion:

merged_text = " ".join([tr.text for tr in regions if tr.text])
sources = [*{tr.source for tr in regions}]
source = sources.pop() if len(sources) == 1 else "merged:".join(sources) # type:ignore
return TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
source = sources.pop() if len(sources) == 1 else Source.MERGED
element = TextRegion.from_coords(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
setattr(element, "merged_sources", sources)
return element


def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]:
Expand All @@ -332,7 +338,7 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE
)
merged_regions = [merge_text_regions(group) for group in grouped_regions]
return [
LayoutElement(text=r.text, source=None, type="UncategorizedText", bbox=r.bbox)
LayoutElement(text=r.text, source=r.source, type="UncategorizedText", bbox=r.bbox)
for r in merged_regions
]

Expand Down
3 changes: 2 additions & 1 deletion unstructured_inference/models/detectron2onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from onnxruntime.quantization import QuantType, quantize_dynamic
from PIL import Image

from unstructured_inference.constants import Source
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.logger import logger, logger_onnx
from unstructured_inference.models.unstructuredmodel import (
Expand Down Expand Up @@ -158,7 +159,7 @@ def postprocess(
text=None,
type=detected_class,
prob=conf,
source="detectron2_onnx",
source=Source.DETECTRON2_ONNX,
)

regions.append(region)
Expand Down
3 changes: 2 additions & 1 deletion unstructured_inference/models/yolox.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from onnxruntime.quantization import QuantType, quantize_dynamic
from PIL import Image

from unstructured_inference.constants import Source
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.logger import logger
from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
Expand Down Expand Up @@ -149,7 +150,7 @@ def image_processing(
text=None,
type=detected_class,
prob=prob,
source="yolox",
source=Source.YOLOX,
)

regions.append(region)
Expand Down

0 comments on commit 67b49dc

Please sign in to comment.