Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/pdf miner source property #228

Merged
merged 9 commits into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.6.5-dev0

* Fix `source` property for elements generated by pdfminer.
* Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR.

## 0.6.4

* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
Expand Down
17 changes: 16 additions & 1 deletion test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from PIL import Image

import unstructured_inference.models.base as models
from unstructured_inference.constants import OCRMode
from unstructured_inference.constants import OCRMode, Source
from unstructured_inference.inference import elements, layout, layoutelement
from unstructured_inference.models import chipper, detectron2, tesseract
from unstructured_inference.models.base import get_model
from unstructured_inference.models.unstructuredmodel import (
UnstructuredElementExtractionModel,
UnstructuredObjectDetectionModel,
Expand Down Expand Up @@ -117,6 +118,19 @@ def detect(self, *args):
assert elements.ocr(text_block, image=image) == ""


def test_ocr_source():
file = "sample-docs/loremipsum-flat.pdf"
model = get_model("yolox_tiny")
doc = layout.DocumentLayout.from_file(
file,
model,
ocr_mode=OCRMode.FULL_PAGE.value,
supplement_with_ocr_elements=True,
ocr_strategy="force",
)
assert Source.OCR_TESSERACT in {e.source for e in doc.pages[0].elements}


class MockLayoutModel:
def __init__(self, layout):
self.layout_return = layout
Expand Down Expand Up @@ -678,6 +692,7 @@ def test_ocr_image(region, objects, ocr_strategy, expected):
@pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
def test_load_pdf(filename):
layouts, images = layout.load_pdf(f"sample-docs/{filename}")
assert Source.PDFMINER in {e.source for e in layouts[0]}
assert len(layouts)
for lo in layouts:
assert len(lo)
Expand Down
4 changes: 2 additions & 2 deletions test_unstructured_inference/inference/test_layout_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from layoutparser.elements import TextBlock
from layoutparser.elements.layout_elements import Rectangle as LPRectangle

from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR
from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR, Source
from unstructured_inference.inference.elements import TextRegion
from unstructured_inference.inference.layoutelement import (
LayoutElement,
Expand Down Expand Up @@ -166,7 +166,7 @@ def test_layout_element_from_lp_textblock():
300,
300,
text="Sample Text",
source="detectron2_lp",
source=Source.DETECTRON2_LP,
type="Text",
prob=0.99,
)
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.6.4" # pragma: no cover
__version__ = "0.6.5-dev0" # pragma: no cover
10 changes: 10 additions & 0 deletions unstructured_inference/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,15 @@ class AnnotationResult(Enum):
PLOT = "plot"


class Source(Enum):
YOLOX = "yolox"
DETECTRON2_ONNX = "detectron2_onnx"
DETECTRON2_LP = "detectron2_lp"
OCR_TESSERACT = "OCR-tesseract"
OCR_PADDLE = "OCR-paddle"
PDFMINER = "pdfminer"
MERGED = "merged"


SUBREGION_THRESHOLD_FOR_OCR = 0.5
FULL_PAGE_REGION_THRESHOLD = 0.99
3 changes: 2 additions & 1 deletion unstructured_inference/inference/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from scipy.sparse.csgraph import connected_components

from unstructured_inference.config import inference_config
from unstructured_inference.constants import Source
from unstructured_inference.logger import logger
from unstructured_inference.math import safe_division
from unstructured_inference.models import tesseract
Expand Down Expand Up @@ -197,7 +198,7 @@ def intersections(*rects: Rectangle):
@dataclass
class TextRegion(Rectangle):
text: Optional[str] = None
source: Optional[str] = None
source: Optional[Source] = None

def __str__(self) -> str:
return str(self.text)
Expand Down
15 changes: 11 additions & 4 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from PIL import Image, ImageSequence
from pytesseract import Output

from unstructured_inference.constants import OCRMode
from unstructured_inference.constants import OCRMode, Source
from unstructured_inference.inference.elements import (
EmbeddedTextRegion,
ImageTextRegion,
Expand Down Expand Up @@ -677,7 +677,14 @@ def load_pdf(
else:
continue

text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text)
text_region = element_class(
x1 * coef,
y1 * coef,
x2 * coef,
y2 * coef,
text=_text,
source=Source.PDFMINER,
)

if text_region.area > 0:
layout.append(text_region)
Expand Down Expand Up @@ -738,7 +745,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]:
(x1, y1, x2, y2) = l, t, l + w, t + h
text = ocr_data["text"][i]
if text:
text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR")
text_region = TextRegion(x1, y1, x2, y2, text=text, source=Source.OCR_TESSERACT)
text_regions.append(text_region)

return text_regions
Expand Down Expand Up @@ -774,7 +781,7 @@ def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]:
y2 = max([i[1] for i in line[0]])
text = line[1][0]
if text:
text_region = TextRegion(x1, y1, x2, y2, text)
text_region = TextRegion(x1, y1, x2, y2, text, source=Source.OCR_PADDLE)
text_regions.append(text_region)

return text_regions
16 changes: 11 additions & 5 deletions unstructured_inference/inference/layoutelement.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
from PIL import Image

from unstructured_inference.config import inference_config
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR
from unstructured_inference.constants import (
FULL_PAGE_REGION_THRESHOLD,
SUBREGION_THRESHOLD_FOR_OCR,
Source,
)
from unstructured_inference.inference.elements import (
ImageTextRegion,
Rectangle,
Expand Down Expand Up @@ -74,7 +78,7 @@ def from_lp_textblock(cls, textblock: TextBlock):
text = textblock.text
type = textblock.type
prob = textblock.score
return cls(x1, y1, x2, y2, text=text, source="detectron2_lp", type=type, prob=prob)
return cls(x1, y1, x2, y2, text=text, source=Source.DETECTRON2_LP, type=type, prob=prob)


def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str:
Expand Down Expand Up @@ -311,8 +315,10 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion:

merged_text = " ".join([tr.text for tr in regions if tr.text])
sources = [*{tr.source for tr in regions}]
source = sources.pop() if len(sources) == 1 else "merged:".join(sources) # type:ignore
return TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
source = sources.pop() if len(sources) == 1 else Source.MERGED
element = TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
setattr(element, "merged_sources", sources)
return element


def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]:
Expand All @@ -332,7 +338,7 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE
r.x2,
r.y2,
text=r.text,
source=None,
source=r.source,
type="UncategorizedText",
)
for r in merged_regions
Expand Down
3 changes: 2 additions & 1 deletion unstructured_inference/models/detectron2onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from onnxruntime.quantization import QuantType, quantize_dynamic
from PIL import Image

from unstructured_inference.constants import Source
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.logger import logger, logger_onnx
from unstructured_inference.models.unstructuredmodel import (
Expand Down Expand Up @@ -158,7 +159,7 @@ def postprocess(
text=None,
type=detected_class,
prob=conf,
source="detectron2_onnx",
source=Source.DETECTRON2_ONNX,
)

regions.append(region)
Expand Down
3 changes: 2 additions & 1 deletion unstructured_inference/models/yolox.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from onnxruntime.quantization import QuantType, quantize_dynamic
from PIL import Image

from unstructured_inference.constants import Source
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.logger import logger
from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
Expand Down Expand Up @@ -149,7 +150,7 @@ def image_processing(
text=None,
type=detected_class,
prob=prob,
source="yolox",
source=Source.YOLOX,
)

regions.append(region)
Expand Down