Unstructured-IO · benjats07 · Sep 26, 2023 · Sep 23, 2023 · Sep 23, 2023 · Sep 23, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.6.5-dev0
+
+* Fix `source` property for elements generated by pdfminer.
+* Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR.
+
 ## 0.6.4
 
 * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task

diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -13,6 +13,7 @@
 from unstructured_inference.constants import OCRMode
 from unstructured_inference.inference import elements, layout, layoutelement
 from unstructured_inference.models import chipper, detectron2, tesseract
+from unstructured_inference.models.base import get_model
 from unstructured_inference.models.unstructuredmodel import (
     UnstructuredElementExtractionModel,
     UnstructuredObjectDetectionModel,
@@ -117,6 +118,19 @@ def detect(self, *args):
     assert elements.ocr(text_block, image=image) == ""
 
 
+def test_ocr_source():
+    file = "sample-docs/loremipsum-flat.pdf"
+    model = get_model("yolox_tiny")
+    doc = layout.DocumentLayout.from_file(
+        file,
+        model,
+        ocr_mode=OCRMode.FULL_PAGE.value,
+        supplement_with_ocr_elements=True,
+        ocr_strategy="force",
+    )
+    assert "OCR-tesseract" in {e.source for e in doc.pages[0].elements}
+
+
 class MockLayoutModel:
     def __init__(self, layout):
         self.layout_return = layout
@@ -678,6 +692,7 @@ def test_ocr_image(region, objects, ocr_strategy, expected):
 @pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
 def test_load_pdf(filename):
     layouts, images = layout.load_pdf(f"sample-docs/{filename}")
+    assert "pdfminer" in {e.source for e in layouts[0]}
     assert len(layouts)
     for lo in layouts:
         assert len(lo)

diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.4"  # pragma: no cover
+__version__ = "0.6.5-dev0"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -677,7 +677,14 @@ def load_pdf(
                 else:
                     continue
 
-            text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text)
+            text_region = element_class(
+                x1 * coef,
+                y1 * coef,
+                x2 * coef,
+                y2 * coef,
+                text=_text,
+                source="pdfminer",
 setattr(element, "merged_sources", sources) 
 setattr(element, "merged_sources", sources) 
+            )
 
             if text_region.area > 0:
                 layout.append(text_region)
@@ -738,7 +745,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]:
         (x1, y1, x2, y2) = l, t, l + w, t + h
         text = ocr_data["text"][i]
         if text:
-            text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR")
+            text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR-tesseract")
             text_regions.append(text_region)
 
     return text_regions
@@ -774,7 +781,7 @@ def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]:
             y2 = max([i[1] for i in line[0]])
             text = line[1][0]
             if text:
-                text_region = TextRegion(x1, y1, x2, y2, text)
+                text_region = TextRegion(x1, y1, x2, y2, text, source="OCR-paddle")
                 text_regions.append(text_region)
 
     return text_regions
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -332,7 +332,7 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE
             r.x2,
             r.y2,
             text=r.text,
-            source=None,
+            source=r.source,
             type="UncategorizedText",
         )
         for r in merged_regions
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.6.4" # pragma: no cover
		__version__ = "0.6.5-dev0" # pragma: no cover