From bfdf35798762bd656acef2ef9ade0e9c780118e0 Mon Sep 17 00:00:00 2001
From: cragwolfe <crag@unstructured.io>
Date: Wed, 20 Sep 2023 22:53:28 -0700
Subject: [PATCH 01/11] chore: changelog repair (#221)

---
 CHANGELOG.md                          | 8 ++------
 unstructured_inference/__version__.py | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a07c7be..3eb5b33d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,15 +1,11 @@
+## 0.5.32-dev0
+
 ## 0.5.31
 
 * Add functionality to extract and save images from the page
 * Add functionality to get only "true" embedded images when extracting elements from PDF pages
 * Update the layout visualization script to be able to show only image elements if need
-
-## 0.5.30
-
 * add an evaluation metric for table comparison based on token similarity
-
-## 0.5.29-dev0
-
 * fix paddle unit tests where `make test` fails since paddle doesn't work on M1/M2 chip locally
 
 ## 0.5.28
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 68f3020f..09d2007b 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.31"  # pragma: no cover
+__version__ = "0.5.32-dev0"  # pragma: no cover

From 5e73202530d1faa995652144cece8aa8804decf2 Mon Sep 17 00:00:00 2001
From: Yao You <theyaoyou@gmail.com>
Date: Thu, 21 Sep 2023 08:55:38 -0500
Subject: [PATCH 02/11] feat: add config class (#218)

- add a dataclass that contains configrations for inference processes
- the parameters can be specified via env variables, which overrides the
default values; this allows for flexibly setup in different
applications/deployments
- currently contains specifications for table and layout related
parameters
- followup needed to identify other parameters that can be added to this
config class
---
 CHANGELOG.md                                  |  5 +-
 test_unstructured_inference/test_config.py    | 11 +++
 test_unstructured_inference/test_utils.py     |  2 +-
 unstructured_inference/__version__.py         |  2 +-
 unstructured_inference/config.py              | 82 +++++++++++++++++++
 unstructured_inference/inference/elements.py  | 14 ++--
 .../inference/layoutelement.py                |  7 +-
 unstructured_inference/models/tables.py       | 13 ++-
 unstructured_inference/utils.py               |  6 +-
 9 files changed, 122 insertions(+), 20 deletions(-)
 create mode 100644 test_unstructured_inference/test_config.py
 create mode 100644 unstructured_inference/config.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3eb5b33d..4edd4d73 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,7 @@
-## 0.5.32-dev0
+## 0.6.0
+
+* add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables
+* update behavior of `pad_image_with_background_color` so that input `pad` is applied to all sides
 
 ## 0.5.31
 
diff --git a/test_unstructured_inference/test_config.py b/test_unstructured_inference/test_config.py
new file mode 100644
index 00000000..1a56c9ff
--- /dev/null
+++ b/test_unstructured_inference/test_config.py
@@ -0,0 +1,11 @@
+def test_default_config():
+    from unstructured_inference.config import inference_config
+
+    assert inference_config.TABLE_IMAGE_CROP_PAD == 12
+
+
+def test_env_override(monkeypatch):
+    monkeypatch.setenv("TABLE_IMAGE_CROP_PAD", 1)
+    from unstructured_inference.config import inference_config
+
+    assert inference_config.TABLE_IMAGE_CROP_PAD == 1
diff --git a/test_unstructured_inference/test_utils.py b/test_unstructured_inference/test_utils.py
index 29e6c13e..bc30c0e5 100644
--- a/test_unstructured_inference/test_utils.py
+++ b/test_unstructured_inference/test_utils.py
@@ -135,7 +135,7 @@ def test_annotate_layout_elements_with_plot_result():
 def test_pad_image_with_background_color(mock_pil_image):
     pad = 10
     height, width = mock_pil_image.size
-    padded = pad_image_with_background_color(mock_pil_image, pad * 2, "black")
+    padded = pad_image_with_background_color(mock_pil_image, pad, "black")
     assert padded.size == (height + 2 * pad, width + 2 * pad)
     np.testing.assert_array_almost_equal(
         np.array(padded.crop((pad, pad, width + pad, height + pad))),
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 09d2007b..3b3ca6c4 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.32-dev0"  # pragma: no cover
+__version__ = "0.6.0"  # pragma: no cover
diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py
new file mode 100644
index 00000000..5f1f90f3
--- /dev/null
+++ b/unstructured_inference/config.py
@@ -0,0 +1,82 @@
+"""
+This module contains variables that can permitted to be tweaked by the system environment. For
+example, model parameters that changes the output of an inference call. Constants do NOT belong in
+this module. Constants are values that are usually names for common options (e.g., color names) or
+settings that should not be altered without making a code change (e.g., definition of 1Gb of memory
+in bytes). Constants should go into `./constants.py`
+"""
+import os
+from dataclasses import dataclass
+
+
+@dataclass
+class InferenceConfig:
+    """class for configuring inference parameters"""
+
+    def _get_string(self, var: str, default_value: str = "") -> str:
+        """attempt to get the value of var from the os environment; if not present return the
+        default_value"""
+        return os.environ.get(var, default_value)
+
+    def _get_int(self, var: str, default_value: int) -> int:
+        if value := self._get_string(var):
+            return int(value)
+        return default_value
+
+    def _get_float(self, var: str, default_value: float) -> float:
+        if value := self._get_string(var):
+            return float(value)
+        return default_value
+
+    @property
+    def TABLE_IMAGE_CROP_PAD(self) -> int:
+        """extra image content to add around an identified table region; measured in pixels
+
+        The padding adds image data around an identified table bounding box for downstream table
+        structure detection model use as input
+        """
+        return self._get_int("TABLE_IMAGE_CROP_PAD", 12)
+
+    @property
+    def TABLE_IMAGE_BACKGROUND_PAD(self) -> int:
+        """number of pixels to pad around an table image with a white background color
+
+        The padding adds NO image data around an identified table bounding box; it simply adds white
+        background around the image
+        """
+        return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0)
+
+    @property
+    def LAYOUT_SAME_REGION_THRESHOLD(self) -> float:
+        """threshold for two layouts' bounding boxes to be considered as the same region
+
+        When the intersection area over union area of the two is larger than this threshold the two
+        boxes are considered the same region
+        """
+        return self._get_float("LAYOUT_SAME_REGION_THRESHOLD", 0.75)
+
+    @property
+    def LAYOUT_SUBREGION_THRESHOLD(self) -> float:
+        """threshold for one bounding box to be considered as a sub-region of another bounding box
+
+        When the intersection region area divided by self area is larger than this threshold self is
+        considered a subregion of the other
+        """
+        return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75)
+
+    @property
+    def ELEMENTS_H_PADDING_COEF(self) -> float:
+        """When extending the boundaries of a PDF object for the purpose of determining which other
+        elements should be considered in the same text region, we use a relative distance based on
+        some fraction of the block height (typically character height). This is the fraction used
+        for the horizontal extension applied to the left and right sides.
+        """
+        return self._get_float("ELEMENTS_H_PADDING_COEF", 0.4)
+
+    @property
+    def ELEMENTS_V_PADDING_COEF(self) -> float:
+        """Same as ELEMENTS_H_PADDING_COEF but the vertical extension."""
+        return self._get_float("ELEMENTS_V_PADDING_COEF", 0.3)
+
+
+inference_config = InferenceConfig()
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
index 69ea4c19..66cf7e26 100644
--- a/unstructured_inference/inference/elements.py
+++ b/unstructured_inference/inference/elements.py
@@ -11,18 +11,11 @@
 from PIL import Image
 from scipy.sparse.csgraph import connected_components
 
+from unstructured_inference.config import inference_config
 from unstructured_inference.logger import logger
 from unstructured_inference.math import safe_division
 from unstructured_inference.models import tesseract
 
-# When extending the boundaries of a PDF object for the purpose of determining which other elements
-# should be considered in the same text region, we use a relative distance based on some fraction of
-# the block height (typically character height). This is the fraction used for the horizontal
-# extension applied to the left and right sides.
-H_PADDING_COEF = 0.4
-# Same as above but the vertical extension.
-V_PADDING_COEF = 0.3
-
 
 @dataclass
 class Rectangle:
@@ -156,7 +149,10 @@ def partition_groups_from_regions(regions: Collection[Rectangle]) -> List[List[R
     """Partitions regions into groups of regions based on proximity. Returns list of lists of
     regions, each list corresponding with a group"""
     padded_regions = [
-        r.vpad(r.height * V_PADDING_COEF).hpad(r.height * H_PADDING_COEF) for r in regions
+        r.vpad(r.height * inference_config.ELEMENTS_V_PADDING_COEF).hpad(
+            r.height * inference_config.ELEMENTS_H_PADDING_COEF,
+        )
+        for r in regions
     ]
 
     intersection_mtx = intersections(*padded_regions)
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
index 5e00388d..c91bb5bc 100644
--- a/unstructured_inference/inference/layoutelement.py
+++ b/unstructured_inference/inference/layoutelement.py
@@ -8,6 +8,7 @@
 from pandas import DataFrame
 from PIL import Image
 
+from unstructured_inference.config import inference_config
 from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR
 from unstructured_inference.inference.elements import (
     ImageTextRegion,
@@ -79,7 +80,7 @@ def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str:
     tables.load_agent()
     if tables.tables_agent is None:
         raise RuntimeError("Unable to load table extraction agent.")
-    padded_block = text_block.pad(12)
+    padded_block = text_block.pad(inference_config.TABLE_IMAGE_CROP_PAD)
     cropped_image = image.crop((padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2))
     return tables.tables_agent.predict(cropped_image)
 
@@ -90,8 +91,8 @@ def merge_inferred_layout_with_extracted_layout(
     page_image_size: tuple,
     ocr_layout: Optional[List[TextRegion]] = None,
     supplement_with_ocr_elements: bool = True,
-    same_region_threshold: float = 0.75,
-    subregion_threshold: float = 0.75,
+    same_region_threshold: float = inference_config.LAYOUT_SAME_REGION_THRESHOLD,
+    subregion_threshold: float = inference_config.LAYOUT_SUBREGION_THRESHOLD,
 ) -> List[LayoutElement]:
     """Merge two layouts to produce a single layout."""
     extracted_elements_to_add: List[TextRegion] = []
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
index 0c79e8cd..bd1bc4ce 100644
--- a/unstructured_inference/models/tables.py
+++ b/unstructured_inference/models/tables.py
@@ -14,6 +14,7 @@
 from PIL import Image
 from transformers import DetrImageProcessor, TableTransformerForObjectDetection
 
+from unstructured_inference.config import inference_config
 from unstructured_inference.logger import logger
 from unstructured_inference.models.table_postprocess import Rect
 from unstructured_inference.models.unstructuredmodel import UnstructuredModel
@@ -113,7 +114,11 @@ def get_tokens(self, x: Image):
 
         return tokens
 
-    def get_structure(self, x: Image, pad_for_structure_detection: int = 50) -> dict:
+    def get_structure(
+        self,
+        x: Image,
+        pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD,
+    ) -> dict:
         """get the table structure as a dictionary contaning different types of elements as
         key-value pairs; check table-transformer documentation for more information"""
         with torch.no_grad():
@@ -126,7 +131,11 @@ def get_structure(self, x: Image, pad_for_structure_detection: int = 50) -> dict
             outputs_structure["pad_for_structure_detection"] = pad_for_structure_detection
             return outputs_structure
 
-    def run_prediction(self, x: Image, pad_for_structure_detection: int = 50):
+    def run_prediction(
+        self,
+        x: Image,
+        pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD,
+    ):
         """Predict table structure"""
         outputs_structure = self.get_structure(x, pad_for_structure_detection)
         tokens = self.get_tokens(x=x)
diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py
index 9235ce68..f21daf8f 100644
--- a/unstructured_inference/utils.py
+++ b/unstructured_inference/utils.py
@@ -130,7 +130,7 @@ def pad_image_with_background_color(
     pad: int = 10,
     background_color: str = "white",
 ) -> Image.Image:
-    """pads an input image with the same background color around it by pad//2 on all 4 sides
+    """pads an input image with the same background color around it by pad on all 4 sides
 
     The original image is kept intact and a new image is returned with padding added.
     """
@@ -139,6 +139,6 @@ def pad_image_with_background_color(
         raise ValueError(
             "Can not pad an image with negative space! Please use a positive value for `pad`.",
         )
-    new = Image.new(image.mode, (width + pad, height + pad), background_color)
-    new.paste(image, (pad // 2, pad // 2))
+    new = Image.new(image.mode, (width + pad * 2, height + pad * 2), background_color)
+    new.paste(image, (pad, pad))
     return new

From eaa8d65c69b4aab2c69545e3b556865bbb1d9ddd Mon Sep 17 00:00:00 2001
From: Benjamin Torres <benjats07@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:02:40 -0600
Subject: [PATCH 03/11] Fix/nested bounding boxes (#201)

This PR implements two major changes:
* Replaces detectron2 with Yolox_quantized as default model
* Introduces an algorithm for reducing nested elements detected by any
model. As a benefit of these now is possible to detect more diverse
element types.
* Adds a property to `Rectangle` class to register the origin of the
data.
* Adds functionality to `annotate` function, to skip elements of certain
origins
* Adds functionality to `annotate` function to print additional details
of bounding boxes
* Tests updates
---------

Co-authored-by: Alan Bertl <alan@unstructured.io>
Co-authored-by: qued <64741807+qued@users.noreply.github.com>
Co-authored-by: Yao You <theyaoyou@gmail.com>
---
 CHANGELOG.md                                  |   8 ++
 test_unstructured_inference/conftest.py       |   9 +-
 .../inference/test_layout.py                  |  93 +++++++++----
 .../inference/test_layout_element.py          |  25 +++-
 .../models/test_model.py                      |  76 ++++++++++-
 .../models/test_yolox.py                      |  13 +-
 test_unstructured_inference/test_elements.py  |  61 ++++++++-
 .../test_visualization.py                     |  27 +---
 unstructured_inference/__version__.py         |   2 +-
 unstructured_inference/inference/elements.py  |  18 ++-
 unstructured_inference/inference/layout.py    |  31 ++++-
 .../inference/layoutelement.py                |  61 ++++++++-
 unstructured_inference/models/base.py         |   2 +-
 .../models/detectron2onnx.py                  |   1 +
 .../models/unstructuredmodel.py               | 129 +++++++++++++++++-
 unstructured_inference/models/yolox.py        |  28 ++--
 unstructured_inference/utils.py               |  14 +-
 unstructured_inference/visualize.py           |  77 ++++-------
 18 files changed, 506 insertions(+), 169 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4edd4d73..5898e932 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## 0.6.1
+
+* YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model.
+* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this 
+  behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps 
+  reduce duplicated content.
+* Add `source` property to our elements, so you can know where the information was generated (OCR or detection model)
+
 ## 0.6.0
 
 * add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables
diff --git a/test_unstructured_inference/conftest.py b/test_unstructured_inference/conftest.py
index 761a0492..c20caece 100644
--- a/test_unstructured_inference/conftest.py
+++ b/test_unstructured_inference/conftest.py
@@ -28,7 +28,7 @@ def mock_text_region():
 
 @pytest.fixture()
 def mock_layout_element():
-    return LayoutElement(100, 100, 300, 300, text="Sample text", type="Text")
+    return LayoutElement(100, 100, 300, 300, text="Sample text", source=None, type="Text")
 
 
 @pytest.fixture()
@@ -110,9 +110,9 @@ def mock_embedded_text_regions():
 @pytest.fixture()
 def mock_ocr_regions():
     return [
-        EmbeddedTextRegion(10, 10, 90, 90, "0"),
-        EmbeddedTextRegion(200, 200, 300, 300, "1"),
-        EmbeddedTextRegion(500, 320, 600, 350, "3"),
+        EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None),
+        EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None),
+        EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None),
     ]
 
 
@@ -141,6 +141,7 @@ def mock_inferred_layout(mock_embedded_text_regions):
             r.x2,
             r.y2,
             text=None,
+            source=None,
             type="Text",
         )
         for r in mock_embedded_text_regions
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
index bd5aa3e8..42bba9a2 100644
--- a/test_unstructured_inference/inference/test_layout.py
+++ b/test_unstructured_inference/inference/test_layout.py
@@ -28,9 +28,16 @@ def mock_image():
 
 @pytest.fixture()
 def mock_initial_layout():
-    text_block = layout.EmbeddedTextRegion(2, 4, 6, 8, text="A very repetitive narrative. " * 10)
+    text_block = layout.EmbeddedTextRegion(
+        2,
+        4,
+        6,
+        8,
+        text="A very repetitive narrative. " * 10,
+        source="Mock",
+    )
 
-    title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title")
+    title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title", source="Mock")
 
     return [text_block, title_block]
 
@@ -42,11 +49,20 @@ def mock_final_layout():
         4,
         6,
         8,
+        source="Mock",
         text="A very repetitive narrative. " * 10,
         type="NarrativeText",
     )
 
-    title_block = layoutelement.LayoutElement(1, 2, 3, 4, text="A Catchy Title", type="Title")
+    title_block = layoutelement.LayoutElement(
+        1,
+        2,
+        3,
+        4,
+        source="Mock",
+        text="A Catchy Title",
+        type="Title",
+    )
 
     return [text_block, title_block]
 
@@ -709,8 +725,11 @@ def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-t
         assert element.text.startswith(test_snippets[i])
 
 
-@pytest.mark.parametrize("colors", ["red", None])
-def test_annotate(colors):
+@pytest.mark.parametrize(
+    ("colors", "add_details", "threshold"),
+    [("red", False, 0.992), (None, False, 0.992), ("red", True, 0.8)],
+)
+def test_annotate(colors, add_details, threshold):
     def check_annotated_image():
         annotated_array = np.array(annotated_image)
         for coords in [coords1, coords2]:
@@ -722,9 +741,9 @@ def check_annotated_image():
                 assert all(annotated_array[y1:y2, x1, i] == expected)
                 assert all(annotated_array[y1:y2, x2, i] == expected)
             # Make sure almost all the pixels are not changed
-            assert ((annotated_array[:, :, 0] == 1).mean()) > 0.992
-            assert ((annotated_array[:, :, 1] == 1).mean()) > 0.992
-            assert ((annotated_array[:, :, 2] == 1).mean()) > 0.992
+            assert ((annotated_array[:, :, 0] == 1).mean()) > threshold
+            assert ((annotated_array[:, :, 1] == 1).mean()) > threshold
+            assert ((annotated_array[:, :, 2] == 1).mean()) > threshold
 
     test_image_arr = np.ones((100, 100, 3), dtype="uint8")
     image = Image.fromarray(test_image_arr)
@@ -735,15 +754,18 @@ def check_annotated_image():
     rect2 = elements.Rectangle(*coords2)
     page.elements = [rect1, rect2]
 
+    annotated_image = page.annotate(colors=colors, add_details=add_details, sources=["all"])
+    check_annotated_image()
+
     # Scenario 1: where self.image exists
-    annotated_image = page.annotate(colors=colors)
+    annotated_image = page.annotate(colors=colors, add_details=add_details)
     check_annotated_image()
 
     # Scenario 2: where self.image is None, but self.image_path exists
     with patch.object(Image, "open", return_value=image):
         page.image = None
         page.image_path = "mock_path_to_image"
-        annotated_image = page.annotate(colors=colors)
+        annotated_image = page.annotate(colors=colors, add_details=add_details)
         check_annotated_image()
 
 
@@ -775,32 +797,30 @@ def test_image_text_region(text, ocr_strategy, expected, mock_image):
         )
 
 
-@pytest.fixture()
-def ordering_layout():
-    elements = [
-        layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
-        layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
-        layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
-        layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
-        layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
-        layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
-        layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
-    ]
-    return elements
+class MockDetectionModel(layout.UnstructuredObjectDetectionModel):
+    def initialize(self, *args, **kwargs):
+        pass
+
+    def predict(self, x):
+        return [
+            layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
+            layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
+            layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
+            layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
+            layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
+            layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
+            layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
+        ]
 
 
-def test_layout_order(mock_image, ordering_layout):
+def test_layout_order(mock_image):
     with tempfile.TemporaryDirectory() as tmpdir:
         mock_image_path = os.path.join(tmpdir, "mock.jpg")
         mock_image.save(mock_image_path)
-        with patch.object(layout, "get_model", lambda: lambda x: ordering_layout), patch.object(
+        with patch.object(layout, "get_model", lambda: MockDetectionModel()), patch.object(
             layout,
             "load_pdf",
             lambda *args, **kwargs: ([[]], [mock_image_path]),
-        ), patch.object(
-            layout,
-            "UnstructuredObjectDetectionModel",
-            object,
         ):
             doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
             page = doc.pages[0]
@@ -960,3 +980,20 @@ def test_warning_if_chipper_and_low_dpi(caplog):
         mock_from_file.assert_called_once()
         assert caplog.records[0].levelname == "WARNING"
         assert "DPI >= 300" in caplog.records[0].msg
+
+
+@pytest.mark.parametrize(
+    ("filename", "img_num", "should_complete"),
+    [("sample-docs/empty-document.pdf", 0, True), ("sample-docs/empty-document.pdf", 10, False)],
+)
+def test_get_image(filename, img_num, should_complete):
+    doc = layout.DocumentLayout.from_file(filename)
+    page = doc.pages[0]
+    try:
+        img = page._get_image(filename, img_num)
+        # transform img to numpy array
+        img = np.array(img)
+        # is a blank image with all pixels white
+        assert img.mean() == 255.0
+    except ValueError:
+        assert not should_complete
diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py
index 9dfdb3d9..59727b62 100644
--- a/test_unstructured_inference/inference/test_layout_element.py
+++ b/test_unstructured_inference/inference/test_layout_element.py
@@ -17,12 +17,12 @@
 def test_aggregate_ocr_text_by_block():
     expected = "A Unified Toolkit"
     ocr_layout = [
-        TextRegion(0, 0, 20, 20, "A"),
-        TextRegion(50, 50, 150, 150, "Unified"),
-        TextRegion(150, 150, 300, 250, "Toolkit"),
-        TextRegion(200, 250, 300, 350, "Deep"),
+        TextRegion(0, 0, 20, 20, source="OCR", text="A"),
+        TextRegion(50, 50, 150, 150, source="OCR", text="Unified"),
+        TextRegion(150, 150, 300, 250, source="OCR", text="Toolkit"),
+        TextRegion(200, 250, 300, 350, source="OCR", text="Deep"),
     ]
-    region = TextRegion(0, 0, 250, 350, "")
+    region = TextRegion(0, 0, 250, 350, text="")
 
     text = aggregate_ocr_text_by_block(ocr_layout, region, 0.5)
     assert text == expected
@@ -65,6 +65,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
             r.x2,
             r.y2,
             text=r.text,
+            source=None,
             type="UncategorizedText",
         )
         for r in mock_ocr_regions
@@ -94,6 +95,7 @@ def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_re
             r.x2,
             r.y2,
             text=r.text,
+            source=None,
             type="UncategorizedText",
         )
         for r in mock_ocr_regions
@@ -138,6 +140,7 @@ def test_layout_element_do_dict(mock_layout_element):
         "text": "Sample text",
         "type": "Text",
         "prob": None,
+        "source": None,
     }
 
     assert mock_layout_element.to_dict() == expected
@@ -157,6 +160,14 @@ def test_layout_element_from_lp_textblock():
         score=0.99,
     )
 
-    expected = LayoutElement(100, 100, 300, 300, "Sample Text", "Text", 0.99)
-
+    expected = LayoutElement(
+        100,
+        100,
+        300,
+        300,
+        text="Sample Text",
+        source="detectron2_lp",
+        type="Text",
+        prob=0.99,
+    )
     assert LayoutElement.from_lp_textblock(mock_text_block) == expected
diff --git a/test_unstructured_inference/models/test_model.py b/test_unstructured_inference/models/test_model.py
index f5e00855..4ae6c08a 100644
--- a/test_unstructured_inference/models/test_model.py
+++ b/test_unstructured_inference/models/test_model.py
@@ -58,7 +58,7 @@ def test_raises_uninitialized():
 def test_model_initializes_once():
     from unstructured_inference.inference import layout
 
-    with mock.patch.object(models, "UnstructuredDetectronONNXModel", MockModel), mock.patch.object(
+    with mock.patch.object(models, "UnstructuredYoloXModel", MockModel), mock.patch.object(
         models,
         "models",
         {},
@@ -72,3 +72,77 @@ def test_model_initializes_once():
         assert (
             doc.pages[0].elements[0].prob is None
         )  # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
+
+
+def test_deduplicate_detected_elements():
+    import numpy as np
+
+    from unstructured_inference.inference.elements import intersections
+    from unstructured_inference.inference.layout import DocumentLayout
+    from unstructured_inference.models.base import get_model
+
+    model = get_model("yolox_quantized")
+    # model.confidence_threshold=0.5
+    file = "sample-docs/example_table.jpg"
+    doc = DocumentLayout.from_image_file(
+        file,
+        model,
+        ocr_strategy="never",
+        supplement_with_ocr_elements=False,
+    )
+    known_elements = [e for e in doc.pages[0].elements if e.type != "UncategorizedText"]
+    # Compute intersection matrix
+    intersections_mtx = intersections(*known_elements)
+    # Get rid off diagonal (cause an element will always intersect itself)
+    np.fill_diagonal(intersections_mtx, False)
+    # Now all the elements should be False, because any intersection remains
+    return not intersections_mtx.all()
+
+
+def test_enhance_regions():
+    from unstructured_inference.inference.elements import Rectangle
+    from unstructured_inference.models.base import get_model
+
+    elements = [
+        Rectangle(0, 0, 1, 1),
+        Rectangle(0.01, 0.01, 1.01, 1.01),
+        Rectangle(0.02, 0.02, 1.02, 1.02),
+        Rectangle(0.03, 0.03, 1.03, 1.03),
+        Rectangle(0.04, 0.04, 1.04, 1.04),
+        Rectangle(0.05, 0.05, 1.05, 1.05),
+        Rectangle(0.06, 0.06, 1.06, 1.06),
+        Rectangle(0.07, 0.07, 1.07, 1.07),
+        Rectangle(0.08, 0.08, 1.08, 1.08),
+        Rectangle(0.09, 0.09, 1.09, 1.09),
+        Rectangle(0.10, 0.10, 1.10, 1.10),
+    ]
+    model = get_model("yolox_tiny")
+    elements = model.enhance_regions(elements, 0.5)
+    assert len(elements) == 1
+    assert (elements[0].x1, elements[0].y1, elements[0].x2, elements[0].x2) == (0, 0, 1.10, 1.10)
+
+
+def test_clean_type():
+    from unstructured_inference.inference.layout import LayoutElement
+    from unstructured_inference.models.base import get_model
+
+    elements = [
+        LayoutElement(
+            0.6,
+            0.6,
+            0.65,
+            0.65,
+            type="Table",
+        ),  # One little table nested inside all the others
+        LayoutElement(0.5, 0.5, 0.7, 0.7, type="Table"),  # One nested table
+        LayoutElement(0, 0, 1, 1, type="Table"),  # Big table
+        LayoutElement(0.01, 0.01, 1.01, 1.01),
+        LayoutElement(0.02, 0.02, 1.02, 1.02),
+        LayoutElement(0.03, 0.03, 1.03, 1.03),
+        LayoutElement(0.04, 0.04, 1.04, 1.04),
+        LayoutElement(0.05, 0.05, 1.05, 1.05),
+    ]
+    model = get_model("yolox_tiny")
+    elements = model.clean_type(elements, type_to_clean="Table")
+    assert len(elements) == 1
+    assert (elements[0].x1, elements[0].y1, elements[0].x2, elements[0].x2) == (0, 0, 1, 1)
diff --git a/test_unstructured_inference/models/test_yolox.py b/test_unstructured_inference/models/test_yolox.py
index 122892e5..317876eb 100644
--- a/test_unstructured_inference/models/test_yolox.py
+++ b/test_unstructured_inference/models/test_yolox.py
@@ -14,7 +14,9 @@ def test_layout_yolox_local_parsing_image():
     # NOTE(benjamin) The example image should result in one page result
     assert len(document_layout.pages) == 1
     # NOTE(benjamin) The example sent to the test contains 13 detections
-    assert len(document_layout.pages[0].elements) == 13
+    types_known = ["Text", "Section-header", "Page-header"]
+    known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known]
+    assert len(known_regions) == 13
     assert hasattr(
         document_layout.pages[0].elements[0],
         "prob",
@@ -32,8 +34,9 @@ def test_layout_yolox_local_parsing_pdf():
     content = str(document_layout)
     assert "libero fringilla" in content
     assert len(document_layout.pages) == 1
-    # NOTE(benjamin) The example sent to the test contains 5 detections
-    assert len(document_layout.pages[0].elements) == 5
+    # NOTE(benjamin) The example sent to the test contains 5 text detections
+    text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"]
+    assert len(text_elements) == 5
     assert hasattr(
         document_layout.pages[0].elements[0],
         "prob",
@@ -59,10 +62,10 @@ def test_layout_yolox_local_parsing_empty_pdf():
 
 
 def test_layout_yolox_local_parsing_image_soft():
-    filename = os.path.join("sample-docs", "test-image.jpg")
+    filename = os.path.join("sample-docs", "example_table.jpg")
     # NOTE(benjamin) keep_output = True create a file for each image in
     # localstorage for visualization of the result
-    document_layout = process_file_with_model(filename, model_name="yolox_tiny", is_image=True)
+    document_layout = process_file_with_model(filename, model_name="yolox_quantized", is_image=True)
     # NOTE(benjamin) The example image should result in one page result
     assert len(document_layout.pages) == 1
     # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model
diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py
index 56a35905..1c1be08c 100644
--- a/test_unstructured_inference/test_elements.py
+++ b/test_unstructured_inference/test_elements.py
@@ -7,6 +7,10 @@
 from PIL import Image
 
 from unstructured_inference.inference import elements
+from unstructured_inference.inference.layoutelement import (
+    LocationlessLayoutElement,
+    separate,
+)
 
 skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
 
@@ -115,7 +119,7 @@ def test_rectangle_area(monkeypatch):
             rect = elements.Rectangle(0, 0, 0, 0)
             mockheight.return_value = height
             mockwidth.return_value = width
-            assert rect.area() == width * height
+            assert rect.area == width * height
 
 
 def test_rectangle_iou():
@@ -125,16 +129,16 @@ def test_rectangle_iou():
         rect2 = rand_rect(20)
         assert rect1.intersection_over_union(rect2) == rect2.intersection_over_union(rect1)
         if rect1.is_in(rect2):
-            assert rect1.intersection_over_union(rect2) == rect1.area() / rect2.area()
+            assert rect1.intersection_over_union(rect2) == rect1.area / rect2.area
         elif rect2.is_in(rect1):
-            assert rect1.intersection_over_union(rect2) == rect2.area() / rect1.area()
+            assert rect1.intersection_over_union(rect2) == rect2.area / rect1.area
         else:
             if rect1.intersection(rect2) is None:
                 assert rect1.intersection_over_union(rect2) == 0.0
             else:
-                intersection = rect1.intersection(rect2).area()
+                intersection = rect1.intersection(rect2).area
                 assert rect1.intersection_over_union(rect2) == intersection / (
-                    rect1.area() + rect2.area() - intersection
+                    rect1.area + rect2.area - intersection
                 )
 
 
@@ -191,6 +195,53 @@ def test_intersection_over_min(
     )
 
 
+def test_grow_region_to_match_region():
+    from unstructured_inference.inference.elements import Rectangle, grow_region_to_match_region
+
+    a = Rectangle(1, 1, 2, 2)
+    b = Rectangle(1, 1, 5, 5)
+    grow_region_to_match_region(a, b)
+    assert a == Rectangle(1, 1, 5, 5)
+
+
+def test_LocationlessLayoutElement():
+    text = "Testing text"
+    type = "Type"
+    e = LocationlessLayoutElement(text, type)
+    assert e.to_dict() == {"text": text, "type": type}
+
+
+@pytest.mark.parametrize(
+    ("rect1", "rect2", "expected"),
+    [
+        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 5.1, 5.1), True),
+        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 5.2, 5.2), True),
+        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(7, 7, 10, 10), False),
+    ],
+)
+def test_is_almost_subregion_of(rect1, rect2, expected):
+    assert expected == rect2.is_almost_subregion_of(rect1)
+
+
+@pytest.mark.parametrize(
+    ("rect1", "rect2"),
+    [
+        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 6, 6)),
+        (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(6, 6, 8, 8)),
+        (elements.Rectangle(3, 3, 7, 7), elements.Rectangle(2, 2, 4, 4)),
+        (elements.Rectangle(2, 2, 4, 11), elements.Rectangle(3, 3, 7, 10)),
+        (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 3, 7, 10)),
+        (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(2.5, 2.5, 3.5, 4.5)),
+        (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 1, 4, 3.5)),
+        (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 1, 4.5, 3.5)),
+    ],
+)
+def test_separate(rect1, rect2):
+    separate(rect1, rect2)
+
+    # assert not rect1.intersects(rect2) #TODO: fix this test
+
+
 @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
 def test_ocr_paddle(monkeypatch, caplog):
     monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle")
diff --git a/test_unstructured_inference/test_visualization.py b/test_unstructured_inference/test_visualization.py
index c9a2978a..0524451f 100644
--- a/test_unstructured_inference/test_visualization.py
+++ b/test_unstructured_inference/test_visualization.py
@@ -5,32 +5,7 @@
 from PIL import Image
 
 from unstructured_inference.inference.elements import Rectangle
-from unstructured_inference.visualize import draw_bbox, draw_yolox_bounding_boxes, show_plot
-
-
-@pytest.mark.parametrize(
-    ("y_coords", "x_coords"),
-    [
-        (10, slice(10, 15)),
-        (10, slice(16, 50)),
-        (40, slice(1, 50)),
-        (slice(10, 40), 1),
-        (slice(10, 12), 50),
-        (slice(14, 16), 50),
-        (slice(19, 40), 50),
-    ],
-)
-def test_visualize(y_coords, x_coords):
-    test_image = np.ones((100, 100, 3))
-    boxes = [[1, 10, 50, 40]]
-    annotated_img = draw_yolox_bounding_boxes(
-        test_image,
-        boxes,
-        scores=[0.8],
-        cls_ids=[0],
-        class_names=["thing"],
-    )
-    assert annotated_img[y_coords, x_coords, 0].sum() == 0.0
+from unstructured_inference.visualize import draw_bbox, show_plot
 
 
 def test_draw_bbox():
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 3b3ca6c4..a2b973a8 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.0"  # pragma: no cover
+__version__ = "0.6.1"  # pragma: no cover
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
index 66cf7e26..1b965cda 100644
--- a/unstructured_inference/inference/elements.py
+++ b/unstructured_inference/inference/elements.py
@@ -102,6 +102,7 @@ def intersection(self, other: Rectangle) -> Optional[Rectangle]:
             return None
         return Rectangle(x1, y1, x2, y2)
 
+    @property
     def area(self) -> float:
         """Gives the area of the rectangle."""
         return self.width * self.height
@@ -111,8 +112,8 @@ def intersection_over_union(self, other: Rectangle) -> float:
         how similar the regions are. Returns 0 for disjoint rectangles, 1 for two identical
         rectangles -- area of intersection / area of union."""
         intersection = self.intersection(other)
-        intersection_area = 0.0 if intersection is None else intersection.area()
-        union_area = self.area() + other.area() - intersection_area
+        intersection_area = 0.0 if intersection is None else intersection.area
+        union_area = self.area + other.area - intersection_area
         return safe_division(intersection_area, union_area)
 
     def intersection_over_minimum(self, other: Rectangle) -> float:
@@ -120,8 +121,8 @@ def intersection_over_minimum(self, other: Rectangle) -> float:
         for identifying when one rectangle is almost-a-subset of the other. Returns 0 for disjoint
         rectangles, 1 when either is a subset of the other."""
         intersection = self.intersection(other)
-        intersection_area = 0.0 if intersection is None else intersection.area()
-        min_area = min(self.area(), other.area())
+        intersection_area = 0.0 if intersection is None else intersection.area
+        min_area = min(self.area, other.area)
         return safe_division(intersection_area, min_area)
 
     def is_almost_subregion_of(self, other: Rectangle, subregion_threshold: float = 0.75) -> bool:
@@ -129,9 +130,9 @@ def is_almost_subregion_of(self, other: Rectangle, subregion_threshold: float =
         comparing the intersection area over self area to some threshold, and checking whether self
         is the smaller rectangle."""
         intersection = self.intersection(other)
-        intersection_area = 0.0 if intersection is None else intersection.area()
-        return (subregion_threshold < safe_division(intersection_area, self.area())) and (
-            self.area() <= other.area()
+        intersection_area = 0.0 if intersection is None else intersection.area
+        return (subregion_threshold < safe_division(intersection_area, self.area)) and (
+            self.area <= other.area
         )
 
 
@@ -148,6 +149,8 @@ def minimal_containing_region(*regions: Rectangle) -> Rectangle:
 def partition_groups_from_regions(regions: Collection[Rectangle]) -> List[List[Rectangle]]:
     """Partitions regions into groups of regions based on proximity. Returns list of lists of
     regions, each list corresponding with a group"""
+    if len(regions) == 0:
+        return []
     padded_regions = [
         r.vpad(r.height * inference_config.ELEMENTS_V_PADDING_COEF).hpad(
             r.height * inference_config.ELEMENTS_H_PADDING_COEF,
@@ -194,6 +197,7 @@ def intersections(*rects: Rectangle):
 @dataclass
 class TextRegion(Rectangle):
     text: Optional[str] = None
+    source: Optional[str] = None
 
     def __str__(self) -> str:
         return str(self.text)
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index 5768e2a4..2cdba3bc 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -261,7 +261,9 @@ def get_elements_with_detection_model(
         # NOTE(mrobinson) - We'll want make this model inference step some kind of
         # remote call in the future.
         inferred_layout: List[LayoutElement] = self.detection_model(self.image)
-
+        inferred_layout = UnstructuredObjectDetectionModel.deduplicate_detected_elements(
+            inferred_layout,
+        )
         if self.ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value:
             ocr_layout = None
         elif self.ocr_mode == OCRMode.FULL_PAGE.value:
@@ -339,6 +341,7 @@ def get_elements_with_detection_model(
         if inplace:
             self.elements = elements
             return None
+
         return elements
 
     def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutElement]:
@@ -400,8 +403,14 @@ def annotate(
         colors: Optional[Union[List[str], str]] = None,
         image_dpi: int = 200,
         annotation_data: Optional[dict[str, dict]] = None,
+        add_details: bool = False,
+        sources: List[str] = ["all"],
     ) -> Image.Image:
-        """Annotates the elements on the page image."""
+        """Annotates the elements on the page image.
+        if add_details is True, and the elements contain type and source attributes, then
+        the type and source will be added to the image.
+        sources is a list of sources to annotate. If sources is ["all"], then all sources will be
+        annotated. Current sources allowed are "yolox","detectron2_onnx" and "detectron2_lp" """
         if colors is None:
             colors = ["red" for _ in self.elements]
         if isinstance(colors, str):
@@ -422,7 +431,9 @@ def annotate(
         if annotation_data is None:
             for el, color in zip(self.elements, colors):
                 if isinstance(el, Rectangle):
-                    img = draw_bbox(img, el, color=color)
+                    required_source = getattr(el, "source", None)
+                    if "all" in sources or required_source in sources:
+                        img = draw_bbox(img, el, color=color, details=add_details)
         else:
             for attribute, style in annotation_data.items():
                 if hasattr(self, attribute) and getattr(self, attribute):
@@ -430,7 +441,15 @@ def annotate(
                     width = style["width"]
                     for region in getattr(self, attribute):
                         if isinstance(region, Rectangle):
-                            img = draw_bbox(img, region, color=color, width=width)
+                            required_source = getattr(el, "source", None)
+                            if "all" in sources or required_source in sources:
+                                img = draw_bbox(
+                                    img,
+                                    region,
+                                    color=color,
+                                    width=width,
+                                    details=add_details,
+                                )
 
         return img
 
@@ -660,7 +679,7 @@ def load_pdf(
 
             text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text)
 
-            if text_region.area() > 0:
+            if text_region.area > 0:
                 layout.append(text_region)
         layouts.append(layout)
 
@@ -719,7 +738,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]:
         (x1, y1, x2, y2) = l, t, l + w, t + h
         text = ocr_data["text"][i]
         if text:
-            text_region = TextRegion(x1, y1, x2, y2, text)
+            text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR")
             text_regions.append(text_region)
 
     return text_regions
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
index c91bb5bc..887ecc33 100644
--- a/unstructured_inference/inference/layoutelement.py
+++ b/unstructured_inference/inference/layoutelement.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Collection, List, Optional, cast
+from typing import Collection, List, Optional, Union, cast
 
 import numpy as np
 from layoutparser.elements.layout import TextBlock
@@ -18,7 +18,6 @@
     partition_groups_from_regions,
     region_bounding_boxes_are_almost_the_same,
 )
-from unstructured_inference.models import tables
 
 
 @dataclass
@@ -54,6 +53,7 @@ def to_dict(self) -> dict:
             "text": self.text,
             "type": self.type,
             "prob": self.prob,
+            "source": self.source,
         }
         return out_dict
 
@@ -63,7 +63,9 @@ def from_region(cls, region: Rectangle):
         x1, y1, x2, y2 = region.x1, region.y1, region.x2, region.y2
         text = region.text if hasattr(region, "text") else None
         type = region.type if hasattr(region, "type") else None
-        return cls(x1, y1, x2, y2, text, type)
+        prob = region.prob if hasattr(region, "prob") else None
+        source = region.source if hasattr(region, "source") else None
+        return cls(x1, y1, x2, y2, text=text, source=source, type=type, prob=prob)
 
     @classmethod
     def from_lp_textblock(cls, textblock: TextBlock):
@@ -71,12 +73,14 @@ def from_lp_textblock(cls, textblock: TextBlock):
         x1, y1, x2, y2 = textblock.coordinates
         text = textblock.text
         type = textblock.type
-        score = textblock.score
-        return cls(x1, y1, x2, y2, text, type, prob=score)
+        prob = textblock.score
+        return cls(x1, y1, x2, y2, text=text, source="detectron2_lp", type=type, prob=prob)
 
 
 def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str:
     """Extract the contents of a table."""
+    from unstructured_inference.models import tables
+
     tables.load_agent()
     if tables.tables_agent is None:
         raise RuntimeError("Unable to load table extraction agent.")
@@ -159,6 +163,7 @@ def merge_inferred_layout_with_extracted_layout(
             el.y2,
             text=el.text,
             type="Image" if isinstance(el, ImageTextRegion) else "UncategorizedText",
+            source=el.source,
         )
         for el in extracted_elements_to_add
     ]
@@ -305,8 +310,9 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion:
     max_y2 = max([tr.y2 for tr in regions])
 
     merged_text = " ".join([tr.text for tr in regions if tr.text])
-
-    return TextRegion(min_x1, min_y1, max_x2, max_y2, merged_text)
+    sources = [*{tr.source for tr in regions}]
+    source = sources.pop() if len(sources) == 1 else "merged:".join(sources)  # type:ignore
+    return TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
 
 
 def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]:
@@ -326,12 +332,53 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE
             r.x2,
             r.y2,
             text=r.text,
+            source=None,
             type="UncategorizedText",
         )
         for r in merged_regions
     ]
 
 
+def separate(region_a: Union[LayoutElement, Rectangle], region_b: Union[LayoutElement, Rectangle]):
+    """Reduce leftmost rectangle to don't overlap with the other"""
+
+    def reduce(keep: Rectangle, reduce: Rectangle):
+        # Asume intersection
+
+        # Other is down
+        if reduce.y2 > keep.y2 and reduce.x1 < keep.x2:
+            # other is down-right
+            if reduce.x2 > keep.x2 and reduce.y2 > keep.y2:
+                reduce.x1 = keep.x2 * 1.01
+                reduce.y1 = keep.y2 * 1.01
+                return
+            # other is down-left
+            if reduce.x1 < keep.x1 and reduce.y1 < keep.y2:
+                reduce.y1 = keep.y2
+                return
+            # other is centered
+            reduce.y1 = keep.y2
+        else:  # other is up
+            # other is up-right
+            if reduce.x2 > keep.x2 and reduce.y1 < keep.y1:
+                reduce.y2 = keep.y1
+                return
+            # other is left
+            if reduce.x1 < keep.x1 and reduce.y1 < keep.y1:
+                reduce.y2 = keep.y1
+                return
+            # other is centered
+            reduce.y2 = keep.y1
+
+    if not region_a.intersects(region_b):
+        return
+    else:
+        if region_a.area > region_b.area:
+            reduce(keep=region_a, reduce=region_b)
+        else:
+            reduce(keep=region_b, reduce=region_a)
+
+
 # NOTE(alan): The right way to do this is probably to rewrite LayoutElement as well as the different
 # Region types to not subclass Rectangle, and instead have an optional bbox property that is a
 # Rectangle. I or someone else will have to get to that later.
diff --git a/unstructured_inference/models/base.py b/unstructured_inference/models/base.py
index b0b2898e..a8323f1b 100644
--- a/unstructured_inference/models/base.py
+++ b/unstructured_inference/models/base.py
@@ -23,7 +23,7 @@
     UnstructuredYoloXModel,
 )
 
-DEFAULT_MODEL = "detectron2_onnx"
+DEFAULT_MODEL = "yolox_quantized"
 
 models: Dict[str, UnstructuredModel] = {}
 
diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py
index b37f3d72..7b9df081 100644
--- a/unstructured_inference/models/detectron2onnx.py
+++ b/unstructured_inference/models/detectron2onnx.py
@@ -158,6 +158,7 @@ def postprocess(
                     text=None,
                     type=detected_class,
                     prob=conf,
+                    source="detectron2_onnx",
                 )
 
                 regions.append(region)
diff --git a/unstructured_inference/models/unstructuredmodel.py b/unstructured_inference/models/unstructuredmodel.py
index 8d61b49a..634f2dd6 100644
--- a/unstructured_inference/models/unstructuredmodel.py
+++ b/unstructured_inference/models/unstructuredmodel.py
@@ -1,10 +1,18 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, List
+from typing import TYPE_CHECKING, Any, List, cast
 
+import numpy as np
 from PIL.Image import Image
 
+from unstructured_inference.inference.elements import (
+    grow_region_to_match_region,
+    intersections,
+    partition_groups_from_regions,
+)
+from unstructured_inference.inference.layoutelement import separate
+
 if TYPE_CHECKING:
     from unstructured_inference.inference.layoutelement import (
         LayoutElement,
@@ -55,6 +63,125 @@ def __call__(self, x: Image) -> List[LayoutElement]:
         """Inference using function call interface."""
         return super().__call__(x)
 
+    @staticmethod
+    def enhance_regions(
+        elements: List[LayoutElement],
+        iom_to_merge: float = 0.3,
+    ) -> List[LayoutElement]:
+        """This function traverses all the elements and either deletes nested elements,
+        or merges or splits them depending on the iom score for both regions"""
+        intersections_mtx = intersections(*elements)
+
+        for i, row in enumerate(intersections_mtx):
+            first = elements[i]
+            if first:
+                # We get only the elements which intersected
+                indices_to_check = np.where(row)[0]
+                # Delete the first element, since it will always intersect with itself
+                indices_to_check = indices_to_check[indices_to_check != i]
+                if len(indices_to_check) == 0:
+                    continue
+                if len(indices_to_check) > 1:  # sort by iom
+                    iom_to_check = [
+                        (j, first.intersection_over_minimum(elements[j]))
+                        for j in indices_to_check
+                        if elements[j] is not None
+                    ]
+                    iom_to_check.sort(
+                        key=lambda x: x[1],
+                        reverse=True,
+                    )  # sort elements by iom, so we first check the greatest
+                    indices_to_check = [x[0] for x in iom_to_check if x[0] != i]  # type:ignore
+                for j in indices_to_check:
+                    if elements[j] is None or elements[i] is None:
+                        continue
+                    second = elements[j]
+                    intersection = first.intersection(
+                        second,
+                    )  # we know it does, but need the region
+                    first_inside_second = first.is_in(second)
+                    second_inside_first = second.is_in(first)
+
+                    if first_inside_second and not second_inside_first:
+                        elements[i] = None  # type:ignore
+                    elif second_inside_first and not first_inside_second:
+                        # delete second element
+                        elements[j] = None  # type:ignore
+                    elif intersection:
+                        iom = first.intersection_over_minimum(second)
+                        if iom < iom_to_merge:  # small
+                            separate(first, second)
+                            # The rectangle could become too small, which is a
+                            # good size to delete?
+                        else:  # big
+                            # merge
+                            if first.area > second.area:
+                                grow_region_to_match_region(first, second)
+                                elements[j] = None  # type:ignore
+                            else:
+                                grow_region_to_match_region(second, first)
+                                elements[i] = None  # type:ignore
+
+        elements = [e for e in elements if e is not None]
+        return elements
+
+    @staticmethod
+    def clean_type(elements: List[LayoutElement], type_to_clean="Table") -> List[LayoutElement]:
+        """After this function, the list of elements will not contain any element inside
+        of the type specified"""
+        target_elements = [e for e in elements if e.type == type_to_clean]
+        other_elements = [e for e in elements if e.type != type_to_clean]
+        if len(target_elements) == 0 or len(other_elements) == 0:
+            return elements
+
+        # Sort elements from biggest to smallest
+        target_elements.sort(key=lambda e: e.area, reverse=True)
+        other_elements.sort(key=lambda e: e.area, reverse=True)
+
+        # First check if targets contains each other
+        for element in target_elements:  # Just handles containment or little overlap
+            contains = [
+                e for e in target_elements if e.is_almost_subregion_of(element) and e != element
+            ]
+            for contained in contains:
+                target_elements.remove(contained)
+        # Then check if remaining elements intersect with targets
+        other_elements = filter(
+            lambda e: not any(e.is_almost_subregion_of(target) for target in target_elements),
+            other_elements,
+        )  # type:ignore
+
+        final_elements = list(other_elements)
+        final_elements.extend(target_elements)
+        # Note(benjamin): could use bisect.insort,
+        # but need to add < operator for
+        # LayoutElement in python <3.10
+        final_elements.sort(key=lambda e: e.y1)
+        return final_elements
+
+    @staticmethod
+    def deduplicate_detected_elements(
+        elements: List[LayoutElement],
+        min_text_size: int = 15,
+    ) -> List[LayoutElement]:
+        """Deletes overlapping elements in a list of elements."""
+
+        if len(elements) <= 1:
+            return elements
+
+        cleaned_elements: List[LayoutElement] = []
+        # TODO: Delete nested elements with low or None probability
+        # TODO: Keep most confident
+        # TODO: Better to grow horizontally than vertically?
+        groups_tmp = partition_groups_from_regions(elements)
+        groups = cast(List[List["LayoutElement"]], groups_tmp)
+        for g in groups:
+            all_types = {e.type for e in g}
+            for type in all_types:
+                g = UnstructuredObjectDetectionModel.clean_type(g, type)
+            cleaned_elements.extend(g)
+        return cleaned_elements
+
 
 class UnstructuredElementExtractionModel(UnstructuredModel):
     """Wrapper class for object extraction models used by unstructured."""
diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py
index c4615cc1..f5103698 100644
--- a/unstructured_inference/models/yolox.py
+++ b/unstructured_inference/models/yolox.py
@@ -18,7 +18,6 @@
 from unstructured_inference.logger import logger
 from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
 from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
-from unstructured_inference.visualize import draw_yolox_bounding_boxes
 
 YOLOX_LABEL_MAP = {
     0: "Caption",
@@ -111,7 +110,6 @@ def image_processing(
         input_shape = (1024, 768)
         origin_img = np.array(image)
         img, ratio = preprocess(origin_img, input_shape)
-        # TODO (benjamin): We should use models.get_model() but currenly returns Detectron model
         session = self.model
 
         ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
@@ -143,7 +141,16 @@ def image_processing(
             # being (x1,y1) the top left and (x2,y2) the bottom right
             x1, y1, x2, y2, prob, class_id = det.tolist()
             detected_class = self.layout_classes[int(class_id)]
-            region = LayoutElement(x1, y1, x2, y2, text=None, type=detected_class, prob=prob)
+            region = LayoutElement(
+                x1,
+                y1,
+                x2,
+                y2,
+                text=None,
+                type=detected_class,
+                prob=prob,
+                source="yolox",
+            )
 
             regions.append(region)
 
@@ -153,21 +160,6 @@ def image_processing(
 
         return page_layout
 
-    def annotate_image(self, image_fn, dets, out_fn):
-        """Draw bounding boxes and prediction metadata."""
-        origin_img = np.array(Image.open(image_fn))
-        final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
-
-        annotated_image = draw_yolox_bounding_boxes(
-            origin_img,
-            final_boxes,
-            final_scores,
-            final_cls_inds,
-            conf=0.3,
-            class_names=self.layout_classes,
-        )
-        cv2.imwrite(out_fn, annotated_image)
-
 
 # Note: preprocess function was named preproc on original source
 
diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py
index f21daf8f..c5a70091 100644
--- a/unstructured_inference/utils.py
+++ b/unstructured_inference/utils.py
@@ -1,12 +1,13 @@
 import os
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Union
+from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterable, Iterator, Union
 
 import cv2
 import numpy as np
 from PIL import Image
 
 from unstructured_inference.constants import AnnotationResult
+from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.visualize import show_plot
 
 if TYPE_CHECKING:
@@ -125,6 +126,17 @@ def annotate_layout_elements(
                 show_plot(img, desired_width=plot_desired_width)
 
 
+def tag(elements: Iterable[LayoutElement]):
+    """Asign an numeric id to the elements in the list.
+    Useful for debugging"""
+    colors = ["red", "blue", "green", "magenta", "brown"]
+    for i, e in enumerate(elements):
+        e.text = f"-{i}-:{e.text}"
+        # currently not a property
+        e.id = i  # type:ignore
+        e.color = colors[i % len(colors)]  # type:ignore
+
+
 def pad_image_with_background_color(
     image: Image.Image,
     pad: int = 10,
diff --git a/unstructured_inference/visualize.py b/unstructured_inference/visualize.py
index b1c47fba..8cba46ac 100644
--- a/unstructured_inference/visualize.py
+++ b/unstructured_inference/visualize.py
@@ -1,68 +1,43 @@
 # Copyright (c) Megvii Inc. All rights reserved.
 # Unstructured modified the original source code found at
 # https://github.com/Megvii-BaseDetection/YOLOX/blob/ac379df3c97d1835ebd319afad0c031c36d03f36/yolox/utils/visualize.py
+import typing
 from typing import Optional, Union
 
-import cv2
 import matplotlib.pyplot as plt
 import numpy as np
+from PIL import ImageFont
 from PIL.Image import Image
 from PIL.ImageDraw import ImageDraw
 
 from unstructured_inference.inference.elements import Rectangle
 
 
-def draw_bbox(image: Image, rect: Rectangle, color: str = "red", width=1) -> Image:
+@typing.no_type_check
+def draw_bbox(
+    image: Image,
+    rect: Rectangle,
+    color: str = "red",
+    width=1,
+    details: bool = False,
+) -> Image:
     """Draws bounding box in image"""
-    img = image.copy()
-    draw = ImageDraw(img)
-    topleft, _, bottomright, _ = rect.coordinates
-    draw.rectangle((topleft, bottomright), outline=color, width=width)
-    return img
-
-
-# NOTE: in original files from YoloX 'draw_yolox_bounding_boxes' function is named "vis"
-# TODO(alan): Need type hints here
-def draw_yolox_bounding_boxes(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
-    """
-    This function draws bounding boxes over the img argument, using
-    boxes from detections from YoloX.
-    img is a numpy array from cv2.imread()
-    Scores refers to the probability of each detection.
-    cls_ids are the class of each detection
-    conf is the confidence required to draw the bounding box
-    class_names is a list, where class_names[cls_ids[i]] should be the name
-        for the i-th bounding box.
-    """
-    for i in range(len(boxes)):
-        box = boxes[i]
-        cls_id = int(cls_ids[i])
-        score = scores[i]
-        if score < conf:
-            continue
-        x0 = int(box[0])
-        y0 = int(box[1])
-        x1 = int(box[2])
-        y1 = int(box[3])
-
-        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
-        text = f"{class_names[cls_id]}:{score * 100:.1f}%"
-        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
-        font = cv2.FONT_HERSHEY_SIMPLEX
-
-        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
-        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
-
-        txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
-        cv2.rectangle(
-            img,
-            (x0, y0 + 1),
-            (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])),
-            txt_bk_color,
-            -1,
-        )
-        cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
-
+    try:
+        img = image.copy()
+        draw = ImageDraw(img)
+        topleft, _, bottomright, _ = rect.coordinates
+        c = getattr(rect, "color", color)
+        if details:
+            source = getattr(rect, "source", "Unknown")
+            type = getattr(rect, "type", "")
+            kbd = ImageFont.truetype("Keyboard.ttf", 20)
+            draw.text(topleft, text=f"{type} {source}", fill=c, font=kbd)
+        draw.rectangle((topleft, bottomright), outline=c, width=width)
+    except OSError:
+        print("Failed to find font file. Skipping details.")
+        img = draw_bbox(image, rect, color, width)
+    except Exception as e:
+        print(f"Failed to draw bounding box: {e}")
     return img
 
 
From 8c6d6693c3d80c11f19e5a4f4138822a6479406f Mon Sep 17 00:00:00 2001
From: Yao You <theyaoyou@gmail.com>
Date: Fri, 22 Sep 2023 13:37:59 -0500
Subject: [PATCH 04/11] feat: make table transformer parameters configurable
 (#224)

- refactor `tables.py` so that the structure element confidence
threshold values are loaded from `inference_config`
- refactor intersection over box area threshold in
`objects_to_structure` to config intead of using hardwired value of 0.5
(default is still 0.5)
---
 CHANGELOG.md                            |  4 +++
 unstructured_inference/__version__.py   |  2 +-
 unstructured_inference/config.py        | 36 +++++++++++++++++++++++++
 unstructured_inference/models/tables.py | 33 +++++++++++++++--------
 4 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5898e932..cc9e79b6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.6.2
+
+* move the confidence threshold for table transformer to config
+
 ## 0.6.1
 
 * YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model.
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index a2b973a8..f42820dd 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.1"  # pragma: no cover
+__version__ = "0.6.2"  # pragma: no cover
diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py
index 5f1f90f3..e8f287e9 100644
--- a/unstructured_inference/config.py
+++ b/unstructured_inference/config.py
@@ -46,6 +46,42 @@ def TABLE_IMAGE_BACKGROUND_PAD(self) -> int:
         """
         return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0)
 
+    @property
+    def TT_TABLE_CONF(self) -> float:
+        """confidence threshold for table identified by table transformer"""
+        return self._get_float("TT_TABLE_CONF", 0.5)
+
+    @property
+    def TABLE_COLUMN_CONF(self) -> float:
+        """confidence threshold for column identified by table transformer"""
+        return self._get_float("TABLE_COLUMN_CONF", 0.5)
+
+    @property
+    def TABLE_ROW_CONF(self) -> float:
+        """confidence threshold for column identified by table transformer"""
+        return self._get_float("TABLE_ROW_CONF", 0.5)
+
+    @property
+    def TABLE_COLUMN_HEADER_CONF(self) -> float:
+        """confidence threshold for column header identified by table transformer"""
+        return self._get_float("TABLE_COLUMN_HEADER_CONF", 0.5)
+
+    @property
+    def TABLE_PROJECTED_ROW_HEADER_CONF(self) -> float:
+        """confidence threshold for projected row header identified by table transformer"""
+        return self._get_float("TABLE_PROJECTED_ROW_HEADER_CONF", 0.5)
+
+    @property
+    def TABLE_SPANNING_CELL_CONF(self) -> float:
+        """confidence threshold for table spanning cells identified by table transformer"""
+        return self._get_float("TABLE_SPANNING_CELL_CONF", 0.5)
+
+    @property
+    def TABLE_IOB_THRESHOLD(self) -> float:
+        """minimum intersection over box area ratio for a box to be considered part of a larger box
+        it intersects"""
+        return self._get_float("TABLE_IOB_THRESHOLD", 0.5)
+
     @property
     def LAYOUT_SAME_REGION_THRESHOLD(self) -> float:
         """threshold for two layouts' bounding boxes to be considered as the same region
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
index bd1bc4ce..7d48c41b 100644
--- a/unstructured_inference/models/tables.py
+++ b/unstructured_inference/models/tables.py
@@ -177,12 +177,13 @@ def get_class_map(data_type: str):
 
 
 structure_class_thresholds = {
-    "table": 0.5,
-    "table column": 0.5,
-    "table row": 0.5,
-    "table column header": 0.5,
-    "table projected row header": 0.5,
-    "table spanning cell": 0.5,
+    "table": inference_config.TT_TABLE_CONF,
+    "table column": inference_config.TABLE_COLUMN_CONF,
+    "table row": inference_config.TABLE_ROW_CONF,
+    "table column header": inference_config.TABLE_COLUMN_HEADER_CONF,
+    "table projected row header": inference_config.TABLE_PROJECTED_ROW_HEADER_CONF,
+    "table spanning cell": inference_config.TABLE_SPANNING_CELL_CONF,
+    # FIXME (yao) this parameter doesn't seem to be used at all in inference? Can we remove it
     "no object": 10,
 }
 
@@ -282,8 +283,16 @@ def objects_to_structures(objects, tokens, class_thresholds):
     table_structures = []
 
     for table in tables:
-        table_objects = [obj for obj in objects if iob(obj["bbox"], table["bbox"]) >= 0.5]
-        table_tokens = [token for token in tokens if iob(token["bbox"], table["bbox"]) >= 0.5]
+        table_objects = [
+            obj
+            for obj in objects
+            if iob(obj["bbox"], table["bbox"]) >= inference_config.TABLE_IOB_THRESHOLD
+        ]
+        table_tokens = [
+            token
+            for token in tokens
+            if iob(token["bbox"], table["bbox"]) >= inference_config.TABLE_IOB_THRESHOLD
+        ]
 
         structure = {}
 
@@ -302,7 +311,7 @@ def objects_to_structures(objects, tokens, class_thresholds):
         for obj in rows:
             obj["column header"] = False
             for header_obj in column_headers:
-                if iob(obj["bbox"], header_obj["bbox"]) >= 0.5:
+                if iob(obj["bbox"], header_obj["bbox"]) >= inference_config.TABLE_IOB_THRESHOLD:
                     obj["column header"] = True
 
         # Refine table structures
@@ -478,7 +487,7 @@ def structure_to_cells(table_structure, tokens):
                 spanning_cell_rect = Rect(list(spanning_cell["bbox"]))
                 if (
                     spanning_cell_rect.intersect(cell_rect).get_area() / cell_rect.get_area()
-                ) > 0.5:
+                ) > inference_config.TABLE_IOB_THRESHOLD:
                     cell["subcell"] = True
                     break
 
@@ -499,7 +508,9 @@ def structure_to_cells(table_structure, tokens):
         for subcell in subcells:
             subcell_rect = Rect(list(subcell["bbox"]))
             subcell_rect_area = subcell_rect.get_area()
-            if (subcell_rect.intersect(spanning_cell_rect).get_area() / subcell_rect_area) > 0.5:
+            if (
+                subcell_rect.intersect(spanning_cell_rect).get_area() / subcell_rect_area
+            ) > inference_config.TABLE_IOB_THRESHOLD:
                 if cell_rect is None:
                     cell_rect = Rect(list(subcell["bbox"]))
                 else:

From 35ebea7968fe6a30cb3606b0486f8dfe6d833a54 Mon Sep 17 00:00:00 2001
From: Yao You <theyaoyou@gmail.com>
Date: Fri, 22 Sep 2023 14:21:55 -0500
Subject: [PATCH 05/11] feat: add pre commit hook (#220)

- add config yaml (copied from `unstructured` repo)
- helps with dev's Quality of Life
---
 .pre-commit-config.yaml | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..88da6a35
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,38 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: "v4.3.0"
+    hooks:
+      - id: check-added-large-files
+      - id: check-toml
+      - id: check-yaml
+      - id: check-json
+      - id: check-xml
+      - id: end-of-file-fixer
+        exclude: \.json$
+        include: \.py$
+      - id: trailing-whitespace
+      - id: mixed-line-ending
+
+  - repo: https://github.com/psf/black
+    rev: 22.10.0
+    hooks:
+      - id: black
+        args: ["--line-length=100"]
+        language_version: python3
+
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: "v0.0.230"
+    hooks:
+      - id: ruff
+        args:
+          [
+            "--fix",
+            "--select=I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402",
+            "--ignore=PT011,PT012,SIM117",
+          ]
+
+  - repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
+        language_version: python3

From cb2aff2e31572f44147f69b7458eea548cd54a4f Mon Sep 17 00:00:00 2001
From: Yao You <theyaoyou@gmail.com>
Date: Mon, 25 Sep 2023 12:55:16 -0500
Subject: [PATCH 06/11] fix: padded boxes are not rescaled/shifted correctly
 (#229)

---
 CHANGELOG.md                                  |  8 +-
 .../models/test_tables.py                     | 73 +++++++++++++++----
 unstructured_inference/__version__.py         |  2 +-
 unstructured_inference/models/tables.py       |  4 +-
 4 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc9e79b6..3d70b588 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.6.3
+
+* fix a bug where padded table structure bounding boxes are not shifted back into the original image coordinates correctly
+
 ## 0.6.2
 
 * move the confidence threshold for table transformer to config
@@ -5,8 +9,8 @@
 ## 0.6.1
 
 * YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model.
-* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this 
-  behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps 
+* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this
+  behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps
   reduce duplicated content.
 * Add `source` property to our elements, so you can know where the information was generated (OCR or detection model)
 
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
index 585f17f1..fd95bcc4 100644
--- a/test_unstructured_inference/models/test_tables.py
+++ b/test_unstructured_inference/models/test_tables.py
@@ -1,6 +1,9 @@
 import os
 
+import numpy as np
 import pytest
+import torch
+from PIL import Image
 from transformers.models.table_transformer.modeling_table_transformer import (
     TableTransformerDecoder,
 )
@@ -11,6 +14,18 @@
 skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
 
 
+@pytest.fixture()
+def table_transformer():
+    table_model = tables.UnstructuredTableTransformerModel()
+    table_model.initialize(model="microsoft/table-transformer-structure-recognition")
+    return table_model
+
+
+@pytest.fixture()
+def example_image():
+    return Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
+
+
 @pytest.mark.parametrize(
     "model_path",
     [
@@ -328,13 +343,8 @@ def test_align_rows(rows, bbox, output):
     assert postprocess.align_rows(rows, bbox) == output
 
 
-def test_table_prediction_tesseract():
-    table_model = tables.UnstructuredTableTransformerModel()
-    from PIL import Image
-
-    table_model.initialize(model="microsoft/table-transformer-structure-recognition")
-    img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
-    prediction = table_model.predict(img)
+def test_table_prediction_tesseract(table_transformer, example_image):
+    prediction = table_transformer.predict(example_image)
     # assert rows spans two rows are detected
     assert '<table><thead><th rowspan="2">' in prediction
     # one of the safest rows to detect should be present
@@ -351,28 +361,24 @@ def test_table_prediction_tesseract():
 
 
 @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
-def test_table_prediction_paddle(monkeypatch):
+def test_table_prediction_paddle(monkeypatch, example_image):
     monkeypatch.setenv("TABLE_OCR", "paddle")
     table_model = tables.UnstructuredTableTransformerModel()
-    from PIL import Image
 
     table_model.initialize(model="microsoft/table-transformer-structure-recognition")
-    img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
-    prediction = table_model.predict(img)
+    prediction = table_model.predict(example_image)
     # Note(yuming): lossen paddle table prediction output test since performance issue
     # and results are different in different platforms (i.e., gpu vs cpu)
     assert len(prediction)
 
 
-def test_table_prediction_invalid_table_ocr(monkeypatch):
+def test_table_prediction_invalid_table_ocr(monkeypatch, example_image):
     monkeypatch.setenv("TABLE_OCR", "invalid_table_ocr")
     with pytest.raises(ValueError):
         table_model = tables.UnstructuredTableTransformerModel()
-        from PIL import Image
 
         table_model.initialize(model="microsoft/table-transformer-structure-recognition")
-        img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
-        _ = table_model.predict(img)
+        _ = table_model.predict(example_image)
 
 
 def test_intersect():
@@ -581,3 +587,40 @@ def test_cells_to_html():
         "cols</td></tr><tr><td></td><td>sub cell 1</td><td>sub cell 2</td></tr></table>"
     )
     assert tables.cells_to_html(cells) == expected
+
+
+def test_padded_results_has_right_dimensions(table_transformer, example_image):
+    str_class_name2idx = tables.get_class_map("structure")
+    # a simpler mapping so we keep all structure in the returned objs below for test
+    str_class_idx2name = {v: "table cell" for v in str_class_name2idx.values()}
+    # pad size is no more than 10% of the original image so we can setup test below easier
+    pad = int(min(example_image.size) / 10)
+
+    structure = table_transformer.get_structure(example_image, pad_for_structure_detection=pad)
+    # boxes deteced OUTSIDE of the original image; this shouldn't happen but we want to make sure
+    # the code handles it as expected
+    structure["pred_boxes"][0][0, :2] = 0.5
+    structure["pred_boxes"][0][0, 2:] = 1.0
+    # mock a box we know are safly inside the original image with known positions
+    width, height = example_image.size
+    padded_width = width + pad * 2
+    padded_height = height + pad * 2
+    original = [1, 3, 101, 53]
+    structure["pred_boxes"][0][1, :] = torch.tensor(
+        [
+            (51 + pad) / padded_width,
+            (28 + pad) / padded_height,
+            100 / padded_width,
+            50 / padded_height,
+        ],
+    )
+    objs = tables.outputs_to_objects(structure, example_image.size, str_class_idx2name)
+    np.testing.assert_almost_equal(objs[0]["bbox"], [-pad, -pad, width + pad, height + pad], 4)
+    np.testing.assert_almost_equal(objs[1]["bbox"], original, 4)
+    # a more strict test would be to constrain the actual detected boxes to be within the original
+    # image but that requires the table transformer to behave in certain ways and do not
+    # actually test the padding math; so here we use the relaxed condition
+    for obj in objs[2:]:
+        x1, y1, x2, y2 = obj["bbox"]
+        assert max(x1, x2) < width + pad
+        assert max(y1, y2) < height + pad
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index f42820dd..1f6c2e4b 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.2"  # pragma: no cover
+__version__ = "0.6.3"  # pragma: no cover
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
index 7d48c41b..d4885296 100644
--- a/unstructured_inference/models/tables.py
+++ b/unstructured_inference/models/tables.py
@@ -220,11 +220,11 @@ def outputs_to_objects(outputs, img_size, class_idx2name):
     pred_bboxes = outputs["pred_boxes"].detach().cpu()[0]
 
     pad = outputs.get("pad_for_structure_detection", 0)
-    scale_size = (img_size[0] + pad, img_size[1] + pad)
+    scale_size = (img_size[0] + pad * 2, img_size[1] + pad * 2)
     pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, scale_size)]
     # unshift the padding; padding effectively shifted the bounding boxes of structures in the
     # original image with half of the total pad
-    shift_size = pad / 2
+    shift_size = pad
 
     objects = []
     for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):

From c4d3e8b0bf4321caba817efd6e3384a573461aff Mon Sep 17 00:00:00 2001
From: Yao You <theyaoyou@gmail.com>
Date: Mon, 25 Sep 2023 15:33:05 -0500
Subject: [PATCH 07/11] feat: add autoscaling for table images (#210)

Auto scale table images so that the text height is optimum for
`tesseract` OCR inference. This functionality will scaling images where
the estimated mean text height based on the `inference_config` setup:
table images with text height below
`inference_config.TESSERACT_MIN_TEXT_HEIGHT` or above
`inference_config.TESSERACT_MAX_TEXT_HEIGHT` are scaled so that the text
height is at `inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT`.

This PR resolves
[CORE-1863](https://unstructured-ai.atlassian.net/browse/CORE-1863)

## test

- this PR adds a unit test to confirm auto scale is triggered
- test the tokens computed without zoom and with zoom with the attached
image: with zoom the tokens should include the correct text "Japanese"
in the table on the page. Without zoom (call get_tokens using main) we
won't see this token and instead you might find a token that look like
"Inpanere". For this specific document it is best to set
`TESSERACT_MIN_TEXT_HEIGHT` to 12.

![layout-parser-paper-with-table](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/7963bba0-67cb-48ee-b338-52b1c2620fc0)


[CORE-1863]:
https://unstructured-ai.atlassian.net/browse/CORE-1863?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
---
 CHANGELOG.md                                  |  5 ++
 requirements/base.txt                         |  2 +-
 requirements/dev.txt                          |  4 +-
 requirements/test.in                          |  1 +
 requirements/test.txt                         |  6 ++-
 .../models/test_tables.py                     | 16 ++++++
 unstructured_inference/__version__.py         |  2 +-
 unstructured_inference/config.py              | 30 ++++++++++-
 unstructured_inference/models/tables.py       | 54 +++++++++++++++++--
 unstructured_inference/models/tesseract.py    |  4 ++
 10 files changed, 113 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3d70b588..49b29b4f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.6.4
+
+* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
+* add the new image auto scaling parameters to `config.py`
+
 ## 0.6.3
 
 * fix a bug where padded table structure bounding boxes are not shifted back into the original image coordinates correctly
diff --git a/requirements/base.txt b/requirements/base.txt
index 38ea4933..10836a40 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -44,7 +44,7 @@ humanfriendly==10.0
     # via coloredlogs
 idna==3.4
     # via requests
-importlib-resources==6.0.1
+importlib-resources==6.1.0
     # via matplotlib
 iopath==0.1.10
     # via layoutparser
diff --git a/requirements/dev.txt b/requirements/dev.txt
index ff65ca27..f397245b 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -101,7 +101,7 @@ importlib-metadata==6.8.0
     #   jupyterlab
     #   jupyterlab-server
     #   nbconvert
-importlib-resources==6.0.1
+importlib-resources==6.1.0
     # via
     #   -c requirements/base.txt
     #   jsonschema
@@ -139,7 +139,7 @@ json5==0.9.14
     # via jupyterlab-server
 jsonpointer==2.4
     # via jsonschema
-jsonschema[format-nongpl]==4.19.0
+jsonschema[format-nongpl]==4.19.1
     # via
     #   jupyter-events
     #   jupyterlab-server
diff --git a/requirements/test.in b/requirements/test.in
index f3957593..d3846b25 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -12,6 +12,7 @@ flake8
 flake8-docstrings
 mypy
 pytest-cov
+pytest-mock
 pdf2image>=1.16.2
 huggingface_hub>=0.11.1
 ruff
diff --git a/requirements/test.txt b/requirements/test.txt
index ce238d69..195f680b 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -97,9 +97,13 @@ pydocstyle==6.3.0
 pyflakes==3.1.0
     # via flake8
 pytest==7.4.2
-    # via pytest-cov
+    # via
+    #   pytest-cov
+    #   pytest-mock
 pytest-cov==4.1.0
     # via -r requirements/test.in
+pytest-mock==3.11.1
+    # via -r requirements/test.in
 pyyaml==6.0.1
     # via
     #   -c requirements/base.txt
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
index fd95bcc4..c6d3371a 100644
--- a/test_unstructured_inference/models/test_tables.py
+++ b/test_unstructured_inference/models/test_tables.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 
 import numpy as np
 import pytest
@@ -589,6 +590,21 @@ def test_cells_to_html():
     assert tables.cells_to_html(cells) == expected
 
 
+def test_auto_zoom(mocker):
+    spy = mocker.spy(tables, "zoom_image")
+    model = tables.UnstructuredTableTransformerModel()
+    model.initialize("microsoft/table-transformer-structure-recognition")
+    image = Image.open(
+        Path(os.path.dirname(os.path.abspath(__file__)))
+        / ".."
+        / ".."
+        / "sample-docs"
+        / "layout-parser-paper-fast.jpg",
+    )
+    model.get_tokens(image)
+    assert spy.call_count == 1
+
+
 def test_padded_results_has_right_dimensions(table_transformer, example_image):
     str_class_name2idx = tables.get_class_map("structure")
     # a simpler mapping so we keep all structure in the returned objs below for test
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 1f6c2e4b..4bf914ad 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.3"  # pragma: no cover
+__version__ = "0.6.4"  # pragma: no cover
diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py
index e8f287e9..aca2550c 100644
--- a/unstructured_inference/config.py
+++ b/unstructured_inference/config.py
@@ -44,7 +44,35 @@ def TABLE_IMAGE_BACKGROUND_PAD(self) -> int:
         The padding adds NO image data around an identified table bounding box; it simply adds white
         background around the image
         """
-        return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0)
+        return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 20)
+
+    @property
+    def TESSERACT_MIN_TEXT_HEIGHT(self) -> int:
+        """minimum text height acceptable from tesseract OCR results
+
+        if estimated text height from tesseract OCR results is lower than this value the image is
+        scaled up to be processed again
+        """
+        return self._get_int("TESSERACT_MIN_TEXT_HEIGHT", 12)
+
+    @property
+    def TESSERACT_MAX_TEXT_HEIGHT(self) -> int:
+        """maximum text height acceptable from tesseract OCR results
+
+        if estimated text height from tesseract OCR results is higher than this value the image is
+        scaled down to be processed again
+        """
+        return self._get_int("TESSERACT_MAX_TEXT_HEIGHT", 100)
+
+    @property
+    def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
+        """optimum text height for tesseract OCR"""
+        return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
+
+    @property
+    def TESSERACT_TEXT_HEIGHT_QUANTILE(self) -> float:
+        """the quantile to check for text height"""
+        return self._get_float("TESSERACT_TEXT_HEIGHT_QUANTILE", 0.5)
 
     @property
     def TT_TABLE_CONF(self) -> float:
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
index d4885296..4a68e3d2 100644
--- a/unstructured_inference/models/tables.py
+++ b/unstructured_inference/models/tables.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
+import cv2
 import numpy as np
 import pandas as pd
 import pytesseract
@@ -17,6 +18,9 @@
 from unstructured_inference.config import inference_config
 from unstructured_inference.logger import logger
 from unstructured_inference.models.table_postprocess import Rect
+from unstructured_inference.models.tesseract import (
+    TESSERACT_TEXT_HEIGHT,
+)
 from unstructured_inference.models.unstructuredmodel import UnstructuredModel
 from unstructured_inference.utils import pad_image_with_background_color
 
@@ -79,23 +83,45 @@ def get_tokens(self, x: Image):
                     ymax = max([i[1] for i in line[0]])
                     tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
         else:
+            zoom = 1
+
             logger.info("Processing table OCR with tesseract...")
             ocr_df: pd.DataFrame = pytesseract.image_to_data(
                 x,
                 output_type="data.frame",
             )
-
             ocr_df = ocr_df.dropna()
 
+            # tesseract performance degrades when the text height is out of the preferred zone so we
+            # zoom the image (in or out depending on estimated text height) for optimum OCR results
+            # but this needs to be evaluated based on actual use case as the optimum scaling also
+            # depend on type of characters (font, language, etc); be careful about this
+            # functionality
+            text_height = ocr_df[TESSERACT_TEXT_HEIGHT].quantile(
+                inference_config.TESSERACT_TEXT_HEIGHT_QUANTILE,
+            )
+            if (
+                text_height < inference_config.TESSERACT_MIN_TEXT_HEIGHT
+                or text_height > inference_config.TESSERACT_MAX_TEXT_HEIGHT
+            ):
+                # rounding avoids unnecessary precision and potential numerical issues assocaited
+                # with numbers very close to 1 inside cv2 image processing
+                zoom = np.round(inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1)
+                ocr_df = pytesseract.image_to_data(
+                    zoom_image(x, zoom),
+                    output_type="data.frame",
+                )
+                ocr_df = ocr_df.dropna()
+
             tokens = []
             for idtx in ocr_df.itertuples():
                 tokens.append(
                     {
                         "bbox": [
-                            idtx.left,
-                            idtx.top,
-                            idtx.left + idtx.width,
-                            idtx.top + idtx.height,
+                            idtx.left / zoom,
+                            idtx.top / zoom,
+                            (idtx.left + idtx.width) / zoom,
+                            (idtx.top + idtx.height) / zoom,
                         ],
                         "text": idtx.text,
                     },
@@ -688,3 +714,21 @@ def cells_to_html(cells):
         tcell.text = cell["cell text"]
 
     return str(ET.tostring(table, encoding="unicode", short_empty_elements=False))
+
+
+def zoom_image(image: Image, zoom: float) -> Image:
+    """scale an image based on the zoom factor using cv2; the scaled image is post processed by
+    dilation then erosion to improve edge sharpness for OCR tasks"""
+    new_image = cv2.resize(
+        cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR),
+        None,
+        fx=zoom,
+        fy=zoom,
+        interpolation=cv2.INTER_CUBIC,
+    )
+
+    kernel = np.ones((1, 1), np.uint8)
+    new_image = cv2.dilate(new_image, kernel, iterations=1)
+    new_image = cv2.erode(new_image, kernel, iterations=1)
+
+    return Image.fromarray(new_image)
diff --git a/unstructured_inference/models/tesseract.py b/unstructured_inference/models/tesseract.py
index 56bf8e5a..e6f599cc 100644
--- a/unstructured_inference/models/tesseract.py
+++ b/unstructured_inference/models/tesseract.py
@@ -16,6 +16,10 @@
     os.environ["OMP_THREAD_LIMIT"] = "1"
 
 
+# this field is defined by pytesseract/unstructured.pytesseract
+TESSERACT_TEXT_HEIGHT = "height"
+
+
 def load_agent(languages: str = "eng"):
     """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once.
 

From f4236c8b271a4d0bca3a9411a405001b64f604b6 Mon Sep 17 00:00:00 2001
From: Benjamin Torres <benjats07@users.noreply.github.com>
Date: Tue, 26 Sep 2023 11:05:15 -0600
Subject: [PATCH 08/11] Fix/pdf miner source property (#228)

This PR adds three possible values for `source` field:
* `pdfminer` as source for elements directly obtained from PDFs.
* `OCR-tesseract` and `OCR-paddle` for elements obtained with the
respective OCR engines.

All those new values are stored in a new class `Source` in unstructured_inference>constants.py

This would help users filter certain elements depending on how were
obtained.
---
 CHANGELOG.md                                    |  5 +++++
 .../inference/test_layout.py                    | 17 ++++++++++++++++-
 .../inference/test_layout_element.py            |  4 ++--
 unstructured_inference/__version__.py           |  2 +-
 unstructured_inference/constants.py             | 10 ++++++++++
 unstructured_inference/inference/elements.py    |  3 ++-
 unstructured_inference/inference/layout.py      | 15 +++++++++++----
 .../inference/layoutelement.py                  | 16 +++++++++++-----
 unstructured_inference/models/detectron2onnx.py |  3 ++-
 unstructured_inference/models/yolox.py          |  3 ++-
 10 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49b29b4f..7748867f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.6.5-dev0
+
+* Fix `source` property for elements generated by pdfminer.
+* Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR.
+
 ## 0.6.4
 
 * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
index 42bba9a2..b2b665bd 100644
--- a/test_unstructured_inference/inference/test_layout.py
+++ b/test_unstructured_inference/inference/test_layout.py
@@ -10,9 +10,10 @@
 from PIL import Image
 
 import unstructured_inference.models.base as models
-from unstructured_inference.constants import OCRMode
+from unstructured_inference.constants import OCRMode, Source
 from unstructured_inference.inference import elements, layout, layoutelement
 from unstructured_inference.models import chipper, detectron2, tesseract
+from unstructured_inference.models.base import get_model
 from unstructured_inference.models.unstructuredmodel import (
     UnstructuredElementExtractionModel,
     UnstructuredObjectDetectionModel,
@@ -117,6 +118,19 @@ def detect(self, *args):
     assert elements.ocr(text_block, image=image) == ""
 
 
+def test_ocr_source():
+    file = "sample-docs/loremipsum-flat.pdf"
+    model = get_model("yolox_tiny")
+    doc = layout.DocumentLayout.from_file(
+        file,
+        model,
+        ocr_mode=OCRMode.FULL_PAGE.value,
+        supplement_with_ocr_elements=True,
+        ocr_strategy="force",
+    )
+    assert Source.OCR_TESSERACT in {e.source for e in doc.pages[0].elements}
+
+
 class MockLayoutModel:
     def __init__(self, layout):
         self.layout_return = layout
@@ -678,6 +692,7 @@ def test_ocr_image(region, objects, ocr_strategy, expected):
 @pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"])
 def test_load_pdf(filename):
     layouts, images = layout.load_pdf(f"sample-docs/{filename}")
+    assert Source.PDFMINER in {e.source for e in layouts[0]}
     assert len(layouts)
     for lo in layouts:
         assert len(lo)
diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py
index 59727b62..0991a364 100644
--- a/test_unstructured_inference/inference/test_layout_element.py
+++ b/test_unstructured_inference/inference/test_layout_element.py
@@ -2,7 +2,7 @@
 from layoutparser.elements import TextBlock
 from layoutparser.elements.layout_elements import Rectangle as LPRectangle
 
-from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR
+from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR, Source
 from unstructured_inference.inference.elements import TextRegion
 from unstructured_inference.inference.layoutelement import (
     LayoutElement,
@@ -166,7 +166,7 @@ def test_layout_element_from_lp_textblock():
         300,
         300,
         text="Sample Text",
-        source="detectron2_lp",
+        source=Source.DETECTRON2_LP,
         type="Text",
         prob=0.99,
     )
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 4bf914ad..530da768 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.4"  # pragma: no cover
+__version__ = "0.6.5-dev0"  # pragma: no cover
diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py
index c6c20299..78c46379 100644
--- a/unstructured_inference/constants.py
+++ b/unstructured_inference/constants.py
@@ -11,5 +11,15 @@ class AnnotationResult(Enum):
     PLOT = "plot"
 
 
+class Source(Enum):
+    YOLOX = "yolox"
+    DETECTRON2_ONNX = "detectron2_onnx"
+    DETECTRON2_LP = "detectron2_lp"
+    OCR_TESSERACT = "OCR-tesseract"
+    OCR_PADDLE = "OCR-paddle"
+    PDFMINER = "pdfminer"
+    MERGED = "merged"
+
+
 SUBREGION_THRESHOLD_FOR_OCR = 0.5
 FULL_PAGE_REGION_THRESHOLD = 0.99
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
index 1b965cda..67f78216 100644
--- a/unstructured_inference/inference/elements.py
+++ b/unstructured_inference/inference/elements.py
@@ -12,6 +12,7 @@
 from scipy.sparse.csgraph import connected_components
 
 from unstructured_inference.config import inference_config
+from unstructured_inference.constants import Source
 from unstructured_inference.logger import logger
 from unstructured_inference.math import safe_division
 from unstructured_inference.models import tesseract
@@ -197,7 +198,7 @@ def intersections(*rects: Rectangle):
 @dataclass
 class TextRegion(Rectangle):
     text: Optional[str] = None
-    source: Optional[str] = None
+    source: Optional[Source] = None
 
     def __str__(self) -> str:
         return str(self.text)
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index 2cdba3bc..1ef60c30 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -13,7 +13,7 @@
 from PIL import Image, ImageSequence
 from pytesseract import Output
 
-from unstructured_inference.constants import OCRMode
+from unstructured_inference.constants import OCRMode, Source
 from unstructured_inference.inference.elements import (
     EmbeddedTextRegion,
     ImageTextRegion,
@@ -677,7 +677,14 @@ def load_pdf(
                 else:
                     continue
 
-            text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text)
+            text_region = element_class(
+                x1 * coef,
+                y1 * coef,
+                x2 * coef,
+                y2 * coef,
+                text=_text,
+                source=Source.PDFMINER,
+            )
 
             if text_region.area > 0:
                 layout.append(text_region)
@@ -738,7 +745,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]:
         (x1, y1, x2, y2) = l, t, l + w, t + h
         text = ocr_data["text"][i]
         if text:
-            text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR")
+            text_region = TextRegion(x1, y1, x2, y2, text=text, source=Source.OCR_TESSERACT)
             text_regions.append(text_region)
 
     return text_regions
@@ -774,7 +781,7 @@ def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]:
             y2 = max([i[1] for i in line[0]])
             text = line[1][0]
             if text:
-                text_region = TextRegion(x1, y1, x2, y2, text)
+                text_region = TextRegion(x1, y1, x2, y2, text, source=Source.OCR_PADDLE)
                 text_regions.append(text_region)
 
     return text_regions
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
index 887ecc33..f909c93a 100644
--- a/unstructured_inference/inference/layoutelement.py
+++ b/unstructured_inference/inference/layoutelement.py
@@ -9,7 +9,11 @@
 from PIL import Image
 
 from unstructured_inference.config import inference_config
-from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR
+from unstructured_inference.constants import (
+    FULL_PAGE_REGION_THRESHOLD,
+    SUBREGION_THRESHOLD_FOR_OCR,
+    Source,
+)
 from unstructured_inference.inference.elements import (
     ImageTextRegion,
     Rectangle,
@@ -74,7 +78,7 @@ def from_lp_textblock(cls, textblock: TextBlock):
         text = textblock.text
         type = textblock.type
         prob = textblock.score
-        return cls(x1, y1, x2, y2, text=text, source="detectron2_lp", type=type, prob=prob)
+        return cls(x1, y1, x2, y2, text=text, source=Source.DETECTRON2_LP, type=type, prob=prob)
 
 
 def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str:
@@ -311,8 +315,10 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion:
 
     merged_text = " ".join([tr.text for tr in regions if tr.text])
     sources = [*{tr.source for tr in regions}]
-    source = sources.pop() if len(sources) == 1 else "merged:".join(sources)  # type:ignore
-    return TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
+    source = sources.pop() if len(sources) == 1 else Source.MERGED
+    element = TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text)
+    setattr(element, "merged_sources", sources)
+    return element
 
 
 def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]:
@@ -332,7 +338,7 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE
             r.x2,
             r.y2,
             text=r.text,
-            source=None,
+            source=r.source,
             type="UncategorizedText",
         )
         for r in merged_regions
diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py
index 7b9df081..e8b75d1c 100644
--- a/unstructured_inference/models/detectron2onnx.py
+++ b/unstructured_inference/models/detectron2onnx.py
@@ -9,6 +9,7 @@
 from onnxruntime.quantization import QuantType, quantize_dynamic
 from PIL import Image
 
+from unstructured_inference.constants import Source
 from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.logger import logger, logger_onnx
 from unstructured_inference.models.unstructuredmodel import (
@@ -158,7 +159,7 @@ def postprocess(
                     text=None,
                     type=detected_class,
                     prob=conf,
-                    source="detectron2_onnx",
+                    source=Source.DETECTRON2_ONNX,
                 )
 
                 regions.append(region)
diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py
index f5103698..e239a9b6 100644
--- a/unstructured_inference/models/yolox.py
+++ b/unstructured_inference/models/yolox.py
@@ -14,6 +14,7 @@
 from onnxruntime.quantization import QuantType, quantize_dynamic
 from PIL import Image
 
+from unstructured_inference.constants import Source
 from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.logger import logger
 from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
@@ -149,7 +150,7 @@ def image_processing(
                 text=None,
                 type=detected_class,
                 prob=prob,
-                source="yolox",
+                source=Source.YOLOX,
             )
 
             regions.append(region)

From 00b493631bb9b20f2f8ee6b043577c0f02372e7d Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Tue, 26 Sep 2023 11:26:04 -0700
Subject: [PATCH 09/11] Feat/219 keep extracted image elements (#225)

update `merge_inferred_layout_with_extracted_layout` to keep extracted image elements
---
 CHANGELOG.md                                  |  1 +
 examples/image-extraction/requirements.txt    |  1 -
 .../README.md                                 |  0
 .../embedded-image-extraction.py              |  0
 examples/image_extraction/requirements.txt    |  3 +++
 unstructured_inference/inference/layout.py    |  2 +-
 .../inference/layoutelement.py                | 23 ++++++++++++++-----
 7 files changed, 22 insertions(+), 8 deletions(-)
 delete mode 100644 examples/image-extraction/requirements.txt
 rename examples/{image-extraction => image_extraction}/README.md (100%)
 rename examples/{image-extraction => image_extraction}/embedded-image-extraction.py (100%)
 create mode 100644 examples/image_extraction/requirements.txt

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7748867f..da608a15 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@
 
 ## 0.6.4
 
+* Add functionality to keep extracted image elements while merging inferred layout with extracted layout
 * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
 * add the new image auto scaling parameters to `config.py`
 
diff --git a/examples/image-extraction/requirements.txt b/examples/image-extraction/requirements.txt
deleted file mode 100644
index 0d7e9b7d..00000000
--- a/examples/image-extraction/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-unstructured-inference
\ No newline at end of file
diff --git a/examples/image-extraction/README.md b/examples/image_extraction/README.md
similarity index 100%
rename from examples/image-extraction/README.md
rename to examples/image_extraction/README.md
diff --git a/examples/image-extraction/embedded-image-extraction.py b/examples/image_extraction/embedded-image-extraction.py
similarity index 100%
rename from examples/image-extraction/embedded-image-extraction.py
rename to examples/image_extraction/embedded-image-extraction.py
diff --git a/examples/image_extraction/requirements.txt b/examples/image_extraction/requirements.txt
new file mode 100644
index 00000000..351b4120
--- /dev/null
+++ b/examples/image_extraction/requirements.txt
@@ -0,0 +1,3 @@
+unstructured-inference
+pymupdf
+pypdf2
\ No newline at end of file
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index 1ef60c30..d1fb11c3 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -441,7 +441,7 @@ def annotate(
                     width = style["width"]
                     for region in getattr(self, attribute):
                         if isinstance(region, Rectangle):
-                            required_source = getattr(el, "source", None)
+                            required_source = getattr(region, "source", None)
                             if "all" in sources or required_source in sources:
                                 img = draw_bbox(
                                     img,
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
index f909c93a..f3f3343f 100644
--- a/unstructured_inference/inference/layoutelement.py
+++ b/unstructured_inference/inference/layoutelement.py
@@ -148,13 +148,24 @@ def merge_inferred_layout_with_extracted_layout(
                 )
                 if same_bbox:
                     # Looks like these represent the same region
-                    grow_region_to_match_region(inferred_region, extracted_region)
-                    inferred_region.text = extracted_region.text
-                    region_matched = True
-                elif extracted_is_subregion_of_inferred and inferred_is_text and extracted_is_image:
-                    grow_region_to_match_region(inferred_region, extracted_region)
-                    region_matched = True
+                    if extracted_is_image:
+                        # keep extracted region, remove inferred region
+                        inferred_regions_to_remove.append(inferred_region)
+                    else:
+                        # keep inferred region, remove extracted region
+                        grow_region_to_match_region(inferred_region, extracted_region)
+                        inferred_region.text = extracted_region.text
+                        region_matched = True
+                elif extracted_is_subregion_of_inferred and inferred_is_text:
+                    if extracted_is_image:
+                        # keep both extracted and inferred regions
+                        region_matched = False
+                    else:
+                        # keep inferred region, remove extracted region
+                        grow_region_to_match_region(inferred_region, extracted_region)
+                        region_matched = True
                 elif either_region_is_subregion_of_other and inferred_region.type != "Table":
+                    # keep extracted region, remove inferred region
                     inferred_regions_to_remove.append(inferred_region)
         if not region_matched:
             extracted_elements_to_add.append(extracted_region)

From 12ca9d91c6b1a066d94fc2858b904a62bf3ee814 Mon Sep 17 00:00:00 2001
From: Christine Straub <christinemstraub@gmail.com>
Date: Tue, 26 Sep 2023 17:37:23 -0700
Subject: [PATCH 10/11] chore: changelog fix, cut release 0.6.5 (#230)

---
 CHANGELOG.md                          | 4 ++--
 unstructured_inference/__version__.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index da608a15..96a632f3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,11 @@
-## 0.6.5-dev0
+## 0.6.5
 
+* Add functionality to keep extracted image elements while merging inferred layout with extracted layout
 * Fix `source` property for elements generated by pdfminer.
 * Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR.
 
 ## 0.6.4
 
-* Add functionality to keep extracted image elements while merging inferred layout with extracted layout
 * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
 * add the new image auto scaling parameters to `config.py`
 
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 530da768..36225a4c 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.5-dev0"  # pragma: no cover
+__version__ = "0.6.5"  # pragma: no cover

From cf15726a99db843bc6dcab4849577eca51d08af3 Mon Sep 17 00:00:00 2001
From: Yuming Long <63475068+yuming-long@users.noreply.github.com>
Date: Wed, 27 Sep 2023 19:46:19 -0400
Subject: [PATCH 11/11] chore: stop passing language code from tesseract
 mapping to paddle (#226)

### Summary

A user is flagging the assertion error for paddle language code:
```
AssertionError: param lang must in dict_keys(['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'latin', 'arabic', 'cyrillic', 'devanagari']), but got eng
```
and tried setting the `ocr_languages` param to 'en' (the correct lang
code for english in paddle) but also didn't work.
The reason is that the `ocr_languages` uses the mapping for tesseract
code which will convert `en` to `eng` since thats the correct lang code
for english in tesseract.

The quick workaround here is stop passing the lang code to paddle and
let it use default `en`, and this will be addressed once we have the
lang code mapping for paddle.

### Test
looks like user used this branch and got the lang parameter working from
[linked
comments](https://github.com/Unstructured-IO/unstructured-api/issues/247#issuecomment-1731923667)
:)
on api repo:
```
pip install paddlepaddle
pip install "unstructured.PaddleOCR"
export ENTIRE_PAGE_OCR=paddle
make run-web-app
```
* check error before this change:
```
curl  -X 'POST'  'http://localhost:8000/general/v0/general'   -H 'accept: application/json'  -F 'files=@sample-docs/english-and-korean.png'   -F 'ocr_languages=en'  | jq -C . | less -R
```
will see the error:
```
{
  "detail": "param lang must in dict_keys(['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'latin', 'arabic', 'cyrillic', 'devanagari']), but got eng"
}
```
also in logger you will see `INFO Loading paddle with CPU on
language=eng...` since tesseract mapping converts `en` to `eng`.
* check after this change:

Checkout to this branch and install inference repo into your env (the
same env thats running api) with `pip install -e .`

Rerun `make run-web-app`

Run the curl command again, you won't get the result on m1 chip since
paddle doesn't work on it but from the logger info you can see
`2023-09-27 12:48:48,120 unstructured_inference INFO Loading paddle with
CPU on language=en...`, which means the lang parameter is using default
`en` (logger info is coming from [this
line](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/paddle_ocr.py#L22)).

---------

Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
---
 CHANGELOG.md                               | 4 ++++
 unstructured_inference/__version__.py      | 2 +-
 unstructured_inference/inference/layout.py | 7 +++----
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 96a632f3..2277974d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.6.6
+
+* Stop passing ocr_languages parameter into paddle to avoid invalid paddle language code error, this will be fixed until
+we have the mapping from standard language code to paddle language code.
 ## 0.6.5
 
 * Add functionality to keep extracted image elements while merging inferred layout with extracted layout
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 36225a4c..37b46218 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.5"  # pragma: no cover
+__version__ = "0.6.6"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index d1fb11c3..447e7154 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -275,12 +275,11 @@ def get_elements_with_detection_model(
                 )
 
             if entrie_page_ocr == "paddle":
-                logger.info("Processing entrie page OCR with paddle...")
+                logger.info("Processing entire page OCR with paddle...")
                 from unstructured_inference.models import paddle_ocr
 
-                # TODO(yuming): paddle only support one language at once,
-                # change ocr to tesseract if passed in multilanguages.
-                ocr_data = paddle_ocr.load_agent(language=self.ocr_languages).ocr(
+                # TODO(yuming): pass ocr language to paddle when we have language mapping for paddle
+                ocr_data = paddle_ocr.load_agent().ocr(
                     np.array(self.image),
                     cls=True,
                 )