From bfdf35798762bd656acef2ef9ade0e9c780118e0 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Wed, 20 Sep 2023 22:53:28 -0700 Subject: [PATCH 01/11] chore: changelog repair (#221) --- CHANGELOG.md | 8 ++------ unstructured_inference/__version__.py | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a07c7be..3eb5b33d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,11 @@ +## 0.5.32-dev0 + ## 0.5.31 * Add functionality to extract and save images from the page * Add functionality to get only "true" embedded images when extracting elements from PDF pages * Update the layout visualization script to be able to show only image elements if need - -## 0.5.30 - * add an evaluation metric for table comparison based on token similarity - -## 0.5.29-dev0 - * fix paddle unit tests where `make test` fails since paddle doesn't work on M1/M2 chip locally ## 0.5.28 diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 68f3020f..09d2007b 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.5.31" # pragma: no cover +__version__ = "0.5.32-dev0" # pragma: no cover From 5e73202530d1faa995652144cece8aa8804decf2 Mon Sep 17 00:00:00 2001 From: Yao You Date: Thu, 21 Sep 2023 08:55:38 -0500 Subject: [PATCH 02/11] feat: add config class (#218) - add a dataclass that contains configrations for inference processes - the parameters can be specified via env variables, which overrides the default values; this allows for flexibly setup in different applications/deployments - currently contains specifications for table and layout related parameters - followup needed to identify other parameters that can be added to this config class --- CHANGELOG.md | 5 +- test_unstructured_inference/test_config.py | 11 +++ test_unstructured_inference/test_utils.py | 2 +- unstructured_inference/__version__.py | 2 +- unstructured_inference/config.py | 82 +++++++++++++++++++ unstructured_inference/inference/elements.py | 14 ++-- .../inference/layoutelement.py | 7 +- unstructured_inference/models/tables.py | 13 ++- unstructured_inference/utils.py | 6 +- 9 files changed, 122 insertions(+), 20 deletions(-) create mode 100644 test_unstructured_inference/test_config.py create mode 100644 unstructured_inference/config.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3eb5b33d..4edd4d73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,7 @@ -## 0.5.32-dev0 +## 0.6.0 + +* add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables +* update behavior of `pad_image_with_background_color` so that input `pad` is applied to all sides ## 0.5.31 diff --git a/test_unstructured_inference/test_config.py b/test_unstructured_inference/test_config.py new file mode 100644 index 00000000..1a56c9ff --- /dev/null +++ b/test_unstructured_inference/test_config.py @@ -0,0 +1,11 @@ +def test_default_config(): + from unstructured_inference.config import inference_config + + assert inference_config.TABLE_IMAGE_CROP_PAD == 12 + + +def test_env_override(monkeypatch): + monkeypatch.setenv("TABLE_IMAGE_CROP_PAD", 1) + from unstructured_inference.config import inference_config + + assert inference_config.TABLE_IMAGE_CROP_PAD == 1 diff --git a/test_unstructured_inference/test_utils.py b/test_unstructured_inference/test_utils.py index 29e6c13e..bc30c0e5 100644 --- a/test_unstructured_inference/test_utils.py +++ b/test_unstructured_inference/test_utils.py @@ -135,7 +135,7 @@ def test_annotate_layout_elements_with_plot_result(): def test_pad_image_with_background_color(mock_pil_image): pad = 10 height, width = mock_pil_image.size - padded = pad_image_with_background_color(mock_pil_image, pad * 2, "black") + padded = pad_image_with_background_color(mock_pil_image, pad, "black") assert padded.size == (height + 2 * pad, width + 2 * pad) np.testing.assert_array_almost_equal( np.array(padded.crop((pad, pad, width + pad, height + pad))), diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 09d2007b..3b3ca6c4 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.5.32-dev0" # pragma: no cover +__version__ = "0.6.0" # pragma: no cover diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py new file mode 100644 index 00000000..5f1f90f3 --- /dev/null +++ b/unstructured_inference/config.py @@ -0,0 +1,82 @@ +""" +This module contains variables that can permitted to be tweaked by the system environment. For +example, model parameters that changes the output of an inference call. Constants do NOT belong in +this module. Constants are values that are usually names for common options (e.g., color names) or +settings that should not be altered without making a code change (e.g., definition of 1Gb of memory +in bytes). Constants should go into `./constants.py` +""" +import os +from dataclasses import dataclass + + +@dataclass +class InferenceConfig: + """class for configuring inference parameters""" + + def _get_string(self, var: str, default_value: str = "") -> str: + """attempt to get the value of var from the os environment; if not present return the + default_value""" + return os.environ.get(var, default_value) + + def _get_int(self, var: str, default_value: int) -> int: + if value := self._get_string(var): + return int(value) + return default_value + + def _get_float(self, var: str, default_value: float) -> float: + if value := self._get_string(var): + return float(value) + return default_value + + @property + def TABLE_IMAGE_CROP_PAD(self) -> int: + """extra image content to add around an identified table region; measured in pixels + + The padding adds image data around an identified table bounding box for downstream table + structure detection model use as input + """ + return self._get_int("TABLE_IMAGE_CROP_PAD", 12) + + @property + def TABLE_IMAGE_BACKGROUND_PAD(self) -> int: + """number of pixels to pad around an table image with a white background color + + The padding adds NO image data around an identified table bounding box; it simply adds white + background around the image + """ + return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0) + + @property + def LAYOUT_SAME_REGION_THRESHOLD(self) -> float: + """threshold for two layouts' bounding boxes to be considered as the same region + + When the intersection area over union area of the two is larger than this threshold the two + boxes are considered the same region + """ + return self._get_float("LAYOUT_SAME_REGION_THRESHOLD", 0.75) + + @property + def LAYOUT_SUBREGION_THRESHOLD(self) -> float: + """threshold for one bounding box to be considered as a sub-region of another bounding box + + When the intersection region area divided by self area is larger than this threshold self is + considered a subregion of the other + """ + return self._get_float("LAYOUT_SUBREGION_THRESHOLD", 0.75) + + @property + def ELEMENTS_H_PADDING_COEF(self) -> float: + """When extending the boundaries of a PDF object for the purpose of determining which other + elements should be considered in the same text region, we use a relative distance based on + some fraction of the block height (typically character height). This is the fraction used + for the horizontal extension applied to the left and right sides. + """ + return self._get_float("ELEMENTS_H_PADDING_COEF", 0.4) + + @property + def ELEMENTS_V_PADDING_COEF(self) -> float: + """Same as ELEMENTS_H_PADDING_COEF but the vertical extension.""" + return self._get_float("ELEMENTS_V_PADDING_COEF", 0.3) + + +inference_config = InferenceConfig() diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 69ea4c19..66cf7e26 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -11,18 +11,11 @@ from PIL import Image from scipy.sparse.csgraph import connected_components +from unstructured_inference.config import inference_config from unstructured_inference.logger import logger from unstructured_inference.math import safe_division from unstructured_inference.models import tesseract -# When extending the boundaries of a PDF object for the purpose of determining which other elements -# should be considered in the same text region, we use a relative distance based on some fraction of -# the block height (typically character height). This is the fraction used for the horizontal -# extension applied to the left and right sides. -H_PADDING_COEF = 0.4 -# Same as above but the vertical extension. -V_PADDING_COEF = 0.3 - @dataclass class Rectangle: @@ -156,7 +149,10 @@ def partition_groups_from_regions(regions: Collection[Rectangle]) -> List[List[R """Partitions regions into groups of regions based on proximity. Returns list of lists of regions, each list corresponding with a group""" padded_regions = [ - r.vpad(r.height * V_PADDING_COEF).hpad(r.height * H_PADDING_COEF) for r in regions + r.vpad(r.height * inference_config.ELEMENTS_V_PADDING_COEF).hpad( + r.height * inference_config.ELEMENTS_H_PADDING_COEF, + ) + for r in regions ] intersection_mtx = intersections(*padded_regions) diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 5e00388d..c91bb5bc 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -8,6 +8,7 @@ from pandas import DataFrame from PIL import Image +from unstructured_inference.config import inference_config from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR from unstructured_inference.inference.elements import ( ImageTextRegion, @@ -79,7 +80,7 @@ def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str: tables.load_agent() if tables.tables_agent is None: raise RuntimeError("Unable to load table extraction agent.") - padded_block = text_block.pad(12) + padded_block = text_block.pad(inference_config.TABLE_IMAGE_CROP_PAD) cropped_image = image.crop((padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2)) return tables.tables_agent.predict(cropped_image) @@ -90,8 +91,8 @@ def merge_inferred_layout_with_extracted_layout( page_image_size: tuple, ocr_layout: Optional[List[TextRegion]] = None, supplement_with_ocr_elements: bool = True, - same_region_threshold: float = 0.75, - subregion_threshold: float = 0.75, + same_region_threshold: float = inference_config.LAYOUT_SAME_REGION_THRESHOLD, + subregion_threshold: float = inference_config.LAYOUT_SUBREGION_THRESHOLD, ) -> List[LayoutElement]: """Merge two layouts to produce a single layout.""" extracted_elements_to_add: List[TextRegion] = [] diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index 0c79e8cd..bd1bc4ce 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -14,6 +14,7 @@ from PIL import Image from transformers import DetrImageProcessor, TableTransformerForObjectDetection +from unstructured_inference.config import inference_config from unstructured_inference.logger import logger from unstructured_inference.models.table_postprocess import Rect from unstructured_inference.models.unstructuredmodel import UnstructuredModel @@ -113,7 +114,11 @@ def get_tokens(self, x: Image): return tokens - def get_structure(self, x: Image, pad_for_structure_detection: int = 50) -> dict: + def get_structure( + self, + x: Image, + pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD, + ) -> dict: """get the table structure as a dictionary contaning different types of elements as key-value pairs; check table-transformer documentation for more information""" with torch.no_grad(): @@ -126,7 +131,11 @@ def get_structure(self, x: Image, pad_for_structure_detection: int = 50) -> dict outputs_structure["pad_for_structure_detection"] = pad_for_structure_detection return outputs_structure - def run_prediction(self, x: Image, pad_for_structure_detection: int = 50): + def run_prediction( + self, + x: Image, + pad_for_structure_detection: int = inference_config.TABLE_IMAGE_BACKGROUND_PAD, + ): """Predict table structure""" outputs_structure = self.get_structure(x, pad_for_structure_detection) tokens = self.get_tokens(x=x) diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py index 9235ce68..f21daf8f 100644 --- a/unstructured_inference/utils.py +++ b/unstructured_inference/utils.py @@ -130,7 +130,7 @@ def pad_image_with_background_color( pad: int = 10, background_color: str = "white", ) -> Image.Image: - """pads an input image with the same background color around it by pad//2 on all 4 sides + """pads an input image with the same background color around it by pad on all 4 sides The original image is kept intact and a new image is returned with padding added. """ @@ -139,6 +139,6 @@ def pad_image_with_background_color( raise ValueError( "Can not pad an image with negative space! Please use a positive value for `pad`.", ) - new = Image.new(image.mode, (width + pad, height + pad), background_color) - new.paste(image, (pad // 2, pad // 2)) + new = Image.new(image.mode, (width + pad * 2, height + pad * 2), background_color) + new.paste(image, (pad, pad)) return new From eaa8d65c69b4aab2c69545e3b556865bbb1d9ddd Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Thu, 21 Sep 2023 10:02:40 -0600 Subject: [PATCH 03/11] Fix/nested bounding boxes (#201) This PR implements two major changes: * Replaces detectron2 with Yolox_quantized as default model * Introduces an algorithm for reducing nested elements detected by any model. As a benefit of these now is possible to detect more diverse element types. * Adds a property to `Rectangle` class to register the origin of the data. * Adds functionality to `annotate` function, to skip elements of certain origins * Adds functionality to `annotate` function to print additional details of bounding boxes * Tests updates --------- Co-authored-by: Alan Bertl Co-authored-by: qued <64741807+qued@users.noreply.github.com> Co-authored-by: Yao You --- CHANGELOG.md | 8 ++ test_unstructured_inference/conftest.py | 9 +- .../inference/test_layout.py | 93 +++++++++---- .../inference/test_layout_element.py | 25 +++- .../models/test_model.py | 76 ++++++++++- .../models/test_yolox.py | 13 +- test_unstructured_inference/test_elements.py | 61 ++++++++- .../test_visualization.py | 27 +--- unstructured_inference/__version__.py | 2 +- unstructured_inference/inference/elements.py | 18 ++- unstructured_inference/inference/layout.py | 31 ++++- .../inference/layoutelement.py | 61 ++++++++- unstructured_inference/models/base.py | 2 +- .../models/detectron2onnx.py | 1 + .../models/unstructuredmodel.py | 129 +++++++++++++++++- unstructured_inference/models/yolox.py | 28 ++-- unstructured_inference/utils.py | 14 +- unstructured_inference/visualize.py | 77 ++++------- 18 files changed, 506 insertions(+), 169 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4edd4d73..5898e932 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.6.1 + +* YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model. +* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this + behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps + reduce duplicated content. +* Add `source` property to our elements, so you can know where the information was generated (OCR or detection model) + ## 0.6.0 * add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables diff --git a/test_unstructured_inference/conftest.py b/test_unstructured_inference/conftest.py index 761a0492..c20caece 100644 --- a/test_unstructured_inference/conftest.py +++ b/test_unstructured_inference/conftest.py @@ -28,7 +28,7 @@ def mock_text_region(): @pytest.fixture() def mock_layout_element(): - return LayoutElement(100, 100, 300, 300, text="Sample text", type="Text") + return LayoutElement(100, 100, 300, 300, text="Sample text", source=None, type="Text") @pytest.fixture() @@ -110,9 +110,9 @@ def mock_embedded_text_regions(): @pytest.fixture() def mock_ocr_regions(): return [ - EmbeddedTextRegion(10, 10, 90, 90, "0"), - EmbeddedTextRegion(200, 200, 300, 300, "1"), - EmbeddedTextRegion(500, 320, 600, 350, "3"), + EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None), + EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None), + EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None), ] @@ -141,6 +141,7 @@ def mock_inferred_layout(mock_embedded_text_regions): r.x2, r.y2, text=None, + source=None, type="Text", ) for r in mock_embedded_text_regions diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index bd5aa3e8..42bba9a2 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -28,9 +28,16 @@ def mock_image(): @pytest.fixture() def mock_initial_layout(): - text_block = layout.EmbeddedTextRegion(2, 4, 6, 8, text="A very repetitive narrative. " * 10) + text_block = layout.EmbeddedTextRegion( + 2, + 4, + 6, + 8, + text="A very repetitive narrative. " * 10, + source="Mock", + ) - title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title") + title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title", source="Mock") return [text_block, title_block] @@ -42,11 +49,20 @@ def mock_final_layout(): 4, 6, 8, + source="Mock", text="A very repetitive narrative. " * 10, type="NarrativeText", ) - title_block = layoutelement.LayoutElement(1, 2, 3, 4, text="A Catchy Title", type="Title") + title_block = layoutelement.LayoutElement( + 1, + 2, + 3, + 4, + source="Mock", + text="A Catchy Title", + type="Title", + ) return [text_block, title_block] @@ -709,8 +725,11 @@ def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-t assert element.text.startswith(test_snippets[i]) -@pytest.mark.parametrize("colors", ["red", None]) -def test_annotate(colors): +@pytest.mark.parametrize( + ("colors", "add_details", "threshold"), + [("red", False, 0.992), (None, False, 0.992), ("red", True, 0.8)], +) +def test_annotate(colors, add_details, threshold): def check_annotated_image(): annotated_array = np.array(annotated_image) for coords in [coords1, coords2]: @@ -722,9 +741,9 @@ def check_annotated_image(): assert all(annotated_array[y1:y2, x1, i] == expected) assert all(annotated_array[y1:y2, x2, i] == expected) # Make sure almost all the pixels are not changed - assert ((annotated_array[:, :, 0] == 1).mean()) > 0.992 - assert ((annotated_array[:, :, 1] == 1).mean()) > 0.992 - assert ((annotated_array[:, :, 2] == 1).mean()) > 0.992 + assert ((annotated_array[:, :, 0] == 1).mean()) > threshold + assert ((annotated_array[:, :, 1] == 1).mean()) > threshold + assert ((annotated_array[:, :, 2] == 1).mean()) > threshold test_image_arr = np.ones((100, 100, 3), dtype="uint8") image = Image.fromarray(test_image_arr) @@ -735,15 +754,18 @@ def check_annotated_image(): rect2 = elements.Rectangle(*coords2) page.elements = [rect1, rect2] + annotated_image = page.annotate(colors=colors, add_details=add_details, sources=["all"]) + check_annotated_image() + # Scenario 1: where self.image exists - annotated_image = page.annotate(colors=colors) + annotated_image = page.annotate(colors=colors, add_details=add_details) check_annotated_image() # Scenario 2: where self.image is None, but self.image_path exists with patch.object(Image, "open", return_value=image): page.image = None page.image_path = "mock_path_to_image" - annotated_image = page.annotate(colors=colors) + annotated_image = page.annotate(colors=colors, add_details=add_details) check_annotated_image() @@ -775,32 +797,30 @@ def test_image_text_region(text, ocr_strategy, expected, mock_image): ) -@pytest.fixture() -def ordering_layout(): - elements = [ - layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"), - layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"), - layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"), - layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"), - layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"), - layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"), - layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"), - ] - return elements +class MockDetectionModel(layout.UnstructuredObjectDetectionModel): + def initialize(self, *args, **kwargs): + pass + + def predict(self, x): + return [ + layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"), + layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"), + layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"), + layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"), + layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"), + layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"), + layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"), + ] -def test_layout_order(mock_image, ordering_layout): +def test_layout_order(mock_image): with tempfile.TemporaryDirectory() as tmpdir: mock_image_path = os.path.join(tmpdir, "mock.jpg") mock_image.save(mock_image_path) - with patch.object(layout, "get_model", lambda: lambda x: ordering_layout), patch.object( + with patch.object(layout, "get_model", lambda: MockDetectionModel()), patch.object( layout, "load_pdf", lambda *args, **kwargs: ([[]], [mock_image_path]), - ), patch.object( - layout, - "UnstructuredObjectDetectionModel", - object, ): doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf") page = doc.pages[0] @@ -960,3 +980,20 @@ def test_warning_if_chipper_and_low_dpi(caplog): mock_from_file.assert_called_once() assert caplog.records[0].levelname == "WARNING" assert "DPI >= 300" in caplog.records[0].msg + + +@pytest.mark.parametrize( + ("filename", "img_num", "should_complete"), + [("sample-docs/empty-document.pdf", 0, True), ("sample-docs/empty-document.pdf", 10, False)], +) +def test_get_image(filename, img_num, should_complete): + doc = layout.DocumentLayout.from_file(filename) + page = doc.pages[0] + try: + img = page._get_image(filename, img_num) + # transform img to numpy array + img = np.array(img) + # is a blank image with all pixels white + assert img.mean() == 255.0 + except ValueError: + assert not should_complete diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 9dfdb3d9..59727b62 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -17,12 +17,12 @@ def test_aggregate_ocr_text_by_block(): expected = "A Unified Toolkit" ocr_layout = [ - TextRegion(0, 0, 20, 20, "A"), - TextRegion(50, 50, 150, 150, "Unified"), - TextRegion(150, 150, 300, 250, "Toolkit"), - TextRegion(200, 250, 300, 350, "Deep"), + TextRegion(0, 0, 20, 20, source="OCR", text="A"), + TextRegion(50, 50, 150, 150, source="OCR", text="Unified"), + TextRegion(150, 150, 300, 250, source="OCR", text="Toolkit"), + TextRegion(200, 250, 300, 350, source="OCR", text="Deep"), ] - region = TextRegion(0, 0, 250, 350, "") + region = TextRegion(0, 0, 250, 350, text="") text = aggregate_ocr_text_by_block(ocr_layout, region, 0.5) assert text == expected @@ -65,6 +65,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions): r.x2, r.y2, text=r.text, + source=None, type="UncategorizedText", ) for r in mock_ocr_regions @@ -94,6 +95,7 @@ def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_re r.x2, r.y2, text=r.text, + source=None, type="UncategorizedText", ) for r in mock_ocr_regions @@ -138,6 +140,7 @@ def test_layout_element_do_dict(mock_layout_element): "text": "Sample text", "type": "Text", "prob": None, + "source": None, } assert mock_layout_element.to_dict() == expected @@ -157,6 +160,14 @@ def test_layout_element_from_lp_textblock(): score=0.99, ) - expected = LayoutElement(100, 100, 300, 300, "Sample Text", "Text", 0.99) - + expected = LayoutElement( + 100, + 100, + 300, + 300, + text="Sample Text", + source="detectron2_lp", + type="Text", + prob=0.99, + ) assert LayoutElement.from_lp_textblock(mock_text_block) == expected diff --git a/test_unstructured_inference/models/test_model.py b/test_unstructured_inference/models/test_model.py index f5e00855..4ae6c08a 100644 --- a/test_unstructured_inference/models/test_model.py +++ b/test_unstructured_inference/models/test_model.py @@ -58,7 +58,7 @@ def test_raises_uninitialized(): def test_model_initializes_once(): from unstructured_inference.inference import layout - with mock.patch.object(models, "UnstructuredDetectronONNXModel", MockModel), mock.patch.object( + with mock.patch.object(models, "UnstructuredYoloXModel", MockModel), mock.patch.object( models, "models", {}, @@ -72,3 +72,77 @@ def test_model_initializes_once(): assert ( doc.pages[0].elements[0].prob is None ) # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability + + +def test_deduplicate_detected_elements(): + import numpy as np + + from unstructured_inference.inference.elements import intersections + from unstructured_inference.inference.layout import DocumentLayout + from unstructured_inference.models.base import get_model + + model = get_model("yolox_quantized") + # model.confidence_threshold=0.5 + file = "sample-docs/example_table.jpg" + doc = DocumentLayout.from_image_file( + file, + model, + ocr_strategy="never", + supplement_with_ocr_elements=False, + ) + known_elements = [e for e in doc.pages[0].elements if e.type != "UncategorizedText"] + # Compute intersection matrix + intersections_mtx = intersections(*known_elements) + # Get rid off diagonal (cause an element will always intersect itself) + np.fill_diagonal(intersections_mtx, False) + # Now all the elements should be False, because any intersection remains + return not intersections_mtx.all() + + +def test_enhance_regions(): + from unstructured_inference.inference.elements import Rectangle + from unstructured_inference.models.base import get_model + + elements = [ + Rectangle(0, 0, 1, 1), + Rectangle(0.01, 0.01, 1.01, 1.01), + Rectangle(0.02, 0.02, 1.02, 1.02), + Rectangle(0.03, 0.03, 1.03, 1.03), + Rectangle(0.04, 0.04, 1.04, 1.04), + Rectangle(0.05, 0.05, 1.05, 1.05), + Rectangle(0.06, 0.06, 1.06, 1.06), + Rectangle(0.07, 0.07, 1.07, 1.07), + Rectangle(0.08, 0.08, 1.08, 1.08), + Rectangle(0.09, 0.09, 1.09, 1.09), + Rectangle(0.10, 0.10, 1.10, 1.10), + ] + model = get_model("yolox_tiny") + elements = model.enhance_regions(elements, 0.5) + assert len(elements) == 1 + assert (elements[0].x1, elements[0].y1, elements[0].x2, elements[0].x2) == (0, 0, 1.10, 1.10) + + +def test_clean_type(): + from unstructured_inference.inference.layout import LayoutElement + from unstructured_inference.models.base import get_model + + elements = [ + LayoutElement( + 0.6, + 0.6, + 0.65, + 0.65, + type="Table", + ), # One little table nested inside all the others + LayoutElement(0.5, 0.5, 0.7, 0.7, type="Table"), # One nested table + LayoutElement(0, 0, 1, 1, type="Table"), # Big table + LayoutElement(0.01, 0.01, 1.01, 1.01), + LayoutElement(0.02, 0.02, 1.02, 1.02), + LayoutElement(0.03, 0.03, 1.03, 1.03), + LayoutElement(0.04, 0.04, 1.04, 1.04), + LayoutElement(0.05, 0.05, 1.05, 1.05), + ] + model = get_model("yolox_tiny") + elements = model.clean_type(elements, type_to_clean="Table") + assert len(elements) == 1 + assert (elements[0].x1, elements[0].y1, elements[0].x2, elements[0].x2) == (0, 0, 1, 1) diff --git a/test_unstructured_inference/models/test_yolox.py b/test_unstructured_inference/models/test_yolox.py index 122892e5..317876eb 100644 --- a/test_unstructured_inference/models/test_yolox.py +++ b/test_unstructured_inference/models/test_yolox.py @@ -14,7 +14,9 @@ def test_layout_yolox_local_parsing_image(): # NOTE(benjamin) The example image should result in one page result assert len(document_layout.pages) == 1 # NOTE(benjamin) The example sent to the test contains 13 detections - assert len(document_layout.pages[0].elements) == 13 + types_known = ["Text", "Section-header", "Page-header"] + known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known] + assert len(known_regions) == 13 assert hasattr( document_layout.pages[0].elements[0], "prob", @@ -32,8 +34,9 @@ def test_layout_yolox_local_parsing_pdf(): content = str(document_layout) assert "libero fringilla" in content assert len(document_layout.pages) == 1 - # NOTE(benjamin) The example sent to the test contains 5 detections - assert len(document_layout.pages[0].elements) == 5 + # NOTE(benjamin) The example sent to the test contains 5 text detections + text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"] + assert len(text_elements) == 5 assert hasattr( document_layout.pages[0].elements[0], "prob", @@ -59,10 +62,10 @@ def test_layout_yolox_local_parsing_empty_pdf(): def test_layout_yolox_local_parsing_image_soft(): - filename = os.path.join("sample-docs", "test-image.jpg") + filename = os.path.join("sample-docs", "example_table.jpg") # NOTE(benjamin) keep_output = True create a file for each image in # localstorage for visualization of the result - document_layout = process_file_with_model(filename, model_name="yolox_tiny", is_image=True) + document_layout = process_file_with_model(filename, model_name="yolox_quantized", is_image=True) # NOTE(benjamin) The example image should result in one page result assert len(document_layout.pages) == 1 # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py index 56a35905..1c1be08c 100644 --- a/test_unstructured_inference/test_elements.py +++ b/test_unstructured_inference/test_elements.py @@ -7,6 +7,10 @@ from PIL import Image from unstructured_inference.inference import elements +from unstructured_inference.inference.layoutelement import ( + LocationlessLayoutElement, + separate, +) skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"} @@ -115,7 +119,7 @@ def test_rectangle_area(monkeypatch): rect = elements.Rectangle(0, 0, 0, 0) mockheight.return_value = height mockwidth.return_value = width - assert rect.area() == width * height + assert rect.area == width * height def test_rectangle_iou(): @@ -125,16 +129,16 @@ def test_rectangle_iou(): rect2 = rand_rect(20) assert rect1.intersection_over_union(rect2) == rect2.intersection_over_union(rect1) if rect1.is_in(rect2): - assert rect1.intersection_over_union(rect2) == rect1.area() / rect2.area() + assert rect1.intersection_over_union(rect2) == rect1.area / rect2.area elif rect2.is_in(rect1): - assert rect1.intersection_over_union(rect2) == rect2.area() / rect1.area() + assert rect1.intersection_over_union(rect2) == rect2.area / rect1.area else: if rect1.intersection(rect2) is None: assert rect1.intersection_over_union(rect2) == 0.0 else: - intersection = rect1.intersection(rect2).area() + intersection = rect1.intersection(rect2).area assert rect1.intersection_over_union(rect2) == intersection / ( - rect1.area() + rect2.area() - intersection + rect1.area + rect2.area - intersection ) @@ -191,6 +195,53 @@ def test_intersection_over_min( ) +def test_grow_region_to_match_region(): + from unstructured_inference.inference.elements import Rectangle, grow_region_to_match_region + + a = Rectangle(1, 1, 2, 2) + b = Rectangle(1, 1, 5, 5) + grow_region_to_match_region(a, b) + assert a == Rectangle(1, 1, 5, 5) + + +def test_LocationlessLayoutElement(): + text = "Testing text" + type = "Type" + e = LocationlessLayoutElement(text, type) + assert e.to_dict() == {"text": text, "type": type} + + +@pytest.mark.parametrize( + ("rect1", "rect2", "expected"), + [ + (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 5.1, 5.1), True), + (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 5.2, 5.2), True), + (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(7, 7, 10, 10), False), + ], +) +def test_is_almost_subregion_of(rect1, rect2, expected): + assert expected == rect2.is_almost_subregion_of(rect1) + + +@pytest.mark.parametrize( + ("rect1", "rect2"), + [ + (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(3, 3, 6, 6)), + (elements.Rectangle(0, 0, 5, 5), elements.Rectangle(6, 6, 8, 8)), + (elements.Rectangle(3, 3, 7, 7), elements.Rectangle(2, 2, 4, 4)), + (elements.Rectangle(2, 2, 4, 11), elements.Rectangle(3, 3, 7, 10)), + (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 3, 7, 10)), + (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(2.5, 2.5, 3.5, 4.5)), + (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 1, 4, 3.5)), + (elements.Rectangle(2, 2, 4, 4), elements.Rectangle(3, 1, 4.5, 3.5)), + ], +) +def test_separate(rect1, rect2): + separate(rect1, rect2) + + # assert not rect1.intersects(rect2) #TODO: fix this test + + @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI") def test_ocr_paddle(monkeypatch, caplog): monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle") diff --git a/test_unstructured_inference/test_visualization.py b/test_unstructured_inference/test_visualization.py index c9a2978a..0524451f 100644 --- a/test_unstructured_inference/test_visualization.py +++ b/test_unstructured_inference/test_visualization.py @@ -5,32 +5,7 @@ from PIL import Image from unstructured_inference.inference.elements import Rectangle -from unstructured_inference.visualize import draw_bbox, draw_yolox_bounding_boxes, show_plot - - -@pytest.mark.parametrize( - ("y_coords", "x_coords"), - [ - (10, slice(10, 15)), - (10, slice(16, 50)), - (40, slice(1, 50)), - (slice(10, 40), 1), - (slice(10, 12), 50), - (slice(14, 16), 50), - (slice(19, 40), 50), - ], -) -def test_visualize(y_coords, x_coords): - test_image = np.ones((100, 100, 3)) - boxes = [[1, 10, 50, 40]] - annotated_img = draw_yolox_bounding_boxes( - test_image, - boxes, - scores=[0.8], - cls_ids=[0], - class_names=["thing"], - ) - assert annotated_img[y_coords, x_coords, 0].sum() == 0.0 +from unstructured_inference.visualize import draw_bbox, show_plot def test_draw_bbox(): diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 3b3ca6c4..a2b973a8 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.0" # pragma: no cover +__version__ = "0.6.1" # pragma: no cover diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 66cf7e26..1b965cda 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -102,6 +102,7 @@ def intersection(self, other: Rectangle) -> Optional[Rectangle]: return None return Rectangle(x1, y1, x2, y2) + @property def area(self) -> float: """Gives the area of the rectangle.""" return self.width * self.height @@ -111,8 +112,8 @@ def intersection_over_union(self, other: Rectangle) -> float: how similar the regions are. Returns 0 for disjoint rectangles, 1 for two identical rectangles -- area of intersection / area of union.""" intersection = self.intersection(other) - intersection_area = 0.0 if intersection is None else intersection.area() - union_area = self.area() + other.area() - intersection_area + intersection_area = 0.0 if intersection is None else intersection.area + union_area = self.area + other.area - intersection_area return safe_division(intersection_area, union_area) def intersection_over_minimum(self, other: Rectangle) -> float: @@ -120,8 +121,8 @@ def intersection_over_minimum(self, other: Rectangle) -> float: for identifying when one rectangle is almost-a-subset of the other. Returns 0 for disjoint rectangles, 1 when either is a subset of the other.""" intersection = self.intersection(other) - intersection_area = 0.0 if intersection is None else intersection.area() - min_area = min(self.area(), other.area()) + intersection_area = 0.0 if intersection is None else intersection.area + min_area = min(self.area, other.area) return safe_division(intersection_area, min_area) def is_almost_subregion_of(self, other: Rectangle, subregion_threshold: float = 0.75) -> bool: @@ -129,9 +130,9 @@ def is_almost_subregion_of(self, other: Rectangle, subregion_threshold: float = comparing the intersection area over self area to some threshold, and checking whether self is the smaller rectangle.""" intersection = self.intersection(other) - intersection_area = 0.0 if intersection is None else intersection.area() - return (subregion_threshold < safe_division(intersection_area, self.area())) and ( - self.area() <= other.area() + intersection_area = 0.0 if intersection is None else intersection.area + return (subregion_threshold < safe_division(intersection_area, self.area)) and ( + self.area <= other.area ) @@ -148,6 +149,8 @@ def minimal_containing_region(*regions: Rectangle) -> Rectangle: def partition_groups_from_regions(regions: Collection[Rectangle]) -> List[List[Rectangle]]: """Partitions regions into groups of regions based on proximity. Returns list of lists of regions, each list corresponding with a group""" + if len(regions) == 0: + return [] padded_regions = [ r.vpad(r.height * inference_config.ELEMENTS_V_PADDING_COEF).hpad( r.height * inference_config.ELEMENTS_H_PADDING_COEF, @@ -194,6 +197,7 @@ def intersections(*rects: Rectangle): @dataclass class TextRegion(Rectangle): text: Optional[str] = None + source: Optional[str] = None def __str__(self) -> str: return str(self.text) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 5768e2a4..2cdba3bc 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -261,7 +261,9 @@ def get_elements_with_detection_model( # NOTE(mrobinson) - We'll want make this model inference step some kind of # remote call in the future. inferred_layout: List[LayoutElement] = self.detection_model(self.image) - + inferred_layout = UnstructuredObjectDetectionModel.deduplicate_detected_elements( + inferred_layout, + ) if self.ocr_mode == OCRMode.INDIVIDUAL_BLOCKS.value: ocr_layout = None elif self.ocr_mode == OCRMode.FULL_PAGE.value: @@ -339,6 +341,7 @@ def get_elements_with_detection_model( if inplace: self.elements = elements return None + return elements def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutElement]: @@ -400,8 +403,14 @@ def annotate( colors: Optional[Union[List[str], str]] = None, image_dpi: int = 200, annotation_data: Optional[dict[str, dict]] = None, + add_details: bool = False, + sources: List[str] = ["all"], ) -> Image.Image: - """Annotates the elements on the page image.""" + """Annotates the elements on the page image. + if add_details is True, and the elements contain type and source attributes, then + the type and source will be added to the image. + sources is a list of sources to annotate. If sources is ["all"], then all sources will be + annotated. Current sources allowed are "yolox","detectron2_onnx" and "detectron2_lp" """ if colors is None: colors = ["red" for _ in self.elements] if isinstance(colors, str): @@ -422,7 +431,9 @@ def annotate( if annotation_data is None: for el, color in zip(self.elements, colors): if isinstance(el, Rectangle): - img = draw_bbox(img, el, color=color) + required_source = getattr(el, "source", None) + if "all" in sources or required_source in sources: + img = draw_bbox(img, el, color=color, details=add_details) else: for attribute, style in annotation_data.items(): if hasattr(self, attribute) and getattr(self, attribute): @@ -430,7 +441,15 @@ def annotate( width = style["width"] for region in getattr(self, attribute): if isinstance(region, Rectangle): - img = draw_bbox(img, region, color=color, width=width) + required_source = getattr(el, "source", None) + if "all" in sources or required_source in sources: + img = draw_bbox( + img, + region, + color=color, + width=width, + details=add_details, + ) return img @@ -660,7 +679,7 @@ def load_pdf( text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text) - if text_region.area() > 0: + if text_region.area > 0: layout.append(text_region) layouts.append(layout) @@ -719,7 +738,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]: (x1, y1, x2, y2) = l, t, l + w, t + h text = ocr_data["text"][i] if text: - text_region = TextRegion(x1, y1, x2, y2, text) + text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR") text_regions.append(text_region) return text_regions diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index c91bb5bc..887ecc33 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Collection, List, Optional, cast +from typing import Collection, List, Optional, Union, cast import numpy as np from layoutparser.elements.layout import TextBlock @@ -18,7 +18,6 @@ partition_groups_from_regions, region_bounding_boxes_are_almost_the_same, ) -from unstructured_inference.models import tables @dataclass @@ -54,6 +53,7 @@ def to_dict(self) -> dict: "text": self.text, "type": self.type, "prob": self.prob, + "source": self.source, } return out_dict @@ -63,7 +63,9 @@ def from_region(cls, region: Rectangle): x1, y1, x2, y2 = region.x1, region.y1, region.x2, region.y2 text = region.text if hasattr(region, "text") else None type = region.type if hasattr(region, "type") else None - return cls(x1, y1, x2, y2, text, type) + prob = region.prob if hasattr(region, "prob") else None + source = region.source if hasattr(region, "source") else None + return cls(x1, y1, x2, y2, text=text, source=source, type=type, prob=prob) @classmethod def from_lp_textblock(cls, textblock: TextBlock): @@ -71,12 +73,14 @@ def from_lp_textblock(cls, textblock: TextBlock): x1, y1, x2, y2 = textblock.coordinates text = textblock.text type = textblock.type - score = textblock.score - return cls(x1, y1, x2, y2, text, type, prob=score) + prob = textblock.score + return cls(x1, y1, x2, y2, text=text, source="detectron2_lp", type=type, prob=prob) def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str: """Extract the contents of a table.""" + from unstructured_inference.models import tables + tables.load_agent() if tables.tables_agent is None: raise RuntimeError("Unable to load table extraction agent.") @@ -159,6 +163,7 @@ def merge_inferred_layout_with_extracted_layout( el.y2, text=el.text, type="Image" if isinstance(el, ImageTextRegion) else "UncategorizedText", + source=el.source, ) for el in extracted_elements_to_add ] @@ -305,8 +310,9 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion: max_y2 = max([tr.y2 for tr in regions]) merged_text = " ".join([tr.text for tr in regions if tr.text]) - - return TextRegion(min_x1, min_y1, max_x2, max_y2, merged_text) + sources = [*{tr.source for tr in regions}] + source = sources.pop() if len(sources) == 1 else "merged:".join(sources) # type:ignore + return TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text) def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]: @@ -326,12 +332,53 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE r.x2, r.y2, text=r.text, + source=None, type="UncategorizedText", ) for r in merged_regions ] +def separate(region_a: Union[LayoutElement, Rectangle], region_b: Union[LayoutElement, Rectangle]): + """Reduce leftmost rectangle to don't overlap with the other""" + + def reduce(keep: Rectangle, reduce: Rectangle): + # Asume intersection + + # Other is down + if reduce.y2 > keep.y2 and reduce.x1 < keep.x2: + # other is down-right + if reduce.x2 > keep.x2 and reduce.y2 > keep.y2: + reduce.x1 = keep.x2 * 1.01 + reduce.y1 = keep.y2 * 1.01 + return + # other is down-left + if reduce.x1 < keep.x1 and reduce.y1 < keep.y2: + reduce.y1 = keep.y2 + return + # other is centered + reduce.y1 = keep.y2 + else: # other is up + # other is up-right + if reduce.x2 > keep.x2 and reduce.y1 < keep.y1: + reduce.y2 = keep.y1 + return + # other is left + if reduce.x1 < keep.x1 and reduce.y1 < keep.y1: + reduce.y2 = keep.y1 + return + # other is centered + reduce.y2 = keep.y1 + + if not region_a.intersects(region_b): + return + else: + if region_a.area > region_b.area: + reduce(keep=region_a, reduce=region_b) + else: + reduce(keep=region_b, reduce=region_a) + + # NOTE(alan): The right way to do this is probably to rewrite LayoutElement as well as the different # Region types to not subclass Rectangle, and instead have an optional bbox property that is a # Rectangle. I or someone else will have to get to that later. diff --git a/unstructured_inference/models/base.py b/unstructured_inference/models/base.py index b0b2898e..a8323f1b 100644 --- a/unstructured_inference/models/base.py +++ b/unstructured_inference/models/base.py @@ -23,7 +23,7 @@ UnstructuredYoloXModel, ) -DEFAULT_MODEL = "detectron2_onnx" +DEFAULT_MODEL = "yolox_quantized" models: Dict[str, UnstructuredModel] = {} diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py index b37f3d72..7b9df081 100644 --- a/unstructured_inference/models/detectron2onnx.py +++ b/unstructured_inference/models/detectron2onnx.py @@ -158,6 +158,7 @@ def postprocess( text=None, type=detected_class, prob=conf, + source="detectron2_onnx", ) regions.append(region) diff --git a/unstructured_inference/models/unstructuredmodel.py b/unstructured_inference/models/unstructuredmodel.py index 8d61b49a..634f2dd6 100644 --- a/unstructured_inference/models/unstructuredmodel.py +++ b/unstructured_inference/models/unstructuredmodel.py @@ -1,10 +1,18 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, List +from typing import TYPE_CHECKING, Any, List, cast +import numpy as np from PIL.Image import Image +from unstructured_inference.inference.elements import ( + grow_region_to_match_region, + intersections, + partition_groups_from_regions, +) +from unstructured_inference.inference.layoutelement import separate + if TYPE_CHECKING: from unstructured_inference.inference.layoutelement import ( LayoutElement, @@ -55,6 +63,125 @@ def __call__(self, x: Image) -> List[LayoutElement]: """Inference using function call interface.""" return super().__call__(x) + @staticmethod + def enhance_regions( + elements: List[LayoutElement], + iom_to_merge: float = 0.3, + ) -> List[LayoutElement]: + """This function traverses all the elements and either deletes nested elements, + or merges or splits them depending on the iom score for both regions""" + intersections_mtx = intersections(*elements) + + for i, row in enumerate(intersections_mtx): + first = elements[i] + if first: + # We get only the elements which intersected + indices_to_check = np.where(row)[0] + # Delete the first element, since it will always intersect with itself + indices_to_check = indices_to_check[indices_to_check != i] + if len(indices_to_check) == 0: + continue + if len(indices_to_check) > 1: # sort by iom + iom_to_check = [ + (j, first.intersection_over_minimum(elements[j])) + for j in indices_to_check + if elements[j] is not None + ] + iom_to_check.sort( + key=lambda x: x[1], + reverse=True, + ) # sort elements by iom, so we first check the greatest + indices_to_check = [x[0] for x in iom_to_check if x[0] != i] # type:ignore + for j in indices_to_check: + if elements[j] is None or elements[i] is None: + continue + second = elements[j] + intersection = first.intersection( + second, + ) # we know it does, but need the region + first_inside_second = first.is_in(second) + second_inside_first = second.is_in(first) + + if first_inside_second and not second_inside_first: + elements[i] = None # type:ignore + elif second_inside_first and not first_inside_second: + # delete second element + elements[j] = None # type:ignore + elif intersection: + iom = first.intersection_over_minimum(second) + if iom < iom_to_merge: # small + separate(first, second) + # The rectangle could become too small, which is a + # good size to delete? + else: # big + # merge + if first.area > second.area: + grow_region_to_match_region(first, second) + elements[j] = None # type:ignore + else: + grow_region_to_match_region(second, first) + elements[i] = None # type:ignore + + elements = [e for e in elements if e is not None] + return elements + + @staticmethod + def clean_type(elements: List[LayoutElement], type_to_clean="Table") -> List[LayoutElement]: + """After this function, the list of elements will not contain any element inside + of the type specified""" + target_elements = [e for e in elements if e.type == type_to_clean] + other_elements = [e for e in elements if e.type != type_to_clean] + if len(target_elements) == 0 or len(other_elements) == 0: + return elements + + # Sort elements from biggest to smallest + target_elements.sort(key=lambda e: e.area, reverse=True) + other_elements.sort(key=lambda e: e.area, reverse=True) + + # First check if targets contains each other + for element in target_elements: # Just handles containment or little overlap + contains = [ + e for e in target_elements if e.is_almost_subregion_of(element) and e != element + ] + for contained in contains: + target_elements.remove(contained) + # Then check if remaining elements intersect with targets + other_elements = filter( + lambda e: not any(e.is_almost_subregion_of(target) for target in target_elements), + other_elements, + ) # type:ignore + + final_elements = list(other_elements) + final_elements.extend(target_elements) + # Note(benjamin): could use bisect.insort, + # but need to add < operator for + # LayoutElement in python <3.10 + final_elements.sort(key=lambda e: e.y1) + return final_elements + + @staticmethod + def deduplicate_detected_elements( + elements: List[LayoutElement], + min_text_size: int = 15, + ) -> List[LayoutElement]: + """Deletes overlapping elements in a list of elements.""" + + if len(elements) <= 1: + return elements + + cleaned_elements: List[LayoutElement] = [] + # TODO: Delete nested elements with low or None probability + # TODO: Keep most confident + # TODO: Better to grow horizontally than vertically? + groups_tmp = partition_groups_from_regions(elements) + groups = cast(List[List["LayoutElement"]], groups_tmp) + for g in groups: + all_types = {e.type for e in g} + for type in all_types: + g = UnstructuredObjectDetectionModel.clean_type(g, type) + cleaned_elements.extend(g) + return cleaned_elements + class UnstructuredElementExtractionModel(UnstructuredModel): """Wrapper class for object extraction models used by unstructured.""" diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index c4615cc1..f5103698 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -18,7 +18,6 @@ from unstructured_inference.logger import logger from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel from unstructured_inference.utils import LazyDict, LazyEvaluateInfo -from unstructured_inference.visualize import draw_yolox_bounding_boxes YOLOX_LABEL_MAP = { 0: "Caption", @@ -111,7 +110,6 @@ def image_processing( input_shape = (1024, 768) origin_img = np.array(image) img, ratio = preprocess(origin_img, input_shape) - # TODO (benjamin): We should use models.get_model() but currenly returns Detectron model session = self.model ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]} @@ -143,7 +141,16 @@ def image_processing( # being (x1,y1) the top left and (x2,y2) the bottom right x1, y1, x2, y2, prob, class_id = det.tolist() detected_class = self.layout_classes[int(class_id)] - region = LayoutElement(x1, y1, x2, y2, text=None, type=detected_class, prob=prob) + region = LayoutElement( + x1, + y1, + x2, + y2, + text=None, + type=detected_class, + prob=prob, + source="yolox", + ) regions.append(region) @@ -153,21 +160,6 @@ def image_processing( return page_layout - def annotate_image(self, image_fn, dets, out_fn): - """Draw bounding boxes and prediction metadata.""" - origin_img = np.array(Image.open(image_fn)) - final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5] - - annotated_image = draw_yolox_bounding_boxes( - origin_img, - final_boxes, - final_scores, - final_cls_inds, - conf=0.3, - class_names=self.layout_classes, - ) - cv2.imwrite(out_fn, annotated_image) - # Note: preprocess function was named preproc on original source diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py index f21daf8f..c5a70091 100644 --- a/unstructured_inference/utils.py +++ b/unstructured_inference/utils.py @@ -1,12 +1,13 @@ import os from collections.abc import Mapping -from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Union +from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterable, Iterator, Union import cv2 import numpy as np from PIL import Image from unstructured_inference.constants import AnnotationResult +from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.visualize import show_plot if TYPE_CHECKING: @@ -125,6 +126,17 @@ def annotate_layout_elements( show_plot(img, desired_width=plot_desired_width) +def tag(elements: Iterable[LayoutElement]): + """Asign an numeric id to the elements in the list. + Useful for debugging""" + colors = ["red", "blue", "green", "magenta", "brown"] + for i, e in enumerate(elements): + e.text = f"-{i}-:{e.text}" + # currently not a property + e.id = i # type:ignore + e.color = colors[i % len(colors)] # type:ignore + + def pad_image_with_background_color( image: Image.Image, pad: int = 10, diff --git a/unstructured_inference/visualize.py b/unstructured_inference/visualize.py index b1c47fba..8cba46ac 100644 --- a/unstructured_inference/visualize.py +++ b/unstructured_inference/visualize.py @@ -1,68 +1,43 @@ # Copyright (c) Megvii Inc. All rights reserved. # Unstructured modified the original source code found at # https://github.com/Megvii-BaseDetection/YOLOX/blob/ac379df3c97d1835ebd319afad0c031c36d03f36/yolox/utils/visualize.py +import typing from typing import Optional, Union -import cv2 import matplotlib.pyplot as plt import numpy as np +from PIL import ImageFont from PIL.Image import Image from PIL.ImageDraw import ImageDraw from unstructured_inference.inference.elements import Rectangle -def draw_bbox(image: Image, rect: Rectangle, color: str = "red", width=1) -> Image: +@typing.no_type_check +def draw_bbox( + image: Image, + rect: Rectangle, + color: str = "red", + width=1, + details: bool = False, +) -> Image: """Draws bounding box in image""" - img = image.copy() - draw = ImageDraw(img) - topleft, _, bottomright, _ = rect.coordinates - draw.rectangle((topleft, bottomright), outline=color, width=width) - return img - - -# NOTE: in original files from YoloX 'draw_yolox_bounding_boxes' function is named "vis" -# TODO(alan): Need type hints here -def draw_yolox_bounding_boxes(img, boxes, scores, cls_ids, conf=0.5, class_names=None): - """ - This function draws bounding boxes over the img argument, using - boxes from detections from YoloX. - img is a numpy array from cv2.imread() - Scores refers to the probability of each detection. - cls_ids are the class of each detection - conf is the confidence required to draw the bounding box - class_names is a list, where class_names[cls_ids[i]] should be the name - for the i-th bounding box. - """ - for i in range(len(boxes)): - box = boxes[i] - cls_id = int(cls_ids[i]) - score = scores[i] - if score < conf: - continue - x0 = int(box[0]) - y0 = int(box[1]) - x1 = int(box[2]) - y1 = int(box[3]) - - color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist() - text = f"{class_names[cls_id]}:{score * 100:.1f}%" - txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255) - font = cv2.FONT_HERSHEY_SIMPLEX - - txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] - cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) - - txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() - cv2.rectangle( - img, - (x0, y0 + 1), - (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])), - txt_bk_color, - -1, - ) - cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1) - + try: + img = image.copy() + draw = ImageDraw(img) + topleft, _, bottomright, _ = rect.coordinates + c = getattr(rect, "color", color) + if details: + source = getattr(rect, "source", "Unknown") + type = getattr(rect, "type", "") + kbd = ImageFont.truetype("Keyboard.ttf", 20) + draw.text(topleft, text=f"{type} {source}", fill=c, font=kbd) + draw.rectangle((topleft, bottomright), outline=c, width=width) + except OSError: + print("Failed to find font file. Skipping details.") + img = draw_bbox(image, rect, color, width) + except Exception as e: + print(f"Failed to draw bounding box: {e}") return img From 8c6d6693c3d80c11f19e5a4f4138822a6479406f Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 22 Sep 2023 13:37:59 -0500 Subject: [PATCH 04/11] feat: make table transformer parameters configurable (#224) - refactor `tables.py` so that the structure element confidence threshold values are loaded from `inference_config` - refactor intersection over box area threshold in `objects_to_structure` to config intead of using hardwired value of 0.5 (default is still 0.5) --- CHANGELOG.md | 4 +++ unstructured_inference/__version__.py | 2 +- unstructured_inference/config.py | 36 +++++++++++++++++++++++++ unstructured_inference/models/tables.py | 33 +++++++++++++++-------- 4 files changed, 63 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5898e932..cc9e79b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.6.2 + +* move the confidence threshold for table transformer to config + ## 0.6.1 * YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model. diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index a2b973a8..f42820dd 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.1" # pragma: no cover +__version__ = "0.6.2" # pragma: no cover diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py index 5f1f90f3..e8f287e9 100644 --- a/unstructured_inference/config.py +++ b/unstructured_inference/config.py @@ -46,6 +46,42 @@ def TABLE_IMAGE_BACKGROUND_PAD(self) -> int: """ return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0) + @property + def TT_TABLE_CONF(self) -> float: + """confidence threshold for table identified by table transformer""" + return self._get_float("TT_TABLE_CONF", 0.5) + + @property + def TABLE_COLUMN_CONF(self) -> float: + """confidence threshold for column identified by table transformer""" + return self._get_float("TABLE_COLUMN_CONF", 0.5) + + @property + def TABLE_ROW_CONF(self) -> float: + """confidence threshold for column identified by table transformer""" + return self._get_float("TABLE_ROW_CONF", 0.5) + + @property + def TABLE_COLUMN_HEADER_CONF(self) -> float: + """confidence threshold for column header identified by table transformer""" + return self._get_float("TABLE_COLUMN_HEADER_CONF", 0.5) + + @property + def TABLE_PROJECTED_ROW_HEADER_CONF(self) -> float: + """confidence threshold for projected row header identified by table transformer""" + return self._get_float("TABLE_PROJECTED_ROW_HEADER_CONF", 0.5) + + @property + def TABLE_SPANNING_CELL_CONF(self) -> float: + """confidence threshold for table spanning cells identified by table transformer""" + return self._get_float("TABLE_SPANNING_CELL_CONF", 0.5) + + @property + def TABLE_IOB_THRESHOLD(self) -> float: + """minimum intersection over box area ratio for a box to be considered part of a larger box + it intersects""" + return self._get_float("TABLE_IOB_THRESHOLD", 0.5) + @property def LAYOUT_SAME_REGION_THRESHOLD(self) -> float: """threshold for two layouts' bounding boxes to be considered as the same region diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index bd1bc4ce..7d48c41b 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -177,12 +177,13 @@ def get_class_map(data_type: str): structure_class_thresholds = { - "table": 0.5, - "table column": 0.5, - "table row": 0.5, - "table column header": 0.5, - "table projected row header": 0.5, - "table spanning cell": 0.5, + "table": inference_config.TT_TABLE_CONF, + "table column": inference_config.TABLE_COLUMN_CONF, + "table row": inference_config.TABLE_ROW_CONF, + "table column header": inference_config.TABLE_COLUMN_HEADER_CONF, + "table projected row header": inference_config.TABLE_PROJECTED_ROW_HEADER_CONF, + "table spanning cell": inference_config.TABLE_SPANNING_CELL_CONF, + # FIXME (yao) this parameter doesn't seem to be used at all in inference? Can we remove it "no object": 10, } @@ -282,8 +283,16 @@ def objects_to_structures(objects, tokens, class_thresholds): table_structures = [] for table in tables: - table_objects = [obj for obj in objects if iob(obj["bbox"], table["bbox"]) >= 0.5] - table_tokens = [token for token in tokens if iob(token["bbox"], table["bbox"]) >= 0.5] + table_objects = [ + obj + for obj in objects + if iob(obj["bbox"], table["bbox"]) >= inference_config.TABLE_IOB_THRESHOLD + ] + table_tokens = [ + token + for token in tokens + if iob(token["bbox"], table["bbox"]) >= inference_config.TABLE_IOB_THRESHOLD + ] structure = {} @@ -302,7 +311,7 @@ def objects_to_structures(objects, tokens, class_thresholds): for obj in rows: obj["column header"] = False for header_obj in column_headers: - if iob(obj["bbox"], header_obj["bbox"]) >= 0.5: + if iob(obj["bbox"], header_obj["bbox"]) >= inference_config.TABLE_IOB_THRESHOLD: obj["column header"] = True # Refine table structures @@ -478,7 +487,7 @@ def structure_to_cells(table_structure, tokens): spanning_cell_rect = Rect(list(spanning_cell["bbox"])) if ( spanning_cell_rect.intersect(cell_rect).get_area() / cell_rect.get_area() - ) > 0.5: + ) > inference_config.TABLE_IOB_THRESHOLD: cell["subcell"] = True break @@ -499,7 +508,9 @@ def structure_to_cells(table_structure, tokens): for subcell in subcells: subcell_rect = Rect(list(subcell["bbox"])) subcell_rect_area = subcell_rect.get_area() - if (subcell_rect.intersect(spanning_cell_rect).get_area() / subcell_rect_area) > 0.5: + if ( + subcell_rect.intersect(spanning_cell_rect).get_area() / subcell_rect_area + ) > inference_config.TABLE_IOB_THRESHOLD: if cell_rect is None: cell_rect = Rect(list(subcell["bbox"])) else: From 35ebea7968fe6a30cb3606b0486f8dfe6d833a54 Mon Sep 17 00:00:00 2001 From: Yao You Date: Fri, 22 Sep 2023 14:21:55 -0500 Subject: [PATCH 05/11] feat: add pre commit hook (#220) - add config yaml (copied from `unstructured` repo) - helps with dev's Quality of Life --- .pre-commit-config.yaml | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..88da6a35 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,38 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: "v4.3.0" + hooks: + - id: check-added-large-files + - id: check-toml + - id: check-yaml + - id: check-json + - id: check-xml + - id: end-of-file-fixer + exclude: \.json$ + include: \.py$ + - id: trailing-whitespace + - id: mixed-line-ending + + - repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + args: ["--line-length=100"] + language_version: python3 + + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: "v0.0.230" + hooks: + - id: ruff + args: + [ + "--fix", + "--select=I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402", + "--ignore=PT011,PT012,SIM117", + ] + + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + language_version: python3 From cb2aff2e31572f44147f69b7458eea548cd54a4f Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 25 Sep 2023 12:55:16 -0500 Subject: [PATCH 06/11] fix: padded boxes are not rescaled/shifted correctly (#229) --- CHANGELOG.md | 8 +- .../models/test_tables.py | 73 +++++++++++++++---- unstructured_inference/__version__.py | 2 +- unstructured_inference/models/tables.py | 4 +- 4 files changed, 67 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc9e79b6..3d70b588 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.6.3 + +* fix a bug where padded table structure bounding boxes are not shifted back into the original image coordinates correctly + ## 0.6.2 * move the confidence threshold for table transformer to config @@ -5,8 +9,8 @@ ## 0.6.1 * YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model. -* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this - behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps +* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this + behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps reduce duplicated content. * Add `source` property to our elements, so you can know where the information was generated (OCR or detection model) diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py index 585f17f1..fd95bcc4 100644 --- a/test_unstructured_inference/models/test_tables.py +++ b/test_unstructured_inference/models/test_tables.py @@ -1,6 +1,9 @@ import os +import numpy as np import pytest +import torch +from PIL import Image from transformers.models.table_transformer.modeling_table_transformer import ( TableTransformerDecoder, ) @@ -11,6 +14,18 @@ skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"} +@pytest.fixture() +def table_transformer(): + table_model = tables.UnstructuredTableTransformerModel() + table_model.initialize(model="microsoft/table-transformer-structure-recognition") + return table_model + + +@pytest.fixture() +def example_image(): + return Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB") + + @pytest.mark.parametrize( "model_path", [ @@ -328,13 +343,8 @@ def test_align_rows(rows, bbox, output): assert postprocess.align_rows(rows, bbox) == output -def test_table_prediction_tesseract(): - table_model = tables.UnstructuredTableTransformerModel() - from PIL import Image - - table_model.initialize(model="microsoft/table-transformer-structure-recognition") - img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB") - prediction = table_model.predict(img) +def test_table_prediction_tesseract(table_transformer, example_image): + prediction = table_transformer.predict(example_image) # assert rows spans two rows are detected assert '
' in prediction # one of the safest rows to detect should be present @@ -351,28 +361,24 @@ def test_table_prediction_tesseract(): @pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI") -def test_table_prediction_paddle(monkeypatch): +def test_table_prediction_paddle(monkeypatch, example_image): monkeypatch.setenv("TABLE_OCR", "paddle") table_model = tables.UnstructuredTableTransformerModel() - from PIL import Image table_model.initialize(model="microsoft/table-transformer-structure-recognition") - img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB") - prediction = table_model.predict(img) + prediction = table_model.predict(example_image) # Note(yuming): lossen paddle table prediction output test since performance issue # and results are different in different platforms (i.e., gpu vs cpu) assert len(prediction) -def test_table_prediction_invalid_table_ocr(monkeypatch): +def test_table_prediction_invalid_table_ocr(monkeypatch, example_image): monkeypatch.setenv("TABLE_OCR", "invalid_table_ocr") with pytest.raises(ValueError): table_model = tables.UnstructuredTableTransformerModel() - from PIL import Image table_model.initialize(model="microsoft/table-transformer-structure-recognition") - img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB") - _ = table_model.predict(img) + _ = table_model.predict(example_image) def test_intersect(): @@ -581,3 +587,40 @@ def test_cells_to_html(): "cols
sub cell 1sub cell 2
" ) assert tables.cells_to_html(cells) == expected + + +def test_padded_results_has_right_dimensions(table_transformer, example_image): + str_class_name2idx = tables.get_class_map("structure") + # a simpler mapping so we keep all structure in the returned objs below for test + str_class_idx2name = {v: "table cell" for v in str_class_name2idx.values()} + # pad size is no more than 10% of the original image so we can setup test below easier + pad = int(min(example_image.size) / 10) + + structure = table_transformer.get_structure(example_image, pad_for_structure_detection=pad) + # boxes deteced OUTSIDE of the original image; this shouldn't happen but we want to make sure + # the code handles it as expected + structure["pred_boxes"][0][0, :2] = 0.5 + structure["pred_boxes"][0][0, 2:] = 1.0 + # mock a box we know are safly inside the original image with known positions + width, height = example_image.size + padded_width = width + pad * 2 + padded_height = height + pad * 2 + original = [1, 3, 101, 53] + structure["pred_boxes"][0][1, :] = torch.tensor( + [ + (51 + pad) / padded_width, + (28 + pad) / padded_height, + 100 / padded_width, + 50 / padded_height, + ], + ) + objs = tables.outputs_to_objects(structure, example_image.size, str_class_idx2name) + np.testing.assert_almost_equal(objs[0]["bbox"], [-pad, -pad, width + pad, height + pad], 4) + np.testing.assert_almost_equal(objs[1]["bbox"], original, 4) + # a more strict test would be to constrain the actual detected boxes to be within the original + # image but that requires the table transformer to behave in certain ways and do not + # actually test the padding math; so here we use the relaxed condition + for obj in objs[2:]: + x1, y1, x2, y2 = obj["bbox"] + assert max(x1, x2) < width + pad + assert max(y1, y2) < height + pad diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index f42820dd..1f6c2e4b 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.2" # pragma: no cover +__version__ = "0.6.3" # pragma: no cover diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index 7d48c41b..d4885296 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -220,11 +220,11 @@ def outputs_to_objects(outputs, img_size, class_idx2name): pred_bboxes = outputs["pred_boxes"].detach().cpu()[0] pad = outputs.get("pad_for_structure_detection", 0) - scale_size = (img_size[0] + pad, img_size[1] + pad) + scale_size = (img_size[0] + pad * 2, img_size[1] + pad * 2) pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, scale_size)] # unshift the padding; padding effectively shifted the bounding boxes of structures in the # original image with half of the total pad - shift_size = pad / 2 + shift_size = pad objects = [] for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes): From c4d3e8b0bf4321caba817efd6e3384a573461aff Mon Sep 17 00:00:00 2001 From: Yao You Date: Mon, 25 Sep 2023 15:33:05 -0500 Subject: [PATCH 07/11] feat: add autoscaling for table images (#210) Auto scale table images so that the text height is optimum for `tesseract` OCR inference. This functionality will scaling images where the estimated mean text height based on the `inference_config` setup: table images with text height below `inference_config.TESSERACT_MIN_TEXT_HEIGHT` or above `inference_config.TESSERACT_MAX_TEXT_HEIGHT` are scaled so that the text height is at `inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT`. This PR resolves [CORE-1863](https://unstructured-ai.atlassian.net/browse/CORE-1863) ## test - this PR adds a unit test to confirm auto scale is triggered - test the tokens computed without zoom and with zoom with the attached image: with zoom the tokens should include the correct text "Japanese" in the table on the page. Without zoom (call get_tokens using main) we won't see this token and instead you might find a token that look like "Inpanere". For this specific document it is best to set `TESSERACT_MIN_TEXT_HEIGHT` to 12. ![layout-parser-paper-with-table](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/7963bba0-67cb-48ee-b338-52b1c2620fc0) [CORE-1863]: https://unstructured-ai.atlassian.net/browse/CORE-1863?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ --- CHANGELOG.md | 5 ++ requirements/base.txt | 2 +- requirements/dev.txt | 4 +- requirements/test.in | 1 + requirements/test.txt | 6 ++- .../models/test_tables.py | 16 ++++++ unstructured_inference/__version__.py | 2 +- unstructured_inference/config.py | 30 ++++++++++- unstructured_inference/models/tables.py | 54 +++++++++++++++++-- unstructured_inference/models/tesseract.py | 4 ++ 10 files changed, 113 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d70b588..49b29b4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.6.4 + +* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task +* add the new image auto scaling parameters to `config.py` + ## 0.6.3 * fix a bug where padded table structure bounding boxes are not shifted back into the original image coordinates correctly diff --git a/requirements/base.txt b/requirements/base.txt index 38ea4933..10836a40 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -44,7 +44,7 @@ humanfriendly==10.0 # via coloredlogs idna==3.4 # via requests -importlib-resources==6.0.1 +importlib-resources==6.1.0 # via matplotlib iopath==0.1.10 # via layoutparser diff --git a/requirements/dev.txt b/requirements/dev.txt index ff65ca27..f397245b 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -101,7 +101,7 @@ importlib-metadata==6.8.0 # jupyterlab # jupyterlab-server # nbconvert -importlib-resources==6.0.1 +importlib-resources==6.1.0 # via # -c requirements/base.txt # jsonschema @@ -139,7 +139,7 @@ json5==0.9.14 # via jupyterlab-server jsonpointer==2.4 # via jsonschema -jsonschema[format-nongpl]==4.19.0 +jsonschema[format-nongpl]==4.19.1 # via # jupyter-events # jupyterlab-server diff --git a/requirements/test.in b/requirements/test.in index f3957593..d3846b25 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -12,6 +12,7 @@ flake8 flake8-docstrings mypy pytest-cov +pytest-mock pdf2image>=1.16.2 huggingface_hub>=0.11.1 ruff diff --git a/requirements/test.txt b/requirements/test.txt index ce238d69..195f680b 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -97,9 +97,13 @@ pydocstyle==6.3.0 pyflakes==3.1.0 # via flake8 pytest==7.4.2 - # via pytest-cov + # via + # pytest-cov + # pytest-mock pytest-cov==4.1.0 # via -r requirements/test.in +pytest-mock==3.11.1 + # via -r requirements/test.in pyyaml==6.0.1 # via # -c requirements/base.txt diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py index fd95bcc4..c6d3371a 100644 --- a/test_unstructured_inference/models/test_tables.py +++ b/test_unstructured_inference/models/test_tables.py @@ -1,4 +1,5 @@ import os +from pathlib import Path import numpy as np import pytest @@ -589,6 +590,21 @@ def test_cells_to_html(): assert tables.cells_to_html(cells) == expected +def test_auto_zoom(mocker): + spy = mocker.spy(tables, "zoom_image") + model = tables.UnstructuredTableTransformerModel() + model.initialize("microsoft/table-transformer-structure-recognition") + image = Image.open( + Path(os.path.dirname(os.path.abspath(__file__))) + / ".." + / ".." + / "sample-docs" + / "layout-parser-paper-fast.jpg", + ) + model.get_tokens(image) + assert spy.call_count == 1 + + def test_padded_results_has_right_dimensions(table_transformer, example_image): str_class_name2idx = tables.get_class_map("structure") # a simpler mapping so we keep all structure in the returned objs below for test diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 1f6c2e4b..4bf914ad 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.3" # pragma: no cover +__version__ = "0.6.4" # pragma: no cover diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py index e8f287e9..aca2550c 100644 --- a/unstructured_inference/config.py +++ b/unstructured_inference/config.py @@ -44,7 +44,35 @@ def TABLE_IMAGE_BACKGROUND_PAD(self) -> int: The padding adds NO image data around an identified table bounding box; it simply adds white background around the image """ - return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0) + return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 20) + + @property + def TESSERACT_MIN_TEXT_HEIGHT(self) -> int: + """minimum text height acceptable from tesseract OCR results + + if estimated text height from tesseract OCR results is lower than this value the image is + scaled up to be processed again + """ + return self._get_int("TESSERACT_MIN_TEXT_HEIGHT", 12) + + @property + def TESSERACT_MAX_TEXT_HEIGHT(self) -> int: + """maximum text height acceptable from tesseract OCR results + + if estimated text height from tesseract OCR results is higher than this value the image is + scaled down to be processed again + """ + return self._get_int("TESSERACT_MAX_TEXT_HEIGHT", 100) + + @property + def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int: + """optimum text height for tesseract OCR""" + return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20) + + @property + def TESSERACT_TEXT_HEIGHT_QUANTILE(self) -> float: + """the quantile to check for text height""" + return self._get_float("TESSERACT_TEXT_HEIGHT_QUANTILE", 0.5) @property def TT_TABLE_CONF(self) -> float: diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py index d4885296..4a68e3d2 100644 --- a/unstructured_inference/models/tables.py +++ b/unstructured_inference/models/tables.py @@ -7,6 +7,7 @@ from pathlib import Path from typing import List, Optional, Union +import cv2 import numpy as np import pandas as pd import pytesseract @@ -17,6 +18,9 @@ from unstructured_inference.config import inference_config from unstructured_inference.logger import logger from unstructured_inference.models.table_postprocess import Rect +from unstructured_inference.models.tesseract import ( + TESSERACT_TEXT_HEIGHT, +) from unstructured_inference.models.unstructuredmodel import UnstructuredModel from unstructured_inference.utils import pad_image_with_background_color @@ -79,23 +83,45 @@ def get_tokens(self, x: Image): ymax = max([i[1] for i in line[0]]) tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]}) else: + zoom = 1 + logger.info("Processing table OCR with tesseract...") ocr_df: pd.DataFrame = pytesseract.image_to_data( x, output_type="data.frame", ) - ocr_df = ocr_df.dropna() + # tesseract performance degrades when the text height is out of the preferred zone so we + # zoom the image (in or out depending on estimated text height) for optimum OCR results + # but this needs to be evaluated based on actual use case as the optimum scaling also + # depend on type of characters (font, language, etc); be careful about this + # functionality + text_height = ocr_df[TESSERACT_TEXT_HEIGHT].quantile( + inference_config.TESSERACT_TEXT_HEIGHT_QUANTILE, + ) + if ( + text_height < inference_config.TESSERACT_MIN_TEXT_HEIGHT + or text_height > inference_config.TESSERACT_MAX_TEXT_HEIGHT + ): + # rounding avoids unnecessary precision and potential numerical issues assocaited + # with numbers very close to 1 inside cv2 image processing + zoom = np.round(inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1) + ocr_df = pytesseract.image_to_data( + zoom_image(x, zoom), + output_type="data.frame", + ) + ocr_df = ocr_df.dropna() + tokens = [] for idtx in ocr_df.itertuples(): tokens.append( { "bbox": [ - idtx.left, - idtx.top, - idtx.left + idtx.width, - idtx.top + idtx.height, + idtx.left / zoom, + idtx.top / zoom, + (idtx.left + idtx.width) / zoom, + (idtx.top + idtx.height) / zoom, ], "text": idtx.text, }, @@ -688,3 +714,21 @@ def cells_to_html(cells): tcell.text = cell["cell text"] return str(ET.tostring(table, encoding="unicode", short_empty_elements=False)) + + +def zoom_image(image: Image, zoom: float) -> Image: + """scale an image based on the zoom factor using cv2; the scaled image is post processed by + dilation then erosion to improve edge sharpness for OCR tasks""" + new_image = cv2.resize( + cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR), + None, + fx=zoom, + fy=zoom, + interpolation=cv2.INTER_CUBIC, + ) + + kernel = np.ones((1, 1), np.uint8) + new_image = cv2.dilate(new_image, kernel, iterations=1) + new_image = cv2.erode(new_image, kernel, iterations=1) + + return Image.fromarray(new_image) diff --git a/unstructured_inference/models/tesseract.py b/unstructured_inference/models/tesseract.py index 56bf8e5a..e6f599cc 100644 --- a/unstructured_inference/models/tesseract.py +++ b/unstructured_inference/models/tesseract.py @@ -16,6 +16,10 @@ os.environ["OMP_THREAD_LIMIT"] = "1" +# this field is defined by pytesseract/unstructured.pytesseract +TESSERACT_TEXT_HEIGHT = "height" + + def load_agent(languages: str = "eng"): """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once. From f4236c8b271a4d0bca3a9411a405001b64f604b6 Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Tue, 26 Sep 2023 11:05:15 -0600 Subject: [PATCH 08/11] Fix/pdf miner source property (#228) This PR adds three possible values for `source` field: * `pdfminer` as source for elements directly obtained from PDFs. * `OCR-tesseract` and `OCR-paddle` for elements obtained with the respective OCR engines. All those new values are stored in a new class `Source` in unstructured_inference>constants.py This would help users filter certain elements depending on how were obtained. --- CHANGELOG.md | 5 +++++ .../inference/test_layout.py | 17 ++++++++++++++++- .../inference/test_layout_element.py | 4 ++-- unstructured_inference/__version__.py | 2 +- unstructured_inference/constants.py | 10 ++++++++++ unstructured_inference/inference/elements.py | 3 ++- unstructured_inference/inference/layout.py | 15 +++++++++++---- .../inference/layoutelement.py | 16 +++++++++++----- unstructured_inference/models/detectron2onnx.py | 3 ++- unstructured_inference/models/yolox.py | 3 ++- 10 files changed, 62 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 49b29b4f..7748867f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.6.5-dev0 + +* Fix `source` property for elements generated by pdfminer. +* Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR. + ## 0.6.4 * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 42bba9a2..b2b665bd 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -10,9 +10,10 @@ from PIL import Image import unstructured_inference.models.base as models -from unstructured_inference.constants import OCRMode +from unstructured_inference.constants import OCRMode, Source from unstructured_inference.inference import elements, layout, layoutelement from unstructured_inference.models import chipper, detectron2, tesseract +from unstructured_inference.models.base import get_model from unstructured_inference.models.unstructuredmodel import ( UnstructuredElementExtractionModel, UnstructuredObjectDetectionModel, @@ -117,6 +118,19 @@ def detect(self, *args): assert elements.ocr(text_block, image=image) == "" +def test_ocr_source(): + file = "sample-docs/loremipsum-flat.pdf" + model = get_model("yolox_tiny") + doc = layout.DocumentLayout.from_file( + file, + model, + ocr_mode=OCRMode.FULL_PAGE.value, + supplement_with_ocr_elements=True, + ocr_strategy="force", + ) + assert Source.OCR_TESSERACT in {e.source for e in doc.pages[0].elements} + + class MockLayoutModel: def __init__(self, layout): self.layout_return = layout @@ -678,6 +692,7 @@ def test_ocr_image(region, objects, ocr_strategy, expected): @pytest.mark.parametrize("filename", ["loremipsum.pdf", "IRS-form-1987.pdf"]) def test_load_pdf(filename): layouts, images = layout.load_pdf(f"sample-docs/{filename}") + assert Source.PDFMINER in {e.source for e in layouts[0]} assert len(layouts) for lo in layouts: assert len(lo) diff --git a/test_unstructured_inference/inference/test_layout_element.py b/test_unstructured_inference/inference/test_layout_element.py index 59727b62..0991a364 100644 --- a/test_unstructured_inference/inference/test_layout_element.py +++ b/test_unstructured_inference/inference/test_layout_element.py @@ -2,7 +2,7 @@ from layoutparser.elements import TextBlock from layoutparser.elements.layout_elements import Rectangle as LPRectangle -from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR +from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR, Source from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layoutelement import ( LayoutElement, @@ -166,7 +166,7 @@ def test_layout_element_from_lp_textblock(): 300, 300, text="Sample Text", - source="detectron2_lp", + source=Source.DETECTRON2_LP, type="Text", prob=0.99, ) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 4bf914ad..530da768 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.4" # pragma: no cover +__version__ = "0.6.5-dev0" # pragma: no cover diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index c6c20299..78c46379 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -11,5 +11,15 @@ class AnnotationResult(Enum): PLOT = "plot" +class Source(Enum): + YOLOX = "yolox" + DETECTRON2_ONNX = "detectron2_onnx" + DETECTRON2_LP = "detectron2_lp" + OCR_TESSERACT = "OCR-tesseract" + OCR_PADDLE = "OCR-paddle" + PDFMINER = "pdfminer" + MERGED = "merged" + + SUBREGION_THRESHOLD_FOR_OCR = 0.5 FULL_PAGE_REGION_THRESHOLD = 0.99 diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py index 1b965cda..67f78216 100644 --- a/unstructured_inference/inference/elements.py +++ b/unstructured_inference/inference/elements.py @@ -12,6 +12,7 @@ from scipy.sparse.csgraph import connected_components from unstructured_inference.config import inference_config +from unstructured_inference.constants import Source from unstructured_inference.logger import logger from unstructured_inference.math import safe_division from unstructured_inference.models import tesseract @@ -197,7 +198,7 @@ def intersections(*rects: Rectangle): @dataclass class TextRegion(Rectangle): text: Optional[str] = None - source: Optional[str] = None + source: Optional[Source] = None def __str__(self) -> str: return str(self.text) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 2cdba3bc..1ef60c30 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -13,7 +13,7 @@ from PIL import Image, ImageSequence from pytesseract import Output -from unstructured_inference.constants import OCRMode +from unstructured_inference.constants import OCRMode, Source from unstructured_inference.inference.elements import ( EmbeddedTextRegion, ImageTextRegion, @@ -677,7 +677,14 @@ def load_pdf( else: continue - text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text) + text_region = element_class( + x1 * coef, + y1 * coef, + x2 * coef, + y2 * coef, + text=_text, + source=Source.PDFMINER, + ) if text_region.area > 0: layout.append(text_region) @@ -738,7 +745,7 @@ def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]: (x1, y1, x2, y2) = l, t, l + w, t + h text = ocr_data["text"][i] if text: - text_region = TextRegion(x1, y1, x2, y2, text=text, source="OCR") + text_region = TextRegion(x1, y1, x2, y2, text=text, source=Source.OCR_TESSERACT) text_regions.append(text_region) return text_regions @@ -774,7 +781,7 @@ def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]: y2 = max([i[1] for i in line[0]]) text = line[1][0] if text: - text_region = TextRegion(x1, y1, x2, y2, text) + text_region = TextRegion(x1, y1, x2, y2, text, source=Source.OCR_PADDLE) text_regions.append(text_region) return text_regions diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index 887ecc33..f909c93a 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -9,7 +9,11 @@ from PIL import Image from unstructured_inference.config import inference_config -from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR +from unstructured_inference.constants import ( + FULL_PAGE_REGION_THRESHOLD, + SUBREGION_THRESHOLD_FOR_OCR, + Source, +) from unstructured_inference.inference.elements import ( ImageTextRegion, Rectangle, @@ -74,7 +78,7 @@ def from_lp_textblock(cls, textblock: TextBlock): text = textblock.text type = textblock.type prob = textblock.score - return cls(x1, y1, x2, y2, text=text, source="detectron2_lp", type=type, prob=prob) + return cls(x1, y1, x2, y2, text=text, source=Source.DETECTRON2_LP, type=type, prob=prob) def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str: @@ -311,8 +315,10 @@ def merge_text_regions(regions: List[TextRegion]) -> TextRegion: merged_text = " ".join([tr.text for tr in regions if tr.text]) sources = [*{tr.source for tr in regions}] - source = sources.pop() if len(sources) == 1 else "merged:".join(sources) # type:ignore - return TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text) + source = sources.pop() if len(sources) == 1 else Source.MERGED + element = TextRegion(min_x1, min_y1, max_x2, max_y2, source=source, text=merged_text) + setattr(element, "merged_sources", sources) + return element def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutElement]: @@ -332,7 +338,7 @@ def get_elements_from_ocr_regions(ocr_regions: List[TextRegion]) -> List[LayoutE r.x2, r.y2, text=r.text, - source=None, + source=r.source, type="UncategorizedText", ) for r in merged_regions diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py index 7b9df081..e8b75d1c 100644 --- a/unstructured_inference/models/detectron2onnx.py +++ b/unstructured_inference/models/detectron2onnx.py @@ -9,6 +9,7 @@ from onnxruntime.quantization import QuantType, quantize_dynamic from PIL import Image +from unstructured_inference.constants import Source from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.logger import logger, logger_onnx from unstructured_inference.models.unstructuredmodel import ( @@ -158,7 +159,7 @@ def postprocess( text=None, type=detected_class, prob=conf, - source="detectron2_onnx", + source=Source.DETECTRON2_ONNX, ) regions.append(region) diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index f5103698..e239a9b6 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -14,6 +14,7 @@ from onnxruntime.quantization import QuantType, quantize_dynamic from PIL import Image +from unstructured_inference.constants import Source from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.logger import logger from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel @@ -149,7 +150,7 @@ def image_processing( text=None, type=detected_class, prob=prob, - source="yolox", + source=Source.YOLOX, ) regions.append(region) From 00b493631bb9b20f2f8ee6b043577c0f02372e7d Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Tue, 26 Sep 2023 11:26:04 -0700 Subject: [PATCH 09/11] Feat/219 keep extracted image elements (#225) update `merge_inferred_layout_with_extracted_layout` to keep extracted image elements --- CHANGELOG.md | 1 + examples/image-extraction/requirements.txt | 1 - .../README.md | 0 .../embedded-image-extraction.py | 0 examples/image_extraction/requirements.txt | 3 +++ unstructured_inference/inference/layout.py | 2 +- .../inference/layoutelement.py | 23 ++++++++++++++----- 7 files changed, 22 insertions(+), 8 deletions(-) delete mode 100644 examples/image-extraction/requirements.txt rename examples/{image-extraction => image_extraction}/README.md (100%) rename examples/{image-extraction => image_extraction}/embedded-image-extraction.py (100%) create mode 100644 examples/image_extraction/requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index 7748867f..da608a15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ## 0.6.4 +* Add functionality to keep extracted image elements while merging inferred layout with extracted layout * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task * add the new image auto scaling parameters to `config.py` diff --git a/examples/image-extraction/requirements.txt b/examples/image-extraction/requirements.txt deleted file mode 100644 index 0d7e9b7d..00000000 --- a/examples/image-extraction/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -unstructured-inference \ No newline at end of file diff --git a/examples/image-extraction/README.md b/examples/image_extraction/README.md similarity index 100% rename from examples/image-extraction/README.md rename to examples/image_extraction/README.md diff --git a/examples/image-extraction/embedded-image-extraction.py b/examples/image_extraction/embedded-image-extraction.py similarity index 100% rename from examples/image-extraction/embedded-image-extraction.py rename to examples/image_extraction/embedded-image-extraction.py diff --git a/examples/image_extraction/requirements.txt b/examples/image_extraction/requirements.txt new file mode 100644 index 00000000..351b4120 --- /dev/null +++ b/examples/image_extraction/requirements.txt @@ -0,0 +1,3 @@ +unstructured-inference +pymupdf +pypdf2 \ No newline at end of file diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 1ef60c30..d1fb11c3 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -441,7 +441,7 @@ def annotate( width = style["width"] for region in getattr(self, attribute): if isinstance(region, Rectangle): - required_source = getattr(el, "source", None) + required_source = getattr(region, "source", None) if "all" in sources or required_source in sources: img = draw_bbox( img, diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index f909c93a..f3f3343f 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -148,13 +148,24 @@ def merge_inferred_layout_with_extracted_layout( ) if same_bbox: # Looks like these represent the same region - grow_region_to_match_region(inferred_region, extracted_region) - inferred_region.text = extracted_region.text - region_matched = True - elif extracted_is_subregion_of_inferred and inferred_is_text and extracted_is_image: - grow_region_to_match_region(inferred_region, extracted_region) - region_matched = True + if extracted_is_image: + # keep extracted region, remove inferred region + inferred_regions_to_remove.append(inferred_region) + else: + # keep inferred region, remove extracted region + grow_region_to_match_region(inferred_region, extracted_region) + inferred_region.text = extracted_region.text + region_matched = True + elif extracted_is_subregion_of_inferred and inferred_is_text: + if extracted_is_image: + # keep both extracted and inferred regions + region_matched = False + else: + # keep inferred region, remove extracted region + grow_region_to_match_region(inferred_region, extracted_region) + region_matched = True elif either_region_is_subregion_of_other and inferred_region.type != "Table": + # keep extracted region, remove inferred region inferred_regions_to_remove.append(inferred_region) if not region_matched: extracted_elements_to_add.append(extracted_region) From 12ca9d91c6b1a066d94fc2858b904a62bf3ee814 Mon Sep 17 00:00:00 2001 From: Christine Straub Date: Tue, 26 Sep 2023 17:37:23 -0700 Subject: [PATCH 10/11] chore: changelog fix, cut release 0.6.5 (#230) --- CHANGELOG.md | 4 ++-- unstructured_inference/__version__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da608a15..96a632f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,11 @@ -## 0.6.5-dev0 +## 0.6.5 +* Add functionality to keep extracted image elements while merging inferred layout with extracted layout * Fix `source` property for elements generated by pdfminer. * Add 'OCR-tesseract' and 'OCR-paddle' as sources for elements generated by OCR. ## 0.6.4 -* Add functionality to keep extracted image elements while merging inferred layout with extracted layout * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task * add the new image auto scaling parameters to `config.py` diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 530da768..36225a4c 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.5-dev0" # pragma: no cover +__version__ = "0.6.5" # pragma: no cover From cf15726a99db843bc6dcab4849577eca51d08af3 Mon Sep 17 00:00:00 2001 From: Yuming Long <63475068+yuming-long@users.noreply.github.com> Date: Wed, 27 Sep 2023 19:46:19 -0400 Subject: [PATCH 11/11] chore: stop passing language code from tesseract mapping to paddle (#226) ### Summary A user is flagging the assertion error for paddle language code: ``` AssertionError: param lang must in dict_keys(['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'latin', 'arabic', 'cyrillic', 'devanagari']), but got eng ``` and tried setting the `ocr_languages` param to 'en' (the correct lang code for english in paddle) but also didn't work. The reason is that the `ocr_languages` uses the mapping for tesseract code which will convert `en` to `eng` since thats the correct lang code for english in tesseract. The quick workaround here is stop passing the lang code to paddle and let it use default `en`, and this will be addressed once we have the lang code mapping for paddle. ### Test looks like user used this branch and got the lang parameter working from [linked comments](https://github.com/Unstructured-IO/unstructured-api/issues/247#issuecomment-1731923667) :) on api repo: ``` pip install paddlepaddle pip install "unstructured.PaddleOCR" export ENTIRE_PAGE_OCR=paddle make run-web-app ``` * check error before this change: ``` curl -X 'POST' 'http://localhost:8000/general/v0/general' -H 'accept: application/json' -F 'files=@sample-docs/english-and-korean.png' -F 'ocr_languages=en' | jq -C . | less -R ``` will see the error: ``` { "detail": "param lang must in dict_keys(['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'latin', 'arabic', 'cyrillic', 'devanagari']), but got eng" } ``` also in logger you will see `INFO Loading paddle with CPU on language=eng...` since tesseract mapping converts `en` to `eng`. * check after this change: Checkout to this branch and install inference repo into your env (the same env thats running api) with `pip install -e .` Rerun `make run-web-app` Run the curl command again, you won't get the result on m1 chip since paddle doesn't work on it but from the logger info you can see `2023-09-27 12:48:48,120 unstructured_inference INFO Loading paddle with CPU on language=en...`, which means the lang parameter is using default `en` (logger info is coming from [this line](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/paddle_ocr.py#L22)). --------- Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com> --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- unstructured_inference/inference/layout.py | 7 +++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 96a632f3..2277974d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.6.6 + +* Stop passing ocr_languages parameter into paddle to avoid invalid paddle language code error, this will be fixed until +we have the mapping from standard language code to paddle language code. ## 0.6.5 * Add functionality to keep extracted image elements while merging inferred layout with extracted layout diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 36225a4c..37b46218 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.6.5" # pragma: no cover +__version__ = "0.6.6" # pragma: no cover diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index d1fb11c3..447e7154 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -275,12 +275,11 @@ def get_elements_with_detection_model( ) if entrie_page_ocr == "paddle": - logger.info("Processing entrie page OCR with paddle...") + logger.info("Processing entire page OCR with paddle...") from unstructured_inference.models import paddle_ocr - # TODO(yuming): paddle only support one language at once, - # change ocr to tesseract if passed in multilanguages. - ocr_data = paddle_ocr.load_agent(language=self.ocr_languages).ocr( + # TODO(yuming): pass ocr language to paddle when we have language mapping for paddle + ocr_data = paddle_ocr.load_agent().ocr( np.array(self.image), cls=True, )