Feat/219 keep extracted image elements (#225)

update `merge_inferred_layout_with_extracted_layout` to keep extracted image elements
Unstructured-IO · Sep 26, 2023 · 00b4936 · 00b4936
1 parent f4236c8
commit 00b4936
Show file tree

Hide file tree

Showing 7 changed files with 22 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 
 ## 0.6.4
 
+* Add functionality to keep extracted image elements while merging inferred layout with extracted layout
 * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
 * add the new image auto scaling parameters to `config.py`
 

diff --git a/examples/image-extraction/requirements.txt b/examples/image-extraction/requirements.txt
diff --git a/examples/image-extraction/README.md → examples/image_extraction/README.md b/examples/image-extraction/README.md → examples/image_extraction/README.md
diff --git a/...e-extraction/embedded-image-extraction.py → ...e_extraction/embedded-image-extraction.py b/...e-extraction/embedded-image-extraction.py → ...e_extraction/embedded-image-extraction.py
diff --git a/examples/image_extraction/requirements.txt b/examples/image_extraction/requirements.txt
@@ -0,0 +1,3 @@
+unstructured-inference
+pymupdf
+pypdf2
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -441,7 +441,7 @@ def annotate(
                     width = style["width"]
                     for region in getattr(self, attribute):
                         if isinstance(region, Rectangle):
-                            required_source = getattr(el, "source", None)
+                            required_source = getattr(region, "source", None)
                             if "all" in sources or required_source in sources:
                                 img = draw_bbox(
                                     img,

diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -148,13 +148,24 @@ def merge_inferred_layout_with_extracted_layout(
                 )
                 if same_bbox:
                     # Looks like these represent the same region
-                    grow_region_to_match_region(inferred_region, extracted_region)
-                    inferred_region.text = extracted_region.text
-                    region_matched = True
-                elif extracted_is_subregion_of_inferred and inferred_is_text and extracted_is_image:
-                    grow_region_to_match_region(inferred_region, extracted_region)
-                    region_matched = True
+                    if extracted_is_image:
+                        # keep extracted region, remove inferred region
+                        inferred_regions_to_remove.append(inferred_region)
+                    else:
+                        # keep inferred region, remove extracted region
+                        grow_region_to_match_region(inferred_region, extracted_region)
+                        inferred_region.text = extracted_region.text
+                        region_matched = True
+                elif extracted_is_subregion_of_inferred and inferred_is_text:
+                    if extracted_is_image:
+                        # keep both extracted and inferred regions
+                        region_matched = False
+                    else:
+                        # keep inferred region, remove extracted region
+                        grow_region_to_match_region(inferred_region, extracted_region)
+                        region_matched = True
                 elif either_region_is_subregion_of_other and inferred_region.type != "Table":
+                    # keep extracted region, remove inferred region
                     inferred_regions_to_remove.append(inferred_region)
         if not region_matched:
             extracted_elements_to_add.append(extracted_region)