diff --git a/CHANGELOG.md b/CHANGELOG.md index 7748867f..da608a15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ## 0.6.4 +* Add functionality to keep extracted image elements while merging inferred layout with extracted layout * add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task * add the new image auto scaling parameters to `config.py` diff --git a/examples/image-extraction/requirements.txt b/examples/image-extraction/requirements.txt deleted file mode 100644 index 0d7e9b7d..00000000 --- a/examples/image-extraction/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -unstructured-inference \ No newline at end of file diff --git a/examples/image-extraction/README.md b/examples/image_extraction/README.md similarity index 100% rename from examples/image-extraction/README.md rename to examples/image_extraction/README.md diff --git a/examples/image-extraction/embedded-image-extraction.py b/examples/image_extraction/embedded-image-extraction.py similarity index 100% rename from examples/image-extraction/embedded-image-extraction.py rename to examples/image_extraction/embedded-image-extraction.py diff --git a/examples/image_extraction/requirements.txt b/examples/image_extraction/requirements.txt new file mode 100644 index 00000000..351b4120 --- /dev/null +++ b/examples/image_extraction/requirements.txt @@ -0,0 +1,3 @@ +unstructured-inference +pymupdf +pypdf2 \ No newline at end of file diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 1ef60c30..d1fb11c3 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -441,7 +441,7 @@ def annotate( width = style["width"] for region in getattr(self, attribute): if isinstance(region, Rectangle): - required_source = getattr(el, "source", None) + required_source = getattr(region, "source", None) if "all" in sources or required_source in sources: img = draw_bbox( img, diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index f909c93a..f3f3343f 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -148,13 +148,24 @@ def merge_inferred_layout_with_extracted_layout( ) if same_bbox: # Looks like these represent the same region - grow_region_to_match_region(inferred_region, extracted_region) - inferred_region.text = extracted_region.text - region_matched = True - elif extracted_is_subregion_of_inferred and inferred_is_text and extracted_is_image: - grow_region_to_match_region(inferred_region, extracted_region) - region_matched = True + if extracted_is_image: + # keep extracted region, remove inferred region + inferred_regions_to_remove.append(inferred_region) + else: + # keep inferred region, remove extracted region + grow_region_to_match_region(inferred_region, extracted_region) + inferred_region.text = extracted_region.text + region_matched = True + elif extracted_is_subregion_of_inferred and inferred_is_text: + if extracted_is_image: + # keep both extracted and inferred regions + region_matched = False + else: + # keep inferred region, remove extracted region + grow_region_to_match_region(inferred_region, extracted_region) + region_matched = True elif either_region_is_subregion_of_other and inferred_region.type != "Table": + # keep extracted region, remove inferred region inferred_regions_to_remove.append(inferred_region) if not region_matched: extracted_elements_to_add.append(extracted_region)