Skip to content

Commit

Permalink
Feat/219 keep extracted image elements (#225)
Browse files Browse the repository at this point in the history
update `merge_inferred_layout_with_extracted_layout` to keep extracted image elements
  • Loading branch information
christinestraub authored Sep 26, 2023
1 parent f4236c8 commit 00b4936
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

## 0.6.4

* Add functionality to keep extracted image elements while merging inferred layout with extracted layout
* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
* add the new image auto scaling parameters to `config.py`

Expand Down
1 change: 0 additions & 1 deletion examples/image-extraction/requirements.txt

This file was deleted.

File renamed without changes.
3 changes: 3 additions & 0 deletions examples/image_extraction/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
unstructured-inference
pymupdf
pypdf2
2 changes: 1 addition & 1 deletion unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def annotate(
width = style["width"]
for region in getattr(self, attribute):
if isinstance(region, Rectangle):
required_source = getattr(el, "source", None)
required_source = getattr(region, "source", None)
if "all" in sources or required_source in sources:
img = draw_bbox(
img,
Expand Down
23 changes: 17 additions & 6 deletions unstructured_inference/inference/layoutelement.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,24 @@ def merge_inferred_layout_with_extracted_layout(
)
if same_bbox:
# Looks like these represent the same region
grow_region_to_match_region(inferred_region, extracted_region)
inferred_region.text = extracted_region.text
region_matched = True
elif extracted_is_subregion_of_inferred and inferred_is_text and extracted_is_image:
grow_region_to_match_region(inferred_region, extracted_region)
region_matched = True
if extracted_is_image:
# keep extracted region, remove inferred region
inferred_regions_to_remove.append(inferred_region)
else:
# keep inferred region, remove extracted region
grow_region_to_match_region(inferred_region, extracted_region)
inferred_region.text = extracted_region.text
region_matched = True
elif extracted_is_subregion_of_inferred and inferred_is_text:
if extracted_is_image:
# keep both extracted and inferred regions
region_matched = False
else:
# keep inferred region, remove extracted region
grow_region_to_match_region(inferred_region, extracted_region)
region_matched = True
elif either_region_is_subregion_of_other and inferred_region.type != "Table":
# keep extracted region, remove inferred region
inferred_regions_to_remove.append(inferred_region)
if not region_matched:
extracted_elements_to_add.append(extracted_region)
Expand Down

0 comments on commit 00b4936

Please sign in to comment.