Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: method to catch and classify overlapping bounding boxes #1803

Merged
merged 27 commits into from
Oct 25, 2023
Merged
Changes from 4 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
174d983
feat: method to catch and classify overlapping
LaverdeS Oct 19, 2023
1ff28cd
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 20, 2023
37a1077
feat: catching 6 cases of overlapping bboxes
LaverdeS Oct 20, 2023
cd97282
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 20, 2023
ac406d2
chore: tidy
LaverdeS Oct 21, 2023
c7b36cc
chore: shorten catch_overlapping_and_nested_bboxes method
LaverdeS Oct 23, 2023
dac2e42
chore: typo in docstrings
LaverdeS Oct 23, 2023
ca07bf3
fix: max ngram n in calculate_largest_ngram_percentage
LaverdeS Oct 23, 2023
85fd0fc
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 23, 2023
a580ab1
chore: add typing
LaverdeS Oct 23, 2023
d8c4f44
chore: add testing for case with and without overlap
LaverdeS Oct 23, 2023
b85225d
chore: update CHANGELOG
LaverdeS Oct 23, 2023
d95fa0b
chore: tidy
LaverdeS Oct 23, 2023
78057ae
chore: better logic for is_parent_box
LaverdeS Oct 23, 2023
7d69d9b
chore: tidier lines
LaverdeS Oct 23, 2023
6c6317a
chore: tidy
LaverdeS Oct 23, 2023
d234fd3
chore: tidy
LaverdeS Oct 23, 2023
a700881
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 23, 2023
a74d158
chore: refactor ngrams() method
LaverdeS Oct 23, 2023
26cf06c
chore: tidy
LaverdeS Oct 23, 2023
32dcc41
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 24, 2023
c820900
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 24, 2023
6f6eca7
feat: add nested_error_tolerance_px and sm_overlap_threshold params
LaverdeS Oct 24, 2023
0dba67f
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 24, 2023
861e496
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 24, 2023
2abfcc1
chore: more tests to cover all cases and use params
LaverdeS Oct 24, 2023
5c0c6d5
Merge branch 'main' into sebastian/catch_overlapping_bboxes
LaverdeS Oct 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
278 changes: 278 additions & 0 deletions unstructured/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
from datetime import datetime
from functools import wraps
from itertools import combinations
from typing import (
Any,
Callable,
Expand Down Expand Up @@ -280,3 +281,280 @@ def scarf_analytics():
)
except Exception:
pass


def ngrams(s, n):
"""Generate n-grams from a string"""

ngrams_list = []
for i in range(len(s) - n + 1):
ngrams_list.append(tuple(s[i : i + n]))
return ngrams_list


def calculate_shared_ngram_percentage(string_A, string_B, n):
"""Calculate the percentage of common_ngrams between string_A and string_B
with reference to the total number of ngrams in string_A"""

string_A_ngrams = ngrams(string_A.split(), n)
string_B_ngrams = ngrams(string_B.split(), n)

if not string_A_ngrams:
return 0

common_ngrams = set(string_A_ngrams) & set(string_B_ngrams)
percentage = (len(common_ngrams) / len(string_A_ngrams)) * 100
return percentage, common_ngrams


def calculate_largest_ngram_percentage(string_A, string_B):
"""Iteratively calculate_shared_ngram_percentage starting from the biggest
ngram possible until is >0.0%"""

if len(string_A.split()) < len(string_B.split()):
n = len(string_A.split()) - 1
else:
n = len(string_B.split()) - 1
string_A, string_B = string_B, string_A
n_str = str(n)
ngram_percentage = 0
while not ngram_percentage:
ngram_percentage, shared_ngrams = calculate_shared_ngram_percentage(string_A, string_B, n)
if n == 0:
break
else:
n -= 1
return round(ngram_percentage, 2), shared_ngrams, n_str


def is_parent_box(
parent_target,
child_target,
add=0,
) -> bool:
"""True if the child_target bounding box is nested in the parent_target.
Box format: [x_bottom_left, y_bottom_left, x_top_right, y_top_right].
The parameter 'add' is the pixel error tolerance for extra pixels outside the parent region"""

if len(parent_target) != 4:
return False

if add and len(parent_target) == 4:
parent_target = list(parent_target)
parent_target[0] -= add
parent_target[1] -= add
parent_target[2] += add
parent_target[3] += add

if len(child_target) == 4:
if (child_target[0] >= parent_target[0] and child_target[1] >= parent_target[1]) and (
child_target[2] <= parent_target[2] and child_target[3] <= parent_target[3]
):
return True
elif len(child_target) == 2: # Needed for polygon regions, this might need revision
if (
parent_target[0] <= child_target[0] <= parent_target[2]
and parent_target[1] <= child_target[1] <= parent_target[3]
):
return True

return False


def calculate_overlap_percentage(box1, box2, intersection_ratio_method="total"):
"""Box format: [x_bottom_left, y_bottom_left, x_top_right, y_top_right]"""
x1, y1 = box1[0]
x2, y2 = box1[2]
x3, y3 = box2[0]
x4, y4 = box2[2]
area_box1 = (x2 - x1) * (y2 - y1)
area_box2 = (x4 - x3) * (y4 - y3)
x_intersection1 = max(x1, x3)
y_intersection1 = max(y1, y3)
x_intersection2 = min(x2, x4)
y_intersection2 = min(y2, y4)
intersection_area = max(0, x_intersection2 - x_intersection1) * max(
0,
y_intersection2 - y_intersection1,
)
max_area = max(area_box1, area_box2)
min_area = min(area_box1, area_box2)
total_area = area_box1 + area_box2
overlap_percentage = 0

if intersection_ratio_method == "parent":
if max_area == 0:
return 0
overlap_percentage = (intersection_area / max_area) * 100

elif intersection_ratio_method == "partial":
if min_area == 0:
return 0
overlap_percentage = (intersection_area / min_area) * 100

else:
if (area_box1 + area_box2) == 0:
return 0

overlap_percentage = (intersection_area / (area_box1 + area_box2 - intersection_area)) * 100

return round(overlap_percentage, 2), max_area, min_area, total_area


def catch_overlapping_bboxes(
elements,
) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is over 150 lines long, which is way over the rule of thumb limit of 5-20 lines. Breaking it up into logical subtasks that get their own functions would really increase the readability.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I renamed catch_overlapping_bboxes to catch_overlapping_and_nested_bboxesand create two more methods to break it up: identify_overlapping_or_nesting_case and identify_overlapping_case. A short description can be found in the PR description. Is it better? I can try to split it further but most of the conditions to classify the overlapping-case shall be in one method, making it large.

"""Catch overlapping and nested bounding boxes cases across a list of elements."""

num_pages = elements[-1].metadata.page_number
bounding_boxes = [[] for _ in range(num_pages)]
text_labels = [[] for _ in range(num_pages)]
text_content = [[] for _ in range(num_pages)]

for ix, element in enumerate(elements):
n_page_to_ix = element.metadata.page_number - 1
bounding_boxes[n_page_to_ix].append(element.metadata.coordinates.to_dict()["points"])
text_labels[n_page_to_ix].append(f"{ix}. {element.category}")
text_content[n_page_to_ix].append(element.text)

overlapping_flag = False
overlapping_cases = []
for page_number, (page_bboxes, page_labels, page_text) in enumerate(
zip(bounding_boxes, text_labels, text_content),
start=1,
):
page_bboxes_combinations = list(combinations(page_bboxes, 2))
page_labels_combinations = list(combinations(page_labels, 2))
text_content_combinations = list(combinations(page_text, 2))

for box_pair, label_pair, text_pair in zip(
page_bboxes_combinations,
page_labels_combinations,
text_content_combinations,
):
box1, box2 = box_pair
type1, type2 = label_pair
ix_element1 = "".join([ch for ch in type1 if ch.isnumeric()])
ix_element2 = "".join([ch for ch in type2 if ch.isnumeric()])
type1 = type1[3:].strip()
type2 = type2[3:].strip()
x_bottom_left_1, y_bottom_left_1 = box1[0]
x_top_right_1, y_top_right_1 = box1[2]
x_bottom_left_2, y_bottom_left_2 = box2[0]
x_top_right_2, y_top_right_2 = box2[2]
horizontal_overlap = x_bottom_left_1 < x_top_right_2 and x_top_right_1 > x_bottom_left_2
vertical_overlap = y_bottom_left_1 < y_top_right_2 and y_top_right_1 > y_bottom_left_2
overlapping_elements, overlapping_case, overlap_percentage, largest_ngram_percentage = (
None,
None,
None,
None,
)

if horizontal_overlap and vertical_overlap:
box1_corners = [x_bottom_left_1, y_bottom_left_1, x_top_right_1, y_top_right_1]
box2_corners = [x_bottom_left_2, y_bottom_left_2, x_top_right_2, y_top_right_2]
overlap_percentage_total, _, _, _ = calculate_overlap_percentage(
box1,
box2,
intersection_ratio_method="total",
)
overlap_percentage, max_area, min_area, total_area = calculate_overlap_percentage(
box1,
box2,
intersection_ratio_method="parent",
)

if is_parent_box(box1_corners, box2_corners, add=5):
overlapping_elements = [
f"{type1}(ix={ix_element1})",
f"{type2}(ix={ix_element2})",
]
overlapping_case = f"nested {type2} in {type1}"
overlap_percentage = 100

elif is_parent_box(box2_corners, box1_corners, add=5):
overlapping_elements = [
f"{type2}(ix={ix_element2})",
f"{type1}(ix={ix_element1})",
]
overlapping_case = f"nested {type1} in {type2}"
overlap_percentage = 100

else:
(
overlap_percentage,
max_area,
min_area,
total_area,
) = calculate_overlap_percentage(
box1,
box2,
intersection_ratio_method="partial",
)
if overlap_percentage < 10.0:
overlapping_elements = [
f"{type1}(ix={ix_element1})",
f"{type2}(ix={ix_element2})",
]
overlapping_case = "Small partial overlap"

else:
text1, text2 = text_pair
if not text1:
overlapping_elements = [
f"{type1}(ix={ix_element1})",
f"{type2}(ix={ix_element2})",
]
overlapping_case = f"partial overlap with empty content in {type1}"

elif not text2:
overlapping_elements = [
f"{type2}(ix={ix_element2})",
f"{type1}(ix={ix_element1})",
]
overlapping_case = f"partial overlap with empty content in {type2}"

elif text1 in text2 or text2 in text1:
overlapping_elements = [
f"{type1}(ix={ix_element1})",
f"{type2}(ix={ix_element2})",
]
overlapping_case = "partial overlap with duplicate text"

else:
(
largest_ngram_percentage,
largest_shared_ngrams_max,
largest_n,
) = calculate_largest_ngram_percentage(text1, text2)
largest_ngram_percentage = round(largest_ngram_percentage, 2)
if not largest_ngram_percentage:
overlapping_elements = [
f"{type1}(ix={ix_element1})",
f"{type2}(ix={ix_element2})",
]
overlapping_case = "partial overlap without sharing text"

else:
overlapping_elements = [
f"{type1}(ix={ix_element1})",
f"{type2}(ix={ix_element2})",
]
overlapping_case = f"partial overlap sharing {largest_ngram_percentage}% of the text from {type1 if len(text1.split()) < len(text2.split()) else type2} ({largest_n}-gram)"
overlapping_cases.append(
{
"overlapping_elements": overlapping_elements,
"overlapping_case": overlapping_case,
"overlap_percentage": f"{overlap_percentage}%",
"metadata": {
"largest_ngram_percentage": largest_ngram_percentage,
"overlap_percentage_total": f"{overlap_percentage_total}%",
"max_area": f"{round(max_area, 2)}pxˆ2",
"min_area": f"{round(min_area, 2)}pxˆ2",
"total_area": f"{round(total_area, 2)}pxˆ2",
},
},
)
overlapping_flag = True
return overlapping_flag, overlapping_cases