Skip to content

Commit

Permalink
fix: consolidate regex_metadata and adjust offsets
Browse files Browse the repository at this point in the history
The implementation of adjusting regex-metadata match-offsets assumed
the wrong data-type so while it passed the tests, in production it
dropped all regex_metadata except that in the first section.

In fairness, this never actually happened because the overchunking fixed
in the previous commit made any element that had regex matches show up
in its own single-element chunk.

Reimplement for regex-metadata of type `Dict[str, List[RegexMetadata]]`
rather than `List[RegexMetadata]`.
  • Loading branch information
scanny committed Oct 18, 2023
1 parent d74dd26 commit 49c4e21
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 22 deletions.
9 changes: 0 additions & 9 deletions test_unstructured/chunking/test_title.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,6 @@ def test_split_elements_by_title_and_table():
]


@pytest.mark.xfail(reason="regex_metadata was wrong type", raises=AssertionError, strict=True)
# -- `ElementMetadata.regex_metadata` is `Dict[str, List[RegexMetadata]]`, not `List[RegexMetadata]`
# -- when this is fixed, this test fails by isolating a chunk for "Today is a bad day", which is
# -- where the regex-metadata appears.
def test_chunk_by_title():
elements: List[Element] = [
Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])),
Expand Down Expand Up @@ -218,11 +214,6 @@ def test_chunk_by_title_does_not_break_on_regex_metadata_change():
]


@pytest.mark.xfail(
reason="bug: regex_metadata of second and later section elements is discarded",
raises=AssertionError,
strict=True,
)
def test_chunk_by_title_consolidates_and_adjusts_offsets_of_regex_metadata():
"""ElementMetadata.regex_metadata of chunk is union of regex_metadatas of its elements.
Expand Down
31 changes: 18 additions & 13 deletions unstructured/chunking/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
CompositeElement,
Element,
ElementMetadata,
RegexMetadata,
Table,
TableChunk,
Text,
Expand Down Expand Up @@ -120,33 +119,39 @@ def chunk_by_title(
text = ""
metadata = first_element.metadata
start_char = 0
for element in section:
for element_idx, element in enumerate(section):
# -- concatenate all element text in section into `text` --
if isinstance(element, Text):
# -- add a blank line between "squashed" elements --
text += "\n\n" if text else ""
start_char = len(text)
text += element.text

# -- "chunk" metadata should include union of list-items in all its elements. Also,
# -- metadata like regex_metadata that records start and/or end positions of related
# -- text need those offsets adjusted.
# -- "chunk" metadata should include union of list-items in all its elements --
for attr, value in vars(element.metadata).items():
if isinstance(value, list):
value = cast(List[Any], value)
# -- get existing (list) value from chunk_metadata --
_value = getattr(metadata, attr, []) or []

# TODO: this mutates the original, work on a copy instead.
if attr == "regex_metadata":
value = cast(List[RegexMetadata], value)
for item in value:
item["start"] += start_char
item["end"] += start_char

_value.extend(item for item in value if item not in _value)
setattr(metadata, attr, _value)

# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
element_regex_metadata = element.metadata.regex_metadata
# -- skip the first element because it is "alredy consolidated" and otherwise this would
# -- duplicate it.
if element_regex_metadata and element_idx > 0:
if metadata.regex_metadata is None:
metadata.regex_metadata = {}
chunk_regex_metadata = metadata.regex_metadata
for regex_name, matches in element_regex_metadata.items():
for m in matches:
m["start"] += start_char
m["end"] += start_char
chunk_matches = chunk_regex_metadata.get(regex_name, [])
chunk_matches.extend(matches)
chunk_regex_metadata[regex_name] = chunk_matches

# Check if text exceeds max_characters
if len(text) > max_characters:
# Chunk the text from the end to the beginning
Expand Down

0 comments on commit 49c4e21

Please sign in to comment.