fix: consolidate regex_metadata and adjust offsets

The implementation of adjusting regex-metadata match-offsets assumed the wrong data-type so while it passed the tests, in production it dropped all regex_metadata except that in the first section. In fairness, this never actually happened because the overchunking fixed in the previous commit made any element that had regex matches show up in its own single-element chunk. Reimplement for regex-metadata of type `Dict[str, List[RegexMetadata]]` rather than `List[RegexMetadata]`.
Unstructured-IO · Oct 18, 2023 · 49c4e21 · 49c4e21
1 parent d74dd26
commit 49c4e21
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 22 deletions.
diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py
@@ -64,10 +64,6 @@ def test_split_elements_by_title_and_table():
     ]
 
 
-@pytest.mark.xfail(reason="regex_metadata was wrong type", raises=AssertionError, strict=True)
-# -- `ElementMetadata.regex_metadata` is `Dict[str, List[RegexMetadata]]`, not `List[RegexMetadata]`
-# -- when this is fixed, this test fails by isolating a chunk for "Today is a bad day", which is
-# -- where the regex-metadata appears.
 def test_chunk_by_title():
     elements: List[Element] = [
         Title("A Great Day", metadata=ElementMetadata(emphasized_text_contents=["Day"])),
@@ -218,11 +214,6 @@ def test_chunk_by_title_does_not_break_on_regex_metadata_change():
     ]
 
 
-@pytest.mark.xfail(
-    reason="bug: regex_metadata of second and later section elements is discarded",
-    raises=AssertionError,
-    strict=True,
-)
 def test_chunk_by_title_consolidates_and_adjusts_offsets_of_regex_metadata():
     """ElementMetadata.regex_metadata of chunk is union of regex_metadatas of its elements.
 

diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py
@@ -16,7 +16,6 @@
     CompositeElement,
     Element,
     ElementMetadata,
-    RegexMetadata,
     Table,
     TableChunk,
     Text,
@@ -120,33 +119,39 @@ def chunk_by_title(
         text = ""
         metadata = first_element.metadata
         start_char = 0
-        for element in section:
+        for element_idx, element in enumerate(section):
             # -- concatenate all element text in section into `text` --
             if isinstance(element, Text):
                 # -- add a blank line between "squashed" elements --
                 text += "\n\n" if text else ""
                 start_char = len(text)
                 text += element.text
 
-            # -- "chunk" metadata should include union of list-items in all its elements. Also,
-            # -- metadata like regex_metadata that records start and/or end positions of related
-            # -- text need those offsets adjusted.
+            # -- "chunk" metadata should include union of list-items in all its elements --
             for attr, value in vars(element.metadata).items():
                 if isinstance(value, list):
                     value = cast(List[Any], value)
                     # -- get existing (list) value from chunk_metadata --
                     _value = getattr(metadata, attr, []) or []
-
-                    # TODO: this mutates the original, work on a copy instead.
-                    if attr == "regex_metadata":
-                        value = cast(List[RegexMetadata], value)
-                        for item in value:
-                            item["start"] += start_char
-                            item["end"] += start_char
-
                     _value.extend(item for item in value if item not in _value)
                     setattr(metadata, attr, _value)
 
+            # -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
+            element_regex_metadata = element.metadata.regex_metadata
+            # -- skip the first element because it is "alredy consolidated" and otherwise this would
+            # -- duplicate it.
+            if element_regex_metadata and element_idx > 0:
+                if metadata.regex_metadata is None:
+                    metadata.regex_metadata = {}
+                chunk_regex_metadata = metadata.regex_metadata
+                for regex_name, matches in element_regex_metadata.items():
+                    for m in matches:
+                        m["start"] += start_char
+                        m["end"] += start_char
+                    chunk_matches = chunk_regex_metadata.get(regex_name, [])
+                    chunk_matches.extend(matches)
+                    chunk_regex_metadata[regex_name] = chunk_matches
+
         # Check if text exceeds max_characters
         if len(text) > max_characters:
             # Chunk the text from the end to the beginning