fix: Better logic for setting category_depth metadata for Title e…

…lements (#1517) This PR promotes the `category_depth` metadata for `Title` elements from `None` to 0, whenever `Headline` and/or `Subheadline` types (that are also mapped to `Title` elements with depth 1 and 2) are present. An additional test to `test_common.py` has been added to check on the improvement. More test of how this logic fixes the behaviour can be found in a adapted version on the colab [here](https://colab.research.google.com/drive/1LoScFJBYUhkM6X7pMp8cDaJLC_VoxGci?usp=sharing). --------- Co-authored-by: qued <[email protected]>
Unstructured-IO · Oct 5, 2023 · e90a979 · e90a979
1 parent e34396b
commit e90a979
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.10.20-dev1
+## 0.10.20-dev2
 
 ### Enhancements
 
@@ -10,6 +10,7 @@
 
 ### Fixes
 
+* **Fixes category_depth None value for Title elements** Problem: `Title` elements from `chipper` get `category_depth`= None even when `Headline` and/or `Subheadline` elements are present in the same page. Fix: all `Title` elements with `category_depth` = None should be set to have a depth of 0 instead iff there are `Headline` and/or `Subheadline` element-types present. Importance: `Title` elements should be equivalent html `H1` when nested headings are present; otherwise, `category_depth` metadata can result ambiguous within elements in a page.
 * **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate.
 
 ## 0.10.19

diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py
@@ -35,6 +35,18 @@ def elements(self):
                 type="Headline",
                 text="Charlie Brown and the Great Pumpkin",
             ),
+            LocationlessLayoutElement(
+                type="Subheadline",
+                text="The Beginning",
+            ),
+            LocationlessLayoutElement(
+                type="Text",
+                text="This time Charlie Brown had it really tricky...",
+            ),
+            LocationlessLayoutElement(
+                type="Title",
+                text="Another book title in the same page",
+            ),
         ]
 
 
@@ -405,3 +417,12 @@ def test_set_element_hierarchy_custom_rule_set():
     assert (
         elements[5].metadata.parent_id == elements[4].id
     ), "FigureCaption should be child of Title 2"
+
+
+def test_document_to_element_list_sets_category_depth_titles():
+    layout_with_hierarchies = MockDocumentLayout()
+    elements = document_to_element_list(layout_with_hierarchies)
+    assert elements[0].metadata.category_depth == 1
+    assert elements[1].metadata.category_depth == 2
+    assert elements[2].metadata.category_depth is None
+    assert elements[3].metadata.category_depth == 0
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.20-dev1"  # pragma: no cover
+__version__ = "0.10.20-dev2"  # pragma: no cover
diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
@@ -31,6 +31,7 @@
     ListItem,
     PageBreak,
     Text,
+    Title,
 )
 from unstructured.logger import logger
 from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
@@ -561,7 +562,6 @@ def document_to_element_list(
                 infer_list_items=infer_list_items,
                 source_format=source_format if source_format else "html",
             )
-
             if isinstance(element, List):
                 for el in element:
                     if last_modification_date:
@@ -575,6 +575,14 @@ def document_to_element_list(
                 element.metadata.text_as_html = (
                     layout_element.text_as_html if hasattr(layout_element, "text_as_html") else None
                 )
+                try:
+                    if (
+                        isinstance(element, Title) and element.metadata.category_depth is None
+                    ) and any(el.type in ["Headline", "Subheadline"] for el in page.elements):
+                        element.metadata.category_depth = 0
+                except AttributeError:
+                    logger.info("HTML element instance has no attribute type")
+
                 page_elements.append(element)
             coordinates = (
                 element.metadata.coordinates.points if element.metadata.coordinates else None
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.10.20-dev1" # pragma: no cover
		__version__ = "0.10.20-dev2" # pragma: no cover