diff --git a/CHANGELOG.md b/CHANGELOG.md index 0fc6dd7880..7d5d1843b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.20-dev1 +## 0.10.20-dev2 ### Enhancements @@ -10,6 +10,7 @@ ### Fixes +* **Fixes category_depth None value for Title elements** Problem: `Title` elements from `chipper` get `category_depth`= None even when `Headline` and/or `Subheadline` elements are present in the same page. Fix: all `Title` elements with `category_depth` = None should be set to have a depth of 0 instead iff there are `Headline` and/or `Subheadline` element-types present. Importance: `Title` elements should be equivalent html `H1` when nested headings are present; otherwise, `category_depth` metadata can result ambiguous within elements in a page. * **Tweak `xy-cut` ordering output to be more column friendly** This results in the order of elements more closely reflecting natural reading order which benefits downstream applications. While element ordering from `xy-cut` is usually mostly correct when ordering multi-column documents, sometimes elements from a RHS column will appear before elements in a LHS column. Fix: add swapped `xy-cut` ordering by sorting by X coordinate first and then Y coordinate. ## 0.10.19 diff --git a/test_unstructured/partition/test_common.py b/test_unstructured/partition/test_common.py index 65e47af160..e9751d0f3d 100644 --- a/test_unstructured/partition/test_common.py +++ b/test_unstructured/partition/test_common.py @@ -35,6 +35,18 @@ def elements(self): type="Headline", text="Charlie Brown and the Great Pumpkin", ), + LocationlessLayoutElement( + type="Subheadline", + text="The Beginning", + ), + LocationlessLayoutElement( + type="Text", + text="This time Charlie Brown had it really tricky...", + ), + LocationlessLayoutElement( + type="Title", + text="Another book title in the same page", + ), ] @@ -405,3 +417,12 @@ def test_set_element_hierarchy_custom_rule_set(): assert ( elements[5].metadata.parent_id == elements[4].id ), "FigureCaption should be child of Title 2" + + +def test_document_to_element_list_sets_category_depth_titles(): + layout_with_hierarchies = MockDocumentLayout() + elements = document_to_element_list(layout_with_hierarchies) + assert elements[0].metadata.category_depth == 1 + assert elements[1].metadata.category_depth == 2 + assert elements[2].metadata.category_depth is None + assert elements[3].metadata.category_depth == 0 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 7497a6b185..680eaf3a9a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.20-dev1" # pragma: no cover +__version__ = "0.10.20-dev2" # pragma: no cover diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 319be80608..faf1005fa8 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -31,6 +31,7 @@ ListItem, PageBreak, Text, + Title, ) from unstructured.logger import logger from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE @@ -561,7 +562,6 @@ def document_to_element_list( infer_list_items=infer_list_items, source_format=source_format if source_format else "html", ) - if isinstance(element, List): for el in element: if last_modification_date: @@ -575,6 +575,14 @@ def document_to_element_list( element.metadata.text_as_html = ( layout_element.text_as_html if hasattr(layout_element, "text_as_html") else None ) + try: + if ( + isinstance(element, Title) and element.metadata.category_depth is None + ) and any(el.type in ["Headline", "Subheadline"] for el in page.elements): + element.metadata.category_depth = 0 + except AttributeError: + logger.info("HTML element instance has no attribute type") + page_elements.append(element) coordinates = ( element.metadata.coordinates.points if element.metadata.coordinates else None