Unstructured-IO · newelh · Oct 3, 2023 · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## 0.10.19-dev0
+
+### Enhancements
+
+* **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title. 
+
+
 ## 0.10.18
 
 ### Enhancements

diff --git a/test_unstructured/partition/test_html_partition.py b/test_unstructured/partition/test_html_partition.py
@@ -8,7 +8,7 @@
 
 from unstructured.chunking.title import chunk_by_title
 from unstructured.cleaners.core import clean_extra_whitespace
-from unstructured.documents.elements import ListItem, NarrativeText, Table, Title
+from unstructured.documents.elements import EmailAddress, ListItem, NarrativeText, Table, Title
 from unstructured.documents.html import HTMLTitle
 from unstructured.partition.html import partition_html
 from unstructured.partition.json import partition_json
@@ -645,3 +645,25 @@ def test_add_chunking_strategy_on_partition_html(
     chunks = chunk_by_title(elements)
     assert chunk_elements != elements
     assert chunk_elements == chunks
+
+
+def test_html_heading_title_detection():
+    html_text = """
+    <p>This is a section of narrative text, it's long, flows and has meaning</p>
+    <h1>This is a section of narrative text, it's long, flows and has meaning</h1>
+    <h2>A heading that is at the second level</h2>
+    <h3>Finally, the third heading</h3>
+    <h2>December 1-17, 2017</h2>
+    <h3>[email protected]</h3>
+    <h3><li>- bulleted item</li></h3>
+    """
+    elements = partition_html(text=html_text)
+    assert elements == [
+        NarrativeText("This is a section of narrative text, it's long, flows and has meaning"),
+        Title("This is a section of narrative text, it's long, flows and has meaning"),
+        Title("A heading that is at the second level"),
+        Title("Finally, the third heading"),
+        Title("December 1-17, 2017"),
+        EmailAddress("[email protected]"),
+        ListItem("- bulleted item"),
+    ]
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.18"  # pragma: no cover
+__version__ = "0.10.19-dev0"  # pragma: no cover
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
@@ -389,7 +389,7 @@ def _text_to_element(
             links=links,
             emphasized_texts=emphasized_texts,
         )
-    elif is_possible_title(text):
+    elif is_heading_tag(tag) or is_possible_title(text):
         return HTMLTitle(
             text,
             tag=tag,
@@ -431,6 +431,11 @@ def is_narrative_tag(text: str, tag: str) -> bool:
     return tag not in HEADING_TAGS and is_possible_narrative_text(text)
 
 
+def is_heading_tag(tag: str) -> bool:
+    """Uses tag information to infer whether text is a heading."""
+    return tag in HEADING_TAGS
+
+
 def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) -> str:
     """Extracts text from a text tag element."""
     text = ""
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.10.18" # pragma: no cover
		__version__ = "0.10.19-dev0" # pragma: no cover