Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Detect all text in HTML Heading tags as titles #1556

Merged
merged 16 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## 0.10.19-dev0

### Enhancements

* **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title.


## 0.10.18

### Enhancements
Expand Down
24 changes: 23 additions & 1 deletion test_unstructured/partition/test_html_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import ListItem, NarrativeText, Table, Title
from unstructured.documents.elements import EmailAddress, ListItem, NarrativeText, Table, Title
from unstructured.documents.html import HTMLTitle
from unstructured.partition.html import partition_html
from unstructured.partition.json import partition_json
Expand Down Expand Up @@ -645,3 +645,25 @@ def test_add_chunking_strategy_on_partition_html(
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks


def test_html_heading_title_detection():
html_text = """
<p>This is a section of narrative text, it's long, flows and has meaning</p>
<h1>This is a section of narrative text, it's long, flows and has meaning</h1>
<h2>A heading that is at the second level</h2>
<h3>Finally, the third heading</h3>
<h2>December 1-17, 2017</h2>
<h3>[email protected]</h3>
<h3><li>- bulleted item</li></h3>
"""
elements = partition_html(text=html_text)
assert elements == [
NarrativeText("This is a section of narrative text, it's long, flows and has meaning"),
Title("This is a section of narrative text, it's long, flows and has meaning"),
Title("A heading that is at the second level"),
Title("Finally, the third heading"),
Title("December 1-17, 2017"),
EmailAddress("[email protected]"),
ListItem("- bulleted item"),
]
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.18" # pragma: no cover
__version__ = "0.10.19-dev0" # pragma: no cover
7 changes: 6 additions & 1 deletion unstructured/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ def _text_to_element(
links=links,
emphasized_texts=emphasized_texts,
)
elif is_possible_title(text):
elif is_heading_tag(tag) or is_possible_title(text):
return HTMLTitle(
text,
tag=tag,
Expand Down Expand Up @@ -431,6 +431,11 @@ def is_narrative_tag(text: str, tag: str) -> bool:
return tag not in HEADING_TAGS and is_possible_narrative_text(text)


def is_heading_tag(tag: str) -> bool:
"""Uses tag information to infer whether text is a heading."""
return tag in HEADING_TAGS


def _construct_text(tag_elem: etree.Element, include_tail_text: bool = True) -> str:
"""Extracts text from a text tag element."""
text = ""
Expand Down