From 89bd2faaf759d3cb6d6cb64578e9d089ee6acbe6 Mon Sep 17 00:00:00 2001 From: unifyh <18213435+unifyh@users.noreply.github.com> Date: Tue, 3 Oct 2023 12:17:51 +0800 Subject: [PATCH] fix: Fix various cases of HTML text missing after partition (#1587) Fix 4 cases of text missing after partition: 1. Text immediately after `` ```html missing1
hello
``` 2. Text inside container and immediately after `
` ```html
hello
missing2
``` 3. Text immediately after a text opening tag, if said tag contains `
` ```html

missing3
hello

``` 4. Text inside `` if it is the only content (different cause from case 1) ```html missing4 ``` Also fix problem causing `test_unstructured/documents/test_html.py::test_exclude_tag_types` to not work as intended. This will close GitHub Issue#1543 --- CHANGELOG.md | 5 +++ test_unstructured/documents/test_html.py | 40 ++++++++++++++++++++++-- unstructured/documents/html.py | 12 +++++-- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 009b31a377..8426b0fdd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ ### Fixes +* **Fix various cases of HTML text missing after partition** + Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result. + Fix: Updated code to deal with these cases. + Importance: This will ensure the correctness when partitioning HTML and Markdown documents. + ## 0.10.18 diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py index d6d236f08f..02f6d6bc72 100644 --- a/test_unstructured/documents/test_html.py +++ b/test_unstructured/documents/test_html.py @@ -17,6 +17,7 @@ from unstructured.documents.html import ( HEADING_TAGS, LIST_ITEM_TAGS, + SECTION_TAGS, TABLE_TAGS, TEXT_TAGS, HTMLDocument, @@ -41,8 +42,15 @@ TAGS = TAGS.replace(">", "").split("<")[1:] -INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + ["div"] -EXCLUDED_TAGS = "tag", [tag for tag in TAGS if tag not in INCLUDED_TAGS] +VOID_TAGS = "

" +VOID_TAGS = VOID_TAGS.replace(">", "").split("<")[1:] + +INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + SECTION_TAGS +EXCLUDED_TAGS = [ + tag + for tag in TAGS + if tag not in (INCLUDED_TAGS + TABLE_TAGS + VOID_TAGS + ["html", "head", "body"]) +] @pytest.fixture() @@ -685,3 +693,31 @@ def test_sample_doc_with_emoji(): # NOTE(robinson) - unclear why right now, but the output is the emoji on the test runners # and the byte string representation when running locally on mac assert doc.elements[0].text in ["Hello again ð\x9f\x98\x80", "Hello again 😀"] + + +def test_only_plain_text_in_body(): + raw_html = "Hello" + doc = HTMLDocument.from_string(raw_html) + assert doc.elements[0].text == "Hello" + + +def test_plain_text_before_anything_in_body(): + raw_html = "Hello

World

" + doc = HTMLDocument.from_string(raw_html) + assert doc.elements[0].text == "Hello" + assert doc.elements[1].text == "World" + + +def test_line_break_in_container(): + raw_html = "
Hello
World
" + doc = HTMLDocument.from_string(raw_html) + assert doc.elements[0].text == "Hello" + assert doc.elements[1].text == "World" + + +@pytest.mark.parametrize("tag", TEXT_TAGS) +def test_line_break_in_text_tag(tag): + raw_html = f"<{tag}>Hello
World" + doc = HTMLDocument.from_string(raw_html) + assert doc.elements[0].text == "Hello" + assert doc.elements[1].text == "World" diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 3bfdb1e680..1fbbcbcdfa 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -417,7 +417,7 @@ def _is_container_with_text(tag_elem: etree.Element) -> bool:
Please read my message!
""" - if tag_elem.tag not in SECTION_TAGS or len(tag_elem) == 0: + if tag_elem.tag not in SECTION_TAGS + ["body"] or len(tag_elem) == 0: return False if tag_elem.text is None or tag_elem.text.strip() == "": @@ -451,6 +451,12 @@ def _has_break_tags(tag_elem: etree._Element) -> bool: # pyright: ignore[report def _unfurl_break_tags(tag_elem: etree.Element) -> List[etree.Element]: unfurled = [] + + if tag_elem.text: + _tag_elem = etree.Element(tag_elem.tag) + _tag_elem.text = tag_elem.text + unfurled.append(_tag_elem) + children = tag_elem.getchildren() for child in children: if not _has_break_tags(child): @@ -474,13 +480,13 @@ def _is_text_tag(tag_elem: etree.Element, max_predecessor_len: int = 5) -> bool: if len(tag_elem) > max_predecessor_len + empty_elems_len: return False - if tag_elem.tag in TEXT_TAGS + HEADING_TAGS: + if tag_elem.tag in TEXT_TAGS + HEADING_TAGS + TEXTBREAK_TAGS: return True # NOTE(robinson) - This indicates that a div tag has no children. If that's the # case and the tag has text, its potential a text tag children = tag_elem.getchildren() - if tag_elem.tag in SECTION_TAGS and len(children) == 0: + if tag_elem.tag in SECTION_TAGS + ["body"] and len(children) == 0: return True if _has_adjacent_bulleted_spans(tag_elem, children):