From 89bd2faaf759d3cb6d6cb64578e9d089ee6acbe6 Mon Sep 17 00:00:00 2001
From: unifyh <18213435+unifyh@users.noreply.github.com>
Date: Tue, 3 Oct 2023 12:17:51 +0800
Subject: [PATCH] fix: Fix various cases of HTML text missing after partition
(#1587)
Fix 4 cases of text missing after partition:
1. Text immediately after `
`
```html
missing1
hello
```
2. Text inside container and immediately after ` `
```html
hello missing2
```
3. Text immediately after a text opening tag, if said tag contains
` `
```html
missing3 hello
```
4. Text inside `` if it is the only content (different cause from
case 1)
```html
missing4
```
Also fix problem causing
`test_unstructured/documents/test_html.py::test_exclude_tag_types` to
not work as intended.
This will close GitHub Issue#1543
---
CHANGELOG.md | 5 +++
test_unstructured/documents/test_html.py | 40 ++++++++++++++++++++++--
unstructured/documents/html.py | 12 +++++--
3 files changed, 52 insertions(+), 5 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 009b31a377..8426b0fdd4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,11 @@
### Fixes
+* **Fix various cases of HTML text missing after partition**
+ Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result.
+ Fix: Updated code to deal with these cases.
+ Importance: This will ensure the correctness when partitioning HTML and Markdown documents.
+
## 0.10.18
diff --git a/test_unstructured/documents/test_html.py b/test_unstructured/documents/test_html.py
index d6d236f08f..02f6d6bc72 100644
--- a/test_unstructured/documents/test_html.py
+++ b/test_unstructured/documents/test_html.py
@@ -17,6 +17,7 @@
from unstructured.documents.html import (
HEADING_TAGS,
LIST_ITEM_TAGS,
+ SECTION_TAGS,
TABLE_TAGS,
TEXT_TAGS,
HTMLDocument,
@@ -41,8 +42,15 @@
TAGS = TAGS.replace(">", "").split("<")[1:]
-INCLUDED_TAGS = TEXT_TAGS + HEADING_TAGS + LIST_ITEM_TAGS + ["div"]
-EXCLUDED_TAGS = "tag", [tag for tag in TAGS if tag not in INCLUDED_TAGS]
+VOID_TAGS = "