diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa832741fb..68d9d0ff58 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.12-dev4
+## 0.16.12-dev5
### Enhancements
@@ -11,6 +11,7 @@
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
+- **Improve element-type mapping for HTML.** Fixes bug where certain non-title elements were classified as `Title`.
## 0.16.11
diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py
index 9a44a08f05..d703bcfb86 100644
--- a/test_unstructured/metrics/test_element_type.py
+++ b/test_unstructured/metrics/test_element_type.py
@@ -19,7 +19,7 @@
"fake-email.txt",
{
("NarrativeText", None): 1,
- ("Title", 0): 1,
+ ("UncategorizedText", None): 1,
("ListItem", 1): 2,
},
),
@@ -50,7 +50,7 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
(
"fake-email.txt",
{
- ("Title", 0): 1,
+ ("UncategorizedText", None): 1,
("ListItem", 1): 2,
("NarrativeText", None): 2,
},
diff --git a/test_unstructured/partition/html/test_parser.py b/test_unstructured/partition/html/test_parser.py
index 32dc975ec1..6c5b407e79 100644
--- a/test_unstructured/partition/html/test_parser.py
+++ b/test_unstructured/partition/html/test_parser.py
@@ -384,9 +384,8 @@ def it_generates_the_document_elements_from_the_Flow_element(self):
elements = div.iter_elements()
e = next(elements)
- assert e == Title("Text of div with hierarchical phrasing content before first block item")
+ assert e == Text("Text of div with hierarchical phrasing content before first block item")
assert e.metadata.to_dict() == {
- "category_depth": 0,
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
"emphasized_text_tags": ["b", "bi", "b"],
}
@@ -394,19 +393,17 @@ def it_generates_the_document_elements_from_the_Flow_element(self):
assert e == NarrativeText("Click here to see the blurb for this block item.")
assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]}
e = next(elements)
- assert e == Title("tail of block item with hierarchical phrasing content")
+ assert e == Text("tail of block item with hierarchical phrasing content")
assert e.metadata.to_dict() == {
- "category_depth": 0,
"emphasized_text_contents": ["with", "hierarchical", "phrasing"],
"emphasized_text_tags": ["b", "bi", "b"],
}
e = next(elements)
- assert e == Title("second block item")
- assert e.metadata.to_dict() == {"category_depth": 0}
+ assert e == Text("second block item")
+ assert e.metadata.to_dict() == {}
e = next(elements)
- assert e == Title("tail of block item with hierarchical phrasing content")
+ assert e == Text("tail of block item with hierarchical phrasing content")
assert e.metadata.to_dict() == {
- "category_depth": 0,
"emphasized_text_contents": ["with", "hierarchical"],
"emphasized_text_tags": ["b", "bi"],
}
@@ -664,7 +661,7 @@ def it_generates_text_segments_for_its_text_and_children_and_tail(
("html_text", "expected_value"),
[
# -- Phrasing with nested block but no text or tail produces only element for block --
- ("aaa
", [Title("aaa")]),
+ ("aaa
", [Text("aaa")]),
# -- Phrasing with text produces annotated text-segment for the text --
(
"aaabbb
",
@@ -672,14 +669,14 @@ def it_generates_text_segments_for_its_text_and_children_and_tail(
TextSegment(
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
),
- Title("bbb"),
+ Text("bbb"),
],
),
# -- Phrasing with tail produces annotated text-segment for the tail --
(
"aaa
bbb",
[
- Title("aaa"),
+ Text("aaa"),
TextSegment(
"bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"}
),
@@ -692,7 +689,7 @@ def it_generates_text_segments_for_its_text_and_children_and_tail(
TextSegment(
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
),
- Title("bbb"),
+ Text("bbb"),
TextSegment(
"ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
),
@@ -776,15 +773,15 @@ def it_generates_text_segments_for_its_children_and_their_tails(
# -- a phrasing element with no block children produces no elements --
("", "", []),
# -- a child block element produces an element --
- ("aaa
", "", [Title("aaa")]),
+ ("aaa
", "", [Text("aaa")]),
# -- a child block element with a tail also produces a text-segment for the tail --
- ("aaa
bbb", "", [Title("aaa"), TextSegment("bbb", {})]),
+ ("aaa
bbb", "", [Text("aaa"), TextSegment("bbb", {})]),
# -- and also text-segments for phrasing following the tail --
(
"aaa
bbbcccddd",
"",
[
- Title("aaa"),
+ Text("aaa"),
TextSegment("bbb", {}),
TextSegment("ccc", {}),
TextSegment("ddd", {}),
@@ -798,7 +795,7 @@ def it_generates_text_segments_for_its_children_and_their_tails(
TextSegment(
"aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"}
),
- Title("bbb"),
+ Text("bbb"),
TextSegment(
"ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"}
),
@@ -872,7 +869,7 @@ def and_it_generates_elements_for_its_block_children(
[
TextSegment("aaa", {}),
TextSegment("bbb", {}),
- Title("ccc"),
+ Text("ccc"),
TextSegment("ddd", {}),
TextSegment("eee", {}),
],
@@ -996,7 +993,7 @@ def it_generates_enclosed_block_items_as_separate_elements(self):
"link_urls": ["http://eie.io"],
},
),
- Title("one with"),
+ Text("one with"),
TextSegment(
" the Force.",
{
diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py
index fd84a676c9..0ca830eecf 100644
--- a/test_unstructured/partition/html/test_partition.py
+++ b/test_unstructured/partition/html/test_partition.py
@@ -72,7 +72,7 @@ def test_partition_html_accepts_a_file_path(tmp_path: pathlib.Path):
assert elements == [
Title("A Great and Glorious Section"),
NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"),
- Title("Another Magnificent paragraph"),
+ Text("Another Magnificent paragraph"),
NarrativeText("The prior element is a title based on its capitalization patterns!"),
Table("I'm in a table"),
Title("A New Beginning"),
@@ -201,7 +201,7 @@ def test_partition_html_processes_chinese_chracters():
def test_emoji_appears_with_emoji_utf8_code():
assert partition_html(text='
Hello 😀
') == [
- Title("Hello 😀")
+ Text("Hello 😀")
]
@@ -575,10 +575,10 @@ def test_pre_tag_parsing_respects_order():
"The Big Blue Bear
\n"
)
) == [
- Title("The Big Brown Bear"),
+ Text("The Big Brown Bear"),
NarrativeText("The big brown bear is growling."),
NarrativeText("The big brown bear is sleeping."),
- Title("The Big Blue Bear"),
+ Text("The Big Blue Bear"),
]
@@ -604,7 +604,7 @@ def test_partition_html_br_tag_parsing():
assert elements == [
Title("Header 1"),
- Title("Text"),
+ Text("Text"),
Title("Header 2"),
Text(
" Param1 = Y\nParam2 = 1\nParam3 = 2\nParam4 = A\n \nParam5 = A,B,C,D,E\n"
@@ -640,7 +640,7 @@ def test_partition_html_tag_tail_parsing():
elements = partition_html(text=html_text)
- assert elements == [Title("Head"), Title("Nested"), Title("Tail")]
+ assert elements == [Text("Head"), Text("Nested"), Text("Tail")]
# -- parsing edge cases --------------------------------------------------------------------------
@@ -731,11 +731,11 @@ def test_containers_with_text_are_processed():
assert elements == [
Text("Hi All,"),
NarrativeText("Get excited for our first annual family day!"),
- Title("Best."),
+ Text("Best."),
Text("--"),
- Title("Dino the Datasaur"),
- Title("Unstructured Technologies"),
- Title("Data Scientist"),
+ Text("Dino the Datasaur"),
+ Text("Unstructured Technologies"),
+ Text("Data Scientist"),
Address("Doylestown, PA 18901"),
NarrativeText("See you there!"),
]
@@ -786,7 +786,7 @@ def test_html_grabs_bulleted_text_in_paras():
def test_joins_tag_text_correctly():
elements = partition_html(text="Hello again peet magical
")
- assert elements == [Title("Hello again peet magical")]
+ assert elements == [Text("Hello again peet magical")]
def test_sample_doc_with_emoji():
@@ -796,17 +796,17 @@ def test_sample_doc_with_emoji():
def test_only_text_and_no_elements_in_body():
elements = partition_html(text="Hello")
- assert elements == [Title("Hello")]
+ assert elements == [Text("Hello")]
def test_text_before_elements_in_body():
elements = partition_html(text="HelloWorld
")
- assert elements == [Title("Hello"), Title("World")]
+ assert elements == [Text("Hello"), Text("World")]
def test_line_break_in_container():
elements = partition_html(text="Hello
World
")
- assert elements == [Title("Hello World")]
+ assert elements == [Text("Hello World")]
@pytest.mark.parametrize("tag", ["del", "form", "noscript"])
@@ -963,7 +963,7 @@ def test_partition_html_grabs_emphasized_texts():
assert e.metadata.emphasized_text_contents is None
assert e.metadata.emphasized_text_tags is None
e = elements[4]
- assert e == Title("A lone span text!")
+ assert e == Text("A lone span text!")
assert e.metadata.emphasized_text_contents is None
assert e.metadata.emphasized_text_tags is None
@@ -1078,7 +1078,7 @@ def test_partition_html_grabs_links():
assert e.metadata.link_urls is None
assert e.metadata.link_texts is None
e = elements[4]
- assert e == Title("A lone link!")
+ assert e == Text("A lone link!")
assert e.metadata.link_urls == ["/loner"]
assert e.metadata.link_texts == ["A lone link!"]
diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh
index 8634b330f7..22eb807fa6 100755
--- a/test_unstructured_ingest/test-ingest-src.sh
+++ b/test_unstructured_ingest/test-ingest-src.sh
@@ -18,7 +18,8 @@ EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR}
export OMP_THREAD_LIMIT=1
all_tests=(
- 's3.sh'
+ # NOTE(scanny): This test is disabled because it routinely flakes on OCR differencs
+ # 's3.sh'
's3-minio.sh'
'astradb.sh'
'azure.sh'
@@ -76,7 +77,8 @@ full_python_matrix_tests=(
'local-single-file.sh'
'local-single-file-with-encoding.sh'
'local-single-file-with-pdf-infer-table-structure.sh'
- 's3.sh'
+ # NOTE(scanny): This test is disabled because it routinely flakes on OCR differences
+ # 's3.sh'
'google-drive.sh'
'gcs.sh'
'azure.sh'
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index a85bcf341c..07eda39112 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.12-dev4" # pragma: no cover
+__version__ = "0.16.12-dev5" # pragma: no cover
diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py
index dca984b013..26f86693dc 100644
--- a/unstructured/partition/html/parser.py
+++ b/unstructured/partition/html/parser.py
@@ -99,7 +99,6 @@
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
- is_possible_title,
is_us_city_state_zip,
)
from unstructured.utils import lazyproperty
@@ -885,18 +884,6 @@ def derive_element_type_from_text(text: str) -> type[Text] | None:
if is_possible_narrative_text(text):
return NarrativeText
- # NOTE (scanny): Classifying short paragraphs as titles produces noise much more frequently
- # than it does value. A `Title` element is very consequential in its effect on chunking and
- # document hierarchy. Classifying any small paragraph as a heading is frequently wrong and
- # throws off these important downstream processes much more than missing the occasional
- # heading does. If we want to infer headings, I think we have to be much more intelligent
- # about it and consider what elements came before and after to see if the text _behaves_ like
- # a heading, maybe whether it is bold and how many text elements follow it before the next
- # title and how long since the prior title, whether `h1..h6` are used elsewhere in the
- # document, etc.
- if is_possible_title(text):
- return Title
-
return Text