From 27951372294404ede050ac91a7d0d10ab62ceea3 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 17 Dec 2024 17:48:37 -0800 Subject: [PATCH] fix: html incorrectly categorizing text Fixes #3666 --- CHANGELOG.md | 3 +- .../metrics/test_element_type.py | 4 +-- .../partition/html/test_parser.py | 33 +++++++++---------- .../partition/html/test_partition.py | 32 +++++++++--------- test_unstructured_ingest/test-ingest-src.sh | 6 ++-- unstructured/__version__.py | 2 +- unstructured/partition/html/parser.py | 13 -------- 7 files changed, 40 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa832741fb..68d9d0ff58 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev4 +## 0.16.12-dev5 ### Enhancements @@ -11,6 +11,7 @@ - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. - **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. +- **Improve element-type mapping for HTML.** Fixes bug where certain non-title elements were classified as `Title`. ## 0.16.11 diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index 9a44a08f05..d703bcfb86 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -19,7 +19,7 @@ "fake-email.txt", { ("NarrativeText", None): 1, - ("Title", 0): 1, + ("UncategorizedText", None): 1, ("ListItem", 1): 2, }, ), @@ -50,7 +50,7 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in ( "fake-email.txt", { - ("Title", 0): 1, + ("UncategorizedText", None): 1, ("ListItem", 1): 2, ("NarrativeText", None): 2, }, diff --git a/test_unstructured/partition/html/test_parser.py b/test_unstructured/partition/html/test_parser.py index 32dc975ec1..6c5b407e79 100644 --- a/test_unstructured/partition/html/test_parser.py +++ b/test_unstructured/partition/html/test_parser.py @@ -384,9 +384,8 @@ def it_generates_the_document_elements_from_the_Flow_element(self): elements = div.iter_elements() e = next(elements) - assert e == Title("Text of div with hierarchical phrasing content before first block item") + assert e == Text("Text of div with hierarchical phrasing content before first block item") assert e.metadata.to_dict() == { - "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical", "phrasing"], "emphasized_text_tags": ["b", "bi", "b"], } @@ -394,19 +393,17 @@ def it_generates_the_document_elements_from_the_Flow_element(self): assert e == NarrativeText("Click here to see the blurb for this block item.") assert e.metadata.to_dict() == {"link_texts": ["here"], "link_urls": ["http://blurb.io"]} e = next(elements) - assert e == Title("tail of block item with hierarchical phrasing content") + assert e == Text("tail of block item with hierarchical phrasing content") assert e.metadata.to_dict() == { - "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical", "phrasing"], "emphasized_text_tags": ["b", "bi", "b"], } e = next(elements) - assert e == Title("second block item") - assert e.metadata.to_dict() == {"category_depth": 0} + assert e == Text("second block item") + assert e.metadata.to_dict() == {} e = next(elements) - assert e == Title("tail of block item with hierarchical phrasing content") + assert e == Text("tail of block item with hierarchical phrasing content") assert e.metadata.to_dict() == { - "category_depth": 0, "emphasized_text_contents": ["with", "hierarchical"], "emphasized_text_tags": ["b", "bi"], } @@ -664,7 +661,7 @@ def it_generates_text_segments_for_its_text_and_children_and_tail( ("html_text", "expected_value"), [ # -- Phrasing with nested block but no text or tail produces only element for block -- - ("

aaa

", [Title("aaa")]), + ("

aaa

", [Text("aaa")]), # -- Phrasing with text produces annotated text-segment for the text -- ( "aaa

bbb

", @@ -672,14 +669,14 @@ def it_generates_text_segments_for_its_text_and_children_and_tail( TextSegment( "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} ), - Title("bbb"), + Text("bbb"), ], ), # -- Phrasing with tail produces annotated text-segment for the tail -- ( "

aaa

bbb
", [ - Title("aaa"), + Text("aaa"), TextSegment( "bbb", {"emphasized_text_contents": "bbb", "emphasized_text_tags": "b"} ), @@ -692,7 +689,7 @@ def it_generates_text_segments_for_its_text_and_children_and_tail( TextSegment( "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} ), - Title("bbb"), + Text("bbb"), TextSegment( "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"} ), @@ -776,15 +773,15 @@ def it_generates_text_segments_for_its_children_and_their_tails( # -- a phrasing element with no block children produces no elements -- ("", "", []), # -- a child block element produces an element -- - ("

aaa

", "", [Title("aaa")]), + ("

aaa

", "", [Text("aaa")]), # -- a child block element with a tail also produces a text-segment for the tail -- - ("

aaa

bbb
", "", [Title("aaa"), TextSegment("bbb", {})]), + ("

aaa

bbb
", "", [Text("aaa"), TextSegment("bbb", {})]), # -- and also text-segments for phrasing following the tail -- ( "

aaa

bbbcccddd
", "", [ - Title("aaa"), + Text("aaa"), TextSegment("bbb", {}), TextSegment("ccc", {}), TextSegment("ddd", {}), @@ -798,7 +795,7 @@ def it_generates_text_segments_for_its_children_and_their_tails( TextSegment( "aaa", {"emphasized_text_contents": "aaa", "emphasized_text_tags": "b"} ), - Title("bbb"), + Text("bbb"), TextSegment( "ccc", {"emphasized_text_contents": "ccc", "emphasized_text_tags": "b"} ), @@ -872,7 +869,7 @@ def and_it_generates_elements_for_its_block_children( [ TextSegment("aaa", {}), TextSegment("bbb", {}), - Title("ccc"), + Text("ccc"), TextSegment("ddd", {}), TextSegment("eee", {}), ], @@ -996,7 +993,7 @@ def it_generates_enclosed_block_items_as_separate_elements(self): "link_urls": ["http://eie.io"], }, ), - Title("one with"), + Text("one with"), TextSegment( " the Force.", { diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py index fd84a676c9..0ca830eecf 100644 --- a/test_unstructured/partition/html/test_partition.py +++ b/test_unstructured/partition/html/test_partition.py @@ -72,7 +72,7 @@ def test_partition_html_accepts_a_file_path(tmp_path: pathlib.Path): assert elements == [ Title("A Great and Glorious Section"), NarrativeText("Dear Leader is the best. He is such a wonderful engineer!"), - Title("Another Magnificent paragraph"), + Text("Another Magnificent paragraph"), NarrativeText("The prior element is a title based on its capitalization patterns!"), Table("I'm in a table"), Title("A New Beginning"), @@ -201,7 +201,7 @@ def test_partition_html_processes_chinese_chracters(): def test_emoji_appears_with_emoji_utf8_code(): assert partition_html(text='

Hello 😀

') == [ - Title("Hello 😀") + Text("Hello 😀") ] @@ -575,10 +575,10 @@ def test_pre_tag_parsing_respects_order(): "
The Big Blue Bear
\n" ) ) == [ - Title("The Big Brown Bear"), + Text("The Big Brown Bear"), NarrativeText("The big brown bear is growling."), NarrativeText("The big brown bear is sleeping."), - Title("The Big Blue Bear"), + Text("The Big Blue Bear"), ] @@ -604,7 +604,7 @@ def test_partition_html_br_tag_parsing(): assert elements == [ Title("Header 1"), - Title("Text"), + Text("Text"), Title("Header 2"), Text( " Param1 = Y\nParam2 = 1\nParam3 = 2\nParam4 = A\n \nParam5 = A,B,C,D,E\n" @@ -640,7 +640,7 @@ def test_partition_html_tag_tail_parsing(): elements = partition_html(text=html_text) - assert elements == [Title("Head"), Title("Nested"), Title("Tail")] + assert elements == [Text("Head"), Text("Nested"), Text("Tail")] # -- parsing edge cases -------------------------------------------------------------------------- @@ -731,11 +731,11 @@ def test_containers_with_text_are_processed(): assert elements == [ Text("Hi All,"), NarrativeText("Get excited for our first annual family day!"), - Title("Best."), + Text("Best."), Text("--"), - Title("Dino the Datasaur"), - Title("Unstructured Technologies"), - Title("Data Scientist"), + Text("Dino the Datasaur"), + Text("Unstructured Technologies"), + Text("Data Scientist"), Address("Doylestown, PA 18901"), NarrativeText("See you there!"), ] @@ -786,7 +786,7 @@ def test_html_grabs_bulleted_text_in_paras(): def test_joins_tag_text_correctly(): elements = partition_html(text="

Hello again peet magical

") - assert elements == [Title("Hello again peet magical")] + assert elements == [Text("Hello again peet magical")] def test_sample_doc_with_emoji(): @@ -796,17 +796,17 @@ def test_sample_doc_with_emoji(): def test_only_text_and_no_elements_in_body(): elements = partition_html(text="Hello") - assert elements == [Title("Hello")] + assert elements == [Text("Hello")] def test_text_before_elements_in_body(): elements = partition_html(text="Hello

World

") - assert elements == [Title("Hello"), Title("World")] + assert elements == [Text("Hello"), Text("World")] def test_line_break_in_container(): elements = partition_html(text="
Hello
World
") - assert elements == [Title("Hello World")] + assert elements == [Text("Hello World")] @pytest.mark.parametrize("tag", ["del", "form", "noscript"]) @@ -963,7 +963,7 @@ def test_partition_html_grabs_emphasized_texts(): assert e.metadata.emphasized_text_contents is None assert e.metadata.emphasized_text_tags is None e = elements[4] - assert e == Title("A lone span text!") + assert e == Text("A lone span text!") assert e.metadata.emphasized_text_contents is None assert e.metadata.emphasized_text_tags is None @@ -1078,7 +1078,7 @@ def test_partition_html_grabs_links(): assert e.metadata.link_urls is None assert e.metadata.link_texts is None e = elements[4] - assert e == Title("A lone link!") + assert e == Text("A lone link!") assert e.metadata.link_urls == ["/loner"] assert e.metadata.link_texts == ["A lone link!"] diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index 8634b330f7..22eb807fa6 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -18,7 +18,8 @@ EVAL_OUTPUT_ROOT=${EVAL_OUTPUT_ROOT:-$SCRIPT_DIR} export OMP_THREAD_LIMIT=1 all_tests=( - 's3.sh' + # NOTE(scanny): This test is disabled because it routinely flakes on OCR differencs + # 's3.sh' 's3-minio.sh' 'astradb.sh' 'azure.sh' @@ -76,7 +77,8 @@ full_python_matrix_tests=( 'local-single-file.sh' 'local-single-file-with-encoding.sh' 'local-single-file-with-pdf-infer-table-structure.sh' - 's3.sh' + # NOTE(scanny): This test is disabled because it routinely flakes on OCR differences + # 's3.sh' 'google-drive.sh' 'gcs.sh' 'azure.sh' diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a85bcf341c..07eda39112 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.12-dev4" # pragma: no cover +__version__ = "0.16.12-dev5" # pragma: no cover diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index dca984b013..26f86693dc 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -99,7 +99,6 @@ is_bulleted_text, is_email_address, is_possible_narrative_text, - is_possible_title, is_us_city_state_zip, ) from unstructured.utils import lazyproperty @@ -885,18 +884,6 @@ def derive_element_type_from_text(text: str) -> type[Text] | None: if is_possible_narrative_text(text): return NarrativeText - # NOTE (scanny): Classifying short paragraphs as titles produces noise much more frequently - # than it does value. A `Title` element is very consequential in its effect on chunking and - # document hierarchy. Classifying any small paragraph as a heading is frequently wrong and - # throws off these important downstream processes much more than missing the occasional - # heading does. If we want to infer headings, I think we have to be much more intelligent - # about it and consider what elements came before and after to see if the text _behaves_ like - # a heading, maybe whether it is bold and how many text elements follow it before the next - # title and how long since the prior title, whether `h1..h6` are used elsewhere in the - # document, etc. - if is_possible_title(text): - return Title - return Text