From e5a3459fe199330ee19218c29a81a40cfec9dbc7 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 17 Dec 2024 09:58:33 -0800 Subject: [PATCH] fix: improve false-positive Title in Chinese text --- CHANGELOG.md | 3 ++- test_unstructured/partition/test_doc.py | 2 +- test_unstructured/partition/test_docx.py | 16 ++++++------ unstructured/__version__.py | 2 +- unstructured/partition/docx.py | 31 +++++++++++------------- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d13d859802..aa832741fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev3 +## 0.16.12-dev4 ### Enhancements @@ -10,6 +10,7 @@ - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. +- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. ## 0.16.11 diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py index 7c8c4d3ef5..e2698a3f71 100644 --- a/test_unstructured/partition/test_doc.py +++ b/test_unstructured/partition/test_doc.py @@ -275,7 +275,7 @@ def expected_elements() -> list[Element]: Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), - Title("Analysis"), + Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index 1330b4a79a..34a27cfde3 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -627,7 +627,7 @@ def expected_elements() -> list[Text]: Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), - Title("Analysis"), + Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), @@ -1210,7 +1210,7 @@ def str_repr(e: Element) -> str: opts_args["file_path"] = example_doc_path("page-breaks.docx") opts = DocxPartitionerOptions(**opts_args) expected = [ - # NOTE(scanny) - -- page 1 -- + # -- page 1 -- NarrativeText( "First page, tab here:\t" "followed by line-break here:\n" @@ -1220,28 +1220,28 @@ def str_repr(e: Element) -> str: "and hard page-break here>>" ), PageBreak(""), - # NOTE(scanny) - -- page 2 -- + # -- page 2 -- NarrativeText( "<> <>"), NarrativeText("<>"), PageBreak(""), - # NOTE(scanny) - -- page 4 -- + # -- page 4 -- PageBreak(""), - # NOTE(scanny) - -- page 5 -- + # -- page 5 -- NarrativeText("<> ' ), PageBreak(""), - # NOTE(scanny) - -- page 6 -- - Title("< Iterator[Eleme ) ) - # NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and - # do not contribute to the document-element stream. + # -- blank paragraphs are commonly used for spacing between paragraphs and do not + # -- contribute to the document-element stream if not text.strip(): return metadata = self._paragraph_metadata(paragraph) - # NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a - # bullet-character if present. + # -- a list-item gets some special treatment, mutating the text to remove a + # -- bullet-character if present if self._is_list_item(paragraph): clean_text = clean_bullets(text).strip() if clean_text: @@ -431,19 +430,19 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme ) return - # NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible + # -- determine element-type from an explicit Word paragraph-style if possible -- TextSubCls = self._style_based_element_type(paragraph) if TextSubCls: yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) return - # NOTE(scanny) - try to recognize the element type by parsing its text + # -- try to recognize the element type by parsing its text -- TextSubCls = self._parse_paragraph_text_for_element_type(paragraph) if TextSubCls: yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) return - # NOTE(scanny) - if all that fails we give it the default `Text` element-type + # -- if all that fails we give it the default `Text` element-type -- yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN) def _convert_table_to_html(self, table: DocxTable) -> str: @@ -576,20 +575,20 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP page_break = paragraph.rendered_page_breaks[0] - # NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break + # -- preceding-fragment is None when first paragraph content is a page-break -- preceding_paragraph_fragment = page_break.preceding_paragraph_fragment if preceding_paragraph_fragment: yield preceding_paragraph_fragment yield page_break - # NOTE(scanny) - following-fragment is None when page-break is last paragraph content. - # This is probably quite rare (Word moves these to the start of the next paragraph) but - # easier to check for it than prove it can't happen. + # -- following-fragment is None when page-break is last paragraph content. This is + # -- probably quite rare (Word moves these to the start of the next paragraph) but + # -- easier to check for it than prove it can't happen. following_paragraph_fragment = page_break.following_paragraph_fragment - # NOTE(scanny) - the paragraph fragment following a page-break can itself contain - # another page-break. This would also be quite rare, but it can happen so we just - # recurse into the second fragment the same way we handled the original paragraph. + # -- the paragraph fragment following a page-break can itself contain another + # -- page-break; this would also be quite rare, but it can happen so we just recurse + # -- into the second fragment the same way we handled the original paragraph if following_paragraph_fragment: yield from iter_paragraph_items(following_paragraph_fragment) @@ -901,8 +900,6 @@ def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Type[T return EmailAddress if is_possible_narrative_text(text): return NarrativeText - if is_possible_title(text): - return Title return None