fix: improve false-positive Title in Chinese text

Unstructured-IO · Dec 17, 2024 · e5a3459 · e5a3459
1 parent 9a9bf4c
commit e5a3459
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 28 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.12-dev3
+## 0.16.12-dev4
 
 ### Enhancements
 
@@ -10,6 +10,7 @@
 
 - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
 - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
+- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
 
 ## 0.16.11
 

diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py
@@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
         Title("These are a few of my favorite things:"),
         ListItem("Parrots"),
         ListItem("Hockey"),
-        Title("Analysis"),
+        Text("Analysis"),
         NarrativeText("This is my first thought. This is my second thought."),
         NarrativeText("This is my third thought."),
         Text("2023"),

diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py
@@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
         Title("These are a few of my favorite things:"),
         ListItem("Parrots"),
         ListItem("Hockey"),
-        Title("Analysis"),
+        Text("Analysis"),
         NarrativeText("This is my first thought. This is my second thought."),
         NarrativeText("This is my third thought."),
         Text("2023"),
@@ -1210,7 +1210,7 @@ def str_repr(e: Element) -> str:
         opts_args["file_path"] = example_doc_path("page-breaks.docx")
         opts = DocxPartitionerOptions(**opts_args)
         expected = [
-            # NOTE(scanny) - -- page 1 --
+            # -- page 1 --
             NarrativeText(
                 "First page, tab here:\t"
                 "followed by line-break here:\n"
@@ -1220,28 +1220,28 @@ def str_repr(e: Element) -> str:
                 "and hard page-break here>>"
             ),
             PageBreak(""),
-            # NOTE(scanny) - -- page 2 --
+            # -- page 2 --
             NarrativeText(
                 "<<Text on second page. The font is big so it breaks onto third page--"
                 "------------------here-->> <<but break falls inside link so text stays"
                 " together."
             ),
             PageBreak(""),
-            # NOTE(scanny) - -- page 3 --
+            # -- page 3 --
             NarrativeText("Continuous section break here>>"),
             NarrativeText("<<followed by text on same page"),
             NarrativeText("Odd-page section break here>>"),
             PageBreak(""),
-            # NOTE(scanny) - -- page 4 --
+            # -- page 4 --
             PageBreak(""),
-            # NOTE(scanny) - -- page 5 --
+            # -- page 5 --
             NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
             NarrativeText(
                 'Then text gets big again so a "natural" rendered page break happens again here>> '
             ),
             PageBreak(""),
-            # NOTE(scanny) - -- page 6 --
-            Title("<<and then more text proceeds."),
+            # -- page 6 --
+            Text("<<and then more text proceeds."),
         ]
 
         elements = _DocxPartitioner.iter_document_elements(opts)

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.12-dev3"  # pragma: no cover
+__version__ = "0.16.12-dev4"  # pragma: no cover
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -48,7 +48,6 @@
     is_bulleted_text,
     is_email_address,
     is_possible_narrative_text,
-    is_possible_title,
     is_us_city_state_zip,
 )
 from unstructured.partition.utils.constants import PartitionStrategy
@@ -412,15 +411,15 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme
             )
         )
 
-        # NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and
-        # do not contribute to the document-element stream.
+        # -- blank paragraphs are commonly used for spacing between paragraphs and do not
+        # -- contribute to the document-element stream
         if not text.strip():
             return
 
         metadata = self._paragraph_metadata(paragraph)
 
-        # NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a
-        # bullet-character if present.
+        # -- a list-item gets some special treatment, mutating the text to remove a
+        # -- bullet-character if present
         if self._is_list_item(paragraph):
             clean_text = clean_bullets(text).strip()
             if clean_text:
@@ -431,19 +430,19 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme
                 )
             return
 
-        # NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible
+        # -- determine element-type from an explicit Word paragraph-style if possible --
         TextSubCls = self._style_based_element_type(paragraph)
         if TextSubCls:
             yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
             return
 
-        # NOTE(scanny) - try to recognize the element type by parsing its text
+        # -- try to recognize the element type by parsing its text --
         TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
         if TextSubCls:
             yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
             return
 
-        # NOTE(scanny) - if all that fails we give it the default `Text` element-type
+        # -- if all that fails we give it the default `Text` element-type --
         yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
 
     def _convert_table_to_html(self, table: DocxTable) -> str:
@@ -576,20 +575,20 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP
 
             page_break = paragraph.rendered_page_breaks[0]
 
-            # NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break
+            # -- preceding-fragment is None when first paragraph content is a page-break --
             preceding_paragraph_fragment = page_break.preceding_paragraph_fragment
             if preceding_paragraph_fragment:
                 yield preceding_paragraph_fragment
 
             yield page_break
 
-            # NOTE(scanny) - following-fragment is None when page-break is last paragraph content.
-            # This is probably quite rare (Word moves these to the start of the next paragraph) but
-            # easier to check for it than prove it can't happen.
+            # -- following-fragment is None when page-break is last paragraph content. This is
+            # -- probably quite rare (Word moves these to the start of the next paragraph) but
+            # -- easier to check for it than prove it can't happen.
             following_paragraph_fragment = page_break.following_paragraph_fragment
-            # NOTE(scanny) - the paragraph fragment following a page-break can itself contain
-            # another page-break. This would also be quite rare, but it can happen so we just
-            # recurse into the second fragment the same way we handled the original paragraph.
+            # -- the paragraph fragment following a page-break can itself contain another
+            # -- page-break; this would also be quite rare, but it can happen so we just recurse
+            # -- into the second fragment the same way we handled the original paragraph
             if following_paragraph_fragment:
                 yield from iter_paragraph_items(following_paragraph_fragment)
 
@@ -901,8 +900,6 @@ def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Type[T
             return EmailAddress
         if is_possible_narrative_text(text):
             return NarrativeText
-        if is_possible_title(text):
-            return Title
 
         return None
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.16.12-dev3" # pragma: no cover
		__version__ = "0.16.12-dev4" # pragma: no cover