Skip to content

Commit

Permalink
fix: improve false-positive Title in Chinese text
Browse files Browse the repository at this point in the history
  • Loading branch information
scanny committed Dec 17, 2024
1 parent 9a9bf4c commit e5a3459
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 28 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.12-dev3
## 0.16.12-dev4

### Enhancements

Expand All @@ -10,6 +10,7 @@

- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.

## 0.16.11

Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/partition/test_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Expand Down
16 changes: 8 additions & 8 deletions test_unstructured/partition/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Expand Down Expand Up @@ -1210,7 +1210,7 @@ def str_repr(e: Element) -> str:
opts_args["file_path"] = example_doc_path("page-breaks.docx")
opts = DocxPartitionerOptions(**opts_args)
expected = [
# NOTE(scanny) - -- page 1 --
# -- page 1 --
NarrativeText(
"First page, tab here:\t"
"followed by line-break here:\n"
Expand All @@ -1220,28 +1220,28 @@ def str_repr(e: Element) -> str:
"and hard page-break here>>"
),
PageBreak(""),
# NOTE(scanny) - -- page 2 --
# -- page 2 --
NarrativeText(
"<<Text on second page. The font is big so it breaks onto third page--"
"------------------here-->> <<but break falls inside link so text stays"
" together."
),
PageBreak(""),
# NOTE(scanny) - -- page 3 --
# -- page 3 --
NarrativeText("Continuous section break here>>"),
NarrativeText("<<followed by text on same page"),
NarrativeText("Odd-page section break here>>"),
PageBreak(""),
# NOTE(scanny) - -- page 4 --
# -- page 4 --
PageBreak(""),
# NOTE(scanny) - -- page 5 --
# -- page 5 --
NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
NarrativeText(
'Then text gets big again so a "natural" rendered page break happens again here>> '
),
PageBreak(""),
# NOTE(scanny) - -- page 6 --
Title("<<and then more text proceeds."),
# -- page 6 --
Text("<<and then more text proceeds."),
]

elements = _DocxPartitioner.iter_document_elements(opts)
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.12-dev3" # pragma: no cover
__version__ = "0.16.12-dev4" # pragma: no cover
31 changes: 14 additions & 17 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
)
from unstructured.partition.utils.constants import PartitionStrategy
Expand Down Expand Up @@ -412,15 +411,15 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme
)
)

# NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and
# do not contribute to the document-element stream.
# -- blank paragraphs are commonly used for spacing between paragraphs and do not
# -- contribute to the document-element stream
if not text.strip():
return

metadata = self._paragraph_metadata(paragraph)

# NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a
# bullet-character if present.
# -- a list-item gets some special treatment, mutating the text to remove a
# -- bullet-character if present
if self._is_list_item(paragraph):
clean_text = clean_bullets(text).strip()
if clean_text:
Expand All @@ -431,19 +430,19 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme
)
return

# NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible
# -- determine element-type from an explicit Word paragraph-style if possible --
TextSubCls = self._style_based_element_type(paragraph)
if TextSubCls:
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
return

# NOTE(scanny) - try to recognize the element type by parsing its text
# -- try to recognize the element type by parsing its text --
TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
if TextSubCls:
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
return

# NOTE(scanny) - if all that fails we give it the default `Text` element-type
# -- if all that fails we give it the default `Text` element-type --
yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)

def _convert_table_to_html(self, table: DocxTable) -> str:
Expand Down Expand Up @@ -576,20 +575,20 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP

page_break = paragraph.rendered_page_breaks[0]

# NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break
# -- preceding-fragment is None when first paragraph content is a page-break --
preceding_paragraph_fragment = page_break.preceding_paragraph_fragment
if preceding_paragraph_fragment:
yield preceding_paragraph_fragment

yield page_break

# NOTE(scanny) - following-fragment is None when page-break is last paragraph content.
# This is probably quite rare (Word moves these to the start of the next paragraph) but
# easier to check for it than prove it can't happen.
# -- following-fragment is None when page-break is last paragraph content. This is
# -- probably quite rare (Word moves these to the start of the next paragraph) but
# -- easier to check for it than prove it can't happen.
following_paragraph_fragment = page_break.following_paragraph_fragment
# NOTE(scanny) - the paragraph fragment following a page-break can itself contain
# another page-break. This would also be quite rare, but it can happen so we just
# recurse into the second fragment the same way we handled the original paragraph.
# -- the paragraph fragment following a page-break can itself contain another
# -- page-break; this would also be quite rare, but it can happen so we just recurse
# -- into the second fragment the same way we handled the original paragraph
if following_paragraph_fragment:
yield from iter_paragraph_items(following_paragraph_fragment)

Expand Down Expand Up @@ -901,8 +900,6 @@ def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Type[T
return EmailAddress
if is_possible_narrative_text(text):
return NarrativeText
if is_possible_title(text):
return Title

return None

Expand Down

0 comments on commit e5a3459

Please sign in to comment.