Skip to content

Commit

Permalink
fix: improve false-positive Title elements on Chinese text (#3836)
Browse files Browse the repository at this point in the history
**Summary**
Improve element-type mapping for Chinese text. Fixes bug where Chinese
text would produce large numbers of false-positive `Title` elements.

Fixes #3084

---------

Co-authored-by: scanny <[email protected]>
Co-authored-by: ryannikolaidis <[email protected]>
  • Loading branch information
3 people authored Dec 18, 2024
1 parent 9a9bf4c commit 9ece0b5
Show file tree
Hide file tree
Showing 15 changed files with 856 additions and 861 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.12-dev3
## 0.16.12-dev4

### Enhancements

Expand All @@ -10,6 +10,7 @@

- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.

## 0.16.11

Expand Down
10 changes: 4 additions & 6 deletions test_unstructured/metrics/test_element_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
"handbook-1p.docx",
{
("Header", None): 1,
("Title", 0): 1,
("Title", 1): 1,
("Title", 2): 1,
("UncategorizedText", 0): 6,
("ListItem", 3): 3,
("NarrativeText", 4): 7,
("NarrativeText", 0): 7,
("Footer", None): 1,
},
(0.43, 0.07, 0.65),
(0.78, 0.72, 0.81),
),
(
"handbook-1p.docx",
{
("Header", None): 1,
("Title", 0): 6,
("UncategorizedText", 0): 6,
("NarrativeText", 0): 7,
("PageBreak", None): 1,
("Footer", None): 1,
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@ def expected_docx_elements():
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/partition/test_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Expand Down
16 changes: 8 additions & 8 deletions test_unstructured/partition/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
Expand Down Expand Up @@ -1210,7 +1210,7 @@ def str_repr(e: Element) -> str:
opts_args["file_path"] = example_doc_path("page-breaks.docx")
opts = DocxPartitionerOptions(**opts_args)
expected = [
# NOTE(scanny) - -- page 1 --
# -- page 1 --
NarrativeText(
"First page, tab here:\t"
"followed by line-break here:\n"
Expand All @@ -1220,28 +1220,28 @@ def str_repr(e: Element) -> str:
"and hard page-break here>>"
),
PageBreak(""),
# NOTE(scanny) - -- page 2 --
# -- page 2 --
NarrativeText(
"<<Text on second page. The font is big so it breaks onto third page--"
"------------------here-->> <<but break falls inside link so text stays"
" together."
),
PageBreak(""),
# NOTE(scanny) - -- page 3 --
# -- page 3 --
NarrativeText("Continuous section break here>>"),
NarrativeText("<<followed by text on same page"),
NarrativeText("Odd-page section break here>>"),
PageBreak(""),
# NOTE(scanny) - -- page 4 --
# -- page 4 --
PageBreak(""),
# NOTE(scanny) - -- page 5 --
# -- page 5 --
NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
NarrativeText(
'Then text gets big again so a "natural" rendered page break happens again here>> '
),
PageBreak(""),
# NOTE(scanny) - -- page 6 --
Title("<<and then more text proceeds."),
# -- page 6 --
Text("<<and then more text proceeds."),
]

elements = _DocxPartitioner.iter_document_elements(opts)
Expand Down
5 changes: 2 additions & 3 deletions test_unstructured/partition/test_odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
Table,
TableChunk,
Text,
Title,
)
from unstructured.partition.docx import partition_docx
from unstructured.partition.odt import partition_odt
Expand All @@ -44,7 +43,7 @@ def test_partition_odt_from_filename():
elements = partition_odt(example_doc_path("fake.odt"))

assert elements == [
Title("Lorem ipsum dolor sit amet."),
Text("Lorem ipsum dolor sit amet."),
Table(
"Header row Mon Wed Fri"
" Color Blue Red Green"
Expand All @@ -63,7 +62,7 @@ def test_partition_odt_from_file():
elements = partition_odt(file=f)

assert elements == [
Title("Lorem ipsum dolor sit amet."),
Text("Lorem ipsum dolor sit amet."),
Table(
"Header row Mon Wed Fri"
" Color Blue Red Green"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "5209312022a75a31d95385fdccff68fa",
"text": "CHAPTER 1",
"metadata": {
Expand Down Expand Up @@ -51,7 +51,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "22a23e29022f32945965002cd734a8f0",
"text": "INTRODUCTION",
"metadata": {
Expand Down Expand Up @@ -79,7 +79,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "4c175cf543957acc4420221de28d3fca",
"text": "CHAPTER 1 \u2013 INTRODUCTION",
"metadata": {
Expand All @@ -101,7 +101,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "77022a5264f552b223538977cd40f640",
"text": "A.\tPURPOSE",
"metadata": {
Expand Down Expand Up @@ -189,7 +189,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "e341ffc123dd2827638aba18149c4175",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": {
Expand Down Expand Up @@ -255,7 +255,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "5209312022a75a31d95385fdccff68fa",
"text": "CHAPTER 1",
"metadata": {
Expand Down Expand Up @@ -51,7 +51,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "22a23e29022f32945965002cd734a8f0",
"text": "INTRODUCTION",
"metadata": {
Expand Down Expand Up @@ -79,7 +79,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "4c175cf543957acc4420221de28d3fca",
"text": "CHAPTER 1 \u2013 INTRODUCTION",
"metadata": {
Expand All @@ -101,7 +101,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "77022a5264f552b223538977cd40f640",
"text": "A.\tPURPOSE",
"metadata": {
Expand Down Expand Up @@ -189,7 +189,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "e341ffc123dd2827638aba18149c4175",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": {
Expand Down Expand Up @@ -255,7 +255,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "56d531394823d81787d77a04462ed096",
"text": "Lorem ipsum dolor sit amet.",
"metadata": {
Expand All @@ -17,6 +17,13 @@
"date_created": "1686809759.687",
"date_modified": "1686809743.0",
"permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "18298851591250030956",
"displayName": "[email protected]",
Expand All @@ -29,31 +36,24 @@
"pendingOwner": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
"emailAddress": "roman@unstructured.io",
"role": "writer",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
"id": "09147371668407854156",
"displayName": "roman",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
"emailAddress": "roman@unstructured.io",
"role": "writer",
"deleted": false,
"pendingOwner": false
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "56d531394823d81787d77a04462ed096",
"text": "Lorem ipsum dolor sit amet.",
"metadata": {
Expand All @@ -17,6 +17,13 @@
"date_created": "1718722775.76",
"date_modified": "1718722788.018",
"permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "18298851591250030956",
"displayName": "[email protected]",
Expand All @@ -39,13 +46,6 @@
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
Expand Down
Loading

0 comments on commit 9ece0b5

Please sign in to comment.