Skip to content

Commit

Permalink
rfctr(csv): minify HTML and table text is cct
Browse files Browse the repository at this point in the history
Include TSV in the mix since they are essentially clones of each other.
  • Loading branch information
scanny committed Oct 18, 2024
1 parent c85f29e commit bfec1bf
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 145 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.1-dev2
## 0.16.1-dev3

### Enhancements

Expand All @@ -11,6 +11,7 @@
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Minify text_as_html from CSV.** Previously `.metadata.text_as_html` for CSV tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.

## 0.16.0

Expand Down
126 changes: 29 additions & 97 deletions test_unstructured/partition/test_constants.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
EXPECTED_TABLE = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TABLE = (
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"</table>"
)

EXPECTED_TABLE_SEMICOLON_DELIMITER = (
"<table>"
"<tr><td>Year</td><td>Month</td><td>Revenue</td><td>Costs</td><td/></tr>"
"<tr><td>2022</td><td>1</td><td>123</td><td>-123</td><td/></tr>"
"<tr><td>2023</td><td>2</td><td>143,1</td><td>-814,38</td><td/></tr>"
"<tr><td>2024</td><td>3</td><td>215,32</td><td>-11,08</td><td/></tr>"
"</table>"
)

EXPECTED_TABLE_WITH_EMOJI = (
"<table>"
"<tr><td>Stanley Cups</td><td/><td/></tr>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"<tr><td>👨\\U+1F3FB🔧</td><td>TOR</td><td>15</td></tr>"
"</table>"
)

EXPECTED_TABLE_XLSX = (
"<table>"
Expand Down Expand Up @@ -54,74 +54,6 @@
"Year Month Revenue Costs 2022 1 123 -123 2023 2 143,1 -814,38 2024 3 215,32 -11,08"
)

EXPECTED_TABLE_SEMICOLON_DELIMITER = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Year</td>
<td>Month</td>
<td>Revenue</td>
<td>Costs</td>
<td></td>
</tr>
<tr>
<td>2022</td>
<td>1</td>
<td>123</td>
<td>-123</td>
<td></td>
</tr>
<tr>
<td>2023</td>
<td>2</td>
<td>143,1</td>
<td>-814,38</td>
<td></td>
</tr>
<tr>
<td>2024</td>
<td>3</td>
<td>215,32</td>
<td>-11,08</td>
<td></td>
</tr>
</tbody>
</table>"""

EXPECTED_TABLE_WITH_EMOJI = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Stanley Cups</td>
<td></td>
<td></td>
</tr>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
<tr>
<td>👨\\U+1F3FB🔧</td>
<td>TOR</td>
<td>15</td>
</tr>
</tbody>
</table>"""

EXPECTED_XLS_TABLE = (
"<table><tr>"
"<td>MC</td>"
Expand Down
5 changes: 1 addition & 4 deletions test_unstructured/partition/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,11 +200,8 @@ def test_partition_csv_header():
)

table = elements[0]
assert clean_extra_whitespace(table.text) == (
"Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert table.metadata.text_as_html is not None
assert "<thead>" in table.metadata.text_as_html


# ================================================================================================
Expand Down
43 changes: 19 additions & 24 deletions test_unstructured/partition/test_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
)
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from unstructured.chunking.title import chunk_by_title
from unstructured.cleaners.core import clean_extra_whitespace
from unstructured.documents.elements import Table
from unstructured.partition.tsv import partition_tsv

Expand All @@ -31,21 +30,20 @@
def test_partition_tsv_from_filename(filename: str, expected_text: str, expected_table: str):
elements = partition_tsv(example_doc_path(filename), include_header=False)

assert clean_extra_whitespace(elements[0].text) == expected_text
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements:
assert element.metadata.filename == filename
table = elements[0]
assert table.text == expected_text
assert table.metadata.text_as_html == expected_table
assert table.metadata.filetype == EXPECTED_FILETYPE
assert all(e.metadata.filename == filename for e in elements)


def test_partition_tsv_from_filename_with_metadata_filename():
elements = partition_tsv(
example_doc_path("stanley-cups.tsv"), metadata_filename="test", include_header=False
)

assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
for element in elements:
assert element.metadata.filename == "test"
assert elements[0].text == EXPECTED_TEXT
assert all(e.metadata.filename == "test" for e in elements)


@pytest.mark.parametrize(
Expand All @@ -59,21 +57,20 @@ def test_partition_tsv_from_file(filename: str, expected_text: str, expected_tab
with open(example_doc_path(filename), "rb") as f:
elements = partition_tsv(file=f, include_header=False)

assert clean_extra_whitespace(elements[0].text) == expected_text
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == expected_table
assert elements[0].metadata.filetype == EXPECTED_FILETYPE
for element in elements:
assert element.metadata.filename is None
table = elements[0]
assert isinstance(table, Table)
assert table.text == expected_text
assert table.metadata.text_as_html == expected_table
assert table.metadata.filetype == EXPECTED_FILETYPE
assert all(e.metadata.filename is None for e in elements)


def test_partition_tsv_from_file_with_metadata_filename():
with open(example_doc_path("stanley-cups.tsv"), "rb") as f:
elements = partition_tsv(file=f, metadata_filename="test", include_header=False)

assert clean_extra_whitespace(elements[0].text) == EXPECTED_TEXT
for element in elements:
assert element.metadata.filename == "test"
assert elements[0].text == EXPECTED_TEXT
assert all(element.metadata.filename == "test" for element in elements)


# -- .metadata.last_modified ---------------------------------------------------------------------
Expand Down Expand Up @@ -142,12 +139,10 @@ def test_partition_tsv_header():
example_doc_path("stanley-cups.tsv"), strategy="fast", include_header=True
)

e = elements[0]
assert (
clean_extra_whitespace(e.text) == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
)
assert e.metadata.text_as_html is not None
assert "<thead>" in e.metadata.text_as_html
table = elements[0]
assert table.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
assert table.metadata.text_as_html is not None
assert "<table>" in table.metadata.text_as_html


def test_partition_tsv_supports_chunking_strategy_while_partitioning():
Expand Down

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.1-dev2" # pragma: no cover
__version__ = "0.16.1-dev3" # pragma: no cover
12 changes: 6 additions & 6 deletions unstructured/partition/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from typing import IO, Any, Iterator

import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

from unstructured.chunking import add_chunking_strategy
from unstructured.common.html_table import HtmlTable
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.model import FileType
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
Expand Down Expand Up @@ -46,7 +46,6 @@ def partition_csv(
Whether True or False, the "text" field is always present in any Table element
and is the text content of the table (no structure).
"""

ctx = _CsvPartitioningContext.load(
file_path=filename,
file=file,
Expand All @@ -58,17 +57,18 @@ def partition_csv(
with ctx.open() as file:
dataframe = pd.read_csv(file, header=ctx.header, sep=ctx.delimiter, encoding=encoding)

html_text = dataframe.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
html_table = HtmlTable.from_html_text(
dataframe.to_html(index=False, header=include_header, na_rep="")
)

metadata = ElementMetadata(
filename=filename,
last_modified=ctx.last_modified,
text_as_html=html_text if infer_table_structure else None,
text_as_html=html_table.html if infer_table_structure else None,
)

# -- a CSV file becomes a single `Table` element --
return [Table(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]
return [Table(text=html_table.text, metadata=metadata, detection_origin=DETECTION_ORIGIN)]


class _CsvPartitioningContext:
Expand Down
15 changes: 8 additions & 7 deletions unstructured/partition/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from typing import IO, Any, Optional

import pandas as pd
from lxml.html.soupparser import fromstring as soupparser_fromstring

from unstructured.chunking import add_chunking_strategy
from unstructured.common.html_table import HtmlTable
from unstructured.documents.elements import Element, ElementMetadata, Table
from unstructured.file_utils.model import FileType
from unstructured.partition.common.common import (
Expand Down Expand Up @@ -42,22 +42,23 @@ def partition_tsv(
header = 0 if include_header else None

if filename:
table = pd.read_csv(filename, sep="\t", header=header)
dataframe = pd.read_csv(filename, sep="\t", header=header)
else:
assert file is not None
# -- Note(scanny): `SpooledTemporaryFile` on Python<3.11 does not implement `.readable()`
# -- which triggers an exception on `pd.DataFrame.read_csv()` call.
f = spooled_to_bytes_io_if_needed(file)
table = pd.read_csv(f, sep="\t", header=header)
dataframe = pd.read_csv(f, sep="\t", header=header)

html_text = table.to_html(index=False, header=include_header, na_rep="")
text = soupparser_fromstring(html_text).text_content()
html_table = HtmlTable.from_html_text(
dataframe.to_html(index=False, header=include_header, na_rep="")
)

metadata = ElementMetadata(
text_as_html=html_text,
filename=filename,
last_modified=get_last_modified_date(filename) if filename else None,
text_as_html=html_table.html,
)
metadata.detection_origin = DETECTION_ORIGIN

return [Table(text=text, metadata=metadata)]
return [Table(text=html_table.text, metadata=metadata)]

0 comments on commit bfec1bf

Please sign in to comment.