Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into chore/bump-inference
Browse files Browse the repository at this point in the history
  • Loading branch information
badGarnet committed Oct 18, 2024
2 parents 6e28bc0 + c85f29e commit 8c54153
Show file tree
Hide file tree
Showing 11 changed files with 182 additions and 199 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.1-dev1
## 0.16.1-dev2

### Enhancements

Expand All @@ -10,6 +10,9 @@

* **Remove unsupported chipper model**
* **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.

## 0.16.0

Expand Down
Binary file added example-docs/empty.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion test_unstructured/chunking/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,7 @@ def it_knows_the_concatenated_text_of_the_pre_chunk_to_help(
class Describe_TableSplitter:
"""Unit-test suite for `unstructured.chunking.base._TableSplitter`."""

def it_splits_an_HTML_table_on_even_rows_when_possible(self):
def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self):
opts = ChunkingOptions(max_characters=(150))
html_table = HtmlTable.from_html_text(
"""
Expand Down
13 changes: 2 additions & 11 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,19 +794,10 @@ def test_auto_partition_xls_from_filename():
example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
)

assert sum(isinstance(element, Table) for element in elements) == 2
assert len(elements) == 14

assert clean_extra_whitespace(elements[0].text)[:45] == (
"MC What is 2+2? 4 correct 3 incorrect MA What"
)
# NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
# whitespace is removed, so the expected text length is less than is the case when
# beautifulsoup4 is *not* installed. E.g.
# "\n\n\nMA\nWhat C datatypes are 8 bits"
# vs. '\n \n \n MA\n What C datatypes are 8 bits?... "
assert len(elements[0].text) == 550
assert sum(isinstance(e, Table) for e in elements) == 2
assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
assert len(elements[0].text) == 507


# ================================================================================================
Expand Down
186 changes: 80 additions & 106 deletions test_unstructured/partition/test_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,14 @@
</tbody>
</table>"""

EXPECTED_TABLE_XLSX = """<table border="1" class="dataframe">
<tbody>
<tr>
<td>Team</td>
<td>Location</td>
<td>Stanley Cups</td>
</tr>
<tr>
<td>Blues</td>
<td>STL</td>
<td>1</td>
</tr>
<tr>
<td>Flyers</td>
<td>PHI</td>
<td>2</td>
</tr>
<tr>
<td>Maple Leafs</td>
<td>TOR</td>
<td>13</td>
</tr>
</tbody>
</table>"""
EXPECTED_TABLE_XLSX = (
"<table>"
"<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
"<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
"<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
"<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
"</table>"
)

EXPECTED_TITLE = "Stanley Cups"

Expand Down Expand Up @@ -139,86 +123,76 @@
</table>"""

EXPECTED_XLS_TABLE = (
"""<table border="1" class="dataframe">
<tbody>
<tr>
<td>MC</td>
<td>What is 2+2?</td>
<td>4</td>
<td>correct</td>
<td>3</td>
<td>incorrect</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>MA</td>
<td>What C datatypes are 8 bits? (assume i386)</td>
<td>int</td>
<td></td>
<td>float</td>
<td></td>
<td>double</td>
<td></td>
<td>char</td>
</tr>
<tr>
<td>TF</td>
<td>Bagpipes are awesome.</td>
<td>true</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>ESS</td>
<td>How have the original Henry Hornbostel buildings """
"""influenced campus architecture and design in the last 30 years?</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>ORD</td>
<td>Rank the following in their order of operation.</td>
<td>Parentheses</td>
<td>Exponents</td>
<td>Division</td>
<td>Addition</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>FIB</td>
<td>The student activities fee is</td>
<td>95</td>
<td>dollars for students enrolled in</td>
<td>19</td>
<td>units or more,</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>MAT</td>
<td>Match the lower-case greek letter with its capital form.</td>
<td>λ</td>
<td>Λ</td>
<td>α</td>
<td>γ</td>
<td>Γ</td>
<td>φ</td>
<td>Φ</td>
</tr>
</tbody>
</table>"""
"<table><tr>"
"<td>MC</td>"
"<td>What is 2+2?</td>"
"<td>4</td>"
"<td>correct</td>"
"<td>3</td>"
"<td>incorrect</td>"
"<td/>"
"<td/>"
"<td/>"
"</tr><tr>" # -----
"<td>MA</td>"
"<td>What C datatypes are 8 bits? (assume i386)</td>"
"<td>int</td>"
"<td/>"
"<td>float</td>"
"<td/>"
"<td>double</td>"
"<td/>"
"<td>char</td>"
"</tr><tr>" # -----
"<td>TF</td>"
"<td>Bagpipes are awesome.</td>"
"<td>true</td>"
"<td/>"
"<td/>"
"<td/>"
"<td/>"
"<td/>"
"<td/>"
"</tr><tr>" # -----
"<td>ESS</td>"
"<td>How have the original Henry Hornbostel buildings influenced campus architecture and"
" design in the last 30 years?</td>"
"<td/>"
"<td/>"
"<td/>"
"<td/>"
"<td/>"
"<td/>"
"<td/>"
"</tr><tr>" # -----
"<td>ORD</td>"
"<td>Rank the following in their order of operation.</td>"
"<td>Parentheses</td>"
"<td>Exponents</td>"
"<td>Division</td>"
"<td>Addition</td>"
"<td/>"
"<td/>"
"<td/>"
"</tr><tr>" # -----
"<td>FIB</td>"
"<td>The student activities fee is</td>"
"<td>95</td>"
"<td>dollars for students enrolled in</td>"
"<td>19</td>"
"<td>units or more,</td>"
"<td/>"
"<td/>"
"<td/>"
"</tr><tr>" # -----
"<td>MAT</td>"
"<td>Match the lower-case greek letter with its capital form.</td>"
"<td>λ</td>"
"<td>Λ</td>"
"<td>α</td>"
"<td>γ</td>"
"<td>Γ</td>"
"<td>φ</td>"
"<td>Φ</td>"
"</tr></table>"
)
Loading

0 comments on commit 8c54153

Please sign in to comment.