From c85f29e6cafdae605be37106a3144d023bcd44c3 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Thu, 17 Oct 2024 15:05:11 -0700 Subject: [PATCH] fix(xlsx): XLSX emits std minified .text_as_html (#3558) **Summary** Eliminate historical "idiosyncracies" of `table.metadata.text_as_html` HTML introduced by `partition_xlsx()`. Produce minified `.text_as_html` consistent with that formed by chunking. **Additional Context** - XLSX `.text_as_html` is minified (no extra whitespace or thead, tbody, tfoot elements). - `table.text` is clean-concatenated-text (CCT) of table. --------- Co-authored-by: scanny --- CHANGELOG.md | 5 +- example-docs/empty.xlsx | Bin 0 -> 8195 bytes test_unstructured/chunking/test_base.py | 2 +- test_unstructured/partition/test_auto.py | 13 +- test_unstructured/partition/test_constants.py | 186 ++++++++---------- test_unstructured/partition/test_xlsx.py | 100 ++++++---- .../gcs/nested-2/stanley-cups.xlsx.json | 12 +- .../tests-example.xls.json | 12 +- unstructured/__version__.py | 2 +- unstructured/partition/html/parser.py | 22 +-- unstructured/partition/xlsx.py | 27 ++- 11 files changed, 182 insertions(+), 199 deletions(-) create mode 100644 example-docs/empty.xlsx diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d180ec7a7..020619e584 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.1-dev1 +## 0.16.1-dev2 ### Enhancements @@ -8,6 +8,9 @@ * **Remove unsupported chipper model** * **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process. +* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. +* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file. +* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text. ## 0.16.0 diff --git a/example-docs/empty.xlsx b/example-docs/empty.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..a605e9ad8df34e54a47a47774e770f9c58397e6e GIT binary patch literal 8195 zcmeHM1y@|z(rqBP1cwCI;2zv1xJ!`WH16&i971TE0D%C3;1VE6)3^qA_u%dj;Oor1 z`6iQ@_Y2;fyVgCc*S)*e>3jCBI#s125BCTcfB--O000z#JoU~YB^Uqz=@9^c13-e+ zlW?$i1=_nBYIr#UUG!Nz?QF?EJc6am2Ean!|9AWke}R&Oex-IcOsPwm?-J|GQ43X~ zNYD3!yRjSS$c$EHO(s~q=11>^a4suFbMUNrDouvwU4%HZzH(3E+C&tBi>FiLv|-9F z!xk- z)Kd0%)9Ybg=8l9Eg5{Uh(!9~RGn+)IUee-~6U{F2MH;=u8)|IH(qw-UJW`{uu2GVC zBSnAtNGXjEL4f4wjV&9y%$D*!!rrNHA(tv6-8YVp9#*|!(}c(SYbX7;G$PC#%cDH8 zWn$Ofz)!Hm5|2Jrz{yM=H90fZ2wVxb4z<#qa5PuN-}~!a#l%V8A;d#c^m;ff0#j=d zj%I;F9}0TL%e~HEtemO7zxO8`5N<@He#CCYi?OMa5GR>_6J3sa*Ql0_FiiUW^?}R$>mZ zynKp`lTRFmFO(Lf+oh?@8T>28|r^8nq? zT*v*HUL8Eh?ZrJuy}{9Bg^SRL^mmcOhR-@KL%#?!R3uLUNHCtZtiS8T!@=3c#KFPl zN3{AwGceGQ1-0_OJxb#Ht$%abq4y!FenRdj`H4LKBh0i!=*a?7y@GO4rmABWkAwUaxY+*jw z(ixs(*MwLWt5PMp&Lm2jqM#{$fO2+izi-jQncNq}Eq4M-@&2118X`Lpgb51xRCzp;g*RYKX*zylp0j&q^=9bxzu;go^#$o-`qX!i%nSD@S$)EczT`Sa(h}ue0?HNCkGUz`k&eTY^f9mK%Uddy5 zMRW8FhHK?BPO5?31j@%GRXrbs;VYS&qzCGj*Wa;&SKQDR!*GoU$X@raL?$WuCulmi z$z`?59Cceh%6o226TW;$tLwXp|Div+)bc7N*HviN?O}8hSR`g35zEYI?4_BF8)iKe zkT&dBwFWhLjlE!K&i6At6z#zgWKsa>5H==}$9S36Cy$@xgYaFi)~DWE z+jkgDqTn9*tuFcG@ZsWbq56g;vjZe zDjsCFwN$(#2hInb|nxL#)6-9Ti3f4 zum@2sp0hL@mATY~Hy8D!g10$e5Ft#gG&mW*2zLCYb<}D=Byd5l0COgT`*q9+|60t8 z^j4J{^^S@E@g49pN60&RVL=`KBL4@&I(WrI{iwcLMfSyB4&Am$y`O-Pmi!Z>h6oD$-?RuN( z@D#Q49J2dK_Nl?FHQXTn>2&nUSmV!C_iu5jG1}?l6Bz}`nar+5$U*6wwa9YJL_1gM zs0SB9_%Etn%{wG=yD|q_6DV3nHdtDljHj7`7H=uz*7HZSrVdaXBgoC@2^t)_$bTfe zzXb?aOQ0Q)^|w3w4_5E#3`7(0;6ey)M9^G3zCT}!qgh-YvQ1c^Hp@&TuBqKq)#Bs; zH4)*!((s+D(dFf*h&u=+j*7yegU{lSsXuR%jXWEerYhZ_erGGE>y92ReGe`uxXkkR zK6O1F1*dvTr=qrjk`0?xPJ}XIrc9~BAWfEhsfXMdOs7&AG=id7}W7+j0 zbx*@xOWNinlgaYUG2nM1g~fkqG3O0hqK)a6X`_0N-%U&svY3j`n6Fk!NCmnw2sTmi z@#CPN+Cxv03-I~Wq`bl&FOD8VxAd~8k|w8et-VS-B6LgbfP`Z9#5jvxfOOSo^pXx6 z+u!n<=5CXqjun4Ii&k<2K}W(q`+=ig!N^y8#N}kW)q0d1DMZ&N-w>|wRrmSsoi3zy zv;fAED>&(6?KJ;xlKop_bd{TloW_@jn*I2T!-o3R^&~}cp;Or!0Tp3nXbsC!sO4hx z!mrxHGtCPucBis&4fWcmdtIbBxRkygt3IySbNmZ;+eH=y}?;T3lRf@=!aC_ zq8IO}sVitx@@(kF9B>E&0%6stWmkJU(Ugags=h?i@hx}E(TpKsJA`4=8hz2AN0-yQ zDi;pA|40>8%f^wHq;C|dO{8|=Y0}ZFQ2U)RdxD3YHA~3n{%HF&93m8OwsqZZSXE!c zAa9RIqpp515oh>tcSo>j2)RGnIp*0Oe|Fr~0r`I2p8|O}=0C4koyTKs_B%P*P9RwF zJKt1}Bbl8eCP_V_h^#y+TQNPPjW!O#xIKmu$?RYg?*NHnZMhd7^!LCT=#lTVxmE19 z%@J#N!8Zv)Y-SqBj_ax4q?Ny@gr7H!aVPZPUoJ}TKRR;EQdmeht&ePs9FDg+V38{U zTyAn^zg$(DiLC&BLyy9a{K(rnst&1?h#@I%l!(}v65?1@sE9l}z$_1m7^9JJHlk5u z`7B`)(ccCne?5CLuxMA_UT8BRUVI1!S%dE=9FLH*R9hNpGDjN2T3ZMV$#551g^mq z(ZE^3+Uc5sNKOI%(@#mWjfCp1FVnGmKz> zpdy<|dgl}*M&RXfqFsfBgHy04K3yc%Qr`7b$ptxm-i-_m@60zCip$SoH^|nb$LOcv zx7a|95ZT>WBpr(sZ~OyaeMd%rhw5%4!$qO}=0{I_t)Gxz)Ljfn$*ZIY{m~~o{Z%`lY;mKB( za#Q67Cq?2xKYVASxQE}PrjZy#w!%5phG+HGFoTp=>CaP{<`b!y*PLiYo^i~^4WsEw z02f*^q7=?GJUMBzP4mLt81dc@m)KJ}cq3eFRxHe%IC0Yyui$f9Q%o7kVA>1s1kO)sFe)ajoaAw?>N5>aUe<0_MPR8s zW~vz1>R7f>wqSx4CV_$ETb-_kwD55b<$9BPB4^H>*vD8lQk2iW1kuu}4|fRIU)NY8 z)iF?U*<9mn5w=fxBGglj>k)iV|Kk2cHp~9X!#Mc~K~=b9BLAxdr9K4q_*9?74rW8< zfFyLT5#kSaj4uN*g>~1`MH$K7IbpgA^muZ#Qdxy95UOS=kp-=&mpR@SbuN*VP|pyz zfyrXyz;OJVJXm?!+N_L-Zz2PuL!5U8*n{3x;KgrTd#oDuSxz-Yjq)YFq2iBLasQBN zmeuw=vh3@KG9wwG@=SVb&Mu2!Y7yK8Sb2emY@*Kv!RnjE$4-Wwao2?HaDjF7;uDGS zDu#UfJ0)i4G8>k94D1Onxu%tIzi7BtHs<$VITUr8l-i@Jt5xSdb21^JDUWks*}1*o z6*?|^CeoAnRo6|FX;&`u>ZWhLX~G=QBS_HOF@U!~ZA(SmA%%+WW|XKWk=rPGK0zlI zX{m|t8S{fby)EK`+%$9IQ)39GMu=JtR&8;pJE2zpUB#;%H-^W{M)9hzFvfOoU|tyy zOSxN>cTce#tZgt4gknWUD;Rzys9CvLZGJW5>vV`ko_Sl-m#t&xZXIe~BSiNItdu*v zH_0DrP^x!`?!8bkLM(U$rb5SHgJ%a-$q5wJ5#43qku#ba zZP5FfH3KfK8*3l_F@Hb&esD4XeNqF>(FlJJzd!Ob7fT?})rIx9>9-i1k)Rk2Jz0hw zhTbFQG&0Zi@<&EiuZIi^;r3nZGWDD4b%0{>Y_a7Ye7Z4?@tu)T@h!X3EzI*gl7}oN zKs*&nPcz4OW9VR*73keE$KH!vv}Ry$*0XroaIK%gz7Z}L4$#et5&DAAfIZwGBQ`U-D%+iw)vK9!;$$n;$I! z)zq}t6gDKNt8E4@_gSZdo!CJKw*g2v%4wlIGa}0Z4AW$qe=R05qa;k`T z#+3#D)0%1u9PoYI_Czy- zXb^V2kd!rU;@L5O&Py7|5h31G(MK4-d2`Yoq<071N_QK(h)gXbH#6 z!BoZB!O?}))WI3}pJv0ql^duwVnC`kZEU!qONh6RX%6j1cRE6A+lYk~3Ad)F^<>I) zC-W$JOYSb^_4XMKjN|Gz*ZcWW@}d)%LZ}G!G#DhDBvk-WFF&Zl}skeLF5&FvRcLg;tSyNDS7f> z;?sgm!O7Ih-b&og%r*{(l^GxWSmw^I4}+XH44azH!il)^YHGJd=5{kw=?Ncj<>|ga z=GvQ?;LFuz;TaRQNjH*-1eUNNvbDJL{*T2^@rxGl5EBc6)$If8`__pNrIC^qKGAmO zdNsC#bm}g47xBom(rtE2_de-IlA=={RazRZerQro9-71J!vQHsa6C4xQZ$hx-8wt2 z_Pg8hC#ojvaS40CvEZVmT7`dGaP`e*{hxE$HUk}`w@~I0L3xS!GxLlc9seaBlyUzV z86ZW6xgTX_xEnIG1tLB>RkWZI6PcH16(%qscMGMDra4vA5hd|^Q_8g?<1_BFv1F_U z!XF9`Fftjv70g1)JDgRcV8O^W=eKf#homeb(ZPhi=q#Qk9aysS+tJF_3E~|c_@8?4 zvlvD=x|0&kboT=B>-Sv>kUzl?EcG@xx8%(W-YaivVO0J8s12qBTZkGpi*uR$9f;jM zo_Cv!0<0F@lV^sn(8-=*vsnoq=~sOl;<94I7PQ{Qw>nrQakFSkek>A^;<6fVyqf7) z3%A7OGRPy$r+m}yL(+hBpGs7)_3pgUx9xuzyAy@fv)wx7vq1m^J|&*PfK5+J?USHw!eaZElK?e zCPDiR{AYpcSLm|M6-n^6=2T0RWJpmoPNumXrMW^?$y#Jgfi! literal 0 HcmV?d00001 diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index 25951ff4e8..eeb5f3740f 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -1060,7 +1060,7 @@ def it_knows_the_concatenated_text_of_the_pre_chunk_to_help( class Describe_TableSplitter: """Unit-test suite for `unstructured.chunking.base._TableSplitter`.""" - def it_splits_an_HTML_table_on_even_rows_when_possible(self): + def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self): opts = ChunkingOptions(max_characters=(150)) html_table = HtmlTable.from_html_text( """ diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 8d6670fd34..3639f26803 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -794,19 +794,10 @@ def test_auto_partition_xls_from_filename(): example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[] ) - assert sum(isinstance(element, Table) for element in elements) == 2 assert len(elements) == 14 - - assert clean_extra_whitespace(elements[0].text)[:45] == ( - "MC What is 2+2? 4 correct 3 incorrect MA What" - ) - # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional - # whitespace is removed, so the expected text length is less than is the case when - # beautifulsoup4 is *not* installed. E.g. - # "\n\n\nMA\nWhat C datatypes are 8 bits" - # vs. '\n \n \n MA\n What C datatypes are 8 bits?... " - assert len(elements[0].text) == 550 + assert sum(isinstance(e, Table) for e in elements) == 2 assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE + assert len(elements[0].text) == 507 # ================================================================================================ diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py index aaf4cbe1b8..8b003cd9c5 100644 --- a/test_unstructured/partition/test_constants.py +++ b/test_unstructured/partition/test_constants.py @@ -28,30 +28,14 @@ """ -EXPECTED_TABLE_XLSX = """ - - - - - - - - - - - - - - - - - - - - - - -
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
""" +EXPECTED_TABLE_XLSX = ( + "" + "" + "" + "" + "" + "
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
" +) EXPECTED_TITLE = "Stanley Cups" @@ -139,86 +123,76 @@ """ EXPECTED_XLS_TABLE = ( - """ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MCWhat is 2+2?4correct3incorrect
MAWhat C datatypes are 8 bits? (assume i386)intfloatdoublechar
TFBagpipes are awesome.true
ESSHow have the original Henry Hornbostel buildings """ - """influenced campus architecture and design in the last 30 years?
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition
FIBThe student activities fee is95dollars for students enrolled in19units or more,
MATMatch the lower-case greek letter with its capital form.λΛαγΓφΦ
""" + "" + "" + "" + "" + "" + "" + "" + "" # ----- + "" + "" + "" + "" + "" + "" + "" # ----- + "" + "" + "" + "" # ----- + "" + "" + "" # ----- + "" + "" + "" + "" + "" + "" + "" # ----- + "" + "" + "" + "" + "" + "" + "" # ----- + "" + "" + "" + "" + "" + "" + "" + "" + "" + "
MCWhat is 2+2?4correct3incorrect" + "" + "" + "
MAWhat C datatypes are 8 bits? (assume i386)int" + "float" + "double" + "char
TFBagpipes are awesome.true" + "" + "" + "" + "" + "" + "
ESSHow have the original Henry Hornbostel buildings influenced campus architecture and" + " design in the last 30 years?" + "" + "" + "" + "" + "" + "" + "
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition" + "" + "" + "
FIBThe student activities fee is95dollars for students enrolled in19units or more," + "" + "" + "
MATMatch the lower-case greek letter with its capital form.λΛαγΓφΦ
" ) diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py index 7a9d25baf4..2e951d321f 100644 --- a/test_unstructured/partition/test_xlsx.py +++ b/test_unstructured/partition/test_xlsx.py @@ -63,20 +63,6 @@ def test_partition_xlsx_from_filename(): assert elements[1].metadata.filename == "stanley-cups.xlsx" -def test_partition_xlsx_from_filename_no_subtables(): - """Partition to a single `Table` element per worksheet.""" - assert partition_xlsx("example-docs/stanley-cups.xlsx", find_subtable=False) == [ - Table( - "\n\n\nStanley Cups\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\n" - "Flyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n" - ), - Table( - "\n\n\nStanley Cups Since 67\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n" - "1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n" - ), - ] - - def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji(): f = tempfile.SpooledTemporaryFile() with open("example-docs/emoji.xlsx", "rb") as g: @@ -120,15 +106,12 @@ def test_partition_xlsx_infer_table_structure(infer_table_structure: bool): def test_partition_xlsx_from_filename_with_header(): elements = partition_xlsx("example-docs/stanley-cups.xlsx", include_header=True) - assert sum(isinstance(element, Table) for element in elements) == 2 + assert len(elements) == 2 - assert ( - clean_extra_whitespace(elements[0].text) - == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX - ) - text_as_html = elements[0].metadata.text_as_html - assert text_as_html is not None - assert "" in text_as_html + assert all(isinstance(e, Table) for e in elements) + e = elements[0] + assert e.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX + assert e.metadata.text_as_html is not None def test_partition_xlsx_from_file(): @@ -176,15 +159,11 @@ def test_partition_xlsx_from_file_with_header(): with open("example-docs/stanley-cups.xlsx", "rb") as f: elements = partition_xlsx(file=f, include_header=True) - assert sum(isinstance(element, Table) for element in elements) == 2 assert len(elements) == 2 - assert ( - clean_extra_whitespace(elements[0].text) - == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX - ) - text_as_html = elements[0].metadata.text_as_html - assert text_as_html is not None - assert "" in text_as_html + assert all(isinstance(e, Table) for e in elements) + e = elements[0] + assert e.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX + assert e.metadata.text_as_html is not None # -- .metadata.last_modified --------------------------------------------------------------------- @@ -249,38 +228,38 @@ def test_partition_xlsx_metadata_language_from_filename(): def test_partition_xlsx_subtables(): assert partition_xlsx("example-docs/xlsx-subtable-cases.xlsx") == [ - Table("\n\n\na\nb\n\n\n\n\nc\nd\n\ne\n\n\n"), + Table("a b c d e"), ListItem("f"), Title("a"), - Table("\n\n\nb\nc\n\n\nd\ne\n\n\n"), + Table("b c d e"), Title("a"), Title("b"), - Table("\n\n\nc\nd\n\n\ne\nf\n\n\n"), - Table("\n\n\na\nb\n\n\nc\nd\n\n\n"), + Table("c d e f"), + Table("a b c d"), ListItem("2. e"), - Table("\n\n\na\nb\n\n\nc\nd\n\n\n"), + Table("a b c d"), Title("e"), Title("f"), Title("a"), - Table("\n\n\nb\nc\n\n\nd\ne\n\n\n"), + Table("b c d e"), Title("f"), Title("a"), Title("b"), - Table("\n\n\nc\nd\n\n\ne\nf\n\n\n"), + Table("c d e f"), Title("g"), Title("a"), - Table("\n\n\nb\nc\n\n\nd\ne\n\n\n"), + Table("b c d e"), Title("f"), Title("g"), Title("a"), Title("b"), - Table("\n\n\nc\nd\n\n\ne\nf\n\n\n"), + Table("c d e f"), Title("g"), Title("h"), - Table("\n\n\na\nb\nc\n\n\n"), + Table("a b c"), Title("a"), - Table("\n\n\nb\nc\nd\n\n\n"), - Table("\n\n\na\nb\nc\n\n\n"), + Table("b c d"), + Table("a b c"), Title("d"), Title("e"), ] @@ -310,6 +289,43 @@ def test_partition_xlsx_with_more_than_1k_cells(): sys.setrecursionlimit(old_recursion_limit) +# ================================================================================================ +# OTHER ARGS +# ================================================================================================ + + +# -- `find_subtable` ----------------------------------------------------------------------------- + + +def test_partition_xlsx_with_find_subtables_False_emits_one_Table_element_per_worksheet(): + elements = partition_xlsx("example-docs/stanley-cups.xlsx", find_subtable=False) + assert elements == [ + Table( + "Stanley Cups Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" + ), + Table( + "Stanley Cups Since 67 Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple" + " Leafs TOR 0" + ), + ] + + +def test_partition_xlsx_with_find_subtables_False_and_infer_table_structure_False_works(): + elements = partition_xlsx( + "example-docs/stanley-cups.xlsx", find_subtable=False, infer_table_structure=False + ) + assert elements == [ + Table( + "Stanley Cups Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13" + ), + Table( + "Stanley Cups Since 67 Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple" + " Leafs TOR 0" + ), + ] + assert all(e.metadata.text_as_html is None for e in elements) + + # ------------------------------------------------------------------------------------------------ # UNIT TESTS # ------------------------------------------------------------------------------------------------ diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json index 4931718ff1..65338dc607 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json @@ -25,12 +25,12 @@ }, { "type": "Table", - "element_id": "8d70ea477d9db14ed01ff1d39a118a42", - "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n", + "element_id": "259d8a8f4c2a333beff68f08c5fbf43f", + "text": "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13", "metadata": { "page_name": "Stanley Cups", "page_number": 1, - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
", + "text_as_html": "
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
", "languages": [ "eng" ], @@ -74,12 +74,12 @@ }, { "type": "Table", - "element_id": "310cd42767ffd563f6639210df793c5b", - "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n", + "element_id": "00c1e0e7211ccb6dffedf7c9091d8798", + "text": "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 0", "metadata": { "page_name": "Stanley Cups Since 67", "page_number": 2, - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
", + "text_as_html": "
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR0
", "languages": [ "eng" ], diff --git a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json index cbcfdbb34b..4a0764d9a4 100644 --- a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json +++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json @@ -1,13 +1,13 @@ [ { "type": "Table", - "element_id": "fef5ed2f1b95429f98f47f83b92ce387", - "text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\n\u03bb\n\u039b\n\u03b1\n\u03b3\n\u0393\n\u03c6\n\u03a6\n\n\n", + "element_id": "8884b4ed11191538c7215455e7b639b0", + "text": "MC What is 2+2? 4 correct 3 incorrect MA What C datatypes are 8 bits? (assume i386) int float double char TF Bagpipes are awesome. true ESS How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years? ORD Rank the following in their order of operation. Parentheses Exponents Division Addition FIB The student activities fee is 95 dollars for students enrolled in 19 units or more, MAT Match the lower-case greek letter with its capital form. \u03bb \u039b \u03b1 \u03b3 \u0393 \u03c6 \u03a6", "metadata": { "filename": "tests-example.xls", "page_name": "Example Test", "page_number": 1, - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
MCWhat is 2+2?4correct3incorrect
MAWhat C datatypes are 8 bits? (assume i386)intfloatdoublechar
TFBagpipes are awesome.true
ESSHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition
FIBThe student activities fee is95dollars for students enrolled in19units or more,
MATMatch the lower-case greek letter with its capital form.\u03bb\u039b\u03b1\u03b3\u0393\u03c6\u03a6
", + "text_as_html": "
MCWhat is 2+2?4correct3incorrect
MAWhat C datatypes are 8 bits? (assume i386)intfloatdoublechar
TFBagpipes are awesome.true
ESSHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?
ORDRank the following in their order of operation.ParenthesesExponentsDivisionAddition
FIBThe student activities fee is95dollars for students enrolled in19units or more,
MATMatch the lower-case greek letter with its capital form.\u03bb\u039b\u03b1\u03b3\u0393\u03c6\u03a6
", "languages": [ "eng" ], @@ -74,13 +74,13 @@ }, { "type": "Table", - "element_id": "dd5cca33b529edcca38130a0a86c3d52", - "text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n", + "element_id": "88ba959ad0c3d1c0983292d5e5142f05", + "text": "Abbreviation Question Type MC Multiple Choice MA Multiple Answer TF True/False ESS Essay ORD Ordering MAT Matching FIB Fill in the Blank FIL File response NUM Numeric Response SR Short response OP Opinion FIB_PLUS Multiple Fill in the Blank JUMBLED_SENTENCE Jumbled Sentence QUIZ_BOWL Quiz Bowl", "metadata": { "filename": "tests-example.xls", "page_name": "Format Abbr.", "page_number": 2, - "text_as_html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
AbbreviationQuestion Type
MCMultiple Choice
MAMultiple Answer
TFTrue/False
ESSEssay
ORDOrdering
MATMatching
FIBFill in the Blank
FILFile response
NUMNumeric Response
SRShort response
OPOpinion
FIB_PLUSMultiple Fill in the Blank
JUMBLED_SENTENCEJumbled Sentence
QUIZ_BOWLQuiz Bowl
", + "text_as_html": "
AbbreviationQuestion Type
MCMultiple Choice
MAMultiple Answer
TFTrue/False
ESSEssay
ORDOrdering
MATMatching
FIBFill in the Blank
FILFile response
NUMNumeric Response
SRShort response
OPOpinion
FIB_PLUSMultiple Fill in the Blank
JUMBLED_SENTENCEJumbled Sentence
QUIZ_BOWLQuiz Bowl
", "languages": [ "eng" ], diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 16e6dacb99..a2a7ffee60 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.1-dev1" # pragma: no cover +__version__ = "0.16.1-dev2" # pragma: no cover diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index c54fbb8954..dca984b013 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -7,8 +7,8 @@ PRINCIPLES - _Elements are paragraphs._ Each paragraph in the HTML document should become a distinct element. - In particular, a paragraph should not be split into two elements and an element should not contain - more than one paragraph. + In particular, a paragraph should not be split into two elements and an element should not + contain more than one paragraph. - _An empty paragraph is not an Element._ A paragraph which contains no text or contains only whitespace does not give rise to an Element (is skipped). @@ -29,12 +29,12 @@ _phrasing content_ (aka. _inline content_). - As an example, a `

` element is a block item and a `` element is phrasing. - A block item starts a new paragraph and so represents an Element boundary. - - A phrasing item affects the appearance of a run of text within a paragraph, like - making it bold or making it into a link. - - Some elements can take either role, depending upon there ancestors and descendants. - - The final authority for whether a particular element is displayed as a block or as - inline "formatting" is the CSS. We do not attempt to interpret the CSS and assume - the default role for each element. + - A phrasing item affects the appearance of a run of text within a paragraph, like making it + bold or making it into a link. + - Some elements can take either role, depending upon their ancestors and descendants. + - The final authority for whether a particular element is displayed as a block or as inline + "formatting" is the CSS. We do not attempt to interpret the CSS and assume the default role + for each element. Other background @@ -44,9 +44,9 @@ there may be ambiguity. - The parser is primarily composed of `lxml` Custom Element Classes. The gist is you write a class - like `Anchor` and then tell the `lxml` parser that all `` elements should be instantiated using - the `Anchor` class. We also provide a default class for any elements that we haven't called out - explicitly. + like `Anchor` and then tell the `lxml` parser that all `` elements should be instantiated + using the `Anchor` class. We also provide a default class for any elements that we haven't + called out explicitly. - _Anatomy of an HTML element._ Some basic terms are important to know to understand the domain language of the parser code. Consider this example: diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index caeea90535..6edf9abc0a 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -9,11 +9,11 @@ import networkx as nx import numpy as np import pandas as pd -from lxml.html.soupparser import fromstring as soupparser_fromstring from typing_extensions import Self, TypeAlias from unstructured.chunking import add_chunking_strategy from unstructured.cleaners.core import clean_bullets +from unstructured.common.html_table import HtmlTable from unstructured.documents.elements import ( Element, ElementMetadata, @@ -58,6 +58,10 @@ def partition_xlsx( A string defining the target filename path. file A file-like object using "rb" mode --> open(filename, "rb"). + find_subtable + Detect "subtables" on each worksheet and partition each of those as a separate `Table` + element. When `False`, each worksheet is partitioned as a single `Table` element. A + subtable is a contiguous block of cells with more than two cells in each row. infer_table_structure If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string. @@ -80,16 +84,12 @@ def partition_xlsx( opts.sheets.items(), start=starting_page_number ): if not opts.find_subtable: - html_text = ( + html_table = HtmlTable.from_html_text( sheet.to_html(index=False, header=opts.include_header, na_rep="") - if opts.infer_table_structure - else None ) - # XXX: `html_text` can be `None`. What happens on this call in that case? - text = soupparser_fromstring(html_text).text_content() # type: ignore metadata = ElementMetadata( - text_as_html=html_text, + text_as_html=html_table.html if infer_table_structure else None, page_name=sheet_name, page_number=page_number, filename=opts.metadata_file_path, @@ -97,8 +97,8 @@ def partition_xlsx( ) metadata.detection_origin = DETECTION_ORIGIN - table = Table(text=text, metadata=metadata) - elements.append(table) + elements.append(Table(text=html_table.text, metadata=metadata)) + else: for component in _ConnectedComponents.from_worksheet_df(sheet): subtable_parser = _SubtableParser(component.subtable) @@ -112,14 +112,13 @@ def partition_xlsx( # -- emit core-table (if it exists) as a `Table` element -- core_table = subtable_parser.core_table if core_table is not None: - html_text = core_table.to_html( - index=False, header=opts.include_header, na_rep="" + html_table = HtmlTable.from_html_text( + core_table.to_html(index=False, header=opts.include_header, na_rep="") ) - text = soupparser_fromstring(html_text).text_content() - element = Table(text=text) + element = Table(text=html_table.text) element.metadata = _get_metadata(sheet_name, page_number, opts) element.metadata.text_as_html = ( - html_text if opts.infer_table_structure else None + html_table.html if opts.infer_table_structure else None ) elements.append(element)