From c85f29e6cafdae605be37106a3144d023bcd44c3 Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Thu, 17 Oct 2024 15:05:11 -0700
Subject: [PATCH] fix(xlsx): XLSX emits std minified .text_as_html (#3558)

**Summary**
Eliminate historical "idiosyncracies" of `table.metadata.text_as_html`
HTML introduced by `partition_xlsx()`. Produce minified `.text_as_html`
consistent with that formed by chunking.

**Additional Context**
- XLSX `.text_as_html` is minified (no extra whitespace or thead, tbody,
tfoot elements).
- `table.text` is clean-concatenated-text (CCT) of table.

---------

Co-authored-by: scanny <scanny@users.noreply.github.com>
---
 CHANGELOG.md                                  |   5 +-
 example-docs/empty.xlsx                       | Bin 0 -> 8195 bytes
 test_unstructured/chunking/test_base.py       |   2 +-
 test_unstructured/partition/test_auto.py      |  13 +-
 test_unstructured/partition/test_constants.py | 186 ++++++++----------
 test_unstructured/partition/test_xlsx.py      | 100 ++++++----
 .../gcs/nested-2/stanley-cups.xlsx.json       |  12 +-
 .../tests-example.xls.json                    |  12 +-
 unstructured/__version__.py                   |   2 +-
 unstructured/partition/html/parser.py         |  22 +--
 unstructured/partition/xlsx.py                |  27 ++-
 11 files changed, 182 insertions(+), 199 deletions(-)
 create mode 100644 example-docs/empty.xlsx

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d180ec7a7..020619e584 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.1-dev1
+## 0.16.1-dev2
 
 ### Enhancements
 
@@ -8,6 +8,9 @@
 
 * **Remove unsupported chipper model**
 * **Rewrite of `partition.email` module and tests.** Use modern Python stdlib `email` module interface to parse email messages and attachments. This change shortens and simplifies the code, and makes it more robust and maintainable. Several historical problems were remedied in the process.
+* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
+* **Fall back to filename extension-based file-type detection for unidentified OLE files.** Resolves a problem where a DOC file that could not be detected as such by `filetype` was incorrectly identified as a MSG file.
+* **Minify text_as_html from XLSX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `pandas` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.
 
 ## 0.16.0
 
diff --git a/example-docs/empty.xlsx b/example-docs/empty.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..a605e9ad8df34e54a47a47774e770f9c58397e6e
GIT binary patch
literal 8195
zcmeHM1y@|z(rqBP1cwCI;2zv1xJ!`WH16&i971TE0D%C3;1VE6)3^qA_u%dj;Oor1
z`6iQ@_Y2;fyVgCc*S)*e>3jCBI#s125BCTcfB--O000z#JoU~YB^Uqz=@9^c13-e+
zlW?$i1=_nBYIr#UUG!Nz?QF?EJc6am2Ean!|9AWke}R&Oex-IcOsPwm?-J|GQ43X~
zNYD3!yRjSS$c$EHO(s~q=11>^a4suFbMUNrDouvwU4%HZzH(3E+C&tBi>FiLv|-9F
z<Qi*D-bdG+#N(yrsgHUh;?2QU&757BS4|_d1Xq!0mB`ZzA_uu4YEfRo5NC1;>!xk-
z)Kd0%)9Ybg=8l9Eg5{Uh(!9~RGn+)IUee-~6U{F2MH;=u8)|IH(qw-UJW`{uu2GVC
zBSnAtNGXjEL4f4wjV&9y%$D*!!rrNHA(tv6-8YVp9#*|!(}c(SYbX7;G$PC#%cDH8
zWn$Ofz)!Hm5|2Jrz{yM=H90fZ2wVxb4z<#qa5PuN-}~!a#l%V8A;d#c^m;ff0#j=d
zj%I;F9}0TL%e~HEtemO7zxO8`5N<@He#CCYi?OMa5GR>_6J3sa*Ql0_Fii<r+kOmX
z(8B{9K;<totkGnnI)gq@fT|1?DnmnOpsfol%a8lN^7|jw+)sbK1f;0k&W09tDDyqM
z=W=4<EvC4<hnP$wg@%8i?EK@J*j#F&1;{gEOpT`@aMA%_|L=Ws3xcsbJrozKTqV)i
zxB^tQ9;IO^H%@Md47AS4FPutNI<VX)&L=L?q~*Nm+`(}S#a{|@<a?K>UW^?}R$>mZ
zynKp`lTRFmFO(Lf+oh<pXmV8sGa;_NUm8|X$CJI2F!U~9D*59!x<~|{!ro{aUXP23
z#Y~xRk1fUd^;1oCOFpX#lPqU`N-sk*+m=(wcMzOAUuLC@9yLlX>?@8T>28|r^8nq?
zT*v*HUL8Eh?ZrJuy}{9Bg^SRL^mmcOhR-@KL%#?!R3uLUNHCtZtiS8T!@=3c#KFPl
zN3{AwGceGQ1-0_OJxb#Ht$%abq4y!FenRdj`H4L<iQ=`LcL0hwvX11}=Ag*{!<FC#
zQ5o(Tq%wpB7iv2Xk+p0k-1<u;iTI7|9v#EWvWX*2KS6M?340lLh)13=Wtjf;RmELi
zs|<gKKACnte)m)U_@l6%+T0PxZ*cWoR=&isd4zgsU8D-~UwTgoVjcvt1$Q-WbtUGf
zhO&$3sZaLpm$<<B9u9F-eD#5%$yPE1DoK&dkztSw8tIn)s{x^7P*$kc4Y89!v<*h5
z5J|J2prbT(ad%ZGuQL7<!Q1r{f`8)2lRT_rYB&G@4G{qN;bx)y_?a0cn)(j&448i9
z;}1s12S&sJrFaFikISFZUou#aUR0W}H@>KBh0i!=*a?7y@GO4rmABWkAwUaxY+*jw
z(ixs(*MwLWt5PMp&Lm2jqM#{$fO2+izi-jQncNq}Eq4M-@<oh{WeL(a&FBYF-C2YK
zSfOa>&2118X`Lpgb51xRCzp;g*RYKX*zylp0j&q^=9bxzu;<k%MzAQR99mW8N{xK;
zwU?Z&;>go^#$o-`qX!i%nSD@S$)EczT`Sa(h}ue0?HNCkGUz`k&eTY^f9mK%Uddy5
zMRW8FhHK?BPO5?31j@%GRXrbs;VYS&qzCGj*Wa;&SKQDR!*GoU$X@raL?$WuCulmi
z$z`?59Cceh%6o226TW;$tLwXp|Div+)bc7N*HviN?O}8hSR`g35zEYI?4_BF8)iKe
zkT&dBw<QIqB@T;C>FWhLjlE!K&i6At6z#zgWKsa>5H==}$9S36Cy$@xgYaFi)~DWE
z+jkgDqTn9*tuFcG@ZsWbq56g<S7q3LMX#xu+WjbcRSmP5vs##l=*!HVz5{2#T~uz*
zjO5XMELUuwi(t#u4(8Ij{Cbi8QrjVI!e6=uugsN=%yRDed^s-X)idDO6!1*svtk-5
zVORtSn7<hkL&{rlDhuJ!@;UOaX$T`;97XJ*+d{C6g%5zI?{5|=v)Hyydz}wq=no}h
z%553?X7w#)*O7!p&9iM8(K9C@gMN1IMwSMnT69xsidFGp*tqH~`JvQ#(}U^J*;QG<
zh(aydpt=XCKbEgU*7<ta;*n;L+`e%L8;k?S7#p?y7qVDGDc~+_s19c4H(}?>;vjZe
zDjsCFwN$(<cD?uA+rCLxI%xLNEPzwqch{${Ut6W;k2q(>#2hInb|nxL#)6-9Ti3f4
zum@2sp0hL@mATY~Hy8D!g10$e5Ft#gG&mW*2zLCYb<}D=Byd5l0COgT`*q9+|60t8
z^j4J{^^S@E@g49pN<T02%MD8RYbPG*d!E6eO5u}jIcFxDw-=co_efc)lzkhs5fWWp
zk*<O<pG|t+?3{=|<??rjdxWHYmJfBdY6t)T`tO2rarLqVy8Ljod75@Mb8MLR!qWkm
zHwS(0YfQ+s{iEY5TV2^Cd&MUGk%a~^)E0wX_qR&!2E_ceBmR*}n`16R3odG-@03tJ
zn|3*uauDl>60&RVL=`KBL4@&I(WrI{iwcLMfSyB4&Am$y`O-Pmi!Z>h6oD$-?RuN(
z@D#Q49J2dK_Nl?FHQXTn>2&nUSmV!C_iu5jG1}?l6Bz}`nar+5$U*6wwa9YJL_1gM
zs0SB9_%Etn%{wG=yD|q_6DV3nHdtDljHj7`7H=uz*7HZSrVdaXBg<?Is#07x&6F&?
zUcjzTQAi;puSL%aoXX`Pv)guqeR-8_N;9PL?UUfxg9;s5S9P`zi;mtmxkU7--@@$|
z0+eH9vyyc!Lx^P%GozcNReQrC^cb#Y9S=)Bt(%B^xpe{0hV%C3Tna~dl7@G-QLuDG
zYVIKiQYniGcgHXog<xEaH3yxc(AouVuiayT*<Hyxd$$1FIN!E)-=Wan&d234D#I6(
zU_SS)J}(|a|6W$(lI@?Rm0|z+Lt6?*U<Mjpftk4A&Gv(qp^?XPwNj?M#&Qm{%O0K;
zOK}QBmlh&!(tsFlBghkcw{sd{@;8=-Z-w)v4Gz%)E~YeF)``i78#daU&Jo)Jb=)@b
zUmL(|mK!W(sN-)*c3RJ(T)N(*u2kPeYBnVu)9;`zSxrfpUD=9w=70D!)XINpWZYyV
zRl4%z-mC6Ao8bbU6B}wuwUP4x$S8~Hr5H!OewkLMU`+d*VJ_k_`V=If`~;U}vJJ(P
z`fJX#L}Z+WVVLy0Ab*iuQJ=4Vq{{7avbYGHZiMg&^qKJ0+iuvnvbg0R6E^hKsHqNm
z8Q({~l^i)*_B`orbLxHcQo8gFN-UTh+x9{25g&utU;x6k!$0!5l>oC@2^t)_$bTfe
zzXb?aOQ0Q)^|w3w4_5E#3`7(0;6ey)M9^G3zCT}!qgh-YvQ1c^Hp@&TuBqKq)#Bs;
zH4)*!((s+D(dFf*h&u=+j*7yegU{lSsXuR%jXWEerYhZ_erGGE>y92ReGe`uxXkkR
zK6O1F1*dvTr=qrjk`0?xPJ}XIrc<fMk^^jb>9~BAWfEhsfXMdOs7&AG=id7}W7+j0
zbx*@xOWNinlgaYUG2nM1g~fkqG3O0hqK)a6X`_0N-%U&svY3j`n6Fk!NCmnw2sTmi
z@#CPN+Cxv03-I~Wq`bl&FOD8VxAd~8k|w8et-VS-B6LgbfP`Z9#5jvxfOOSo^pXx6
z+u!n<=5CXqjun4Ii&k<2K}W(q`+=ig!N^y8#N}kW)q0d1DMZ&N-w>|wRrmSsoi3zy
zv;fAED>&(6?KJ;xlKop_bd{TloW_@jn*I2T!-o3R^&~}cp;Or!0Tp3nXbsC!sO4hx
z!mrxHGtCPucBis&4fWcm<o69!>dtIbBxRkygt3IySbNmZ;+eH=y}?;T3lRf@=!aC_
zq8IO}sVitx@@(kF9B>E&0%6stWmkJU(Ugags=h?i@hx}E(TpKsJA`4=8hz2AN0-yQ
zDi;pA|40>8%f^wHq;C|dO{8|=Y0}ZFQ2U)RdxD3YHA~3n{%HF&93m8OwsqZZSXE!c
zAa9RIqpp515oh>tcSo>j2)RGnIp*0Oe|Fr~0r`I2p8|O}=0C4koyTKs_B%P*P9RwF
zJKt1}Bbl8eCP_V_h^#y+TQNPPjW!O#xIKmu$?RYg?*NHnZMhd7^!LCT=#lTVxmE19
z%@J#N!8Zv)Y-SqBj_ax4q?Ny@gr7H!aVPZPUoJ}TKRR;EQdmeht&ePs9FDg+V38{U
zTyAn^zg$(DiLC&BLyy9a{K(rnst&1?h#@I%l!(}v65?1@sE9l}z$_1m7^9JJHlk5u
z`7B`)(ccCne?5CLuxMA_UT8BR<ZDJ{3|$ACf5fMau6|1v)jc&nN%rR?&CzD-9L@o)
z(zIo))RNv&^rttbOt0m?@rR#36@#Fltzq1f>UVI1<ej%h>!S%d<AvO(V^suQW)(m^
z9saZ!6jJdXA+iL%aWevvLK4I(RqXS1$hB>E=9FLH*R9hNpGDjN2T3ZMV$#551g^mq
z(ZE^3+Uc5sNKOI%(@#mWjfCp1F<pc{b;2&%!^WKboAo0vDUFgy_T)+!dj>VnGmKz>
zpdy<|dgl}*M&RXfqFsfBgHy04K3yc%Qr`7b$ptxm-i-_m@60zCip$SoH^|nb$LOcv
zx7a|95ZT>WBpr(sZ~OyaeMd%rhw5%4!$qO}=0{I_t)Gxz)Ljf<s@en#8iygFpb;zt
zN8sJRlFTgDwLM6DH~o(LF&yp-Eig{PdTi%VWYx3p9(|-9FfIu2aeBwMiTv;AfSZy*
z)yS7*Og7$@S@SK9pc0o!FHIMNiK~a(BJ(i=;d0=hP1>n$*ZIY{m~~o{Z%`lY;mKB(
za#Q67Cq?2xKYVASxQE}PrjZy#w!%5phG+HGFoTp=>CaP{<`b!y*PLiYo^i~^4WsEw
z02f*^q7=?GJUMBzP4mLt81dc@m)<I!dd9_$DOuE)G^Bq*g$G@G(!Yw8NRj)vr7Lu}
zSUSGzzop@!lY&4uGe&5vU{0j!MyP6tA&zTc@!DV}O?$*udqia?B}ey#(*_Eg9Uh+N
z#>KJ}cq3eFRxHe%IC0Yyui$f9Q%o7kVA>1s1kO)sFe)ajoaAw?>N5>aUe<0_MPR8s
zW~vz1>R7f>wqSx4CV_$ETb-_kwD55b<$9BPB4^H>*vD8lQk2iW1kuu}4|fRIU)NY8
z)iF?U*<9mn5w=fxBGglj>k)iV|Kk2cHp~9X!#Mc~K~=b9BLAxdr9K4q_*9?74rW8<
zfFyLT5#kSaj4uN*g>~1`MH$K7IbpgA^muZ#Qdxy95UOS=kp-=&mpR@SbuN*VP|pyz
zfyrXyz;OJVJXm?!+N_L-Zz2PuL!5U8*n{3x;KgrTd#oDuSxz-Yjq)YFq2iBLasQBN
zmeuw=vh3@KG9wwG@=SVb&Mu2!Y7yK8Sb2emY@*Kv!RnjE$4-Wwao2?HaDjF7;uDGS
zDu#UfJ0)i4G8>k94D1Onxu%tIzi7BtHs<$VITUr8l-i@Jt5xSdb21^JDUWks*}1*o
z6*?|^CeoAnRo6|FX;&`u>ZWhLX~G=QBS_HOF@U!~ZA(SmA%%+WW|XKWk=rPGK0zlI
zX{m|t8S{fby)EK`+%$9IQ)39GMu=JtR&8;pJE2zpUB#;%H-^W{M)9hzFvfOoU|tyy
zOSxN>cTce#tZgt4gknWUD;Rzys9CvLZGJW5>vV`ko_Sl-m#t&xZXIe~BSiNItdu*v
zH_0DrP^x!`?!8bkLM(U$rb5S<ov>Hg<w2=&uG2%ZrUClr>J%a-$q5wJ5#43qku#ba
zZP5FfH3KfK8*3l_F@Hb&esD4XeNqF>(FlJJzd!Ob7fT?})rIx9>9-i1k)Rk2Jz0hw
zhTbFQG&0Zi@<&EiuZIi^;r3nZGWDD4b%0{>Y_a7Ye7Z4?@tu)T@h!X3EzI*gl7}oN
zKs*&nPcz4OW9VR*73keE$KH!vv}Ry$*0XroaIK%gz7Z}L4$#et5&DAAf<tYYGS+p=
zV)``_0oNmETXgMxpLVBEdaZd`bxS{mObt`yheUdo=U{A(Tcmc4B-{$l)k4u*#t!j*
zp~=IiGxbD@k<yM)bM2L=uT@DXQD)=<o@6J|#e>IZwGBQ`U-D%+iw)vK9!;$$n;$I!
z)zq}t6gDKNt8E4@_gSZdo!CJKw*g2v%4wlIGa}0Z4A<JS$74Nmi^d6$+eWb_4O0f*
zRDIokKVaUd-(79R_C7z_2$H1~+$k6paxJVlrbwKB?0=F{mYm-~aL`DJUzAa1+Naeo
zCJ3R<m=Ew-^*Sce#Uy1DQ;;nmXiJp{ID*$#CF?sdxeVck=%_g&>W$qe=R05qa;k`T
z#+3#<FJL9#?aJlpB=yEJ+W;aeW#3>D)0%1u9PoYI_Cz<VFU@z)3$^Cu%Iz?qSB>y-
zXb^V2kd!rU;@L5O&Py7|5h31G(MK4-d2`Yoq<071<iBSW;&<4n5>N_QK(h)gXbH#6
z!BoZB!O?}))WI3}pJv0ql^duwVnC`kZEU!qONh6RX%6j1cRE6A+lYk~3Ad)F^<>I)
zC-W$JOYSb^_4XMKjN|Gz*ZcWW@}d)%LZ}G!G#DhDBvk-WFF&Z<qJiyRt`NgdveRhm
z(D$go;WMAyHYVEiF~XI995s3c!+k^zq@Q}i&s#v>l}skeLF5&FvRcLg;tSyNDS7f>
z;?sgm!O7Ih-b&og%r*{(l^GxWSmw^I4}+XH44azH!il)^YHGJd=5{kw=?Ncj<>|ga
z=GvQ?;LFuz;TaRQNjH*-1eUNNvbDJL{*T2^@rxGl5EBc6)$If8`__pNrIC^qKGAmO
zdNsC#bm}g47xBom(rtE2_de-IlA=={RazRZerQro9-71J!vQHsa6C4xQZ$hx-8wt2
z_Pg8hC#ojvaS40CvEZVmT7`dGaP`e*{hxE$HUk}`w@~I0L3xS!GxLlc9seaBlyUzV
z86ZW6xgTX_xEnIG1tLB>RkWZI6PcH16(%qscMGMDra4vA5hd|^Q_8g?<1_BFv1F_U
z!XF9`Fftjv70g1)JDgRcV8O^W=eKf#homeb(ZPhi=q#Qk9aysS+tJF_3E~|c_@8?4
zvlvD=x|0&kboT=B>-Sv>kUzl<TFv$kb4@P$u{n@<C%*)GF&?*U*_5!Z^{ObmDb9MB
z`ix<|oSU-1Zf;cC+_6;xGgv;ZJeen&v$BdDb52~y+)RqbPqWF9#8+pz2-?VHNy;d&
zwrN@6j~hnk6{itF*}*c_UP7>?EcG@xx8%(W-YaivVO0J8s12qBTZkGpi*uR$9f;jM
zo_Cv!0<0F@lV^sn(8-=*vsnoq=~sOl;<94I7PQ{Qw>nrQakFSkek>A^;<6fVyqf7)
z3%A7OGRPy$r+m}yL(+hBpGs7)_3pg<jM!s^lyGUv%RpleAA9l|QE68gZ`^dz%sHgD
z0UpwQqkrmQ${UihJ$ZiOu4<5SZD{eFMx;#l0a|nU-G9QsG6Viw!}gC|`(yrx_N|Ki
zUk&`V)$|AO#~coo#!ro>Ux9xuzyAy@fv)wx7vq1m^J|&*PfK5+J?USHw!eaZElK?e
zCPDiR{AYpcSLm<V|DVu(jDM^J{A%FW^UI$GZ1Mkp;eR{L{A%T|A@ENd0DuGZQ1rKe
l_$&Oce(Y!X_S2u>|M6-n^6=2T0RWJpmoPNumXrMW^?$y#Jgfi!

literal 0
HcmV?d00001

diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
index 25951ff4e8..eeb5f3740f 100644
--- a/test_unstructured/chunking/test_base.py
+++ b/test_unstructured/chunking/test_base.py
@@ -1060,7 +1060,7 @@ def it_knows_the_concatenated_text_of_the_pre_chunk_to_help(
 class Describe_TableSplitter:
     """Unit-test suite for `unstructured.chunking.base._TableSplitter`."""
 
-    def it_splits_an_HTML_table_on_even_rows_when_possible(self):
+    def it_splits_an_HTML_table_on_whole_row_boundaries_when_possible(self):
         opts = ChunkingOptions(max_characters=(150))
         html_table = HtmlTable.from_html_text(
             """
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
index 8d6670fd34..3639f26803 100644
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@@ -794,19 +794,10 @@ def test_auto_partition_xls_from_filename():
         example_doc_path("tests-example.xls"), include_header=False, skip_infer_table_types=[]
     )
 
-    assert sum(isinstance(element, Table) for element in elements) == 2
     assert len(elements) == 14
-
-    assert clean_extra_whitespace(elements[0].text)[:45] == (
-        "MC What is 2+2? 4 correct 3 incorrect MA What"
-    )
-    # NOTE(crag): if the beautifulsoup4 package is installed, some (but not all) additional
-    # whitespace is removed, so the expected text length is less than is the case when
-    # beautifulsoup4 is *not* installed. E.g.
-    #      "\n\n\nMA\nWhat C datatypes are 8 bits"
-    #  vs. '\n  \n    \n      MA\n      What C datatypes are 8 bits?... "
-    assert len(elements[0].text) == 550
+    assert sum(isinstance(e, Table) for e in elements) == 2
     assert elements[0].metadata.text_as_html == EXPECTED_XLS_TABLE
+    assert len(elements[0].text) == 507
 
 
 # ================================================================================================
diff --git a/test_unstructured/partition/test_constants.py b/test_unstructured/partition/test_constants.py
index aaf4cbe1b8..8b003cd9c5 100644
--- a/test_unstructured/partition/test_constants.py
+++ b/test_unstructured/partition/test_constants.py
@@ -28,30 +28,14 @@
   </tbody>
 </table>"""
 
-EXPECTED_TABLE_XLSX = """<table border="1" class="dataframe">
-  <tbody>
-    <tr>
-      <td>Team</td>
-      <td>Location</td>
-      <td>Stanley Cups</td>
-    </tr>
-    <tr>
-      <td>Blues</td>
-      <td>STL</td>
-      <td>1</td>
-    </tr>
-    <tr>
-      <td>Flyers</td>
-      <td>PHI</td>
-      <td>2</td>
-    </tr>
-    <tr>
-      <td>Maple Leafs</td>
-      <td>TOR</td>
-      <td>13</td>
-    </tr>
-  </tbody>
-</table>"""
+EXPECTED_TABLE_XLSX = (
+    "<table>"
+    "<tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr>"
+    "<tr><td>Blues</td><td>STL</td><td>1</td></tr>"
+    "<tr><td>Flyers</td><td>PHI</td><td>2</td></tr>"
+    "<tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr>"
+    "</table>"
+)
 
 EXPECTED_TITLE = "Stanley Cups"
 
@@ -139,86 +123,76 @@
 </table>"""
 
 EXPECTED_XLS_TABLE = (
-    """<table border="1" class="dataframe">
-  <tbody>
-    <tr>
-      <td>MC</td>
-      <td>What is 2+2?</td>
-      <td>4</td>
-      <td>correct</td>
-      <td>3</td>
-      <td>incorrect</td>
-      <td></td>
-      <td></td>
-      <td></td>
-    </tr>
-    <tr>
-      <td>MA</td>
-      <td>What C datatypes are 8 bits? (assume i386)</td>
-      <td>int</td>
-      <td></td>
-      <td>float</td>
-      <td></td>
-      <td>double</td>
-      <td></td>
-      <td>char</td>
-    </tr>
-    <tr>
-      <td>TF</td>
-      <td>Bagpipes are awesome.</td>
-      <td>true</td>
-      <td></td>
-      <td></td>
-      <td></td>
-      <td></td>
-      <td></td>
-      <td></td>
-    </tr>
-    <tr>
-      <td>ESS</td>
-      <td>How have the original Henry Hornbostel buildings """
-    """influenced campus architecture and design in the last 30 years?</td>
-      <td></td>
-      <td></td>
-      <td></td>
-      <td></td>
-      <td></td>
-      <td></td>
-      <td></td>
-    </tr>
-    <tr>
-      <td>ORD</td>
-      <td>Rank the following in their order of operation.</td>
-      <td>Parentheses</td>
-      <td>Exponents</td>
-      <td>Division</td>
-      <td>Addition</td>
-      <td></td>
-      <td></td>
-      <td></td>
-    </tr>
-    <tr>
-      <td>FIB</td>
-      <td>The student activities fee is</td>
-      <td>95</td>
-      <td>dollars for students enrolled in</td>
-      <td>19</td>
-      <td>units or more,</td>
-      <td></td>
-      <td></td>
-      <td></td>
-    </tr>
-    <tr>
-      <td>MAT</td>
-      <td>Match the lower-case greek letter with its capital form.</td>
-      <td>λ</td>
-      <td>Λ</td>
-      <td>α</td>
-      <td>γ</td>
-      <td>Γ</td>
-      <td>φ</td>
-      <td>Φ</td>
-    </tr>
-  </tbody>
-</table>"""
+    "<table><tr>"
+    "<td>MC</td>"
+    "<td>What is 2+2?</td>"
+    "<td>4</td>"
+    "<td>correct</td>"
+    "<td>3</td>"
+    "<td>incorrect</td>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "</tr><tr>"  # -----
+    "<td>MA</td>"
+    "<td>What C datatypes are 8 bits? (assume i386)</td>"
+    "<td>int</td>"
+    "<td/>"
+    "<td>float</td>"
+    "<td/>"
+    "<td>double</td>"
+    "<td/>"
+    "<td>char</td>"
+    "</tr><tr>"  # -----
+    "<td>TF</td>"
+    "<td>Bagpipes are awesome.</td>"
+    "<td>true</td>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "</tr><tr>"  # -----
+    "<td>ESS</td>"
+    "<td>How have the original Henry Hornbostel buildings influenced campus architecture and"
+    " design in the last 30 years?</td>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "</tr><tr>"  # -----
+    "<td>ORD</td>"
+    "<td>Rank the following in their order of operation.</td>"
+    "<td>Parentheses</td>"
+    "<td>Exponents</td>"
+    "<td>Division</td>"
+    "<td>Addition</td>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "</tr><tr>"  # -----
+    "<td>FIB</td>"
+    "<td>The student activities fee is</td>"
+    "<td>95</td>"
+    "<td>dollars for students enrolled in</td>"
+    "<td>19</td>"
+    "<td>units or more,</td>"
+    "<td/>"
+    "<td/>"
+    "<td/>"
+    "</tr><tr>"  # -----
+    "<td>MAT</td>"
+    "<td>Match the lower-case greek letter with its capital form.</td>"
+    "<td>λ</td>"
+    "<td>Λ</td>"
+    "<td>α</td>"
+    "<td>γ</td>"
+    "<td>Γ</td>"
+    "<td>φ</td>"
+    "<td>Φ</td>"
+    "</tr></table>"
 )
diff --git a/test_unstructured/partition/test_xlsx.py b/test_unstructured/partition/test_xlsx.py
index 7a9d25baf4..2e951d321f 100644
--- a/test_unstructured/partition/test_xlsx.py
+++ b/test_unstructured/partition/test_xlsx.py
@@ -63,20 +63,6 @@ def test_partition_xlsx_from_filename():
     assert elements[1].metadata.filename == "stanley-cups.xlsx"
 
 
-def test_partition_xlsx_from_filename_no_subtables():
-    """Partition to a single `Table` element per worksheet."""
-    assert partition_xlsx("example-docs/stanley-cups.xlsx", find_subtable=False) == [
-        Table(
-            "\n\n\nStanley Cups\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\n"
-            "Flyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n"
-        ),
-        Table(
-            "\n\n\nStanley Cups Since 67\n\n\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n"
-            "1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n"
-        ),
-    ]
-
-
 def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji():
     f = tempfile.SpooledTemporaryFile()
     with open("example-docs/emoji.xlsx", "rb") as g:
@@ -120,15 +106,12 @@ def test_partition_xlsx_infer_table_structure(infer_table_structure: bool):
 
 def test_partition_xlsx_from_filename_with_header():
     elements = partition_xlsx("example-docs/stanley-cups.xlsx", include_header=True)
-    assert sum(isinstance(element, Table) for element in elements) == 2
+
     assert len(elements) == 2
-    assert (
-        clean_extra_whitespace(elements[0].text)
-        == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
-    )
-    text_as_html = elements[0].metadata.text_as_html
-    assert text_as_html is not None
-    assert "<thead>" in text_as_html
+    assert all(isinstance(e, Table) for e in elements)
+    e = elements[0]
+    assert e.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
+    assert e.metadata.text_as_html is not None
 
 
 def test_partition_xlsx_from_file():
@@ -176,15 +159,11 @@ def test_partition_xlsx_from_file_with_header():
     with open("example-docs/stanley-cups.xlsx", "rb") as f:
         elements = partition_xlsx(file=f, include_header=True)
 
-    assert sum(isinstance(element, Table) for element in elements) == 2
     assert len(elements) == 2
-    assert (
-        clean_extra_whitespace(elements[0].text)
-        == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
-    )
-    text_as_html = elements[0].metadata.text_as_html
-    assert text_as_html is not None
-    assert "<thead>" in text_as_html
+    assert all(isinstance(e, Table) for e in elements)
+    e = elements[0]
+    assert e.text == "Stanley Cups Unnamed: 1 Unnamed: 2 " + EXPECTED_TEXT_XLSX
+    assert e.metadata.text_as_html is not None
 
 
 # -- .metadata.last_modified ---------------------------------------------------------------------
@@ -249,38 +228,38 @@ def test_partition_xlsx_metadata_language_from_filename():
 
 def test_partition_xlsx_subtables():
     assert partition_xlsx("example-docs/xlsx-subtable-cases.xlsx") == [
-        Table("\n\n\na\nb\n\n\n\n\nc\nd\n\ne\n\n\n"),
+        Table("a b c d e"),
         ListItem("f"),
         Title("a"),
-        Table("\n\n\nb\nc\n\n\nd\ne\n\n\n"),
+        Table("b c d e"),
         Title("a"),
         Title("b"),
-        Table("\n\n\nc\nd\n\n\ne\nf\n\n\n"),
-        Table("\n\n\na\nb\n\n\nc\nd\n\n\n"),
+        Table("c d e f"),
+        Table("a b c d"),
         ListItem("2. e"),
-        Table("\n\n\na\nb\n\n\nc\nd\n\n\n"),
+        Table("a b c d"),
         Title("e"),
         Title("f"),
         Title("a"),
-        Table("\n\n\nb\nc\n\n\nd\ne\n\n\n"),
+        Table("b c d e"),
         Title("f"),
         Title("a"),
         Title("b"),
-        Table("\n\n\nc\nd\n\n\ne\nf\n\n\n"),
+        Table("c d e f"),
         Title("g"),
         Title("a"),
-        Table("\n\n\nb\nc\n\n\nd\ne\n\n\n"),
+        Table("b c d e"),
         Title("f"),
         Title("g"),
         Title("a"),
         Title("b"),
-        Table("\n\n\nc\nd\n\n\ne\nf\n\n\n"),
+        Table("c d e f"),
         Title("g"),
         Title("h"),
-        Table("\n\n\na\nb\nc\n\n\n"),
+        Table("a b c"),
         Title("a"),
-        Table("\n\n\nb\nc\nd\n\n\n"),
-        Table("\n\n\na\nb\nc\n\n\n"),
+        Table("b c d"),
+        Table("a b c"),
         Title("d"),
         Title("e"),
     ]
@@ -310,6 +289,43 @@ def test_partition_xlsx_with_more_than_1k_cells():
         sys.setrecursionlimit(old_recursion_limit)
 
 
+# ================================================================================================
+# OTHER ARGS
+# ================================================================================================
+
+
+# -- `find_subtable` -----------------------------------------------------------------------------
+
+
+def test_partition_xlsx_with_find_subtables_False_emits_one_Table_element_per_worksheet():
+    elements = partition_xlsx("example-docs/stanley-cups.xlsx", find_subtable=False)
+    assert elements == [
+        Table(
+            "Stanley Cups Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+        ),
+        Table(
+            "Stanley Cups Since 67 Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple"
+            " Leafs TOR 0"
+        ),
+    ]
+
+
+def test_partition_xlsx_with_find_subtables_False_and_infer_table_structure_False_works():
+    elements = partition_xlsx(
+        "example-docs/stanley-cups.xlsx", find_subtable=False, infer_table_structure=False
+    )
+    assert elements == [
+        Table(
+            "Stanley Cups Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13"
+        ),
+        Table(
+            "Stanley Cups Since 67 Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple"
+            " Leafs TOR 0"
+        ),
+    ]
+    assert all(e.metadata.text_as_html is None for e in elements)
+
+
 # ------------------------------------------------------------------------------------------------
 # UNIT TESTS
 # ------------------------------------------------------------------------------------------------
diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
index 4931718ff1..65338dc607 100644
--- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
+++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json
@@ -25,12 +25,12 @@
   },
   {
     "type": "Table",
-    "element_id": "8d70ea477d9db14ed01ff1d39a118a42",
-    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n13\n\n\n",
+    "element_id": "259d8a8f4c2a333beff68f08c5fbf43f",
+    "text": "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 13",
     "metadata": {
       "page_name": "Stanley Cups",
       "page_number": 1,
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>13</td>\n    </tr>\n  </tbody>\n</table>",
+      "text_as_html": "<table><tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr><tr><td>Blues</td><td>STL</td><td>1</td></tr><tr><td>Flyers</td><td>PHI</td><td>2</td></tr><tr><td>Maple Leafs</td><td>TOR</td><td>13</td></tr></table>",
       "languages": [
         "eng"
       ],
@@ -74,12 +74,12 @@
   },
   {
     "type": "Table",
-    "element_id": "310cd42767ffd563f6639210df793c5b",
-    "text": "\n\n\nTeam\nLocation\nStanley Cups\n\n\nBlues\nSTL\n1\n\n\nFlyers\nPHI\n2\n\n\nMaple Leafs\nTOR\n0\n\n\n",
+    "element_id": "00c1e0e7211ccb6dffedf7c9091d8798",
+    "text": "Team Location Stanley Cups Blues STL 1 Flyers PHI 2 Maple Leafs TOR 0",
     "metadata": {
       "page_name": "Stanley Cups Since 67",
       "page_number": 2,
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Team</td>\n      <td>Location</td>\n      <td>Stanley Cups</td>\n    </tr>\n    <tr>\n      <td>Blues</td>\n      <td>STL</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <td>Flyers</td>\n      <td>PHI</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <td>Maple Leafs</td>\n      <td>TOR</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>",
+      "text_as_html": "<table><tr><td>Team</td><td>Location</td><td>Stanley Cups</td></tr><tr><td>Blues</td><td>STL</td><td>1</td></tr><tr><td>Flyers</td><td>PHI</td><td>2</td></tr><tr><td>Maple Leafs</td><td>TOR</td><td>0</td></tr></table>",
       "languages": [
         "eng"
       ],
diff --git a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
index cbcfdbb34b..4a0764d9a4 100644
--- a/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
+++ b/test_unstructured_ingest/expected-structured-output/onedrive/utic-test-ingest-fixtures/tests-example.xls.json
@@ -1,13 +1,13 @@
 [
   {
     "type": "Table",
-    "element_id": "fef5ed2f1b95429f98f47f83b92ce387",
-    "text": "\n\n\nMC\nWhat is 2+2?\n4\ncorrect\n3\nincorrect\n\n\n\n\n\nMA\nWhat C datatypes are 8 bits? (assume i386)\nint\n\nfloat\n\ndouble\n\nchar\n\n\nTF\nBagpipes are awesome.\ntrue\n\n\n\n\n\n\n\n\nESS\nHow have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?\n\n\n\n\n\n\n\n\n\nORD\nRank the following in their order of operation.\nParentheses\nExponents\nDivision\nAddition\n\n\n\n\n\nFIB\nThe student activities fee is\n95\ndollars for students enrolled in\n19\nunits or more,\n\n\n\n\n\nMAT\nMatch the lower-case greek letter with its capital form.\n\u03bb\n\u039b\n\u03b1\n\u03b3\n\u0393\n\u03c6\n\u03a6\n\n\n",
+    "element_id": "8884b4ed11191538c7215455e7b639b0",
+    "text": "MC What is 2+2? 4 correct 3 incorrect MA What C datatypes are 8 bits? (assume i386) int float double char TF Bagpipes are awesome. true ESS How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years? ORD Rank the following in their order of operation. Parentheses Exponents Division Addition FIB The student activities fee is 95 dollars for students enrolled in 19 units or more, MAT Match the lower-case greek letter with its capital form. \u03bb \u039b \u03b1 \u03b3 \u0393 \u03c6 \u03a6",
     "metadata": {
       "filename": "tests-example.xls",
       "page_name": "Example Test",
       "page_number": 1,
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>MC</td>\n      <td>What is 2+2?</td>\n      <td>4</td>\n      <td>correct</td>\n      <td>3</td>\n      <td>incorrect</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>MA</td>\n      <td>What C datatypes are 8 bits? (assume i386)</td>\n      <td>int</td>\n      <td></td>\n      <td>float</td>\n      <td></td>\n      <td>double</td>\n      <td></td>\n      <td>char</td>\n    </tr>\n    <tr>\n      <td>TF</td>\n      <td>Bagpipes are awesome.</td>\n      <td>true</td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>ESS</td>\n      <td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>ORD</td>\n      <td>Rank the following in their order of operation.</td>\n      <td>Parentheses</td>\n      <td>Exponents</td>\n      <td>Division</td>\n      <td>Addition</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>FIB</td>\n      <td>The student activities fee is</td>\n      <td>95</td>\n      <td>dollars for students enrolled in</td>\n      <td>19</td>\n      <td>units or more,</td>\n      <td></td>\n      <td></td>\n      <td></td>\n    </tr>\n    <tr>\n      <td>MAT</td>\n      <td>Match the lower-case greek letter with its capital form.</td>\n      <td>\u03bb</td>\n      <td>\u039b</td>\n      <td>\u03b1</td>\n      <td>\u03b3</td>\n      <td>\u0393</td>\n      <td>\u03c6</td>\n      <td>\u03a6</td>\n    </tr>\n  </tbody>\n</table>",
+      "text_as_html": "<table><tr><td>MC</td><td>What is 2+2?</td><td>4</td><td>correct</td><td>3</td><td>incorrect</td><td/><td/><td/></tr><tr><td>MA</td><td>What C datatypes are 8 bits? (assume i386)</td><td>int</td><td/><td>float</td><td/><td>double</td><td/><td>char</td></tr><tr><td>TF</td><td>Bagpipes are awesome.</td><td>true</td><td/><td/><td/><td/><td/><td/></tr><tr><td>ESS</td><td>How have the original Henry Hornbostel buildings influenced campus architecture and design in the last 30 years?</td><td/><td/><td/><td/><td/><td/><td/></tr><tr><td>ORD</td><td>Rank the following in their order of operation.</td><td>Parentheses</td><td>Exponents</td><td>Division</td><td>Addition</td><td/><td/><td/></tr><tr><td>FIB</td><td>The student activities fee is</td><td>95</td><td>dollars for students enrolled in</td><td>19</td><td>units or more,</td><td/><td/><td/></tr><tr><td>MAT</td><td>Match the lower-case greek letter with its capital form.</td><td>\u03bb</td><td>\u039b</td><td>\u03b1</td><td>\u03b3</td><td>\u0393</td><td>\u03c6</td><td>\u03a6</td></tr></table>",
       "languages": [
         "eng"
       ],
@@ -74,13 +74,13 @@
   },
   {
     "type": "Table",
-    "element_id": "dd5cca33b529edcca38130a0a86c3d52",
-    "text": "\n\n\nAbbreviation\nQuestion Type\n\n\nMC\nMultiple Choice\n\n\nMA\nMultiple Answer\n\n\nTF\nTrue/False\n\n\nESS\nEssay\n\n\nORD\nOrdering\n\n\nMAT\nMatching\n\n\nFIB\nFill in the Blank\n\n\nFIL\nFile response\n\n\nNUM\nNumeric Response\n\n\nSR\nShort response\n\n\nOP\nOpinion\n\n\nFIB_PLUS\nMultiple Fill in the Blank\n\n\nJUMBLED_SENTENCE\nJumbled Sentence\n\n\nQUIZ_BOWL\nQuiz Bowl\n\n\n",
+    "element_id": "88ba959ad0c3d1c0983292d5e5142f05",
+    "text": "Abbreviation Question Type MC Multiple Choice MA Multiple Answer TF True/False ESS Essay ORD Ordering MAT Matching FIB Fill in the Blank FIL File response NUM Numeric Response SR Short response OP Opinion FIB_PLUS Multiple Fill in the Blank JUMBLED_SENTENCE Jumbled Sentence QUIZ_BOWL Quiz Bowl",
     "metadata": {
       "filename": "tests-example.xls",
       "page_name": "Format Abbr.",
       "page_number": 2,
-      "text_as_html": "<table border=\"1\" class=\"dataframe\">\n  <tbody>\n    <tr>\n      <td>Abbreviation</td>\n      <td>Question Type</td>\n    </tr>\n    <tr>\n      <td>MC</td>\n      <td>Multiple Choice</td>\n    </tr>\n    <tr>\n      <td>MA</td>\n      <td>Multiple Answer</td>\n    </tr>\n    <tr>\n      <td>TF</td>\n      <td>True/False</td>\n    </tr>\n    <tr>\n      <td>ESS</td>\n      <td>Essay</td>\n    </tr>\n    <tr>\n      <td>ORD</td>\n      <td>Ordering</td>\n    </tr>\n    <tr>\n      <td>MAT</td>\n      <td>Matching</td>\n    </tr>\n    <tr>\n      <td>FIB</td>\n      <td>Fill in the Blank</td>\n    </tr>\n    <tr>\n      <td>FIL</td>\n      <td>File response</td>\n    </tr>\n    <tr>\n      <td>NUM</td>\n      <td>Numeric Response</td>\n    </tr>\n    <tr>\n      <td>SR</td>\n      <td>Short response</td>\n    </tr>\n    <tr>\n      <td>OP</td>\n      <td>Opinion</td>\n    </tr>\n    <tr>\n      <td>FIB_PLUS</td>\n      <td>Multiple Fill in the Blank</td>\n    </tr>\n    <tr>\n      <td>JUMBLED_SENTENCE</td>\n      <td>Jumbled Sentence</td>\n    </tr>\n    <tr>\n      <td>QUIZ_BOWL</td>\n      <td>Quiz Bowl</td>\n    </tr>\n  </tbody>\n</table>",
+      "text_as_html": "<table><tr><td>Abbreviation</td><td>Question Type</td></tr><tr><td>MC</td><td>Multiple Choice</td></tr><tr><td>MA</td><td>Multiple Answer</td></tr><tr><td>TF</td><td>True/False</td></tr><tr><td>ESS</td><td>Essay</td></tr><tr><td>ORD</td><td>Ordering</td></tr><tr><td>MAT</td><td>Matching</td></tr><tr><td>FIB</td><td>Fill in the Blank</td></tr><tr><td>FIL</td><td>File response</td></tr><tr><td>NUM</td><td>Numeric Response</td></tr><tr><td>SR</td><td>Short response</td></tr><tr><td>OP</td><td>Opinion</td></tr><tr><td>FIB_PLUS</td><td>Multiple Fill in the Blank</td></tr><tr><td>JUMBLED_SENTENCE</td><td>Jumbled Sentence</td></tr><tr><td>QUIZ_BOWL</td><td>Quiz Bowl</td></tr></table>",
       "languages": [
         "eng"
       ],
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 16e6dacb99..a2a7ffee60 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.1-dev1"  # pragma: no cover
+__version__ = "0.16.1-dev2"  # pragma: no cover
diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py
index c54fbb8954..dca984b013 100644
--- a/unstructured/partition/html/parser.py
+++ b/unstructured/partition/html/parser.py
@@ -7,8 +7,8 @@
 PRINCIPLES
 
 - _Elements are paragraphs._ Each paragraph in the HTML document should become a distinct element.
-  In particular, a paragraph should not be split into two elements and an element should not contain
-  more than one paragraph.
+  In particular, a paragraph should not be split into two elements and an element should not
+  contain more than one paragraph.
 
 - _An empty paragraph is not an Element._ A paragraph which contains no text or contains only
   whitespace does not give rise to an Element (is skipped).
@@ -29,12 +29,12 @@
   _phrasing content_ (aka. _inline content_).
   - As an example, a `<p>` element is a block item and a `<b>` element is phrasing.
   - A block item starts a new paragraph and so represents an Element boundary.
-  - A phrasing item affects the appearance of a run of text within a paragraph, like
-    making it bold or making it into a link.
-  - Some elements can take either role, depending upon there ancestors and descendants.
-  - The final authority for whether a particular element is displayed as a block or as
-    inline "formatting" is the CSS. We do not attempt to interpret the CSS and assume
-    the default role for each element.
+  - A phrasing item affects the appearance of a run of text within a paragraph, like making it
+    bold or making it into a link.
+  - Some elements can take either role, depending upon their ancestors and descendants.
+  - The final authority for whether a particular element is displayed as a block or as inline
+    "formatting" is the CSS. We do not attempt to interpret the CSS and assume the default role
+    for each element.
 
 Other background
 
@@ -44,9 +44,9 @@
   there may be ambiguity.
 
 - The parser is primarily composed of `lxml` Custom Element Classes. The gist is you write a class
-  like `Anchor` and then tell the `lxml` parser that all `<a>` elements should be instantiated using
-  the `Anchor` class. We also provide a default class for any elements that we haven't called out
-  explicitly.
+  like `Anchor` and then tell the `lxml` parser that all `<a>` elements should be instantiated
+  using the `Anchor` class. We also provide a default class for any elements that we haven't
+  called out explicitly.
 
 - _Anatomy of an HTML element._ Some basic terms are important to know to understand the domain
   language of the parser code. Consider this example:
diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py
index caeea90535..6edf9abc0a 100644
--- a/unstructured/partition/xlsx.py
+++ b/unstructured/partition/xlsx.py
@@ -9,11 +9,11 @@
 import networkx as nx
 import numpy as np
 import pandas as pd
-from lxml.html.soupparser import fromstring as soupparser_fromstring
 from typing_extensions import Self, TypeAlias
 
 from unstructured.chunking import add_chunking_strategy
 from unstructured.cleaners.core import clean_bullets
+from unstructured.common.html_table import HtmlTable
 from unstructured.documents.elements import (
     Element,
     ElementMetadata,
@@ -58,6 +58,10 @@ def partition_xlsx(
         A string defining the target filename path.
     file
         A file-like object using "rb" mode --> open(filename, "rb").
+    find_subtable
+        Detect "subtables" on each worksheet and partition each of those as a separate `Table`
+        element. When `False`, each worksheet is partitioned as a single `Table` element. A
+        subtable is a contiguous block of cells with more than two cells in each row.
     infer_table_structure
         If True, any Table elements that are extracted will also have a metadata field
         named "text_as_html" where the table's text content is rendered into an html string.
@@ -80,16 +84,12 @@ def partition_xlsx(
         opts.sheets.items(), start=starting_page_number
     ):
         if not opts.find_subtable:
-            html_text = (
+            html_table = HtmlTable.from_html_text(
                 sheet.to_html(index=False, header=opts.include_header, na_rep="")
-                if opts.infer_table_structure
-                else None
             )
-            # XXX: `html_text` can be `None`. What happens on this call in that case?
-            text = soupparser_fromstring(html_text).text_content()  # type: ignore
 
             metadata = ElementMetadata(
-                text_as_html=html_text,
+                text_as_html=html_table.html if infer_table_structure else None,
                 page_name=sheet_name,
                 page_number=page_number,
                 filename=opts.metadata_file_path,
@@ -97,8 +97,8 @@ def partition_xlsx(
             )
             metadata.detection_origin = DETECTION_ORIGIN
 
-            table = Table(text=text, metadata=metadata)
-            elements.append(table)
+            elements.append(Table(text=html_table.text, metadata=metadata))
+
         else:
             for component in _ConnectedComponents.from_worksheet_df(sheet):
                 subtable_parser = _SubtableParser(component.subtable)
@@ -112,14 +112,13 @@ def partition_xlsx(
                 # -- emit core-table (if it exists) as a `Table` element --
                 core_table = subtable_parser.core_table
                 if core_table is not None:
-                    html_text = core_table.to_html(
-                        index=False, header=opts.include_header, na_rep=""
+                    html_table = HtmlTable.from_html_text(
+                        core_table.to_html(index=False, header=opts.include_header, na_rep="")
                     )
-                    text = soupparser_fromstring(html_text).text_content()
-                    element = Table(text=text)
+                    element = Table(text=html_table.text)
                     element.metadata = _get_metadata(sheet_name, page_number, opts)
                     element.metadata.text_as_html = (
-                        html_text if opts.infer_table_structure else None
+                        html_table.html if opts.infer_table_structure else None
                     )
                     elements.append(element)