From 2d1e59dffb78e013418d3b306126f71b9b456972 Mon Sep 17 00:00:00 2001 From: cmjcharlton <90400333+cmjcharlton@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:44:42 +0100 Subject: [PATCH] =?UTF-8?q?BUG:=20Unable=20to=20open=20Stata=20118=20or=20?= =?UTF-8?q?119=20format=20files=20saved=20in=20big-endian=E2=80=A6=20(#586?= =?UTF-8?q?40)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BUG: Unable to open Stata 118 or 119 format files saved in big-endian format that contain strL data * Rename test functions to make their purpose clearer --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/stata.py | 7 ++- pandas/tests/io/data/stata/stata12_118.dta | Bin 0 -> 2622 bytes pandas/tests/io/data/stata/stata12_119.dta | Bin 0 -> 2632 bytes pandas/tests/io/data/stata/stata12_be_117.dta | Bin 0 -> 1285 bytes pandas/tests/io/data/stata/stata12_be_118.dta | Bin 0 -> 2622 bytes pandas/tests/io/data/stata/stata12_be_119.dta | Bin 0 -> 2632 bytes pandas/tests/io/data/stata/stata14_119.dta | Bin 0 -> 5574 bytes pandas/tests/io/data/stata/stata14_be_118.dta | Bin 0 -> 5556 bytes pandas/tests/io/data/stata/stata14_be_119.dta | Bin 0 -> 5574 bytes pandas/tests/io/data/stata/stata16_119.dta | Bin 0 -> 4628 bytes pandas/tests/io/data/stata/stata16_be_118.dta | Bin 0 -> 4614 bytes pandas/tests/io/data/stata/stata16_be_119.dta | Bin 0 -> 4628 bytes pandas/tests/io/test_stata.py | 47 +++++++++++++++--- 14 files changed, 43 insertions(+), 12 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata12_118.dta create mode 100644 pandas/tests/io/data/stata/stata12_119.dta create mode 100644 pandas/tests/io/data/stata/stata12_be_117.dta create mode 100644 pandas/tests/io/data/stata/stata12_be_118.dta create mode 100644 pandas/tests/io/data/stata/stata12_be_119.dta create mode 100644 pandas/tests/io/data/stata/stata14_119.dta create mode 100644 pandas/tests/io/data/stata/stata14_be_118.dta create mode 100644 pandas/tests/io/data/stata/stata14_be_119.dta create mode 100644 pandas/tests/io/data/stata/stata16_119.dta create mode 100644 pandas/tests/io/data/stata/stata16_be_118.dta create mode 100644 pandas/tests/io/data/stata/stata16_be_119.dta diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 27b16cb706e8d..e621ab2a5b9c5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -546,6 +546,7 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) +- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) Period ^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4e7bd160a5a52..9c6cd2faeaa2f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1600,14 +1600,13 @@ def _read_strls(self) -> None: v_o = self._read_uint64() else: buf = self._path_or_buf.read(12) - # Only tested on little endian file on little endian machine. + # Only tested on little endian machine. v_size = 2 if self._format_version == 118 else 3 if self._byteorder == "<": buf = buf[0:v_size] + buf[4 : (12 - v_size)] else: - # This path may not be correct, impossible to test - buf = buf[0:v_size] + buf[(4 + v_size) :] - v_o = struct.unpack("Q", buf)[0] + buf = buf[4 - v_size : 4] + buf[(4 + v_size) :] + v_o = struct.unpack(f"{self._byteorder}Q", buf)[0] typ = self._read_uint8() length = self._read_uint32() va = self._path_or_buf.read(length) diff --git a/pandas/tests/io/data/stata/stata12_118.dta b/pandas/tests/io/data/stata/stata12_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..87c6d1f063150d6a279d0b5cee1fbb5237123bcc GIT binary patch literal 2622 zcmeHJJ5R$f5H?gG!3GioTbHglsf0w($y6Yfp{NX_e__NRV@|_$T(%nUOFXvuGj3cpxJiZh$x@|Xq*^JO+= zf_(pMBlH4ziLBXfsoEBKGTPSHXX`Uc}Pc%+K$XZ+2VFms)Ct5zuFEIW6TIk|#^d&*BR0)~f0V z)Dif<2rN_p_EJMQrksY7arNEAp&)?&P$E@isoVc@MuSlK+Ca8;L7h$*(41nc7xZy% zHq%gCx&h9GK}enmayhuxk;NZIZ)f3U)6=qK`~{Ddy~qFn literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata12_119.dta b/pandas/tests/io/data/stata/stata12_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..fa63f0135738e6fb9be5b09d0da4ffde5bbb2ec9 GIT binary patch literal 2632 zcmeHJy-ve05H=u?V1t!`E?sd`i6Q7@styIIP=-E0cXvMDo&S;#l;)ZTJ=Kr;oxsTN~t9PYa< zpt%Zdc0dRL+Gfy6MbuF3Fi3bP5@r-DfsRK4Gma+g)blGX5mxFo@2XPsuj@6S`6$q= zm=T{~0NXwtp6^+q>F{^o>I;WIrXFmALyHf`7AFqB9A2DQJ2Qbj@tso;WUxz1wKT_0!@~+sq$D7m%KgJq=J}sgqdg<@L#m1G*f7{#cH< zR;=n@)&`lwHdyu5HtxCa&J%%VMdQ^=UPO?-t-!82k6%g!?SQ$1>&KFROVYzq8+Wpx zTFbQM4wO5v+YYQS0r#jO8gj`aEhPSS;z%)|8ENY1&ya^Gc(m Z!`$#C6CDYgO|trjG0R!`vKciekN+1`zAFF# literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata12_be_117.dta b/pandas/tests/io/data/stata/stata12_be_117.dta new file mode 100644 index 0000000000000000000000000000000000000000..7f84d15fb76d0543eaecaa2af670198da64e495d GIT binary patch literal 1285 zcmdT^y-ve05O$lEfenP1yL81#C4>SeQ!$jGq6~e2$VuEbQbLPu3PB7NZ^3i0@f17& z?}3=Wj?*S8sv^{hNV)I+eCIo#4MI{$y--pD6GlVE32+uODj0E`6TnLW{8X|j*A7?1 z9^gWRK0y}XK7j#2NV5Y1G4)wY5MaZAj1s0&B{Q<;xUC-ZTOHduZrSc>#|B(j0WWuv z(3uWb{$7kwJIL`Y$JZR+4c6xdEd!`=XqbkZ<5P|wIer;zHUaCbFsD4Ce#|_X&X^$I zpJ->HN?B8Dw;QU;A`fQ$W}0p`8OvjpLzdlS12N??j)Yvb2k=S-qbX0Q6r}AQ+2g9H zWqaEdWwDz7^8&!j&8gPKcg)LX!!uP?=iAyt_i#OJan{*p#Hy!TXZt?^N|T*80fGtT zG?0v|F^d900RPg2>fIpc3xud2m|FFHt982w1wJh_^@}$nWK{d87b05>Lh@LU%i*QqsF~{ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata12_be_118.dta b/pandas/tests/io/data/stata/stata12_be_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..9ed6f39b0f9b53c49b9af3735ec2cd59b9a6d1c2 GIT binary patch literal 2622 zcmeHJJ5R$f5Oy0CiEbb+nXAz@<5!*fjoD<+BlFhiDa53rw zE=U*)MJri0X8F$Va$}jcusa~r`2a(tJ|>;TOH@P+W}li0q3E_ zbe`MHOrsYwsO@LiGg$w~@YA66X0UN(aQiI7afYu3n|m2{GbDhuSGc1*q+Y~anar6W z-(P6!Nu{#F*3+)3N{c)g{muQ8&AB$7A2%I9x;kbxK+L#|LLnF12k?Rg;~9^s6r}AO zbS7n@Zabok8bYNN0p~%hEf?J~H$79X$+lc9mDU<7p!eSDvJ`hnx>#bd7uV>wGF4Te zs=)tMV5tMh4mJ2w%Be3ISIQP0=0T4}(wUJoJPqXnma`kqP( zj%uY+We}1_f}D>omCUFkRC`uLQ`rpIcrp#qZ)sgicW>?)mr43F!1ou!*ilSRw;19RuS$p8QV literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata12_be_119.dta b/pandas/tests/io/data/stata/stata12_be_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..3c9736d0f3af3c0dd504db2cafb456fa0f73290c GIT binary patch literal 2632 zcmeHJJ#NA<6gD6#m91Snl!dhhqztA?UyXA}28rDU{T9f6@u{B)vnoO1(%g z(vkWnX=s>W$wCHye*b=c&#^&BDybVtN?^`tz&HWUB1Q!xosI)IBfv`}TXL;1nhXHv zBn$~cs0MgQV63P5Q{>EH=(lgcNT9>!|QVc}Ts8xiX2F zAnPA=v{I=Q*jnwLs$_s5Hzm~lKQyp6k5KAtjP{?BYs1p8(V7lZBDg|jd z7wxo9cFUpVh}{e_?Y#{FXNy*?7dzDrn7m7{!6P`diwCpxJ=AorvfjllkNE7x?PFEC2ui literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata14_119.dta b/pandas/tests/io/data/stata/stata14_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..e64353213b1c966ec75e8355e03b67395723f9f9 GIT binary patch literal 5574 zcmeHLJ8u**5FQ{rMF9dTB(zOhoC}JQLzWW~2tkQLlt+E?-8s2c_SUDh14^1iq2dov zAt5Lz=x7iiL~{}U08!A;&{KgxNXPiGxjd>Y5z9$u?RxC>jA#6PWJd{l>Zf%eoJ?0o-FaHEj~{e6XjpfGrSV2F}-D3i3AHaKGC?0Wznx{S9q@t&vPq_AnXB<@*9DU^J>_u1quKvMON{m)|k0&87 zN1~)x8$z)YfZcBVZ2f5cYJG1}vZvXw#~g>p5Q}(8xh`|b$3h*7JPlZIGoa)QDb495 z`nN#biFKai2N;he&((xcT+F#bFeUR{1h_wKuSlw7qN7l0yOq3o0Fdq}6FmSs>y&J> z(nXon@h7t0Vn!hI5WL#Alx3;)7PAT#2D3?K)<{YEV{prjRz%63EhV@2W*K`|ri;L# zBEap8Jk%~btTVIs?p>N~f6Bt6*n~(#!0}bGE&`pm*VJJUqq(~QtmN(Wbvl3w-BcCa z4|S~_!&GWK$$0vA9vla^IE=Lo^A0ZAm9PjbnmC~gVfNWzZYjF>WycR9jMKUR{sn+L zZi2jHL(QqdD_SHNa)uSfDB1VLT#m6Zu2J!*T#}j}1F+XwCfgYHH(TF^Z%Px-jY_3u zj4Vfup`v7)`h>9{d5`y?FPe(U`2S?JNt+iGet7vXhy<>Ad&)Uca`}_XRAEsogC<(%F&x@jb0cU^={RqH#C?V_HrXPhtZD~7yPE9RaR?S zj17|=_mUZj>2mB)Hm27c?fdL#|GJ|WSxlcgdTAo2pB%mXI;QWyH&)>hlL0G+yr4H~ zTv5_T`nE~qSL0{ncjE{6ZEPbRG3*{iETR?V8cU^|3Ut8JbYRKNfP!;`?Mt&2KUsJbo#B#u3?C(`+|y}CNgW0;x_c17Qrb~orvs=kELGlmpewDMrBq=u z;qKpga2(y^AksFiJGy39!y>S3@`5ge$!CK(lj!1?96u6bUe*Qh7y#}&`H2G?FuMjT zX)ZCv3`&ZEf9Q)j45L#_qv1)hA~YjC)`twqjBgVxzB`w03zN@H3Z+F9i)6RHZQ(b* z_0@?ige>CZAqG)ccdfY6u}!&v2DHeF)eX@U>23U(*v7Tyk5iW zzZ2G9ZtlL_eU4eYv%Wv~Lx95~o;nh8<39<#v*^nee9U#$5t0!#9f-(dBz`IGg&Ei@ O(}4p~vzaxmgWA71;YXPO literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata14_be_119.dta b/pandas/tests/io/data/stata/stata14_be_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..09d08f7e992ea3ae410bac190f93f5cdc4fc9e5e GIT binary patch literal 5574 zcmeHLy>AmS6n|;c0u=)Yn2^vdUAd;HYJ)7dv`~s#O4W}!auU;=dO1gD7t}7TVB#NO zLP9Vwu(2ROh$R*O0b*cbVP^t?kizpvnl`ZXsCq<-b?0aMJ^MZX&OX0qA{A4tP*#kR z3TI^=Qxfx#Gs$T#mnVL+LW&!Ti&zWXT$(0+t4d}m030L!EG6^0s8b*D=P3zUk%ts6 zELQ^Q;ln(y{wNAcqRhQA>h+8O@vR(DL6xID)fzoLJmy`g#oq8xZp6!tULF}Eev3=| zrm0m{YihK50p8g{Chus)(!Q*tuD$=0tq(k~^o(@$p`~XR9sOwOxi^l!C%)E7?yxvu z#gG@&MvY4f{m{P+jUSEgjh~Hg#BXD3@rdE{C}I(23lrJJjw-B1NpiKq~hux&IoMqbp};09tk8 z?~0P!BCg^`q=Wf{K&H|0;?Qc6tu~lX+A%kptY>14#BY8W{N;x0ykO3jg8kA-=HBho zMc_~oU}i-g>X02anEv;+Z_SoJW#Ex?hR57v_)1abo=V$m>M)4W-2DKS()Ri~9YBe0 zs`Ty#s?v&KN);9-JpDTlj)PkqM9PGD2k+VKVGy{j4D3384N(WeG0DIR0U8I&X?{vR#+a*R$fg^DM|s!$A|R~wOm%=k8v@vpiDa8>Ad zZc<1kA}7+b^<@j&_||94F)f5F;;@KLl&+akOr~otEuaD=mc`nJz!t;JHhxWP17s&o zHHH30>GNlEF#ci1B;p&Zi8(0g^`%?Qv5;o&VRqb1rTzr=^LjMzUpVzEcP3!9RHxh41L;wH) literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata16_119.dta b/pandas/tests/io/data/stata/stata16_119.dta new file mode 100644 index 0000000000000000000000000000000000000000..d03c489d4342d3e908da62d6b4dbf22e34dbce00 GIT binary patch literal 4628 zcmeHLJ8u&~5I&xv2_h&R%{6IgE_Ogl(6AK>MZkdk1Dv(JQBFEPbh|-vXDp{hR1pOQ z1r=Q+OZ)&kl+ItkCE`yovyZbcc2I07vV?ABd3NT$dED&m-OYkjT=7y>aR!HiS4GSq z7Lnjmuv~5j0>?nPr9>kpz^mde1Y;KV*d5RFAlPG2U@JJK0YQO5#LFUL#F!bVuqM!B zt;yE5a{g=H^7EUyZ9l*D>?!&hUxmO0Q{zoEV8&NVAKfsMT}yx6GW_aoL!VPTIDXq; z-rYC6vh@1{!~a<`^xi{5*DZyByuy1PhrAq#l4>=DWRJad4;{9>HUt+g5wyeXNw)DMFCA_!x{=>pP;qB5J#X<)7xf5%OA~JU=tz|Gv#n(EJy*6eaF&7V z$N=sCi-kFUo9S|uQ_q9z2qd$aWgyEymVqn-|7iv$IwAX7fyyC|c}0mBKRUyTWDrbA zs@QU}^AE$*;g4Z&czWLXcXrU|#d+N0jbQSTThDc@=J{tS0@G^cw~G^eSrM_O?Si>jkK#lk8>j4W+<#{$6Jl znN%qufNm-Q&~i3CtESf$1zIgtGFB}C(1bfRd%oax=)hpAG^}a@BGtN6)$2tb>0a#O zT7~2iDO8<;TagY)eUg$SCDn0}#HCwfrNg=-7;6C=Je|48ZiTkx#5W?b literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/stata/stata16_be_118.dta b/pandas/tests/io/data/stata/stata16_be_118.dta new file mode 100644 index 0000000000000000000000000000000000000000..bae769c038820091a3c52ec98f84f12bc0863f4b GIT binary patch literal 4614 zcmeHLy^hmB5Z*Y+r2-K|L+9oin!V&e%AwIVoS-;eK*|H;Hpxb3$@!6Y4=G(hxfW3& z3Pi;VoD_)%NRBf100^YK1`X_=6WfWw)5txv_So3`Piuc0Tg1kez8ib@ znYCSG?|m}1{KMK`#@^optg^yu8hEtjv!)mg87JrwdVq3cG?vpz+)~g!l~W@ZqFXaz zcJ%90;q@y!O0LpU-uAp5=11xgG`R@;T@6Aju*(tX`ZV+Oo2>+Oo6ltL{2|;TY>gD4QN}iK)z^tZB77hcslZsNp9XOt`=Vx z)5X=z+rOuOHJj9rXOl)0HADs7&7q>P-@eSp*OU4C`Q^1e%SGYQ_UuEnXWG<;b8I+g zIsLetj+fJS%juxXtGNbVfuw1^5( zP*70OMY6;P*r9ZO0hfrMU|!2Uyh5>sw5WmmQeICTs@X;ELez|lS6UYClD z-2txdyY>kg+O<9Rz^?5-dn$58cfkoK(j&>!x#MQ zK!@4T)hFERxf@C@(-PM9ybb0@@)0x{5ByC{LMgDx8L0Zy^$4sVaNlD*y@`Z4r>9aO zaQ;HztRHjB5ZOW~yAp>nv}%0=WXY0=WXY0_!ReI|-TB3bfBjK-!!J;-k}RGYrm>rz4J> z`1W#sJ^wzR%&%|X{X6~Z>A3!4I&Q>K1B6O)To5NuWq*8~jc&%X53{SA&vD>#fBJFR zM@%@>HctPm9TDF?qk3T!~I1calEMNY^vriY8pqRxL@)9%UgrCY)HYT5QSA=HAg8 zE>^@+;nd}h+zeRk!(Bz%%D}APvLmbtC;mxL^#bxp$MDO>D*;f?Uf6;xo$W*^X+XFp z1+WGvNwX9!Q8G!I0$3zPD@s~*sWg&Q1+sc6YDrQn=HVs@>Xy|>5z3Na*~S|#M|;D0 fh2AZNWhrdML=Xo&rCj{9BDZDi8&M9UKZ5=M?@bKz literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index a27448a342a19..bf17e62985fe9 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -314,8 +314,19 @@ def test_readold_dta4(self, version, datapath): tm.assert_frame_equal(parsed, expected) # File containing strls - def test_read_dta12(self, datapath): - parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta")) + @pytest.mark.parametrize( + "file", + [ + "stata12_117", + "stata12_be_117", + "stata12_118", + "stata12_be_118", + "stata12_119", + "stata12_be_119", + ], + ) + def test_read_dta_strl(self, file, datapath): + parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) expected = DataFrame.from_records( [ [1, "abc", "abcdefghi"], @@ -325,10 +336,20 @@ def test_read_dta12(self, datapath): columns=["x", "y", "z"], ) - tm.assert_frame_equal(parsed_117, expected, check_dtype=False) + tm.assert_frame_equal(parsed, expected, check_dtype=False) - def test_read_dta18(self, datapath): - parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata14_118", + "stata14_be_118", + "stata14_119", + "stata14_be_119", + ], + ) + def test_read_dta118_119(self, file, datapath): + parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) parsed_118["Bytes"] = parsed_118["Bytes"].astype("O") expected = DataFrame.from_records( [ @@ -352,7 +373,7 @@ def test_read_dta18(self, datapath): for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) - with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr: + with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr: vl = rdr.variable_labels() vl_expected = { "Unicode_Cities_Strl": "Here are some strls with Ünicode chars", @@ -1799,8 +1820,18 @@ def test_gzip_writing(self, temp_file): reread = read_stata(gz, index_col="index") tm.assert_frame_equal(df, reread) - def test_unicode_dta_118(self, datapath): - unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta")) + # 117 is not included in this list as it uses ASCII strings + @pytest.mark.parametrize( + "file", + [ + "stata16_118", + "stata16_be_118", + "stata16_119", + "stata16_be_119", + ], + ) + def test_unicode_dta_118_119(self, file, datapath): + unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta")) columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"] values = [