Skip to content

Commit

Permalink
BUG: Unable to open Stata 118 or 119 format files saved in big-endian… (
Browse files Browse the repository at this point in the history
#58640)

* BUG: Unable to open Stata 118 or 119 format files saved in big-endian format that contain strL data

* Rename test functions to make their purpose clearer
  • Loading branch information
cmjcharlton authored Jun 10, 2024
1 parent a787f45 commit 2d1e59d
Show file tree
Hide file tree
Showing 14 changed files with 43 additions and 12 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ I/O
- Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)

Period
^^^^^^
Expand Down
7 changes: 3 additions & 4 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1600,14 +1600,13 @@ def _read_strls(self) -> None:
v_o = self._read_uint64()
else:
buf = self._path_or_buf.read(12)
# Only tested on little endian file on little endian machine.
# Only tested on little endian machine.
v_size = 2 if self._format_version == 118 else 3
if self._byteorder == "<":
buf = buf[0:v_size] + buf[4 : (12 - v_size)]
else:
# This path may not be correct, impossible to test
buf = buf[0:v_size] + buf[(4 + v_size) :]
v_o = struct.unpack("Q", buf)[0]
buf = buf[4 - v_size : 4] + buf[(4 + v_size) :]
v_o = struct.unpack(f"{self._byteorder}Q", buf)[0]
typ = self._read_uint8()
length = self._read_uint32()
va = self._path_or_buf.read(length)
Expand Down
Binary file added pandas/tests/io/data/stata/stata12_118.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata12_119.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata12_be_117.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata12_be_118.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata12_be_119.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata14_119.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata14_be_118.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata14_be_119.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata16_119.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata16_be_118.dta
Binary file not shown.
Binary file added pandas/tests/io/data/stata/stata16_be_119.dta
Binary file not shown.
47 changes: 39 additions & 8 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,19 @@ def test_readold_dta4(self, version, datapath):
tm.assert_frame_equal(parsed, expected)

# File containing strls
def test_read_dta12(self, datapath):
parsed_117 = self.read_dta(datapath("io", "data", "stata", "stata12_117.dta"))
@pytest.mark.parametrize(
"file",
[
"stata12_117",
"stata12_be_117",
"stata12_118",
"stata12_be_118",
"stata12_119",
"stata12_be_119",
],
)
def test_read_dta_strl(self, file, datapath):
parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))
expected = DataFrame.from_records(
[
[1, "abc", "abcdefghi"],
Expand All @@ -325,10 +336,20 @@ def test_read_dta12(self, datapath):
columns=["x", "y", "z"],
)

tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
tm.assert_frame_equal(parsed, expected, check_dtype=False)

def test_read_dta18(self, datapath):
parsed_118 = self.read_dta(datapath("io", "data", "stata", "stata14_118.dta"))
# 117 is not included in this list as it uses ASCII strings
@pytest.mark.parametrize(
"file",
[
"stata14_118",
"stata14_be_118",
"stata14_119",
"stata14_be_119",
],
)
def test_read_dta118_119(self, file, datapath):
parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))
parsed_118["Bytes"] = parsed_118["Bytes"].astype("O")
expected = DataFrame.from_records(
[
Expand All @@ -352,7 +373,7 @@ def test_read_dta18(self, datapath):
for col in parsed_118.columns:
tm.assert_almost_equal(parsed_118[col], expected[col])

with StataReader(datapath("io", "data", "stata", "stata14_118.dta")) as rdr:
with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr:
vl = rdr.variable_labels()
vl_expected = {
"Unicode_Cities_Strl": "Here are some strls with Ünicode chars",
Expand Down Expand Up @@ -1799,8 +1820,18 @@ def test_gzip_writing(self, temp_file):
reread = read_stata(gz, index_col="index")
tm.assert_frame_equal(df, reread)

def test_unicode_dta_118(self, datapath):
unicode_df = self.read_dta(datapath("io", "data", "stata", "stata16_118.dta"))
# 117 is not included in this list as it uses ASCII strings
@pytest.mark.parametrize(
"file",
[
"stata16_118",
"stata16_be_118",
"stata16_119",
"stata16_be_119",
],
)
def test_unicode_dta_118_119(self, file, datapath):
unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))

columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"]
values = [
Expand Down

0 comments on commit 2d1e59d

Please sign in to comment.