Skip to content

Commit

Permalink
EHN: read_spss stores the metadata in df.attrs (#55472)
Browse files Browse the repository at this point in the history
* EHN: read_spss stores the metadata in df.attrs

* filter warning

* Make separate variable
  • Loading branch information
yuanx749 authored Oct 10, 2023
1 parent 66a54a3 commit b284101
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ Other enhancements

- :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
- :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
- :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/spss.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ def read_spss(
raise TypeError("usecols must be list-like.")
usecols = list(usecols) # pyreadstat requires a list

df, _ = pyreadstat.read_sav(
df, metadata = pyreadstat.read_sav(
stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
)
df.attrs = metadata.__dict__
if dtype_backend is not lib.no_default:
df = df.convert_dtypes(dtype_backend=dtype_backend)
return df
32 changes: 32 additions & 0 deletions pandas/tests/io/test_spss.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,35 @@ def test_invalid_dtype_backend():
)
with pytest.raises(ValueError, match=msg):
pd.read_spss("test", dtype_backend="numpy")


@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
def test_spss_metadata(datapath):
# GH 54264
fname = datapath("io", "data", "spss", "labelled-num.sav")

df = pd.read_spss(fname)
metadata = {
"column_names": ["VAR00002"],
"column_labels": [None],
"column_names_to_labels": {"VAR00002": None},
"file_encoding": "UTF-8",
"number_columns": 1,
"number_rows": 1,
"variable_value_labels": {"VAR00002": {1.0: "This is one"}},
"value_labels": {"labels0": {1.0: "This is one"}},
"variable_to_label": {"VAR00002": "labels0"},
"notes": [],
"original_variable_types": {"VAR00002": "F8.0"},
"readstat_variable_types": {"VAR00002": "double"},
"table_name": None,
"missing_ranges": {},
"missing_user_values": {},
"variable_storage_width": {"VAR00002": 8},
"variable_display_width": {"VAR00002": 8},
"variable_alignment": {"VAR00002": "unknown"},
"variable_measure": {"VAR00002": "unknown"},
"file_label": None,
"file_format": "sav/zsav",
}
assert df.attrs == metadata

0 comments on commit b284101

Please sign in to comment.