From 96508ca1fe4c2eb45f873c4d44c942f290e21e30 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 13 Mar 2019 08:23:21 -0500 Subject: [PATCH] API: concat on sparse values API breaking change to `concat(List[DataFrame[Sparse]])` to return a DataFrame with sparse values, rather than a SparseDataFrame. Doing an outright break, rather than deprecation, because I have a followup PR deprecating SparseDataFrame. We return this internally in a few places (e.g. get_dummies on all-sparse data). Closes https://github.com/pandas-dev/pandas/issues/25702 --- doc/source/whatsnew/v0.25.0.rst | 36 ++++++++++++++++++++++++++++ pandas/core/dtypes/concat.py | 3 +-- pandas/core/groupby/generic.py | 8 +++++++ pandas/tests/reshape/test_reshape.py | 10 ++++++++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1840c47b4054f..db0a4d00f95d9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -64,6 +64,42 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] +Concatenating Sparse Values +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When passed DataFrames whose values are sparse, :func:`concat` will now return a +Series or DataFrame with sparse values, rather than a ``SparseDataFrame`` (:issue:`25702`). + +.. ipython:: python + + df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) + +*Previous Behavior:* + +.. code-block:: ipython + + In [2]: type(pd.concat([df, df])) + pandas.core.sparse.frame.SparseDataFrame + +*New Behavior:* + +.. ipython:: python + + type(pd.concat([df, df])) + + +This now matches the existing behavior of :class:`concat` on ``Series`` with sparse values. +:func:`concat` will continue to return a ``SparseDataFrame`` when all the values +are instances of ``SparseDataFrame``. + +This change also affects routines using :func:`concat` internally, like :func:`get_dummies`, +which now returns a :class:`DataFrame` in all cases (previously a ``SparseDataFrame`` was +returned if all the columns were dummy encoded, and a :class:`DataFrame` otherwise). + +Providing any ``SparseSeries`` or ``SparseDataFrame`` to :func:`concat` will +cause a ``SparseSeries`` or ``SparseDataFrame`` to be returned, as before. + + .. _whatsnew_0250.api_breaking.deps: Increased minimum versions for dependencies diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 10e903acbe538..7e765a38cedcd 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -89,8 +89,7 @@ def _get_frame_result_type(result, objs): """ if (result.blocks and ( - all(is_sparse(b) for b in result.blocks) or - all(isinstance(obj, ABCSparseDataFrame) for obj in objs))): + any(isinstance(obj, ABCSparseDataFrame) for obj in objs))): from pandas.core.sparse.api import SparseDataFrame return SparseDataFrame else: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c5f9e52e07ecf..e665b4b646ed3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -40,6 +40,7 @@ import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series +from pandas.core.sparse.frame import SparseDataFrame from pandas.plotting._core import boxplot_frame_groupby @@ -198,9 +199,16 @@ def aggregate(self, arg, *args, **kwargs): assert not args and not kwargs result = self._aggregate_multiple_funcs( [arg], _level=_level, _axis=self.axis) + result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name) + + if isinstance(self.obj, SparseDataFrame): + # Backwards compat for groupby.agg() with sparse + # values. concat no longer converts DataFrame[Sparse] + # to SparseDataFrame, so we do it here. + result = SparseDataFrame(result._data) except Exception: result = self._aggregate_generic(arg, *args, **kwargs) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index a5b6cffd1d86c..9d75f5f4f2ca4 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -577,6 +577,16 @@ def test_get_dummies_duplicate_columns(self, df): tm.assert_frame_equal(result, expected) + def test_get_dummies_all_sparse(self): + df = pd.DataFrame({"A": [1, 2]}) + result = pd.get_dummies(df, columns=['A'], sparse=True) + dtype = SparseDtype('uint8', 0) + expected = pd.DataFrame({ + 'A_1': SparseArray([1, 0], dtype=dtype), + 'A_2': SparseArray([0, 1], dtype=dtype), + }) + tm.assert_frame_equal(result, expected) + class TestCategoricalReshape(object):