From b020891dbd3c53028cdfef4ce990dced7283dc3b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 1 May 2018 11:09:09 -0400 Subject: [PATCH] API: categorical grouping will no longer return the cartesian product (#20583) * BUG: groupby with categorical and other columns closes #14942 --- doc/source/groupby.rst | 74 +- doc/source/whatsnew/v0.23.0.txt | 52 ++ pandas/conftest.py | 11 + pandas/core/arrays/categorical.py | 31 +- pandas/core/generic.py | 11 +- pandas/core/groupby/groupby.py | 99 ++- pandas/core/indexes/category.py | 4 +- pandas/core/reshape/pivot.py | 34 +- pandas/tests/frame/test_sorting.py | 2 +- pandas/tests/groupby/aggregate/test_cython.py | 23 +- pandas/tests/groupby/aggregate/test_other.py | 6 +- pandas/tests/groupby/test_categorical.py | 705 ++++++++++-------- pandas/tests/groupby/test_function.py | 6 +- pandas/tests/groupby/test_grouping.py | 25 +- pandas/tests/reshape/test_pivot.py | 84 +-- 15 files changed, 748 insertions(+), 419 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 407fad39ba232..3616a7e1b41d2 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -91,10 +91,10 @@ The mapping can be specified many different ways: - A Python function, to be called on each of the axis labels. - A list or NumPy array of the same length as the selected axis. - A dict or ``Series``, providing a ``label -> group name`` mapping. - - For ``DataFrame`` objects, a string indicating a column to be used to group. + - For ``DataFrame`` objects, a string indicating a column to be used to group. Of course ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``, but it makes life simpler. - - For ``DataFrame`` objects, a string indicating an index level to be used to + - For ``DataFrame`` objects, a string indicating an index level to be used to group. - A list of any of the above things. @@ -120,7 +120,7 @@ consider the following ``DataFrame``: 'D' : np.random.randn(8)}) df -On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. +On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. We could naturally group by either the ``A`` or ``B`` columns, or both: .. ipython:: python @@ -360,8 +360,8 @@ Index level names may be specified as keys directly to ``groupby``. DataFrame column selection in GroupBy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once you have created the GroupBy object from a DataFrame, you might want to do -something different for each of the columns. Thus, using ``[]`` similar to +Once you have created the GroupBy object from a DataFrame, you might want to do +something different for each of the columns. Thus, using ``[]`` similar to getting a column from a DataFrame, you can do: .. ipython:: python @@ -421,7 +421,7 @@ statement if you wish: ``for (k1, k2), group in grouped:``. Selecting a group ----------------- -A single group can be selected using +A single group can be selected using :meth:`~pandas.core.groupby.DataFrameGroupBy.get_group`: .. ipython:: python @@ -444,8 +444,8 @@ perform a computation on the grouped data. These operations are similar to the :ref:`aggregating API `, :ref:`window functions API `, and :ref:`resample API `. -An obvious one is aggregation via the -:meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` or equivalently +An obvious one is aggregation via the +:meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` or equivalently :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` method: .. ipython:: python @@ -517,12 +517,12 @@ Some common aggregating functions are tabulated below: :meth:`~pd.core.groupby.DataFrameGroupBy.nth`;Take nth value, or a subset if n is a list :meth:`~pd.core.groupby.DataFrameGroupBy.min`;Compute min of group values :meth:`~pd.core.groupby.DataFrameGroupBy.max`;Compute max of group values - -The aggregating functions above will exclude NA values. Any function which + +The aggregating functions above will exclude NA values. Any function which reduces a :class:`Series` to a scalar value is an aggregation function and will work, a trivial example is ``df.groupby('A').agg(lambda ser: 1)``. Note that -:meth:`~pd.core.groupby.DataFrameGroupBy.nth` can act as a reducer *or* a +:meth:`~pd.core.groupby.DataFrameGroupBy.nth` can act as a reducer *or* a filter, see :ref:`here `. .. _groupby.aggregate.multifunc: @@ -732,7 +732,7 @@ and that the transformed data contains no NAs. .. note:: Some functions will automatically transform the input when applied to a - GroupBy object, but returning an object of the same shape as the original. + GroupBy object, but returning an object of the same shape as the original. Passing ``as_index=False`` will not affect these transformation methods. For example: ``fillna, ffill, bfill, shift.``. @@ -926,7 +926,7 @@ The dimension of the returned result can also change: In [11]: grouped.apply(f) -``apply`` on a Series can operate on a returned value from the applied function, +``apply`` on a Series can operate on a returned value from the applied function, that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python @@ -984,20 +984,48 @@ will be (silently) dropped. Thus, this does not pose any problems: df.groupby('A').std() -Note that ``df.groupby('A').colname.std().`` is more efficient than +Note that ``df.groupby('A').colname.std().`` is more efficient than ``df.groupby('A').std().colname``, so if the result of an aggregation function -is only interesting over one column (here ``colname``), it may be filtered +is only interesting over one column (here ``colname``), it may be filtered *before* applying the aggregation function. +.. _groupby.observed: + +Handling of (un)observed Categorical values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When using a ``Categorical`` grouper (as a single or as part of multipler groupers), the ``observed`` keyword +controls whether to return a cartesian product of all possible groupers values (``observed=False``) or only those +that are observed groupers (``observed=True``). + +Show all values: + +.. ipython:: python + + pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count() + +Show only the observed values: + +.. ipython:: python + + pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=True).count() + +The returned dtype of the grouped will *always* include *all* of the catergories that were grouped. + +.. ipython:: python + + s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count() + s.index.dtype + .. _groupby.missing: NA and NaT group handling ~~~~~~~~~~~~~~~~~~~~~~~~~ -If there are any NaN or NaT values in the grouping key, these will be -automatically excluded. In other words, there will never be an "NA group" or -"NaT group". This was not the case in older versions of pandas, but users were -generally discarding the NA group anyway (and supporting it was an +If there are any NaN or NaT values in the grouping key, these will be +automatically excluded. In other words, there will never be an "NA group" or +"NaT group". This was not the case in older versions of pandas, but users were +generally discarding the NA group anyway (and supporting it was an implementation headache). Grouping with ordered factors @@ -1084,8 +1112,8 @@ This shows the first or last n rows from each group. Taking the nth row of each group ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To select from a DataFrame or Series the nth item, use -:meth:`~pd.core.groupby.DataFrameGroupBy.nth`. This is a reduction method, and +To select from a DataFrame or Series the nth item, use +:meth:`~pd.core.groupby.DataFrameGroupBy.nth`. This is a reduction method, and will return a single row (or no row) per group if you pass an int for n: .. ipython:: python @@ -1153,7 +1181,7 @@ Enumerate groups .. versionadded:: 0.20.2 To see the ordering of the groups (as opposed to the order of rows -within a group given by ``cumcount``) you can use +within a group given by ``cumcount``) you can use :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`. @@ -1273,7 +1301,7 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on Multi-column factorization ~~~~~~~~~~~~~~~~~~~~~~~~~~ -By using :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`, we can extract +By using :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`, we can extract information about the groups in a way similar to :func:`factorize` (as described further in the :ref:`reshaping API `) but which applies naturally to multiple columns of mixed type and different diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7ea10deb65cef..d3bb28c2aee65 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -396,6 +396,58 @@ documentation. If you build an extension array, publicize it on our .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest/ +.. _whatsnew_0230.enhancements.categorical_grouping: + +Categorical Groupers has gained an observed keyword +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for +each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward +compatible (generate a cartesian product). (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`) + + +.. ipython:: python + + cat1 = pd.Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df['C'] = ['foo', 'bar'] * 2 + df + +To show all values, the previous behavior: + +.. ipython:: python + + df.groupby(['A', 'B', 'C'], observed=False).count() + + +To show only observed values: + +.. ipython:: python + + df.groupby(['A', 'B', 'C'], observed=True).count() + +For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword: + +.. ipython:: python + + cat1 = pd.Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = pd.Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df + +.. ipython:: python + + pd.pivot_table(df, values='values', index=['A', 'B'], + dropna=True) + pd.pivot_table(df, values='values', index=['A', 'B'], + dropna=False) + + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/conftest.py b/pandas/conftest.py index 559b5e44631b6..c4aab1b632b00 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -66,6 +66,17 @@ def ip(): return InteractiveShell() +@pytest.fixture(params=[True, False, None]) +def observed(request): + """ pass in the observed keyword to groupby for [True, False] + This indicates whether categoricals should return values for + values which are not in the grouper [False / None], or only values which + appear in the grouper [True]. [None] is supported for future compatiblity + if we decide to change the default (and would need to warn if this + parameter is not passed)""" + return request.param + + @pytest.fixture(params=[None, 'gzip', 'bz2', 'zip', pytest.param('xz', marks=td.skip_if_no_lzma)]) def compression(request): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 517c21cc1bc3a..f91782459df67 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -647,8 +647,13 @@ def _set_categories(self, categories, fastpath=False): self._dtype = new_dtype - def _codes_for_groupby(self, sort): + def _codes_for_groupby(self, sort, observed): """ + Code the categories to ensure we can groupby for categoricals. + + If observed=True, we return a new Categorical with the observed + categories only. + If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in the data. If sort=True, return self. @@ -661,6 +666,8 @@ def _codes_for_groupby(self, sort): ---------- sort : boolean The value of the sort parameter groupby was called with. + observed : boolean + Account only for the observed values Returns ------- @@ -671,6 +678,26 @@ def _codes_for_groupby(self, sort): categories in the original order. """ + # we only care about observed values + if observed: + unique_codes = unique1d(self.codes) + cat = self.copy() + + take_codes = unique_codes[unique_codes != -1] + if self.ordered: + take_codes = np.sort(take_codes) + + # we recode according to the uniques + categories = self.categories.take(take_codes) + codes = _recode_for_categories(self.codes, + self.categories, + categories) + + # return a new categorical that maps our new codes + # and categories + dtype = CategoricalDtype(categories, ordered=self.ordered) + return type(self)(codes, dtype=dtype, fastpath=True) + # Already sorted according to self.categories; all is fine if sort: return self @@ -2161,7 +2188,7 @@ def unique(self): # exclude nan from indexer for categories take_codes = unique_codes[unique_codes != -1] if self.ordered: - take_codes = sorted(take_codes) + take_codes = np.sort(take_codes) return cat.set_categories(cat.categories.take(take_codes)) def _values_for_factorize(self): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index af19acbb416ee..e68662037b43d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6599,7 +6599,7 @@ def clip_lower(self, threshold, axis=None, inplace=False): axis=axis, inplace=inplace) def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): + group_keys=True, squeeze=False, observed=None, **kwargs): """ Group series using mapper (dict or key function, apply given function to group, return result as series) or by a series of columns. @@ -6632,6 +6632,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, squeeze : boolean, default False reduce the dimensionality of the return type if possible, otherwise return a consistent type + observed : boolean, default None + if True: only show observed values for categorical groupers. + if False: show all values for categorical groupers. + if None: if any categorical groupers, show a FutureWarning, + default to False. + + .. versionadded:: 0.23.0 Returns ------- @@ -6665,7 +6672,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, axis = self._get_axis_number(axis) return groupby(self, by=by, axis=axis, level=level, as_index=as_index, sort=sort, group_keys=group_keys, squeeze=squeeze, - **kwargs) + observed=observed, **kwargs) def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8c20d62117e25..8613ab4d8c59d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -556,7 +556,8 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, - sort=True, group_keys=True, squeeze=False, **kwargs): + sort=True, group_keys=True, squeeze=False, + observed=None, **kwargs): self._selection = selection @@ -576,6 +577,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, self.sort = sort self.group_keys = group_keys self.squeeze = squeeze + self.observed = observed self.mutated = kwargs.pop('mutated', False) if grouper is None: @@ -583,6 +585,7 @@ def __init__(self, obj, keys=None, axis=0, level=None, axis=axis, level=level, sort=sort, + observed=observed, mutated=self.mutated) self.obj = obj @@ -1661,10 +1664,11 @@ def nth(self, n, dropna=None): if dropna not in ['any', 'all']: if isinstance(self._selected_obj, Series) and dropna is True: - warnings.warn("the dropna='%s' keyword is deprecated," + warnings.warn("the dropna={dropna} keyword is deprecated," "use dropna='all' instead. " "For a Series groupby, dropna must be " - "either None, 'any' or 'all'." % (dropna), + "either None, 'any' or 'all'.".format( + dropna=dropna), FutureWarning, stacklevel=2) dropna = 'all' @@ -2331,27 +2335,30 @@ def ngroups(self): def recons_labels(self): comp_ids, obs_ids, _ = self.group_info labels = (ping.labels for ping in self.groupings) - return decons_obs_group_ids(comp_ids, - obs_ids, self.shape, labels, xnull=True) + return decons_obs_group_ids( + comp_ids, obs_ids, self.shape, labels, xnull=True) @cache_readonly def result_index(self): if not self.compressed and len(self.groupings) == 1: - return self.groupings[0].group_index.rename(self.names[0]) - - return MultiIndex(levels=[ping.group_index for ping in self.groupings], - labels=self.recons_labels, - verify_integrity=False, - names=self.names) + return self.groupings[0].result_index.rename(self.names[0]) + + labels = self.recons_labels + levels = [ping.result_index for ping in self.groupings] + result = MultiIndex(levels=levels, + labels=labels, + verify_integrity=False, + names=self.names) + return result def get_group_levels(self): if not self.compressed and len(self.groupings) == 1: - return [self.groupings[0].group_index] + return [self.groupings[0].result_index] name_list = [] for ping, labels in zip(self.groupings, self.recons_labels): labels = _ensure_platform_int(labels) - levels = ping.group_index.take(labels) + levels = ping.result_index.take(labels) name_list.append(levels) @@ -2883,6 +2890,8 @@ class Grouping(object): obj : name : level : + observed : boolean, default False + If we are a Categorical, use the observed values in_axis : if the Grouping is a column in self.obj and hence among Groupby.exclusions list @@ -2898,14 +2907,16 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, in_axis=False): + sort=True, observed=None, in_axis=False): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) + self.all_grouper = None self.index = index self.sort = sort self.obj = obj + self.observed = observed self.in_axis = in_axis # right place for this? @@ -2953,17 +2964,30 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): - self.grouper = self.grouper._codes_for_groupby(self.sort) + # observed can be True/False/None + # we treat None as False. If in the future + # we need to warn if observed is not passed + # then we have this option + # gh-20583 + + self.all_grouper = self.grouper + self.grouper = self.grouper._codes_for_groupby( + self.sort, observed) + categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._labels = self.grouper.codes + if observed: + codes = algorithms.unique1d(self.grouper.codes) + else: + codes = np.arange(len(categories)) - c = self.grouper.categories self._group_index = CategoricalIndex( - Categorical.from_codes(np.arange(len(c)), - categories=c, - ordered=self.grouper.ordered)) + Categorical.from_codes( + codes=codes, + categories=categories, + ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): @@ -3022,6 +3046,22 @@ def labels(self): self._make_labels() return self._labels + @cache_readonly + def result_index(self): + if self.all_grouper is not None: + all_categories = self.all_grouper.categories + + # we re-order to the original category orderings + if self.sort: + return self.group_index.set_categories(all_categories) + + # we are not sorting, so add unobserved to the end + categories = self.group_index.categories + return self.group_index.add_categories( + all_categories[~all_categories.isin(categories)]) + + return self.group_index + @property def group_index(self): if self._group_index is None: @@ -3048,7 +3088,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - mutated=False, validate=True): + observed=None, mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -3065,6 +3105,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, are and then creates a Grouping for each one, combined into a BaseGrouper. + If observed & we have a categorical grouper, only show the observed + values + If validate, then check for key/level overlaps """ @@ -3243,6 +3286,7 @@ def is_in_obj(gpr): name=name, level=level, sort=sort, + observed=observed, in_axis=in_axis) \ if not isinstance(gpr, Grouping) else gpr @@ -4154,7 +4198,7 @@ def first_not_none(values): not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: if len(self.grouper.groupings) > 1: - key_index = MultiIndex.from_tuples(keys, names=key_names) + key_index = self.grouper.result_index else: ping = self.grouper.groupings[0] @@ -4244,8 +4288,9 @@ def first_not_none(values): # normally use vstack as its faster than concat # and if we have mi-columns - if isinstance(v.index, - MultiIndex) or key_index is None: + if (isinstance(v.index, MultiIndex) or + key_index is None or + isinstance(key_index, MultiIndex)): stacked_values = np.vstack(map(np.asarray, values)) result = DataFrame(stacked_values, index=key_index, columns=index) @@ -4696,6 +4741,14 @@ def _reindex_output(self, result): This can re-expand the output space """ + + # TODO(jreback): remove completely + # when observed parameter is defaulted to True + # gh-20583 + + if self.observed: + return result + groupings = self.grouper.groupings if groupings is None: return result diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 71caa098c7a28..3ffef5804acf7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -782,9 +782,9 @@ def _concat_same_dtype(self, to_concat, name): result.name = name return result - def _codes_for_groupby(self, sort): + def _codes_for_groupby(self, sort, observed): """ Return a Categorical adjusted for groupby """ - return self.values._codes_for_groupby(sort) + return self.values._codes_for_groupby(sort, observed) @classmethod def _add_comparison_methods(cls): diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 74a9b59d3194a..39fb57e68c9c0 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -79,7 +79,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys) + grouped = data.groupby(keys, observed=dropna) agged = grouped.agg(aggfunc) table = agged @@ -120,6 +120,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, + observed=dropna, margins_name=margins_name, fill_value=fill_value) # discard the top level @@ -138,7 +139,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', def _add_margins(table, data, values, rows, cols, aggfunc, - margins_name='All', fill_value=None): + observed=None, margins_name='All', fill_value=None): if not isinstance(margins_name, compat.string_types): raise ValueError('margins_name argument must be a string') @@ -168,6 +169,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, if values: marginal_result_set = _generate_marginal_results(table, data, values, rows, cols, aggfunc, + observed, grand_margin, margins_name) if not isinstance(marginal_result_set, tuple): @@ -175,7 +177,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc, result, margin_keys, row_margin = marginal_result_set else: marginal_result_set = _generate_marginal_results_without_values( - table, data, rows, cols, aggfunc, margins_name) + table, data, rows, cols, aggfunc, observed, margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set @@ -230,6 +232,7 @@ def _compute_grand_margin(data, values, aggfunc, def _generate_marginal_results(table, data, values, rows, cols, aggfunc, + observed, grand_margin, margins_name='All'): if len(cols) > 0: @@ -241,10 +244,13 @@ def _all_key(key): return (key, margins_name) + ('',) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby(rows).agg(aggfunc) + margin = data[rows + values].groupby( + rows, observed=observed).agg(aggfunc) cat_axis = 1 - for key, piece in table.groupby(level=0, axis=cat_axis): + for key, piece in table.groupby(level=0, + axis=cat_axis, + observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! @@ -264,7 +270,9 @@ def _all_key(key): else: margin = grand_margin cat_axis = 0 - for key, piece in table.groupby(level=0, axis=cat_axis): + for key, piece in table.groupby(level=0, + axis=cat_axis, + observed=observed): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) @@ -279,7 +287,8 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby(cols).agg(aggfunc) + row_margin = data[cols + values].groupby( + cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack @@ -293,7 +302,7 @@ def _all_key(key): def _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, - margins_name='All'): + observed, margins_name='All'): if len(cols) > 0: # need to "interleave" the margins margin_keys = [] @@ -304,14 +313,17 @@ def _all_key(): return (margins_name, ) + ('', ) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows].groupby(rows).apply(aggfunc) + margin = data[rows].groupby(rows, + observed=observed).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table margin_keys.append(all_key) else: - margin = data.groupby(level=0, axis=0).apply(aggfunc) + margin = data.groupby(level=0, + axis=0, + observed=observed).apply(aggfunc) all_key = _all_key() table[all_key] = margin result = table @@ -322,7 +334,7 @@ def _all_key(): margin_keys = table.columns if len(cols): - row_margin = data[cols].groupby(cols).apply(aggfunc) + row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc) else: row_margin = Series(np.nan, index=result.columns) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 5bd239f8a3034..b60eb89e87da5 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -573,7 +573,7 @@ def test_sort_index_intervalindex(self): bins=[-3, -0.5, 0, 0.5, 3]) model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2']) - result = model.groupby(['X1', 'X2']).mean().unstack() + result = model.groupby(['X1', 'X2'], observed=True).mean().unstack() expected = IntervalIndex.from_tuples( [(-3.0, -0.5), (-0.5, 0.0), (0.0, 0.5), (0.5, 3.0)], diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 80383c895a5e5..48a45e93e1e8e 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -158,35 +158,46 @@ def test__cython_agg_general(op, targop): ('min', np.min), ('max', np.max), ] ) -def test_cython_agg_empty_buckets(op, targop): +def test_cython_agg_empty_buckets(op, targop, observed): df = pd.DataFrame([11, 12, 13]) grps = range(0, 55, 5) # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) - expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + g = df.groupby(pd.cut(df[0], grps), observed=observed) + result = g._cython_agg_general(op) + + g = df.groupby(pd.cut(df[0], grps), observed=observed) + expected = g.agg(lambda x: targop(x)) tm.assert_frame_equal(result, expected) -def test_cython_agg_empty_buckets_nanops(): +def test_cython_agg_empty_buckets_nanops(observed): # GH-18869 can't call nanops on empty groups, so hardcode expected # for these df = pd.DataFrame([11, 12, 13], columns=['a']) grps = range(0, 25, 5) # add / sum - result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') + result = df.groupby(pd.cut(df['a'], grps), + observed=observed)._cython_agg_general('add') intervals = pd.interval_range(0, 20, freq=5) expected = pd.DataFrame( {"a": [0, 0, 36, 0]}, index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + if observed: + expected = expected[expected.a != 0] + tm.assert_frame_equal(result, expected) # prod - result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') + result = df.groupby(pd.cut(df['a'], grps), + observed=observed)._cython_agg_general('prod') expected = pd.DataFrame( {"a": [1, 1, 1716, 1]}, index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + if observed: + expected = expected[expected.a != 1] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index a10f7f6e46210..34489051efc18 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -488,15 +488,17 @@ def test_agg_structs_series(structure, expected): @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") -def test_agg_category_nansum(): +def test_agg_category_nansum(observed): categories = ['a', 'b', 'c'] df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], categories=categories), 'B': [1, 2, 3]}) - result = df.groupby("A").B.agg(np.nansum) + result = df.groupby("A", observed=observed).B.agg(np.nansum) expected = pd.Series([3, 3, 0], index=pd.CategoricalIndex(['a', 'b', 'c'], categories=categories, name='A'), name='B') + if observed: + expected = expected[expected != 0] tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 160b60e69f39d..e0793b8e1bd64 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -5,16 +5,43 @@ import pytest import numpy as np -from numpy import nan - import pandas as pd from pandas import (Index, MultiIndex, CategoricalIndex, - DataFrame, Categorical, Series, Interval, qcut) + DataFrame, Categorical, Series, qcut) from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm -def test_groupby(): +def cartesian_product_for_groupers(result, args, names): + """ Reindex to a cartesian production for the groupers, + preserving the nature (Categorical) of each grouper """ + + def f(a): + if isinstance(a, (CategoricalIndex, Categorical)): + categories = a.categories + a = Categorical.from_codes(np.arange(len(categories)), + categories=categories, + ordered=a.ordered) + return a + + index = pd.MultiIndex.from_product(map(f, args), names=names) + return result.reindex(index).sort_index() + + +def test_apply_use_categorical_name(df): + cats = qcut(df.C, 4) + + def get_stats(group): + return {'min': group.min(), + 'max': group.max(), + 'count': group.count(), + 'mean': group.mean()} + + result = df.groupby(cats, observed=False).D.apply(get_stats) + assert result.index.names[0] == 'C' + + +def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) @@ -22,56 +49,29 @@ def test_groupby(): exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) - result = data.groupby("b").mean() + result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) - raw_cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - raw_cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper - gb = df.groupby("A") + gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) - # multiple groupers - gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True)], - names=['A', 'B']) - expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, - np.nan, np.nan, np.nan]}, - index=exp_index) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers with a non-cat - df = df.copy() - df['C'] = ['foo', 'bar'] * 2 - gb = df.groupby(['A', 'B', 'C']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True), - ['foo', 'bar']], - names=['A', 'B', 'C']) - expected = DataFrame({'values': Series( - np.nan, index=exp_index)}).sort_index() - expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] - result = gb.sum() - tm.assert_frame_equal(result, expected) - # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) - g = x.groupby(['person_id']) + g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) @@ -93,36 +93,48 @@ def f(x): df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) - result = df.a.groupby(c).transform(sum) + result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), + df['a']) + tm.assert_frame_equal( + df.groupby(c, observed=False).transform(sum), + df[['a']]) tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) + df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), + df[['a']]) # Filter - tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) - tm.assert_frame_equal(df.groupby(c).filter(np.all), df) + tm.assert_series_equal( + df.a.groupby(c, observed=False).filter(np.all), + df['a']) + tm.assert_frame_equal( + df.groupby(c, observed=False).filter(np.all), + df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) - result = df.a.groupby(c).transform(sum) + result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), + df['a']) tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) + df.groupby(c, observed=False).transform(sum), + df[['a']]) + tm.assert_frame_equal( + df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), + df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) - result = df.groupby(c).apply(len) + result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex( c.values.categories, ordered=c.values.ordered) @@ -130,36 +142,56 @@ def f(x): expected.index.name = 'a' tm.assert_series_equal(result, expected) + # more basic + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=100) -def test_groupby_sort(): + cats = Categorical.from_codes(codes, levels, ordered=True) - # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby - # This should result in a properly sorted Series so that the plot - # has a sorted x axis - # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + data = DataFrame(np.random.randn(100, 4)) - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - cat_labels = Categorical(labels, labels) + result = data.groupby(cats, observed=False).mean() - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) + expected = data.groupby(np.asarray(cats), observed=False).mean() + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) + expected = expected.reindex(exp_idx) - res = df.groupby(['value_group'])['value_group'].count() - exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] - exp.index = CategoricalIndex(exp.index, name=exp.index.name) - tm.assert_series_equal(res, exp) + assert_frame_equal(result, expected) + grouped = data.groupby(cats, observed=False) + desc_result = grouped.describe() -def test_level_groupby_get_group(): + idx = cats.codes.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby( + exp_cats, sort=False, observed=False).describe() + assert_frame_equal(desc_result, expected) + + # GH 10460 + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) + exp = CategoricalIndex(expc) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(0)), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(1)), exp) + + +def test_level_get_group(observed): # GH15155 df = DataFrame(data=np.arange(2, 22, 2), index=MultiIndex( levels=[pd.CategoricalIndex(["a", "b"]), range(10)], labels=[[0] * 5 + [1] * 5, range(10)], names=["Index1", "Index2"])) - g = df.groupby(level=["Index1"]) + g = df.groupby(level=["Index1"], observed=observed) # expected should equal test.loc[["a"]] # GH15166 @@ -173,94 +205,217 @@ def test_level_groupby_get_group(): assert_frame_equal(result, expected) -def test_apply_use_categorical_name(df): - cats = qcut(df.C, 4) +@pytest.mark.parametrize('ordered', [True, False]) +def test_apply(ordered): + # GH 10138 - def get_stats(group): - return {'min': group.min(), - 'max': group.max(), - 'count': group.count(), - 'mean': group.mean()} + dense = Categorical(list('abc'), ordered=ordered) + + # 'b' is in the categories but not in the list + missing = Categorical( + list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense'], observed=True) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_arrays( + [missing, dense], names=['missing', 'dense']) + expected = DataFrame([0, 1, 2.], + index=idx, + columns=['values']) + + result = grouped.apply(lambda x: np.mean(x)) + assert_frame_equal(result, expected) - result = df.groupby(cats).D.apply(get_stats) - assert result.index.names[0] == 'C' + # we coerce back to ints + expected = expected.astype('int') + result = grouped.mean() + assert_frame_equal(result, expected) + result = grouped.agg(np.mean) + assert_frame_equal(result, expected) -def test_apply_categorical_data(): - # GH 10138 - for ordered in [True, False]: - dense = Categorical(list('abc'), ordered=ordered) - # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) - values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense']) - - # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product( - [Categorical(['a', 'b'], ordered=ordered), - Categorical(['a', 'b', 'c'], ordered=ordered)], - names=['missing', 'dense']) - expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], - index=idx, - columns=['values']) - - assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) - assert_frame_equal(grouped.mean(), expected) - assert_frame_equal(grouped.agg(np.mean), expected) - - # but for transform we should still get back the original index - idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], - names=['missing', 'dense']) - expected = Series(1, index=idx) - assert_series_equal(grouped.apply(lambda x: 1), expected) - - -def test_groupby_categorical(): - levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=100) + # but for transform we should still get back the original index + idx = MultiIndex.from_arrays([missing, dense], + names=['missing', 'dense']) + expected = Series(1, index=idx) + result = grouped.apply(lambda x: 1) + assert_series_equal(result, expected) + + +def test_observed(observed): + # multiple groupers, don't re-expand the output space + # of the grouper + # gh-14942 (implement) + # gh-10132 (back-compat) + # gh-8138 (back-compat) + # gh-8869 + + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + df['C'] = ['foo', 'bar'] * 2 - cats = Categorical.from_codes(codes, levels, ordered=True) + # multiple groupers with a non-cat + gb = df.groupby(['A', 'B', 'C'], observed=observed) + exp_index = pd.MultiIndex.from_arrays( + [cat1, cat2, ['foo', 'bar'] * 2], + names=['A', 'B', 'C']) + expected = DataFrame({'values': Series( + [1, 2, 3, 4], index=exp_index)}).sort_index() + result = gb.sum() + if not observed: + expected = cartesian_product_for_groupers( + expected, + [cat1, cat2, ['foo', 'bar']], + list('ABC')) - data = DataFrame(np.random.randn(100, 4)) + tm.assert_frame_equal(result, expected) - result = data.groupby(cats).mean() + gb = df.groupby(['A', 'B'], observed=observed) + exp_index = pd.MultiIndex.from_arrays( + [cat1, cat2], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, 3, 4]}, + index=exp_index) + result = gb.sum() + if not observed: + expected = cartesian_product_for_groupers( + expected, + [cat1, cat2], + list('AB')) - expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) - expected = expected.reindex(exp_idx) + tm.assert_frame_equal(result, expected) - assert_frame_equal(result, expected) + # https://github.com/pandas-dev/pandas/issues/8138 + d = {'cat': + pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), + 'ints': [1, 1, 2, 2], + 'val': [10, 20, 30, 40]} + df = pd.DataFrame(d) - grouped = data.groupby(cats) - desc_result = grouped.describe() + # Grouping on a single column + groups_single_key = df.groupby("cat", observed=observed) + result = groups_single_key.mean() - idx = cats.codes.argsort() - ord_labels = np.asarray(cats).take(idx) - ord_data = data.take(idx) + exp_index = pd.CategoricalIndex(list('ab'), name="cat", + categories=list('abc'), + ordered=True) + expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]}, + index=exp_index) + if not observed: + index = pd.CategoricalIndex(list('abc'), name="cat", + categories=list('abc'), + ordered=True) + expected = expected.reindex(index) - exp_cats = Categorical(ord_labels, ordered=True, - categories=['foo', 'bar', 'baz', 'qux']) - expected = ord_data.groupby(exp_cats, sort=False).describe() - assert_frame_equal(desc_result, expected) + tm.assert_frame_equal(result, expected) - # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) - exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) + # Grouping on two columns + groups_double_key = df.groupby(["cat", "ints"], observed=observed) + result = groups_double_key.agg('mean') + expected = DataFrame( + {"val": [10, 30, 20, 40], + "cat": pd.Categorical(['a', 'a', 'b', 'b'], + categories=['a', 'b', 'c'], + ordered=True), + "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) + if not observed: + expected = cartesian_product_for_groupers( + expected, + [df.cat.values, [1, 2]], + ['cat', 'ints']) + + tm.assert_frame_equal(result, expected) + # GH 10132 + for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = df[(df.cat == c) & (df.ints == i)] + assert_frame_equal(result, expected) + + # gh-8869 + # with as_index + d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], + 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']} + df = pd.DataFrame(d) + cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) + df['range'] = cat + groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) + result = groups.agg('mean') + + groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed) + expected = groups2.agg('mean').reset_index() + tm.assert_frame_equal(result, expected) + + +def test_observed_codes_remap(observed): + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + df = pd.DataFrame(d) + values = pd.cut(df['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = df.groupby([values, 'C2'], observed=observed) + + idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], + names=["cat", "C2"]) + expected = DataFrame({"C1": [3, 3, 4, 5], + "C3": [10, 100, 200, 34]}, index=idx) + if not observed: + expected = cartesian_product_for_groupers( + expected, + [values.values, [1, 2, 3, 4]], + ['cat', 'C2']) + + result = groups_double_key.agg('mean') + tm.assert_frame_equal(result, expected) + + +def test_observed_perf(): + # we create a cartesian product, so this is + # non-performant if we don't use observed values + # gh-14942 + df = DataFrame({ + 'cat': np.random.randint(0, 255, size=30000), + 'int_id': np.random.randint(0, 255, size=30000), + 'other_id': np.random.randint(0, 10000, size=30000), + 'foo': 0}) + df['cat'] = df.cat.astype(str).astype('category') -def test_groupby_datetime_categorical(): + grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True) + result = grouped.count() + assert result.index.levels[0].nunique() == df.cat.nunique() + assert result.index.levels[1].nunique() == df.int_id.nunique() + assert result.index.levels[2].nunique() == df.other_id.nunique() + + +def test_observed_groups(observed): + # gh-20583 + # test that we have the appropriate groups + + cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) + df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]}) + g = df.groupby('cat', observed=observed) + + result = g.groups + if observed: + expected = {'a': Index([0, 2], dtype='int64'), + 'c': Index([1], dtype='int64')} + else: + expected = {'a': Index([0, 2], dtype='int64'), + 'b': Index([], dtype='int64'), + 'c': Index([1], dtype='int64')} + + tm.assert_dict_equal(result, expected) + + +def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) codes = np.random.randint(0, 4, size=100) @@ -268,9 +423,9 @@ def test_groupby_datetime_categorical(): cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() + result = data.groupby(cats, observed=False).mean() - expected = data.groupby(np.asarray(cats)).mean() + expected = data.groupby(np.asarray(cats), observed=False).mean() expected = expected.reindex(levels) expected.index = CategoricalIndex(expected.index, categories=expected.index, @@ -278,13 +433,13 @@ def test_groupby_datetime_categorical(): assert_frame_equal(result, expected) - grouped = data.groupby(cats) + grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = cats.take_nd(idx) ord_data = data.take(idx) - expected = ord_data.groupby(ord_labels).describe() + expected = ord_data.groupby(ord_labels, observed=False).describe() assert_frame_equal(desc_result, expected) tm.assert_index_equal(desc_result.index, expected.index) tm.assert_index_equal( @@ -303,7 +458,7 @@ def test_groupby_datetime_categorical(): .get_level_values(1)), exp) -def test_groupby_categorical_index(): +def test_categorical_index(): s = np.random.RandomState(12345) levels = ['foo', 'bar', 'baz', 'qux'] @@ -315,23 +470,23 @@ def test_groupby_categorical_index(): df['cats'] = cats # with a cat index - result = df.set_index('cats').groupby(level=0).sum() - expected = df[list('abcd')].groupby(cats.codes).sum() + result = df.set_index('cats').groupby(level=0, observed=False).sum() + expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( Categorical.from_codes( [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) # with a cat column, should produce a cat index - result = df.groupby('cats').sum() - expected = df[list('abcd')].groupby(cats.codes).sum() + result = df.groupby('cats', observed=False).sum() + expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( Categorical.from_codes( [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) -def test_groupby_describe_categorical_columns(): +def test_describe_categorical_columns(): # GH 11558 cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], categories=['foo', 'bar', 'baz', 'qux'], @@ -343,14 +498,15 @@ def test_groupby_describe_categorical_columns(): tm.assert_categorical_equal(result.stack().columns.values, cats.values) -def test_groupby_unstack_categorical(): +def test_unstack_categorical(): # GH11558 (example is taken from the original issue) df = pd.DataFrame({'a': range(10), 'medium': ['A', 'B'] * 5, 'artist': list('XYXXY') * 2}) df['medium'] = df['medium'].astype('category') - gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() + gcat = df.groupby( + ['artist', 'medium'], observed=False)['a'].count().unstack() result = gcat.describe() exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, @@ -363,7 +519,7 @@ def test_groupby_unstack_categorical(): tm.assert_series_equal(result, expected) -def test_groupby_bins_unequal_len(): +def test_bins_unequal_len(): # GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) bins = pd.cut(series.dropna().values, 4) @@ -374,47 +530,45 @@ def f(): pytest.raises(ValueError, f) -def test_groupby_multi_categorical_as_index(): +def test_as_index(): # GH13204 df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), 'A': [10, 11, 11], 'B': [101, 102, 103]}) - result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) + result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum() + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 11], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # function grouper f = lambda r: df.loc[r, 'A'] - result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) + result = df.groupby(['cat', f], as_index=False, observed=True).sum() + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 22], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) # another not in-axis grouper s = Series(['a', 'b', 'b'], name='cat2') - result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) + result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) # GH18872: conflicting names in desired index - pytest.raises(ValueError, lambda: df.groupby(['cat', - s.rename('cat')]).sum()) + with pytest.raises(ValueError): + df.groupby(['cat', s.rename('cat')], observed=True).sum() # is original index dropped? - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - group_columns = ['cat', 'A'] + expected = DataFrame( + {'cat': Categorical([1, 2], categories=df.cat.cat.categories), + 'A': [10, 11], + 'B': [101, 205]}, + columns=['cat', 'A', 'B']) for name in [None, 'X', 'B', 'cat']: df.index = Index(list("abc"), name=name) @@ -422,15 +576,17 @@ def test_groupby_multi_categorical_as_index(): if name in group_columns and name in df.index.names: with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.groupby(group_columns, as_index=False).sum() + result = df.groupby( + group_columns, as_index=False, observed=True).sum() else: - result = df.groupby(group_columns, as_index=False).sum() + result = df.groupby( + group_columns, as_index=False, observed=True).sum() - tm.assert_frame_equal(result, expected, check_index_type=True) + tm.assert_frame_equal(result, expected) -def test_groupby_preserve_categories(): +def test_preserve_categories(): # GH-13179 categories = list('abc') @@ -439,8 +595,10 @@ def test_groupby_preserve_categories(): categories=categories, ordered=True)}) index = pd.CategoricalIndex(categories, categories, ordered=True) - tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) - tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) + tm.assert_index_equal( + df.groupby('A', sort=True, observed=False).first().index, index) + tm.assert_index_equal( + df.groupby('A', sort=False, observed=False).first().index, index) # ordered=False df = DataFrame({'A': pd.Categorical(list('ba'), @@ -449,13 +607,15 @@ def test_groupby_preserve_categories(): sort_index = pd.CategoricalIndex(categories, categories, ordered=False) nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), ordered=False) - tm.assert_index_equal(df.groupby('A', sort=True).first().index, - sort_index) - tm.assert_index_equal(df.groupby('A', sort=False).first().index, - nosort_index) + tm.assert_index_equal( + df.groupby('A', sort=True, observed=False).first().index, + sort_index) + tm.assert_index_equal( + df.groupby('A', sort=False, observed=False).first().index, + nosort_index) -def test_groupby_preserve_categorical_dtype(): +def test_preserve_categorical_dtype(): # GH13743, GH13854 df = DataFrame({'A': [1, 2, 1, 1, 2], 'B': [10, 16, 22, 28, 34], @@ -475,38 +635,22 @@ def test_groupby_preserve_categorical_dtype(): categories=list("bac"), ordered=True)}) for col in ['C1', 'C2']: - result1 = df.groupby(by=col, as_index=False).mean() - result2 = df.groupby(by=col, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - # multiple grouper - exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], - 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, - np.nan], - 'C1': Categorical(list("bacbac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bacbac"), - categories=list("bac"), - ordered=True)}) - for cols in [['A', 'C1'], ['A', 'C2']]: - result1 = df.groupby(by=cols, as_index=False).mean() - result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + result1 = df.groupby(by=col, as_index=False, observed=False).mean() + result2 = df.groupby( + by=col, as_index=True, observed=False).mean().reset_index() expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) -def test_groupby_categorical_no_compress(): +def test_categorical_no_compress(): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean() + result = data.groupby(cats, observed=False).mean() + exp = data.groupby(codes, observed=False).mean() exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) @@ -515,8 +659,8 @@ def test_groupby_categorical_no_compress(): codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean().reindex(cats.categories) + result = data.groupby(cats, observed=False).mean() + exp = data.groupby(codes, observed=False).mean().reindex(cats.categories) exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) @@ -525,13 +669,34 @@ def test_groupby_categorical_no_compress(): categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - result = data.groupby("b").mean() + result = data.groupby("b", observed=False).mean() result = result["a"].values exp = np.array([1, 2, 4, np.nan]) tm.assert_numpy_array_equal(result, exp) -def test_groupby_sort_categorical(): +def test_sort(): + + # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby # noqa: flake8 + # This should result in a properly sorted Series so that the plot + # has a sorted x axis + # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + + df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=['value'], ascending=True) + df['value_group'] = pd.cut(df.value, range(0, 10500, 500), + right=False, labels=cat_labels) + + res = df.groupby(['value_group'], observed=False)['value_group'].count() + exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] + exp.index = CategoricalIndex(exp.index, name=exp.index.name) + tm.assert_series_equal(res, exp) + + +def test_sort2(): # dataframe groupby sort was being ignored # GH 8868 df = DataFrame([['(7.5, 10]', 10, 10], ['(7.5, 10]', 8, 20], @@ -543,35 +708,43 @@ def test_groupby_sort_categorical(): df['range'] = Categorical(df['range'], ordered=True) index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range', ordered=True) - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) + expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) col = 'range' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + result_sort = df.groupby(col, sort=True, observed=False).first() + assert_frame_equal(result_sort, expected_sort) + # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + expected_sort = result_sort + result_sort = df.groupby(col, sort=False, observed=False).first() + assert_frame_equal(result_sort, expected_sort) df['range'] = Categorical(df['range'], ordered=False) index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range') - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) + expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', '(0, 2.5]'], categories=['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', '(0, 2.5]'], name='range') - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - index=index, columns=['foo', 'bar']) + expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + index=index, columns=['foo', 'bar']) col = 'range' + # this is an unordered categorical, but we allow this #### - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + result_sort = df.groupby(col, sort=True, observed=False).first() + assert_frame_equal(result_sort, expected_sort) + + result_nosort = df.groupby(col, sort=False, observed=False).first() + assert_frame_equal(result_nosort, expected_nosort) -def test_groupby_sort_categorical_datetimelike(): +def test_sort_datetimelike(): # GH10505 # use same data as test_groupby_sort_categorical, which category is @@ -600,9 +773,12 @@ def test_groupby_sort_categorical_datetimelike(): name='dt', ordered=True) col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal( + result_sort, df.groupby(col, sort=True, observed=False).first()) + # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + assert_frame_equal( + result_sort, df.groupby(col, sort=False, observed=False).first()) # ordered = False df['dt'] = Categorical(df['dt'], ordered=False) @@ -620,65 +796,10 @@ def test_groupby_sort_categorical_datetimelike(): name='dt') col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) - - -def test_groupby_categorical_two_columns(): - - # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - - exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", - ordered=True) - exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=exp_index) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat", "ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], - ordered=True), - "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" - ]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6]) - values.name = "cat" - groups_double_key = test.groupby([values, 'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product( - [Categorical([Interval(1, 2), Interval(2, 3), - Interval(3, 6)], ordered=True), - [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, - nan, nan, nan, nan, 4, 5], - "C3": [nan, nan, nan, nan, 10, 100, - nan, nan, nan, nan, 200, 34]}, index=idx) - tm.assert_frame_equal(res, exp) + assert_frame_equal( + result_sort, df.groupby(col, sort=True, observed=False).first()) + assert_frame_equal( + result_nosort, df.groupby(col, sort=False, observed=False).first()) def test_empty_sum(): @@ -689,22 +810,22 @@ def test_empty_sum(): expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') # 0 by default - result = df.groupby("A").B.sum() + result = df.groupby("A", observed=False).B.sum() expected = pd.Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 - result = df.groupby("A").B.sum(min_count=0) + result = df.groupby("A", observed=False).B.sum(min_count=0) expected = pd.Series([3, 1, 0], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 - result = df.groupby("A").B.sum(min_count=1) + result = df.groupby("A", observed=False).B.sum(min_count=1) expected = pd.Series([3, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count>1 - result = df.groupby("A").B.sum(min_count=2) + result = df.groupby("A", observed=False).B.sum(min_count=2) expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) @@ -718,16 +839,16 @@ def test_empty_prod(): expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') # 1 by default - result = df.groupby("A").B.prod() + result = df.groupby("A", observed=False).B.prod() expected = pd.Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=0 - result = df.groupby("A").B.prod(min_count=0) + result = df.groupby("A", observed=False).B.prod(min_count=0) expected = pd.Series([2, 1, 1], expected_idx, name='B') tm.assert_series_equal(result, expected) # min_count=1 - result = df.groupby("A").B.prod(min_count=1) + result = df.groupby("A", observed=False).B.prod(min_count=1) expected = pd.Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index ba1371fe9f931..f1d678db4ff7f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -313,14 +313,14 @@ def test_cython_median(): tm.assert_frame_equal(rs, xp) -def test_median_empty_bins(): +def test_median_empty_bins(observed): df = pd.DataFrame(np.random.randint(0, 44, 500)) grps = range(0, 55, 5) bins = pd.cut(df[0], grps) - result = df.groupby(bins).median() - expected = df.groupby(bins).agg(lambda x: x.median()) + result = df.groupby(bins, observed=observed).median() + expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 743237f5b386c..c0f5c43b2fd35 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -251,7 +251,7 @@ def test_groupby_levels_and_columns(self): by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64) tm.assert_frame_equal(by_levels, by_columns) - def test_groupby_categorical_index_and_columns(self): + def test_groupby_categorical_index_and_columns(self, observed): # GH18432 columns = ['A', 'B', 'A', 'B'] categories = ['B', 'A'] @@ -260,17 +260,26 @@ def test_groupby_categorical_index_and_columns(self): categories=categories, ordered=True) df = DataFrame(data=data, columns=cat_columns) - result = df.groupby(axis=1, level=0).sum() + result = df.groupby(axis=1, level=0, observed=observed).sum() expected_data = 2 * np.ones((5, 2), int) - expected_columns = CategoricalIndex(categories, - categories=categories, - ordered=True) + + if observed: + # if we are not-observed we undergo a reindex + # so need to adjust the output as our expected sets us up + # to be non-observed + expected_columns = CategoricalIndex(['A', 'B'], + categories=categories, + ordered=True) + else: + expected_columns = CategoricalIndex(categories, + categories=categories, + ordered=True) expected = DataFrame(data=expected_data, columns=expected_columns) assert_frame_equal(result, expected) # test transposed version df = DataFrame(data.T, index=cat_columns) - result = df.groupby(axis=0, level=0).sum() + result = df.groupby(axis=0, level=0, observed=observed).sum() expected = DataFrame(data=expected_data.T, index=expected_columns) assert_frame_equal(result, expected) @@ -572,11 +581,11 @@ def test_get_group(self): pytest.raises(ValueError, lambda: g.get_group(('foo', 'bar', 'baz'))) - def test_get_group_empty_bins(self): + def test_get_group_empty_bins(self, observed): d = pd.DataFrame([3, 1, 7, 6]) bins = [0, 5, 10, 15] - g = d.groupby(pd.cut(d[0], bins)) + g = d.groupby(pd.cut(d[0], bins), observed=observed) # TODO: should prob allow a str of Interval work as well # IOW '(0, 5]' diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1004b40bfb4c1..76cdc1d2a195d 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -93,23 +93,24 @@ def test_pivot_table_dropna(self): def test_pivot_table_categorical(self): - raw_cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - raw_cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) - result = pd.pivot_table(df, values='values', index=['A', 'B']) - - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True)], + cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) + result = pd.pivot_table(df, values='values', index=['A', 'B'], + dropna=True) + + exp_index = pd.MultiIndex.from_arrays( + [cat1, cat2], names=['A', 'B']) expected = DataFrame( - {'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]}, + {'values': [1, 2, 3, 4]}, index=exp_index) tm.assert_frame_equal(result, expected) - def test_pivot_table_dropna_categoricals(self): + @pytest.mark.parametrize('dropna', [True, False]) + def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 categories = ['a', 'b', 'c', 'd'] @@ -118,30 +119,23 @@ def test_pivot_table_dropna_categoricals(self): 'C': range(0, 9)}) df['A'] = df['A'].astype(CDT(categories, ordered=False)) - result_true = df.pivot_table(index='B', columns='A', values='C', - dropna=True) + result = df.pivot_table(index='B', columns='A', values='C', + dropna=dropna) expected_columns = Series(['a', 'b', 'c'], name='A') expected_columns = expected_columns.astype( CDT(categories, ordered=False)) expected_index = Series([1, 2, 3], name='B') - expected_true = DataFrame([[0.0, 3.0, 6.0], - [1.0, 4.0, 7.0], - [2.0, 5.0, 8.0]], - index=expected_index, - columns=expected_columns,) - tm.assert_frame_equal(expected_true, result_true) - - result_false = df.pivot_table(index='B', columns='A', values='C', - dropna=False) - expected_columns = ( - Series(['a', 'b', 'c', 'd'], name='A').astype('category') - ) - expected_false = DataFrame([[0.0, 3.0, 6.0, np.NaN], - [1.0, 4.0, 7.0, np.NaN], - [2.0, 5.0, 8.0, np.NaN]], - index=expected_index, - columns=expected_columns,) - tm.assert_frame_equal(expected_false, result_false) + expected = DataFrame([[0, 3, 6], + [1, 4, 7], + [2, 5, 8]], + index=expected_index, + columns=expected_columns,) + if not dropna: + # add back the non observed to compare + expected = expected.reindex( + columns=Categorical(categories)).astype('float') + + tm.assert_frame_equal(result, expected) def test_pass_array(self): result = self.data.pivot_table( @@ -1068,7 +1062,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self): @pytest.mark.xfail(reason='GH 17035 (np.mean of ints is casted back to ' 'ints)') - def test_categorical_margins(self): + def test_categorical_margins(self, observed): # GH 10989 df = pd.DataFrame({'x': np.arange(8), 'y': np.arange(8) // 4, @@ -1078,12 +1072,12 @@ def test_categorical_margins(self): expected.index = Index([0, 1, 'All'], name='y') expected.columns = Index([0, 1, 'All'], name='z') - table = df.pivot_table('x', 'y', 'z', margins=True) + table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True) tm.assert_frame_equal(table, expected) @pytest.mark.xfail(reason='GH 17035 (np.mean of ints is casted back to ' 'ints)') - def test_categorical_margins_category(self): + def test_categorical_margins_category(self, observed): df = pd.DataFrame({'x': np.arange(8), 'y': np.arange(8) // 4, 'z': np.arange(8) % 2}) @@ -1094,16 +1088,17 @@ def test_categorical_margins_category(self): df.y = df.y.astype('category') df.z = df.z.astype('category') - table = df.pivot_table('x', 'y', 'z', margins=True) + table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True) tm.assert_frame_equal(table, expected) - def test_categorical_aggfunc(self): + def test_categorical_aggfunc(self, observed): # GH 9534 df = pd.DataFrame({"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]}) df["C1"] = df["C1"].astype("category") - result = df.pivot_table("V", index="C1", columns="C2", aggfunc="count") + result = df.pivot_table("V", index="C1", columns="C2", + dropna=observed, aggfunc="count") expected_index = pd.CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], @@ -1118,7 +1113,7 @@ def test_categorical_aggfunc(self): columns=expected_columns) tm.assert_frame_equal(result, expected) - def test_categorical_pivot_index_ordering(self): + def test_categorical_pivot_index_ordering(self, observed): # GH 8731 df = pd.DataFrame({'Sales': [100, 120, 220], 'Month': ['January', 'January', 'January'], @@ -1130,18 +1125,19 @@ def test_categorical_pivot_index_ordering(self): result = df.pivot_table(values='Sales', index='Month', columns='Year', + dropna=observed, aggfunc='sum') expected_columns = pd.Int64Index([2013, 2014], name='Year') - expected_index = pd.CategoricalIndex(months, + expected_index = pd.CategoricalIndex(['January'], categories=months, ordered=False, name='Month') - expected_data = np.empty((12, 2)) - expected_data.fill(np.nan) - expected_data[0, :] = [320., 120.] - expected = pd.DataFrame(expected_data, + expected = pd.DataFrame([[320, 120]], index=expected_index, columns=expected_columns) + if not observed: + result = result.dropna().astype(np.int64) + tm.assert_frame_equal(result, expected) def test_pivot_table_not_series(self):