Skip to content

Commit

Permalink
BUG: groupby with categorical and other columns
Browse files Browse the repository at this point in the history
closes #14942
  • Loading branch information
jreback committed Apr 20, 2018
1 parent bb095a6 commit df3533a
Show file tree
Hide file tree
Showing 6 changed files with 959 additions and 740 deletions.
35 changes: 35 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,41 @@ If you wish to retain the old behavior while using Python >= 3.6, you can use
'Taxes': -200,
'Net result': 300}).sort_index()

.. _whatsnew_0230.api_breaking.categorical_grouping:

Categorical Groupers will now require passing the observed keyword
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for
each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward
compatible (generate a cartesian product). Pandas will show a ``FutureWarning`` if the ``observed`` keyword is not passed; the default will
change to ``observed=True`` in the future. (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`)


.. ipython:: python

cat1 = pd.Categorical(["a", "a", "b", "b"],
categories=["a", "b", "z"], ordered=True)
cat2 = pd.Categorical(["c", "d", "c", "d"],
categories=["c", "d", "y"], ordered=True)
df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
df['C'] = ['foo', 'bar'] * 2
df

Previous Behavior (show all values):

.. ipython:: python

.. code-block:: python
df.groupby(['A', 'B', 'C'], observed=False).count()


New Behavior (show only observed values):

.. ipython:: python

df.groupby(['A', 'B', 'C'], observed=True).count()

.. _whatsnew_0230.api_breaking.deprecate_panel:

Deprecate Panel
Expand Down
11 changes: 9 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6532,7 +6532,7 @@ def clip_lower(self, threshold, axis=None, inplace=False):
axis=axis, inplace=inplace)

def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
group_keys=True, squeeze=False, **kwargs):
group_keys=True, squeeze=False, observed=None, **kwargs):
"""
Group series using mapper (dict or key function, apply given function
to group, return result as series) or by a series of columns.
Expand Down Expand Up @@ -6565,6 +6565,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
squeeze : boolean, default False
reduce the dimensionality of the return type if possible,
otherwise return a consistent type
observed : boolean, default None
if True: only show observed values for categorical groupers
if False: show all values for categorical groupers
if None: if any categorical groupers, show a FutureWarning,
default to False
.. versionadded:: 0.23.0
Returns
-------
Expand Down Expand Up @@ -6598,7 +6605,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
axis = self._get_axis_number(axis)
return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
sort=sort, group_keys=group_keys, squeeze=squeeze,
**kwargs)
observed=observed, **kwargs)

def asfreq(self, freq, method=None, how=None, normalize=False,
fill_value=None):
Expand Down
73 changes: 56 additions & 17 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,8 @@ class _GroupBy(PandasObject, SelectionMixin):

def __init__(self, obj, keys=None, axis=0, level=None,
grouper=None, exclusions=None, selection=None, as_index=True,
sort=True, group_keys=True, squeeze=False, **kwargs):
sort=True, group_keys=True, squeeze=False,
observed=None, **kwargs):

self._selection = selection

Expand All @@ -576,13 +577,15 @@ def __init__(self, obj, keys=None, axis=0, level=None,
self.sort = sort
self.group_keys = group_keys
self.squeeze = squeeze
self.observed = observed
self.mutated = kwargs.pop('mutated', False)

if grouper is None:
grouper, exclusions, obj = _get_grouper(obj, keys,
axis=axis,
level=level,
sort=sort,
observed=observed,
mutated=self.mutated)

self.obj = obj
Expand Down Expand Up @@ -2331,18 +2334,21 @@ def ngroups(self):
def recons_labels(self):
comp_ids, obs_ids, _ = self.group_info
labels = (ping.labels for ping in self.groupings)
return decons_obs_group_ids(comp_ids,
obs_ids, self.shape, labels, xnull=True)
return decons_obs_group_ids(
comp_ids, obs_ids, self.shape, labels, xnull=True)

@cache_readonly
def result_index(self):
if not self.compressed and len(self.groupings) == 1:
return self.groupings[0].group_index.rename(self.names[0])

return MultiIndex(levels=[ping.group_index for ping in self.groupings],
labels=self.recons_labels,
verify_integrity=False,
names=self.names)
labels = self.recons_labels
levels = [ping.group_index for ping in self.groupings]
result = MultiIndex(levels=levels,
labels=labels,
verify_integrity=False,
names=self.names)
return result

def get_group_levels(self):
if not self.compressed and len(self.groupings) == 1:
Expand Down Expand Up @@ -2883,6 +2889,7 @@ class Grouping(object):
obj :
name :
level :
observed : If we are a Categorical, use the observed values
in_axis : if the Grouping is a column in self.obj and hence among
Groupby.exclusions list
Expand All @@ -2898,14 +2905,15 @@ class Grouping(object):
"""

def __init__(self, index, grouper=None, obj=None, name=None, level=None,
sort=True, in_axis=False):
sort=True, observed=None, in_axis=False):

self.name = name
self.level = level
self.grouper = _convert_grouper(index, grouper)
self.index = index
self.sort = sort
self.obj = obj
self.observed = observed
self.in_axis = in_axis

# right place for this?
Expand Down Expand Up @@ -2954,16 +2962,34 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
elif is_categorical_dtype(self.grouper):

self.grouper = self.grouper._codes_for_groupby(self.sort)
codes = self.grouper.codes
categories = self.grouper.categories

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
self._labels = self.grouper.codes
self._labels = codes

# Use the observed values of the grouper if inidcated
observed = self.observed
if observed is None:
msg = ("pass observed=True to ensure that a "
"categorical grouper only returns the "
"observed groupers, or\n"
"observed=False to return NA for non-observed"
"values\n")
warnings.warn(msg, FutureWarning, stacklevel=5)
observed = False

if observed:
codes = algorithms.unique1d(codes)
else:
codes = np.arange(len(categories))

c = self.grouper.categories
self._group_index = CategoricalIndex(
Categorical.from_codes(np.arange(len(c)),
categories=c,
ordered=self.grouper.ordered))
Categorical.from_codes(
codes=codes,
categories=categories,
ordered=self.grouper.ordered))

# we are done
if isinstance(self.grouper, Grouping):
Expand Down Expand Up @@ -3048,7 +3074,7 @@ def groups(self):


def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
mutated=False, validate=True):
observed=None, mutated=False, validate=True):
"""
create and return a BaseGrouper, which is an internal
mapping of how to create the grouper indexers.
Expand All @@ -3065,6 +3091,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
are and then creates a Grouping for each one, combined into
a BaseGrouper.
If observed & we have a categorical grouper, only show the observed
values
If validate, then check for key/level overlaps
"""
Expand Down Expand Up @@ -3243,6 +3272,7 @@ def is_in_obj(gpr):
name=name,
level=level,
sort=sort,
observed=observed,
in_axis=in_axis) \
if not isinstance(gpr, Grouping) else gpr

Expand Down Expand Up @@ -4154,7 +4184,7 @@ def first_not_none(values):
not_indexed_same=not_indexed_same)
elif self.grouper.groupings is not None:
if len(self.grouper.groupings) > 1:
key_index = MultiIndex.from_tuples(keys, names=key_names)
key_index = self.grouper.result_index

else:
ping = self.grouper.groupings[0]
Expand Down Expand Up @@ -4244,8 +4274,9 @@ def first_not_none(values):

# normally use vstack as its faster than concat
# and if we have mi-columns
if isinstance(v.index,
MultiIndex) or key_index is None:
if (isinstance(v.index, MultiIndex) or
key_index is None or
isinstance(key_index, MultiIndex)):
stacked_values = np.vstack(map(np.asarray, values))
result = DataFrame(stacked_values, index=key_index,
columns=index)
Expand Down Expand Up @@ -4696,6 +4727,14 @@ def _reindex_output(self, result):
This can re-expand the output space
"""

# TODO(jreback): remove completely
# when observed parameter is defaulted to True
# gh-20583

if self.observed:
return result

groupings = self.grouper.groupings
if groupings is None:
return result
Expand Down
25 changes: 17 additions & 8 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
pass
values = list(values)

grouped = data.groupby(keys)
grouped = data.groupby(keys, observed=dropna)
agged = grouped.agg(aggfunc)

table = agged
Expand Down Expand Up @@ -241,10 +241,13 @@ def _all_key(key):
return (key, margins_name) + ('',) * (len(cols) - 1)

if len(rows) > 0:
margin = data[rows + values].groupby(rows).agg(aggfunc)
margin = data[rows + values].groupby(
rows, observed=True).agg(aggfunc)
cat_axis = 1

for key, piece in table.groupby(level=0, axis=cat_axis):
for key, piece in table.groupby(level=0,
axis=cat_axis,
observed=True):
all_key = _all_key(key)

# we are going to mutate this, so need to copy!
Expand All @@ -264,7 +267,9 @@ def _all_key(key):
else:
margin = grand_margin
cat_axis = 0
for key, piece in table.groupby(level=0, axis=cat_axis):
for key, piece in table.groupby(level=0,
axis=cat_axis,
observed=True):
all_key = _all_key(key)
table_pieces.append(piece)
table_pieces.append(Series(margin[key], index=[all_key]))
Expand All @@ -279,7 +284,8 @@ def _all_key(key):
margin_keys = table.columns

if len(cols) > 0:
row_margin = data[cols + values].groupby(cols).agg(aggfunc)
row_margin = data[cols + values].groupby(
cols, observed=True).agg(aggfunc)
row_margin = row_margin.stack()

# slight hack
Expand All @@ -304,14 +310,17 @@ def _all_key():
return (margins_name, ) + ('', ) * (len(cols) - 1)

if len(rows) > 0:
margin = data[rows].groupby(rows).apply(aggfunc)
margin = data[rows].groupby(rows,
observed=True).apply(aggfunc)
all_key = _all_key()
table[all_key] = margin
result = table
margin_keys.append(all_key)

else:
margin = data.groupby(level=0, axis=0).apply(aggfunc)
margin = data.groupby(level=0,
axis=0,
observed=True).apply(aggfunc)
all_key = _all_key()
table[all_key] = margin
result = table
Expand All @@ -322,7 +331,7 @@ def _all_key():
margin_keys = table.columns

if len(cols):
row_margin = data[cols].groupby(cols).apply(aggfunc)
row_margin = data[cols].groupby(cols, observed=True).apply(aggfunc)
else:
row_margin = Series(np.nan, index=result.columns)

Expand Down
Loading

0 comments on commit df3533a

Please sign in to comment.