diff --git a/pandas/conftest.py b/pandas/conftest.py index cf83904e4fa13..c4aab1b632b00 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -66,12 +66,14 @@ def ip(): return InteractiveShell() -@pytest.fixture(params=[True, False]) +@pytest.fixture(params=[True, False, None]) def observed(request): """ pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for - values which are not in the grouper [False], or only values which - appear in the grouper [True] """ + values which are not in the grouper [False / None], or only values which + appear in the grouper [True]. [None] is supported for future compatiblity + if we decide to change the default (and would need to warn if this + parameter is not passed)""" return request.param diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 63f0b742eb8b3..8613ab4d8c59d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -557,7 +557,7 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True, squeeze=False, - observed=False, **kwargs): + observed=None, **kwargs): self._selection = selection @@ -2907,7 +2907,7 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, observed=False, in_axis=False): + sort=True, observed=None, in_axis=False): self.name = name self.level = level @@ -2964,6 +2964,12 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): + # observed can be True/False/None + # we treat None as False. If in the future + # we need to warn if observed is not passed + # then we have this option + # gh-20583 + self.all_grouper = self.grouper self.grouper = self.grouper._codes_for_groupby( self.sort, observed) @@ -3082,7 +3088,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - observed=False, mutated=False, validate=True): + observed=None, mutated=False, validate=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 923a79b6a7720..e0793b8e1bd64 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -395,6 +395,26 @@ def test_observed_perf(): assert result.index.levels[2].nunique() == df.other_id.nunique() +def test_observed_groups(observed): + # gh-20583 + # test that we have the appropriate groups + + cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c']) + df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]}) + g = df.groupby('cat', observed=observed) + + result = g.groups + if observed: + expected = {'a': Index([0, 2], dtype='int64'), + 'c': Index([1], dtype='int64')} + else: + expected = {'a': Index([0, 2], dtype='int64'), + 'b': Index([], dtype='int64'), + 'c': Index([1], dtype='int64')} + + tm.assert_dict_equal(result, expected) + + def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4)