From b020891dbd3c53028cdfef4ce990dced7283dc3b Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Tue, 1 May 2018 11:09:09 -0400
Subject: [PATCH] API: categorical grouping will no longer return the cartesian
 product (#20583)

* BUG: groupby with categorical and other columns

closes #14942
---
 doc/source/groupby.rst                        |  74 +-
 doc/source/whatsnew/v0.23.0.txt               |  52 ++
 pandas/conftest.py                            |  11 +
 pandas/core/arrays/categorical.py             |  31 +-
 pandas/core/generic.py                        |  11 +-
 pandas/core/groupby/groupby.py                |  99 ++-
 pandas/core/indexes/category.py               |   4 +-
 pandas/core/reshape/pivot.py                  |  34 +-
 pandas/tests/frame/test_sorting.py            |   2 +-
 pandas/tests/groupby/aggregate/test_cython.py |  23 +-
 pandas/tests/groupby/aggregate/test_other.py  |   6 +-
 pandas/tests/groupby/test_categorical.py      | 705 ++++++++++--------
 pandas/tests/groupby/test_function.py         |   6 +-
 pandas/tests/groupby/test_grouping.py         |  25 +-
 pandas/tests/reshape/test_pivot.py            |  84 +--
 15 files changed, 748 insertions(+), 419 deletions(-)

diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
index 407fad39ba232..3616a7e1b41d2 100644
--- a/doc/source/groupby.rst
+++ b/doc/source/groupby.rst
@@ -91,10 +91,10 @@ The mapping can be specified many different ways:
   - A Python function, to be called on each of the axis labels.
   - A list or NumPy array of the same length as the selected axis.
   - A dict or ``Series``, providing a ``label -> group name`` mapping.
-  - For ``DataFrame`` objects, a string indicating a column to be used to group. 
+  - For ``DataFrame`` objects, a string indicating a column to be used to group.
     Of course ``df.groupby('A')`` is just syntactic sugar for
     ``df.groupby(df['A'])``, but it makes life simpler.
-  - For ``DataFrame`` objects, a string indicating an index level to be used to 
+  - For ``DataFrame`` objects, a string indicating an index level to be used to
     group.
   - A list of any of the above things.
 
@@ -120,7 +120,7 @@ consider the following ``DataFrame``:
                       'D' : np.random.randn(8)})
    df
 
-On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. 
+On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`.
 We could naturally group by either the ``A`` or ``B`` columns, or both:
 
 .. ipython:: python
@@ -360,8 +360,8 @@ Index level names may be specified as keys directly to ``groupby``.
 DataFrame column selection in GroupBy
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Once you have created the GroupBy object from a DataFrame, you might want to do 
-something different for each of the columns. Thus, using ``[]`` similar to 
+Once you have created the GroupBy object from a DataFrame, you might want to do
+something different for each of the columns. Thus, using ``[]`` similar to
 getting a column from a DataFrame, you can do:
 
 .. ipython:: python
@@ -421,7 +421,7 @@ statement if you wish: ``for (k1, k2), group in grouped:``.
 Selecting a group
 -----------------
 
-A single group can be selected using 
+A single group can be selected using
 :meth:`~pandas.core.groupby.DataFrameGroupBy.get_group`:
 
 .. ipython:: python
@@ -444,8 +444,8 @@ perform a computation on the grouped data. These operations are similar to the
 :ref:`aggregating API <basics.aggregate>`, :ref:`window functions API <stats.aggregate>`,
 and :ref:`resample API <timeseries.aggregate>`.
 
-An obvious one is aggregation via the 
-:meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` or equivalently 
+An obvious one is aggregation via the
+:meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` or equivalently
 :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` method:
 
 .. ipython:: python
@@ -517,12 +517,12 @@ Some common aggregating functions are tabulated below:
 	:meth:`~pd.core.groupby.DataFrameGroupBy.nth`;Take nth value, or a subset if n is a list
 	:meth:`~pd.core.groupby.DataFrameGroupBy.min`;Compute min of group values
 	:meth:`~pd.core.groupby.DataFrameGroupBy.max`;Compute max of group values
-	
 
-The aggregating functions above will exclude NA values. Any function which 
+
+The aggregating functions above will exclude NA values. Any function which
 reduces a :class:`Series` to a scalar value is an aggregation function and will work,
 a trivial example is ``df.groupby('A').agg(lambda ser: 1)``. Note that
-:meth:`~pd.core.groupby.DataFrameGroupBy.nth` can act as a reducer *or* a 
+:meth:`~pd.core.groupby.DataFrameGroupBy.nth` can act as a reducer *or* a
 filter, see :ref:`here <groupby.nth>`.
 
 .. _groupby.aggregate.multifunc:
@@ -732,7 +732,7 @@ and that the transformed data contains no NAs.
 .. note::
 
    Some functions will automatically transform the input when applied to a
-   GroupBy object, but returning an object of the same shape as the original. 
+   GroupBy object, but returning an object of the same shape as the original.
    Passing ``as_index=False`` will not affect these transformation methods.
 
    For example: ``fillna, ffill, bfill, shift.``.
@@ -926,7 +926,7 @@ The dimension of the returned result can also change:
 
     In [11]: grouped.apply(f)
 
-``apply`` on a Series can operate on a returned value from the applied function, 
+``apply`` on a Series can operate on a returned value from the applied function,
 that is itself a series, and possibly upcast the result to a DataFrame:
 
 .. ipython:: python
@@ -984,20 +984,48 @@ will be (silently) dropped. Thus, this does not pose any problems:
 
    df.groupby('A').std()
 
-Note that ``df.groupby('A').colname.std().`` is more efficient than 
+Note that ``df.groupby('A').colname.std().`` is more efficient than
 ``df.groupby('A').std().colname``, so if the result of an aggregation function
-is only interesting over one column (here ``colname``), it may be filtered 
+is only interesting over one column (here ``colname``), it may be filtered
 *before* applying the aggregation function.
 
+.. _groupby.observed:
+
+Handling of (un)observed Categorical values
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When using a ``Categorical`` grouper (as a single or as part of multipler groupers), the ``observed`` keyword
+controls whether to return a cartesian product of all possible groupers values (``observed=False``) or only those
+that are observed groupers (``observed=True``).
+
+Show all values:
+
+.. ipython:: python
+
+   pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count()
+
+Show only the observed values:
+
+.. ipython:: python
+
+   pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=True).count()
+
+The returned dtype of the grouped will *always* include *all* of the catergories that were grouped.
+
+.. ipython:: python
+
+   s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], categories=['a', 'b']), observed=False).count()
+   s.index.dtype
+
 .. _groupby.missing:
 
 NA and NaT group handling
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-If there are any NaN or NaT values in the grouping key, these will be 
-automatically excluded. In other words, there will never be an "NA group" or 
-"NaT group". This was not the case in older versions of pandas, but users were 
-generally discarding the NA group anyway (and supporting it was an 
+If there are any NaN or NaT values in the grouping key, these will be
+automatically excluded. In other words, there will never be an "NA group" or
+"NaT group". This was not the case in older versions of pandas, but users were
+generally discarding the NA group anyway (and supporting it was an
 implementation headache).
 
 Grouping with ordered factors
@@ -1084,8 +1112,8 @@ This shows the first or last n rows from each group.
 Taking the nth row of each group
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-To select from a DataFrame or Series the nth item, use 
-:meth:`~pd.core.groupby.DataFrameGroupBy.nth`. This is a reduction method, and 
+To select from a DataFrame or Series the nth item, use
+:meth:`~pd.core.groupby.DataFrameGroupBy.nth`. This is a reduction method, and
 will return a single row (or no row) per group if you pass an int for n:
 
 .. ipython:: python
@@ -1153,7 +1181,7 @@ Enumerate groups
 .. versionadded:: 0.20.2
 
 To see the ordering of the groups (as opposed to the order of rows
-within a group given by ``cumcount``) you can use 
+within a group given by ``cumcount``) you can use
 :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`.
 
 
@@ -1273,7 +1301,7 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
 Multi-column factorization
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-By using :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`, we can extract 
+By using :meth:`~pandas.core.groupby.DataFrameGroupBy.ngroup`, we can extract
 information about the groups in a way similar to :func:`factorize` (as described
 further in the :ref:`reshaping API <reshaping.factorize>`) but which applies
 naturally to multiple columns of mixed type and different
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index 7ea10deb65cef..d3bb28c2aee65 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -396,6 +396,58 @@ documentation. If you build an extension array, publicize it on our
 
 .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest/
 
+.. _whatsnew_0230.enhancements.categorical_grouping:
+
+Categorical Groupers has gained an observed keyword
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions, grouping by 1 or more categorical columns would result in an index that was the cartesian product of all of the categories for
+each grouper, not just the observed values.``.groupby()`` has gained the ``observed`` keyword to toggle this behavior. The default remains backward
+compatible (generate a cartesian product). (:issue:`14942`, :issue:`8138`, :issue:`15217`, :issue:`17594`, :issue:`8669`, :issue:`20583`)
+
+
+.. ipython:: python
+
+   cat1 = pd.Categorical(["a", "a", "b", "b"],
+                         categories=["a", "b", "z"], ordered=True)
+   cat2 = pd.Categorical(["c", "d", "c", "d"],
+                         categories=["c", "d", "y"], ordered=True)
+   df = pd.DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+   df['C'] = ['foo', 'bar'] * 2
+   df
+
+To show all values, the previous behavior:
+
+.. ipython:: python
+
+   df.groupby(['A', 'B', 'C'], observed=False).count()
+
+
+To show only observed values:
+
+.. ipython:: python
+
+   df.groupby(['A', 'B', 'C'], observed=True).count()
+
+For pivotting operations, this behavior is *already* controlled by the ``dropna`` keyword:
+
+.. ipython:: python
+
+   cat1 = pd.Categorical(["a", "a", "b", "b"],
+                         categories=["a", "b", "z"], ordered=True)
+   cat2 = pd.Categorical(["c", "d", "c", "d"],
+                         categories=["c", "d", "y"], ordered=True)
+   df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+   df
+
+.. ipython:: python
+
+   pd.pivot_table(df, values='values', index=['A', 'B'],
+                  dropna=True)
+   pd.pivot_table(df, values='values', index=['A', 'B'],
+                  dropna=False)
+
+
 .. _whatsnew_0230.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 559b5e44631b6..c4aab1b632b00 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -66,6 +66,17 @@ def ip():
     return InteractiveShell()
 
 
+@pytest.fixture(params=[True, False, None])
+def observed(request):
+    """ pass in the observed keyword to groupby for [True, False]
+    This indicates whether categoricals should return values for
+    values which are not in the grouper [False / None], or only values which
+    appear in the grouper [True]. [None] is supported for future compatiblity
+    if we decide to change the default (and would need to warn if this
+    parameter is not passed)"""
+    return request.param
+
+
 @pytest.fixture(params=[None, 'gzip', 'bz2', 'zip',
                         pytest.param('xz', marks=td.skip_if_no_lzma)])
 def compression(request):
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 517c21cc1bc3a..f91782459df67 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -647,8 +647,13 @@ def _set_categories(self, categories, fastpath=False):
 
         self._dtype = new_dtype
 
-    def _codes_for_groupby(self, sort):
+    def _codes_for_groupby(self, sort, observed):
         """
+        Code the categories to ensure we can groupby for categoricals.
+
+        If observed=True, we return a new Categorical with the observed
+        categories only.
+
         If sort=False, return a copy of self, coded with categories as
         returned by .unique(), followed by any categories not appearing in
         the data. If sort=True, return self.
@@ -661,6 +666,8 @@ def _codes_for_groupby(self, sort):
         ----------
         sort : boolean
             The value of the sort parameter groupby was called with.
+        observed : boolean
+            Account only for the observed values
 
         Returns
         -------
@@ -671,6 +678,26 @@ def _codes_for_groupby(self, sort):
             categories in the original order.
         """
 
+        # we only care about observed values
+        if observed:
+            unique_codes = unique1d(self.codes)
+            cat = self.copy()
+
+            take_codes = unique_codes[unique_codes != -1]
+            if self.ordered:
+                take_codes = np.sort(take_codes)
+
+            # we recode according to the uniques
+            categories = self.categories.take(take_codes)
+            codes = _recode_for_categories(self.codes,
+                                           self.categories,
+                                           categories)
+
+            # return a new categorical that maps our new codes
+            # and categories
+            dtype = CategoricalDtype(categories, ordered=self.ordered)
+            return type(self)(codes, dtype=dtype, fastpath=True)
+
         # Already sorted according to self.categories; all is fine
         if sort:
             return self
@@ -2161,7 +2188,7 @@ def unique(self):
         # exclude nan from indexer for categories
         take_codes = unique_codes[unique_codes != -1]
         if self.ordered:
-            take_codes = sorted(take_codes)
+            take_codes = np.sort(take_codes)
         return cat.set_categories(cat.categories.take(take_codes))
 
     def _values_for_factorize(self):
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index af19acbb416ee..e68662037b43d 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -6599,7 +6599,7 @@ def clip_lower(self, threshold, axis=None, inplace=False):
                                          axis=axis, inplace=inplace)
 
     def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
-                group_keys=True, squeeze=False, **kwargs):
+                group_keys=True, squeeze=False, observed=None, **kwargs):
         """
         Group series using mapper (dict or key function, apply given function
         to group, return result as series) or by a series of columns.
@@ -6632,6 +6632,13 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
         squeeze : boolean, default False
             reduce the dimensionality of the return type if possible,
             otherwise return a consistent type
+        observed : boolean, default None
+            if True: only show observed values for categorical groupers.
+            if False: show all values for categorical groupers.
+            if None: if any categorical groupers, show a FutureWarning,
+                default to False.
+
+            .. versionadded:: 0.23.0
 
         Returns
         -------
@@ -6665,7 +6672,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
         axis = self._get_axis_number(axis)
         return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
                        sort=sort, group_keys=group_keys, squeeze=squeeze,
-                       **kwargs)
+                       observed=observed, **kwargs)
 
     def asfreq(self, freq, method=None, how=None, normalize=False,
                fill_value=None):
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 8c20d62117e25..8613ab4d8c59d 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -556,7 +556,8 @@ class _GroupBy(PandasObject, SelectionMixin):
 
     def __init__(self, obj, keys=None, axis=0, level=None,
                  grouper=None, exclusions=None, selection=None, as_index=True,
-                 sort=True, group_keys=True, squeeze=False, **kwargs):
+                 sort=True, group_keys=True, squeeze=False,
+                 observed=None, **kwargs):
 
         self._selection = selection
 
@@ -576,6 +577,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
         self.sort = sort
         self.group_keys = group_keys
         self.squeeze = squeeze
+        self.observed = observed
         self.mutated = kwargs.pop('mutated', False)
 
         if grouper is None:
@@ -583,6 +585,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
                                                     axis=axis,
                                                     level=level,
                                                     sort=sort,
+                                                    observed=observed,
                                                     mutated=self.mutated)
 
         self.obj = obj
@@ -1661,10 +1664,11 @@ def nth(self, n, dropna=None):
 
         if dropna not in ['any', 'all']:
             if isinstance(self._selected_obj, Series) and dropna is True:
-                warnings.warn("the dropna='%s' keyword is deprecated,"
+                warnings.warn("the dropna={dropna} keyword is deprecated,"
                               "use dropna='all' instead. "
                               "For a Series groupby, dropna must be "
-                              "either None, 'any' or 'all'." % (dropna),
+                              "either None, 'any' or 'all'.".format(
+                                  dropna=dropna),
                               FutureWarning,
                               stacklevel=2)
                 dropna = 'all'
@@ -2331,27 +2335,30 @@ def ngroups(self):
     def recons_labels(self):
         comp_ids, obs_ids, _ = self.group_info
         labels = (ping.labels for ping in self.groupings)
-        return decons_obs_group_ids(comp_ids,
-                                    obs_ids, self.shape, labels, xnull=True)
+        return decons_obs_group_ids(
+            comp_ids, obs_ids, self.shape, labels, xnull=True)
 
     @cache_readonly
     def result_index(self):
         if not self.compressed and len(self.groupings) == 1:
-            return self.groupings[0].group_index.rename(self.names[0])
-
-        return MultiIndex(levels=[ping.group_index for ping in self.groupings],
-                          labels=self.recons_labels,
-                          verify_integrity=False,
-                          names=self.names)
+            return self.groupings[0].result_index.rename(self.names[0])
+
+        labels = self.recons_labels
+        levels = [ping.result_index for ping in self.groupings]
+        result = MultiIndex(levels=levels,
+                            labels=labels,
+                            verify_integrity=False,
+                            names=self.names)
+        return result
 
     def get_group_levels(self):
         if not self.compressed and len(self.groupings) == 1:
-            return [self.groupings[0].group_index]
+            return [self.groupings[0].result_index]
 
         name_list = []
         for ping, labels in zip(self.groupings, self.recons_labels):
             labels = _ensure_platform_int(labels)
-            levels = ping.group_index.take(labels)
+            levels = ping.result_index.take(labels)
 
             name_list.append(levels)
 
@@ -2883,6 +2890,8 @@ class Grouping(object):
     obj :
     name :
     level :
+    observed : boolean, default False
+        If we are a Categorical, use the observed values
     in_axis : if the Grouping is a column in self.obj and hence among
         Groupby.exclusions list
 
@@ -2898,14 +2907,16 @@ class Grouping(object):
     """
 
     def __init__(self, index, grouper=None, obj=None, name=None, level=None,
-                 sort=True, in_axis=False):
+                 sort=True, observed=None, in_axis=False):
 
         self.name = name
         self.level = level
         self.grouper = _convert_grouper(index, grouper)
+        self.all_grouper = None
         self.index = index
         self.sort = sort
         self.obj = obj
+        self.observed = observed
         self.in_axis = in_axis
 
         # right place for this?
@@ -2953,17 +2964,30 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
             # a passed Categorical
             elif is_categorical_dtype(self.grouper):
 
-                self.grouper = self.grouper._codes_for_groupby(self.sort)
+                # observed can be True/False/None
+                # we treat None as False. If in the future
+                # we need to warn if observed is not passed
+                # then we have this option
+                # gh-20583
+
+                self.all_grouper = self.grouper
+                self.grouper = self.grouper._codes_for_groupby(
+                    self.sort, observed)
+                categories = self.grouper.categories
 
                 # we make a CategoricalIndex out of the cat grouper
                 # preserving the categories / ordered attributes
                 self._labels = self.grouper.codes
+                if observed:
+                    codes = algorithms.unique1d(self.grouper.codes)
+                else:
+                    codes = np.arange(len(categories))
 
-                c = self.grouper.categories
                 self._group_index = CategoricalIndex(
-                    Categorical.from_codes(np.arange(len(c)),
-                                           categories=c,
-                                           ordered=self.grouper.ordered))
+                    Categorical.from_codes(
+                        codes=codes,
+                        categories=categories,
+                        ordered=self.grouper.ordered))
 
             # we are done
             if isinstance(self.grouper, Grouping):
@@ -3022,6 +3046,22 @@ def labels(self):
             self._make_labels()
         return self._labels
 
+    @cache_readonly
+    def result_index(self):
+        if self.all_grouper is not None:
+            all_categories = self.all_grouper.categories
+
+            # we re-order to the original category orderings
+            if self.sort:
+                return self.group_index.set_categories(all_categories)
+
+            # we are not sorting, so add unobserved to the end
+            categories = self.group_index.categories
+            return self.group_index.add_categories(
+                all_categories[~all_categories.isin(categories)])
+
+        return self.group_index
+
     @property
     def group_index(self):
         if self._group_index is None:
@@ -3048,7 +3088,7 @@ def groups(self):
 
 
 def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
-                 mutated=False, validate=True):
+                 observed=None, mutated=False, validate=True):
     """
     create and return a BaseGrouper, which is an internal
     mapping of how to create the grouper indexers.
@@ -3065,6 +3105,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
     are and then creates a Grouping for each one, combined into
     a BaseGrouper.
 
+    If observed & we have a categorical grouper, only show the observed
+    values
+
     If validate, then check for key/level overlaps
 
     """
@@ -3243,6 +3286,7 @@ def is_in_obj(gpr):
                         name=name,
                         level=level,
                         sort=sort,
+                        observed=observed,
                         in_axis=in_axis) \
             if not isinstance(gpr, Grouping) else gpr
 
@@ -4154,7 +4198,7 @@ def first_not_none(values):
                                         not_indexed_same=not_indexed_same)
         elif self.grouper.groupings is not None:
             if len(self.grouper.groupings) > 1:
-                key_index = MultiIndex.from_tuples(keys, names=key_names)
+                key_index = self.grouper.result_index
 
             else:
                 ping = self.grouper.groupings[0]
@@ -4244,8 +4288,9 @@ def first_not_none(values):
 
                         # normally use vstack as its faster than concat
                         # and if we have mi-columns
-                        if isinstance(v.index,
-                                      MultiIndex) or key_index is None:
+                        if (isinstance(v.index, MultiIndex) or
+                                key_index is None or
+                                isinstance(key_index, MultiIndex)):
                             stacked_values = np.vstack(map(np.asarray, values))
                             result = DataFrame(stacked_values, index=key_index,
                                                columns=index)
@@ -4696,6 +4741,14 @@ def _reindex_output(self, result):
 
         This can re-expand the output space
         """
+
+        # TODO(jreback): remove completely
+        # when observed parameter is defaulted to True
+        # gh-20583
+
+        if self.observed:
+            return result
+
         groupings = self.grouper.groupings
         if groupings is None:
             return result
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 71caa098c7a28..3ffef5804acf7 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -782,9 +782,9 @@ def _concat_same_dtype(self, to_concat, name):
         result.name = name
         return result
 
-    def _codes_for_groupby(self, sort):
+    def _codes_for_groupby(self, sort, observed):
         """ Return a Categorical adjusted for groupby """
-        return self.values._codes_for_groupby(sort)
+        return self.values._codes_for_groupby(sort, observed)
 
     @classmethod
     def _add_comparison_methods(cls):
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 74a9b59d3194a..39fb57e68c9c0 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -79,7 +79,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
                 pass
         values = list(values)
 
-    grouped = data.groupby(keys)
+    grouped = data.groupby(keys, observed=dropna)
     agged = grouped.agg(aggfunc)
 
     table = agged
@@ -120,6 +120,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
             data = data[data.notna().all(axis=1)]
         table = _add_margins(table, data, values, rows=index,
                              cols=columns, aggfunc=aggfunc,
+                             observed=dropna,
                              margins_name=margins_name, fill_value=fill_value)
 
     # discard the top level
@@ -138,7 +139,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
 
 
 def _add_margins(table, data, values, rows, cols, aggfunc,
-                 margins_name='All', fill_value=None):
+                 observed=None, margins_name='All', fill_value=None):
     if not isinstance(margins_name, compat.string_types):
         raise ValueError('margins_name argument must be a string')
 
@@ -168,6 +169,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
     if values:
         marginal_result_set = _generate_marginal_results(table, data, values,
                                                          rows, cols, aggfunc,
+                                                         observed,
                                                          grand_margin,
                                                          margins_name)
         if not isinstance(marginal_result_set, tuple):
@@ -175,7 +177,7 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
         result, margin_keys, row_margin = marginal_result_set
     else:
         marginal_result_set = _generate_marginal_results_without_values(
-            table, data, rows, cols, aggfunc, margins_name)
+            table, data, rows, cols, aggfunc, observed, margins_name)
         if not isinstance(marginal_result_set, tuple):
             return marginal_result_set
         result, margin_keys, row_margin = marginal_result_set
@@ -230,6 +232,7 @@ def _compute_grand_margin(data, values, aggfunc,
 
 
 def _generate_marginal_results(table, data, values, rows, cols, aggfunc,
+                               observed,
                                grand_margin,
                                margins_name='All'):
     if len(cols) > 0:
@@ -241,10 +244,13 @@ def _all_key(key):
             return (key, margins_name) + ('',) * (len(cols) - 1)
 
         if len(rows) > 0:
-            margin = data[rows + values].groupby(rows).agg(aggfunc)
+            margin = data[rows + values].groupby(
+                rows, observed=observed).agg(aggfunc)
             cat_axis = 1
 
-            for key, piece in table.groupby(level=0, axis=cat_axis):
+            for key, piece in table.groupby(level=0,
+                                            axis=cat_axis,
+                                            observed=observed):
                 all_key = _all_key(key)
 
                 # we are going to mutate this, so need to copy!
@@ -264,7 +270,9 @@ def _all_key(key):
         else:
             margin = grand_margin
             cat_axis = 0
-            for key, piece in table.groupby(level=0, axis=cat_axis):
+            for key, piece in table.groupby(level=0,
+                                            axis=cat_axis,
+                                            observed=observed):
                 all_key = _all_key(key)
                 table_pieces.append(piece)
                 table_pieces.append(Series(margin[key], index=[all_key]))
@@ -279,7 +287,8 @@ def _all_key(key):
         margin_keys = table.columns
 
     if len(cols) > 0:
-        row_margin = data[cols + values].groupby(cols).agg(aggfunc)
+        row_margin = data[cols + values].groupby(
+            cols, observed=observed).agg(aggfunc)
         row_margin = row_margin.stack()
 
         # slight hack
@@ -293,7 +302,7 @@ def _all_key(key):
 
 def _generate_marginal_results_without_values(
         table, data, rows, cols, aggfunc,
-        margins_name='All'):
+        observed, margins_name='All'):
     if len(cols) > 0:
         # need to "interleave" the margins
         margin_keys = []
@@ -304,14 +313,17 @@ def _all_key():
             return (margins_name, ) + ('', ) * (len(cols) - 1)
 
         if len(rows) > 0:
-            margin = data[rows].groupby(rows).apply(aggfunc)
+            margin = data[rows].groupby(rows,
+                                        observed=observed).apply(aggfunc)
             all_key = _all_key()
             table[all_key] = margin
             result = table
             margin_keys.append(all_key)
 
         else:
-            margin = data.groupby(level=0, axis=0).apply(aggfunc)
+            margin = data.groupby(level=0,
+                                  axis=0,
+                                  observed=observed).apply(aggfunc)
             all_key = _all_key()
             table[all_key] = margin
             result = table
@@ -322,7 +334,7 @@ def _all_key():
         margin_keys = table.columns
 
     if len(cols):
-        row_margin = data[cols].groupby(cols).apply(aggfunc)
+        row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc)
     else:
         row_margin = Series(np.nan, index=result.columns)
 
diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py
index 5bd239f8a3034..b60eb89e87da5 100644
--- a/pandas/tests/frame/test_sorting.py
+++ b/pandas/tests/frame/test_sorting.py
@@ -573,7 +573,7 @@ def test_sort_index_intervalindex(self):
                     bins=[-3, -0.5, 0, 0.5, 3])
         model = pd.concat([y, x1, x2], axis=1, keys=['Y', 'X1', 'X2'])
 
-        result = model.groupby(['X1', 'X2']).mean().unstack()
+        result = model.groupby(['X1', 'X2'], observed=True).mean().unstack()
         expected = IntervalIndex.from_tuples(
             [(-3.0, -0.5), (-0.5, 0.0),
              (0.0, 0.5), (0.5, 3.0)],
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index 80383c895a5e5..48a45e93e1e8e 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -158,35 +158,46 @@ def test__cython_agg_general(op, targop):
     ('min', np.min),
     ('max', np.max), ]
 )
-def test_cython_agg_empty_buckets(op, targop):
+def test_cython_agg_empty_buckets(op, targop, observed):
     df = pd.DataFrame([11, 12, 13])
     grps = range(0, 55, 5)
 
     # calling _cython_agg_general directly, instead of via the user API
     # which sets different values for min_count, so do that here.
-    result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op)
-    expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x))
+    g = df.groupby(pd.cut(df[0], grps), observed=observed)
+    result = g._cython_agg_general(op)
+
+    g = df.groupby(pd.cut(df[0], grps), observed=observed)
+    expected = g.agg(lambda x: targop(x))
     tm.assert_frame_equal(result, expected)
 
 
-def test_cython_agg_empty_buckets_nanops():
+def test_cython_agg_empty_buckets_nanops(observed):
     # GH-18869 can't call nanops on empty groups, so hardcode expected
     # for these
     df = pd.DataFrame([11, 12, 13], columns=['a'])
     grps = range(0, 25, 5)
     # add / sum
-    result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add')
+    result = df.groupby(pd.cut(df['a'], grps),
+                        observed=observed)._cython_agg_general('add')
     intervals = pd.interval_range(0, 20, freq=5)
     expected = pd.DataFrame(
         {"a": [0, 0, 36, 0]},
         index=pd.CategoricalIndex(intervals, name='a', ordered=True))
+    if observed:
+        expected = expected[expected.a != 0]
+
     tm.assert_frame_equal(result, expected)
 
     # prod
-    result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod')
+    result = df.groupby(pd.cut(df['a'], grps),
+                        observed=observed)._cython_agg_general('prod')
     expected = pd.DataFrame(
         {"a": [1, 1, 1716, 1]},
         index=pd.CategoricalIndex(intervals, name='a', ordered=True))
+    if observed:
+        expected = expected[expected.a != 1]
+
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
index a10f7f6e46210..34489051efc18 100644
--- a/pandas/tests/groupby/aggregate/test_other.py
+++ b/pandas/tests/groupby/aggregate/test_other.py
@@ -488,15 +488,17 @@ def test_agg_structs_series(structure, expected):
 
 
 @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.")
-def test_agg_category_nansum():
+def test_agg_category_nansum(observed):
     categories = ['a', 'b', 'c']
     df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
                                            categories=categories),
                        'B': [1, 2, 3]})
-    result = df.groupby("A").B.agg(np.nansum)
+    result = df.groupby("A", observed=observed).B.agg(np.nansum)
     expected = pd.Series([3, 3, 0],
                          index=pd.CategoricalIndex(['a', 'b', 'c'],
                                                    categories=categories,
                                                    name='A'),
                          name='B')
+    if observed:
+        expected = expected[expected != 0]
     tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 160b60e69f39d..e0793b8e1bd64 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -5,16 +5,43 @@
 import pytest
 
 import numpy as np
-from numpy import nan
-
 import pandas as pd
 from pandas import (Index, MultiIndex, CategoricalIndex,
-                    DataFrame, Categorical, Series, Interval, qcut)
+                    DataFrame, Categorical, Series, qcut)
 from pandas.util.testing import assert_frame_equal, assert_series_equal
 import pandas.util.testing as tm
 
 
-def test_groupby():
+def cartesian_product_for_groupers(result, args, names):
+    """ Reindex to a cartesian production for the groupers,
+    preserving the nature (Categorical) of each grouper """
+
+    def f(a):
+        if isinstance(a, (CategoricalIndex, Categorical)):
+            categories = a.categories
+            a = Categorical.from_codes(np.arange(len(categories)),
+                                       categories=categories,
+                                       ordered=a.ordered)
+        return a
+
+    index = pd.MultiIndex.from_product(map(f, args), names=names)
+    return result.reindex(index).sort_index()
+
+
+def test_apply_use_categorical_name(df):
+    cats = qcut(df.C, 4)
+
+    def get_stats(group):
+        return {'min': group.min(),
+                'max': group.max(),
+                'count': group.count(),
+                'mean': group.mean()}
+
+    result = df.groupby(cats, observed=False).D.apply(get_stats)
+    assert result.index.names[0] == 'C'
+
+
+def test_basic():
 
     cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                        categories=["a", "b", "c", "d"], ordered=True)
@@ -22,56 +49,29 @@ def test_groupby():
 
     exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
     expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
-    result = data.groupby("b").mean()
+    result = data.groupby("b", observed=False).mean()
     tm.assert_frame_equal(result, expected)
 
-    raw_cat1 = Categorical(["a", "a", "b", "b"],
-                           categories=["a", "b", "z"], ordered=True)
-    raw_cat2 = Categorical(["c", "d", "c", "d"],
-                           categories=["c", "d", "y"], ordered=True)
-    df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]})
+    cat1 = Categorical(["a", "a", "b", "b"],
+                       categories=["a", "b", "z"], ordered=True)
+    cat2 = Categorical(["c", "d", "c", "d"],
+                       categories=["c", "d", "y"], ordered=True)
+    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
 
     # single grouper
-    gb = df.groupby("A")
+    gb = df.groupby("A", observed=False)
     exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
     expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
     result = gb.sum()
     tm.assert_frame_equal(result, expected)
 
-    # multiple groupers
-    gb = df.groupby(['A', 'B'])
-    exp_index = pd.MultiIndex.from_product(
-        [Categorical(["a", "b", "z"], ordered=True),
-         Categorical(["c", "d", "y"], ordered=True)],
-        names=['A', 'B'])
-    expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan,
-                                     np.nan, np.nan, np.nan]},
-                         index=exp_index)
-    result = gb.sum()
-    tm.assert_frame_equal(result, expected)
-
-    # multiple groupers with a non-cat
-    df = df.copy()
-    df['C'] = ['foo', 'bar'] * 2
-    gb = df.groupby(['A', 'B', 'C'])
-    exp_index = pd.MultiIndex.from_product(
-        [Categorical(["a", "b", "z"], ordered=True),
-         Categorical(["c", "d", "y"], ordered=True),
-         ['foo', 'bar']],
-        names=['A', 'B', 'C'])
-    expected = DataFrame({'values': Series(
-        np.nan, index=exp_index)}).sort_index()
-    expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4]
-    result = gb.sum()
-    tm.assert_frame_equal(result, expected)
-
     # GH 8623
     x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
                    [1, 'John P. Doe']],
                   columns=['person_id', 'person_name'])
     x['person_name'] = Categorical(x.person_name)
 
-    g = x.groupby(['person_id'])
+    g = x.groupby(['person_id'], observed=False)
     result = g.transform(lambda x: x)
     tm.assert_frame_equal(result, x[['person_name']])
 
@@ -93,36 +93,48 @@ def f(x):
     df = DataFrame({"a": [5, 15, 25]})
     c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
 
-    result = df.a.groupby(c).transform(sum)
+    result = df.a.groupby(c, observed=False).transform(sum)
     tm.assert_series_equal(result, df['a'])
 
     tm.assert_series_equal(
-        df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
-    tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
+        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
+        df['a'])
+    tm.assert_frame_equal(
+        df.groupby(c, observed=False).transform(sum),
+        df[['a']])
     tm.assert_frame_equal(
-        df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']])
+        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
+        df[['a']])
 
     # Filter
-    tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a'])
-    tm.assert_frame_equal(df.groupby(c).filter(np.all), df)
+    tm.assert_series_equal(
+        df.a.groupby(c, observed=False).filter(np.all),
+        df['a'])
+    tm.assert_frame_equal(
+        df.groupby(c, observed=False).filter(np.all),
+        df)
 
     # Non-monotonic
     df = DataFrame({"a": [5, 15, 25, -5]})
     c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
 
-    result = df.a.groupby(c).transform(sum)
+    result = df.a.groupby(c, observed=False).transform(sum)
     tm.assert_series_equal(result, df['a'])
 
     tm.assert_series_equal(
-        df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a'])
-    tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']])
+        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
+        df['a'])
     tm.assert_frame_equal(
-        df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']])
+        df.groupby(c, observed=False).transform(sum),
+        df[['a']])
+    tm.assert_frame_equal(
+        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
+        df[['a']])
 
     # GH 9603
     df = DataFrame({'a': [1, 0, 0, 0]})
     c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
-    result = df.groupby(c).apply(len)
+    result = df.groupby(c, observed=False).apply(len)
 
     exp_index = CategoricalIndex(
         c.values.categories, ordered=c.values.ordered)
@@ -130,36 +142,56 @@ def f(x):
     expected.index.name = 'a'
     tm.assert_series_equal(result, expected)
 
+    # more basic
+    levels = ['foo', 'bar', 'baz', 'qux']
+    codes = np.random.randint(0, 4, size=100)
 
-def test_groupby_sort():
+    cats = Categorical.from_codes(codes, levels, ordered=True)
 
-    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby
-    # This should result in a properly sorted Series so that the plot
-    # has a sorted x axis
-    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
+    data = DataFrame(np.random.randn(100, 4))
 
-    df = DataFrame({'value': np.random.randint(0, 10000, 100)})
-    labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
-    cat_labels = Categorical(labels, labels)
+    result = data.groupby(cats, observed=False).mean()
 
-    df = df.sort_values(by=['value'], ascending=True)
-    df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
-                               right=False, labels=cat_labels)
+    expected = data.groupby(np.asarray(cats), observed=False).mean()
+    exp_idx = CategoricalIndex(levels, categories=cats.categories,
+                               ordered=True)
+    expected = expected.reindex(exp_idx)
 
-    res = df.groupby(['value_group'])['value_group'].count()
-    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
-    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
-    tm.assert_series_equal(res, exp)
+    assert_frame_equal(result, expected)
 
+    grouped = data.groupby(cats, observed=False)
+    desc_result = grouped.describe()
 
-def test_level_groupby_get_group():
+    idx = cats.codes.argsort()
+    ord_labels = np.asarray(cats).take(idx)
+    ord_data = data.take(idx)
+
+    exp_cats = Categorical(ord_labels, ordered=True,
+                           categories=['foo', 'bar', 'baz', 'qux'])
+    expected = ord_data.groupby(
+        exp_cats, sort=False, observed=False).describe()
+    assert_frame_equal(desc_result, expected)
+
+    # GH 10460
+    expc = Categorical.from_codes(np.arange(4).repeat(8),
+                                  levels, ordered=True)
+    exp = CategoricalIndex(expc)
+    tm.assert_index_equal((desc_result.stack().index
+                           .get_level_values(0)), exp)
+    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
+                 '75%', 'max'] * 4)
+    tm.assert_index_equal((desc_result.stack().index
+                           .get_level_values(1)), exp)
+
+
+def test_level_get_group(observed):
     # GH15155
     df = DataFrame(data=np.arange(2, 22, 2),
                    index=MultiIndex(
                        levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
                        labels=[[0] * 5 + [1] * 5, range(10)],
                        names=["Index1", "Index2"]))
-    g = df.groupby(level=["Index1"])
+    g = df.groupby(level=["Index1"], observed=observed)
 
     # expected should equal test.loc[["a"]]
     # GH15166
@@ -173,94 +205,217 @@ def test_level_groupby_get_group():
     assert_frame_equal(result, expected)
 
 
-def test_apply_use_categorical_name(df):
-    cats = qcut(df.C, 4)
+@pytest.mark.parametrize('ordered', [True, False])
+def test_apply(ordered):
+    # GH 10138
 
-    def get_stats(group):
-        return {'min': group.min(),
-                'max': group.max(),
-                'count': group.count(),
-                'mean': group.mean()}
+    dense = Categorical(list('abc'), ordered=ordered)
+
+    # 'b' is in the categories but not in the list
+    missing = Categorical(
+        list('aaa'), categories=['a', 'b'], ordered=ordered)
+    values = np.arange(len(dense))
+    df = DataFrame({'missing': missing,
+                    'dense': dense,
+                    'values': values})
+    grouped = df.groupby(['missing', 'dense'], observed=True)
+
+    # missing category 'b' should still exist in the output index
+    idx = MultiIndex.from_arrays(
+        [missing, dense], names=['missing', 'dense'])
+    expected = DataFrame([0, 1, 2.],
+                         index=idx,
+                         columns=['values'])
+
+    result = grouped.apply(lambda x: np.mean(x))
+    assert_frame_equal(result, expected)
 
-    result = df.groupby(cats).D.apply(get_stats)
-    assert result.index.names[0] == 'C'
+    # we coerce back to ints
+    expected = expected.astype('int')
+    result = grouped.mean()
+    assert_frame_equal(result, expected)
 
+    result = grouped.agg(np.mean)
+    assert_frame_equal(result, expected)
 
-def test_apply_categorical_data():
-    # GH 10138
-    for ordered in [True, False]:
-        dense = Categorical(list('abc'), ordered=ordered)
-        # 'b' is in the categories but not in the list
-        missing = Categorical(
-            list('aaa'), categories=['a', 'b'], ordered=ordered)
-        values = np.arange(len(dense))
-        df = DataFrame({'missing': missing,
-                        'dense': dense,
-                        'values': values})
-        grouped = df.groupby(['missing', 'dense'])
-
-        # missing category 'b' should still exist in the output index
-        idx = MultiIndex.from_product(
-            [Categorical(['a', 'b'], ordered=ordered),
-             Categorical(['a', 'b', 'c'], ordered=ordered)],
-            names=['missing', 'dense'])
-        expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan],
-                             index=idx,
-                             columns=['values'])
-
-        assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected)
-        assert_frame_equal(grouped.mean(), expected)
-        assert_frame_equal(grouped.agg(np.mean), expected)
-
-        # but for transform we should still get back the original index
-        idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']],
-                                      names=['missing', 'dense'])
-        expected = Series(1, index=idx)
-        assert_series_equal(grouped.apply(lambda x: 1), expected)
-
-
-def test_groupby_categorical():
-    levels = ['foo', 'bar', 'baz', 'qux']
-    codes = np.random.randint(0, 4, size=100)
+    # but for transform we should still get back the original index
+    idx = MultiIndex.from_arrays([missing, dense],
+                                 names=['missing', 'dense'])
+    expected = Series(1, index=idx)
+    result = grouped.apply(lambda x: 1)
+    assert_series_equal(result, expected)
+
+
+def test_observed(observed):
+    # multiple groupers, don't re-expand the output space
+    # of the grouper
+    # gh-14942 (implement)
+    # gh-10132 (back-compat)
+    # gh-8138 (back-compat)
+    # gh-8869
+
+    cat1 = Categorical(["a", "a", "b", "b"],
+                       categories=["a", "b", "z"], ordered=True)
+    cat2 = Categorical(["c", "d", "c", "d"],
+                       categories=["c", "d", "y"], ordered=True)
+    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+    df['C'] = ['foo', 'bar'] * 2
 
-    cats = Categorical.from_codes(codes, levels, ordered=True)
+    # multiple groupers with a non-cat
+    gb = df.groupby(['A', 'B', 'C'], observed=observed)
+    exp_index = pd.MultiIndex.from_arrays(
+        [cat1, cat2, ['foo', 'bar'] * 2],
+        names=['A', 'B', 'C'])
+    expected = DataFrame({'values': Series(
+        [1, 2, 3, 4], index=exp_index)}).sort_index()
+    result = gb.sum()
+    if not observed:
+        expected = cartesian_product_for_groupers(
+            expected,
+            [cat1, cat2, ['foo', 'bar']],
+            list('ABC'))
 
-    data = DataFrame(np.random.randn(100, 4))
+    tm.assert_frame_equal(result, expected)
 
-    result = data.groupby(cats).mean()
+    gb = df.groupby(['A', 'B'], observed=observed)
+    exp_index = pd.MultiIndex.from_arrays(
+        [cat1, cat2],
+        names=['A', 'B'])
+    expected = DataFrame({'values': [1, 2, 3, 4]},
+                         index=exp_index)
+    result = gb.sum()
+    if not observed:
+        expected = cartesian_product_for_groupers(
+            expected,
+            [cat1, cat2],
+            list('AB'))
 
-    expected = data.groupby(np.asarray(cats)).mean()
-    exp_idx = CategoricalIndex(levels, categories=cats.categories,
-                               ordered=True)
-    expected = expected.reindex(exp_idx)
+    tm.assert_frame_equal(result, expected)
 
-    assert_frame_equal(result, expected)
+    # https://github.com/pandas-dev/pandas/issues/8138
+    d = {'cat':
+         pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
+                        ordered=True),
+         'ints': [1, 1, 2, 2],
+         'val': [10, 20, 30, 40]}
+    df = pd.DataFrame(d)
 
-    grouped = data.groupby(cats)
-    desc_result = grouped.describe()
+    # Grouping on a single column
+    groups_single_key = df.groupby("cat", observed=observed)
+    result = groups_single_key.mean()
 
-    idx = cats.codes.argsort()
-    ord_labels = np.asarray(cats).take(idx)
-    ord_data = data.take(idx)
+    exp_index = pd.CategoricalIndex(list('ab'), name="cat",
+                                    categories=list('abc'),
+                                    ordered=True)
+    expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
+                         index=exp_index)
+    if not observed:
+        index = pd.CategoricalIndex(list('abc'), name="cat",
+                                    categories=list('abc'),
+                                    ordered=True)
+        expected = expected.reindex(index)
 
-    exp_cats = Categorical(ord_labels, ordered=True,
-                           categories=['foo', 'bar', 'baz', 'qux'])
-    expected = ord_data.groupby(exp_cats, sort=False).describe()
-    assert_frame_equal(desc_result, expected)
+    tm.assert_frame_equal(result, expected)
 
-    # GH 10460
-    expc = Categorical.from_codes(np.arange(4).repeat(8),
-                                  levels, ordered=True)
-    exp = CategoricalIndex(expc)
-    tm.assert_index_equal((desc_result.stack().index
-                           .get_level_values(0)), exp)
-    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
-                 '75%', 'max'] * 4)
-    tm.assert_index_equal((desc_result.stack().index
-                           .get_level_values(1)), exp)
+    # Grouping on two columns
+    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
+    result = groups_double_key.agg('mean')
+    expected = DataFrame(
+        {"val": [10, 30, 20, 40],
+         "cat": pd.Categorical(['a', 'a', 'b', 'b'],
+                               categories=['a', 'b', 'c'],
+                               ordered=True),
+         "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
+    if not observed:
+        expected = cartesian_product_for_groupers(
+            expected,
+            [df.cat.values, [1, 2]],
+            ['cat', 'ints'])
+
+    tm.assert_frame_equal(result, expected)
 
+    # GH 10132
+    for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
+        c, i = key
+        result = groups_double_key.get_group(key)
+        expected = df[(df.cat == c) & (df.ints == i)]
+        assert_frame_equal(result, expected)
+
+    # gh-8869
+    # with as_index
+    d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70],
+         'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']}
+    df = pd.DataFrame(d)
+    cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
+    df['range'] = cat
+    groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
+    result = groups.agg('mean')
+
+    groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
+    expected = groups2.agg('mean').reset_index()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_observed_codes_remap(observed):
+    d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
+    df = pd.DataFrame(d)
+    values = pd.cut(df['C1'], [1, 2, 3, 6])
+    values.name = "cat"
+    groups_double_key = df.groupby([values, 'C2'], observed=observed)
+
+    idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
+                                 names=["cat", "C2"])
+    expected = DataFrame({"C1": [3, 3, 4, 5],
+                          "C3": [10, 100, 200, 34]}, index=idx)
+    if not observed:
+        expected = cartesian_product_for_groupers(
+            expected,
+            [values.values, [1, 2, 3, 4]],
+            ['cat', 'C2'])
+
+    result = groups_double_key.agg('mean')
+    tm.assert_frame_equal(result, expected)
+
+
+def test_observed_perf():
+    # we create a cartesian product, so this is
+    # non-performant if we don't use observed values
+    # gh-14942
+    df = DataFrame({
+        'cat': np.random.randint(0, 255, size=30000),
+        'int_id': np.random.randint(0, 255, size=30000),
+        'other_id': np.random.randint(0, 10000, size=30000),
+        'foo': 0})
+    df['cat'] = df.cat.astype(str).astype('category')
 
-def test_groupby_datetime_categorical():
+    grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True)
+    result = grouped.count()
+    assert result.index.levels[0].nunique() == df.cat.nunique()
+    assert result.index.levels[1].nunique() == df.int_id.nunique()
+    assert result.index.levels[2].nunique() == df.other_id.nunique()
+
+
+def test_observed_groups(observed):
+    # gh-20583
+    # test that we have the appropriate groups
+
+    cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c'])
+    df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]})
+    g = df.groupby('cat', observed=observed)
+
+    result = g.groups
+    if observed:
+        expected = {'a': Index([0, 2], dtype='int64'),
+                    'c': Index([1], dtype='int64')}
+    else:
+        expected = {'a': Index([0, 2], dtype='int64'),
+                    'b': Index([], dtype='int64'),
+                    'c': Index([1], dtype='int64')}
+
+    tm.assert_dict_equal(result, expected)
+
+
+def test_datetime():
     # GH9049: ensure backward compatibility
     levels = pd.date_range('2014-01-01', periods=4)
     codes = np.random.randint(0, 4, size=100)
@@ -268,9 +423,9 @@ def test_groupby_datetime_categorical():
     cats = Categorical.from_codes(codes, levels, ordered=True)
 
     data = DataFrame(np.random.randn(100, 4))
-    result = data.groupby(cats).mean()
+    result = data.groupby(cats, observed=False).mean()
 
-    expected = data.groupby(np.asarray(cats)).mean()
+    expected = data.groupby(np.asarray(cats), observed=False).mean()
     expected = expected.reindex(levels)
     expected.index = CategoricalIndex(expected.index,
                                       categories=expected.index,
@@ -278,13 +433,13 @@ def test_groupby_datetime_categorical():
 
     assert_frame_equal(result, expected)
 
-    grouped = data.groupby(cats)
+    grouped = data.groupby(cats, observed=False)
     desc_result = grouped.describe()
 
     idx = cats.codes.argsort()
     ord_labels = cats.take_nd(idx)
     ord_data = data.take(idx)
-    expected = ord_data.groupby(ord_labels).describe()
+    expected = ord_data.groupby(ord_labels, observed=False).describe()
     assert_frame_equal(desc_result, expected)
     tm.assert_index_equal(desc_result.index, expected.index)
     tm.assert_index_equal(
@@ -303,7 +458,7 @@ def test_groupby_datetime_categorical():
                            .get_level_values(1)), exp)
 
 
-def test_groupby_categorical_index():
+def test_categorical_index():
 
     s = np.random.RandomState(12345)
     levels = ['foo', 'bar', 'baz', 'qux']
@@ -315,23 +470,23 @@ def test_groupby_categorical_index():
     df['cats'] = cats
 
     # with a cat index
-    result = df.set_index('cats').groupby(level=0).sum()
-    expected = df[list('abcd')].groupby(cats.codes).sum()
+    result = df.set_index('cats').groupby(level=0, observed=False).sum()
+    expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
     expected.index = CategoricalIndex(
         Categorical.from_codes(
             [0, 1, 2, 3], levels, ordered=True), name='cats')
     assert_frame_equal(result, expected)
 
     # with a cat column, should produce a cat index
-    result = df.groupby('cats').sum()
-    expected = df[list('abcd')].groupby(cats.codes).sum()
+    result = df.groupby('cats', observed=False).sum()
+    expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
     expected.index = CategoricalIndex(
         Categorical.from_codes(
             [0, 1, 2, 3], levels, ordered=True), name='cats')
     assert_frame_equal(result, expected)
 
 
-def test_groupby_describe_categorical_columns():
+def test_describe_categorical_columns():
     # GH 11558
     cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
                                categories=['foo', 'bar', 'baz', 'qux'],
@@ -343,14 +498,15 @@ def test_groupby_describe_categorical_columns():
     tm.assert_categorical_equal(result.stack().columns.values, cats.values)
 
 
-def test_groupby_unstack_categorical():
+def test_unstack_categorical():
     # GH11558 (example is taken from the original issue)
     df = pd.DataFrame({'a': range(10),
                        'medium': ['A', 'B'] * 5,
                        'artist': list('XYXXY') * 2})
     df['medium'] = df['medium'].astype('category')
 
-    gcat = df.groupby(['artist', 'medium'])['a'].count().unstack()
+    gcat = df.groupby(
+        ['artist', 'medium'], observed=False)['a'].count().unstack()
     result = gcat.describe()
 
     exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
@@ -363,7 +519,7 @@ def test_groupby_unstack_categorical():
     tm.assert_series_equal(result, expected)
 
 
-def test_groupby_bins_unequal_len():
+def test_bins_unequal_len():
     # GH3011
     series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
     bins = pd.cut(series.dropna().values, 4)
@@ -374,47 +530,45 @@ def f():
     pytest.raises(ValueError, f)
 
 
-def test_groupby_multi_categorical_as_index():
+def test_as_index():
     # GH13204
     df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
                     'A': [10, 11, 11],
                     'B': [101, 102, 103]})
-    result = df.groupby(['cat', 'A'], as_index=False).sum()
-    expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
-                          'A': [10, 11, 10, 11, 10, 11],
-                          'B': [101.0, nan, nan, 205.0, nan, nan]},
-                         columns=['cat', 'A', 'B'])
+    result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum()
+    expected = DataFrame(
+        {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
+         'A': [10, 11],
+         'B': [101, 205]},
+        columns=['cat', 'A', 'B'])
     tm.assert_frame_equal(result, expected)
 
     # function grouper
     f = lambda r: df.loc[r, 'A']
-    result = df.groupby(['cat', f], as_index=False).sum()
-    expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
-                          'A': [10.0, nan, nan, 22.0, nan, nan],
-                          'B': [101.0, nan, nan, 205.0, nan, nan]},
-                         columns=['cat', 'A', 'B'])
+    result = df.groupby(['cat', f], as_index=False, observed=True).sum()
+    expected = DataFrame(
+        {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
+         'A': [10, 22],
+         'B': [101, 205]},
+        columns=['cat', 'A', 'B'])
     tm.assert_frame_equal(result, expected)
 
     # another not in-axis grouper
     s = Series(['a', 'b', 'b'], name='cat2')
-    result = df.groupby(['cat', s], as_index=False).sum()
-    expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
-                          'A': [10.0, nan, nan, 22.0, nan, nan],
-                          'B': [101.0, nan, nan, 205.0, nan, nan]},
-                         columns=['cat', 'A', 'B'])
+    result = df.groupby(['cat', s], as_index=False, observed=True).sum()
     tm.assert_frame_equal(result, expected)
 
     # GH18872: conflicting names in desired index
-    pytest.raises(ValueError, lambda: df.groupby(['cat',
-                                                  s.rename('cat')]).sum())
+    with pytest.raises(ValueError):
+        df.groupby(['cat', s.rename('cat')], observed=True).sum()
 
     # is original index dropped?
-    expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
-                          'A': [10, 11, 10, 11, 10, 11],
-                          'B': [101.0, nan, nan, 205.0, nan, nan]},
-                         columns=['cat', 'A', 'B'])
-
     group_columns = ['cat', 'A']
+    expected = DataFrame(
+        {'cat': Categorical([1, 2], categories=df.cat.cat.categories),
+         'A': [10, 11],
+         'B': [101, 205]},
+        columns=['cat', 'A', 'B'])
 
     for name in [None, 'X', 'B', 'cat']:
         df.index = Index(list("abc"), name=name)
@@ -422,15 +576,17 @@ def test_groupby_multi_categorical_as_index():
         if name in group_columns and name in df.index.names:
             with tm.assert_produces_warning(FutureWarning,
                                             check_stacklevel=False):
-                result = df.groupby(group_columns, as_index=False).sum()
+                result = df.groupby(
+                    group_columns, as_index=False, observed=True).sum()
 
         else:
-            result = df.groupby(group_columns, as_index=False).sum()
+            result = df.groupby(
+                group_columns, as_index=False, observed=True).sum()
 
-        tm.assert_frame_equal(result, expected, check_index_type=True)
+        tm.assert_frame_equal(result, expected)
 
 
-def test_groupby_preserve_categories():
+def test_preserve_categories():
     # GH-13179
     categories = list('abc')
 
@@ -439,8 +595,10 @@ def test_groupby_preserve_categories():
                                         categories=categories,
                                         ordered=True)})
     index = pd.CategoricalIndex(categories, categories, ordered=True)
-    tm.assert_index_equal(df.groupby('A', sort=True).first().index, index)
-    tm.assert_index_equal(df.groupby('A', sort=False).first().index, index)
+    tm.assert_index_equal(
+        df.groupby('A', sort=True, observed=False).first().index, index)
+    tm.assert_index_equal(
+        df.groupby('A', sort=False, observed=False).first().index, index)
 
     # ordered=False
     df = DataFrame({'A': pd.Categorical(list('ba'),
@@ -449,13 +607,15 @@ def test_groupby_preserve_categories():
     sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
     nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
                                        ordered=False)
-    tm.assert_index_equal(df.groupby('A', sort=True).first().index,
-                          sort_index)
-    tm.assert_index_equal(df.groupby('A', sort=False).first().index,
-                          nosort_index)
+    tm.assert_index_equal(
+        df.groupby('A', sort=True, observed=False).first().index,
+        sort_index)
+    tm.assert_index_equal(
+        df.groupby('A', sort=False, observed=False).first().index,
+        nosort_index)
 
 
-def test_groupby_preserve_categorical_dtype():
+def test_preserve_categorical_dtype():
     # GH13743, GH13854
     df = DataFrame({'A': [1, 2, 1, 1, 2],
                     'B': [10, 16, 22, 28, 34],
@@ -475,38 +635,22 @@ def test_groupby_preserve_categorical_dtype():
                                             categories=list("bac"),
                                             ordered=True)})
     for col in ['C1', 'C2']:
-        result1 = df.groupby(by=col, as_index=False).mean()
-        result2 = df.groupby(by=col, as_index=True).mean().reset_index()
-        expected = exp_full.reindex(columns=result1.columns)
-        tm.assert_frame_equal(result1, expected)
-        tm.assert_frame_equal(result2, expected)
-
-    # multiple grouper
-    exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2],
-                          'B': [np.nan, 20.0, np.nan, 25.0, np.nan,
-                                np.nan],
-                          'C1': Categorical(list("bacbac"),
-                                            categories=list("bac"),
-                                            ordered=False),
-                          'C2': Categorical(list("bacbac"),
-                                            categories=list("bac"),
-                                            ordered=True)})
-    for cols in [['A', 'C1'], ['A', 'C2']]:
-        result1 = df.groupby(by=cols, as_index=False).mean()
-        result2 = df.groupby(by=cols, as_index=True).mean().reset_index()
+        result1 = df.groupby(by=col, as_index=False, observed=False).mean()
+        result2 = df.groupby(
+            by=col, as_index=True, observed=False).mean().reset_index()
         expected = exp_full.reindex(columns=result1.columns)
         tm.assert_frame_equal(result1, expected)
         tm.assert_frame_equal(result2, expected)
 
 
-def test_groupby_categorical_no_compress():
+def test_categorical_no_compress():
     data = Series(np.random.randn(9))
 
     codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
     cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
 
-    result = data.groupby(cats).mean()
-    exp = data.groupby(codes).mean()
+    result = data.groupby(cats, observed=False).mean()
+    exp = data.groupby(codes, observed=False).mean()
 
     exp.index = CategoricalIndex(exp.index, categories=cats.categories,
                                  ordered=cats.ordered)
@@ -515,8 +659,8 @@ def test_groupby_categorical_no_compress():
     codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
     cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
 
-    result = data.groupby(cats).mean()
-    exp = data.groupby(codes).mean().reindex(cats.categories)
+    result = data.groupby(cats, observed=False).mean()
+    exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
     exp.index = CategoricalIndex(exp.index, categories=cats.categories,
                                  ordered=cats.ordered)
     assert_series_equal(result, exp)
@@ -525,13 +669,34 @@ def test_groupby_categorical_no_compress():
                        categories=["a", "b", "c", "d"], ordered=True)
     data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
 
-    result = data.groupby("b").mean()
+    result = data.groupby("b", observed=False).mean()
     result = result["a"].values
     exp = np.array([1, 2, 4, np.nan])
     tm.assert_numpy_array_equal(result, exp)
 
 
-def test_groupby_sort_categorical():
+def test_sort():
+
+    # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby  # noqa: flake8
+    # This should result in a properly sorted Series so that the plot
+    # has a sorted x axis
+    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
+
+    df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+    labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+    cat_labels = Categorical(labels, labels)
+
+    df = df.sort_values(by=['value'], ascending=True)
+    df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+                               right=False, labels=cat_labels)
+
+    res = df.groupby(['value_group'], observed=False)['value_group'].count()
+    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
+    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
+    tm.assert_series_equal(res, exp)
+
+
+def test_sort2():
     # dataframe groupby sort was being ignored # GH 8868
     df = DataFrame([['(7.5, 10]', 10, 10],
                     ['(7.5, 10]', 8, 20],
@@ -543,35 +708,43 @@ def test_groupby_sort_categorical():
     df['range'] = Categorical(df['range'], ordered=True)
     index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
                               '(7.5, 10]'], name='range', ordered=True)
-    result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
-                            columns=['foo', 'bar'], index=index)
+    expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
+                              columns=['foo', 'bar'], index=index)
 
     col = 'range'
-    assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
+    result_sort = df.groupby(col, sort=True, observed=False).first()
+    assert_frame_equal(result_sort, expected_sort)
+
     # when categories is ordered, group is ordered by category's order
-    assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
+    expected_sort = result_sort
+    result_sort = df.groupby(col, sort=False, observed=False).first()
+    assert_frame_equal(result_sort, expected_sort)
 
     df['range'] = Categorical(df['range'], ordered=False)
     index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
                               '(7.5, 10]'], name='range')
-    result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
-                            columns=['foo', 'bar'], index=index)
+    expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
+                              columns=['foo', 'bar'], index=index)
 
     index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]',
                               '(0, 2.5]'],
                              categories=['(7.5, 10]', '(2.5, 5]',
                                          '(5, 7.5]', '(0, 2.5]'],
                              name='range')
-    result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
-                              index=index, columns=['foo', 'bar'])
+    expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
+                                index=index, columns=['foo', 'bar'])
 
     col = 'range'
+
     # this is an unordered categorical, but we allow this ####
-    assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
-    assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
+    result_sort = df.groupby(col, sort=True, observed=False).first()
+    assert_frame_equal(result_sort, expected_sort)
+
+    result_nosort = df.groupby(col, sort=False, observed=False).first()
+    assert_frame_equal(result_nosort, expected_nosort)
 
 
-def test_groupby_sort_categorical_datetimelike():
+def test_sort_datetimelike():
     # GH10505
 
     # use same data as test_groupby_sort_categorical, which category is
@@ -600,9 +773,12 @@ def test_groupby_sort_categorical_datetimelike():
                                            name='dt', ordered=True)
 
     col = 'dt'
-    assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
+    assert_frame_equal(
+        result_sort, df.groupby(col, sort=True, observed=False).first())
+
     # when categories is ordered, group is ordered by category's order
-    assert_frame_equal(result_sort, df.groupby(col, sort=False).first())
+    assert_frame_equal(
+        result_sort, df.groupby(col, sort=False, observed=False).first())
 
     # ordered = False
     df['dt'] = Categorical(df['dt'], ordered=False)
@@ -620,65 +796,10 @@ def test_groupby_sort_categorical_datetimelike():
                                            name='dt')
 
     col = 'dt'
-    assert_frame_equal(result_sort, df.groupby(col, sort=True).first())
-    assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
-
-
-def test_groupby_categorical_two_columns():
-
-    # https://github.com/pandas-dev/pandas/issues/8138
-    d = {'cat':
-         pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
-                        ordered=True),
-         'ints': [1, 1, 2, 2],
-         'val': [10, 20, 30, 40]}
-    test = pd.DataFrame(d)
-
-    # Grouping on a single column
-    groups_single_key = test.groupby("cat")
-    res = groups_single_key.agg('mean')
-
-    exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat",
-                                    ordered=True)
-    exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]},
-                    index=exp_index)
-    tm.assert_frame_equal(res, exp)
-
-    # Grouping on two columns
-    groups_double_key = test.groupby(["cat", "ints"])
-    res = groups_double_key.agg('mean')
-    exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan],
-                     "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"],
-                                           ordered=True),
-                     "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints"
-                                                             ])
-    tm.assert_frame_equal(res, exp)
-
-    # GH 10132
-    for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
-        c, i = key
-        result = groups_double_key.get_group(key)
-        expected = test[(test.cat == c) & (test.ints == i)]
-        assert_frame_equal(result, expected)
-
-    d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
-    test = pd.DataFrame(d)
-    values = pd.cut(test['C1'], [1, 2, 3, 6])
-    values.name = "cat"
-    groups_double_key = test.groupby([values, 'C2'])
-
-    res = groups_double_key.agg('mean')
-    nan = np.nan
-    idx = MultiIndex.from_product(
-        [Categorical([Interval(1, 2), Interval(2, 3),
-                      Interval(3, 6)], ordered=True),
-         [1, 2, 3, 4]],
-        names=["cat", "C2"])
-    exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3,
-                            nan, nan, nan, nan, 4, 5],
-                     "C3": [nan, nan, nan, nan, 10, 100,
-                            nan, nan, nan, nan, 200, 34]}, index=idx)
-    tm.assert_frame_equal(res, exp)
+    assert_frame_equal(
+        result_sort, df.groupby(col, sort=True, observed=False).first())
+    assert_frame_equal(
+        result_nosort, df.groupby(col, sort=False, observed=False).first())
 
 
 def test_empty_sum():
@@ -689,22 +810,22 @@ def test_empty_sum():
     expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
 
     # 0 by default
-    result = df.groupby("A").B.sum()
+    result = df.groupby("A", observed=False).B.sum()
     expected = pd.Series([3, 1, 0], expected_idx, name='B')
     tm.assert_series_equal(result, expected)
 
     # min_count=0
-    result = df.groupby("A").B.sum(min_count=0)
+    result = df.groupby("A", observed=False).B.sum(min_count=0)
     expected = pd.Series([3, 1, 0], expected_idx, name='B')
     tm.assert_series_equal(result, expected)
 
     # min_count=1
-    result = df.groupby("A").B.sum(min_count=1)
+    result = df.groupby("A", observed=False).B.sum(min_count=1)
     expected = pd.Series([3, 1, np.nan], expected_idx, name='B')
     tm.assert_series_equal(result, expected)
 
     # min_count>1
-    result = df.groupby("A").B.sum(min_count=2)
+    result = df.groupby("A", observed=False).B.sum(min_count=2)
     expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B')
     tm.assert_series_equal(result, expected)
 
@@ -718,16 +839,16 @@ def test_empty_prod():
     expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
 
     # 1 by default
-    result = df.groupby("A").B.prod()
+    result = df.groupby("A", observed=False).B.prod()
     expected = pd.Series([2, 1, 1], expected_idx, name='B')
     tm.assert_series_equal(result, expected)
 
     # min_count=0
-    result = df.groupby("A").B.prod(min_count=0)
+    result = df.groupby("A", observed=False).B.prod(min_count=0)
     expected = pd.Series([2, 1, 1], expected_idx, name='B')
     tm.assert_series_equal(result, expected)
 
     # min_count=1
-    result = df.groupby("A").B.prod(min_count=1)
+    result = df.groupby("A", observed=False).B.prod(min_count=1)
     expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
     tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index ba1371fe9f931..f1d678db4ff7f 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -313,14 +313,14 @@ def test_cython_median():
     tm.assert_frame_equal(rs, xp)
 
 
-def test_median_empty_bins():
+def test_median_empty_bins(observed):
     df = pd.DataFrame(np.random.randint(0, 44, 500))
 
     grps = range(0, 55, 5)
     bins = pd.cut(df[0], grps)
 
-    result = df.groupby(bins).median()
-    expected = df.groupby(bins).agg(lambda x: x.median())
+    result = df.groupby(bins, observed=observed).median()
+    expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index 743237f5b386c..c0f5c43b2fd35 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -251,7 +251,7 @@ def test_groupby_levels_and_columns(self):
         by_columns.columns = pd.Index(by_columns.columns, dtype=np.int64)
         tm.assert_frame_equal(by_levels, by_columns)
 
-    def test_groupby_categorical_index_and_columns(self):
+    def test_groupby_categorical_index_and_columns(self, observed):
         # GH18432
         columns = ['A', 'B', 'A', 'B']
         categories = ['B', 'A']
@@ -260,17 +260,26 @@ def test_groupby_categorical_index_and_columns(self):
                                        categories=categories,
                                        ordered=True)
         df = DataFrame(data=data, columns=cat_columns)
-        result = df.groupby(axis=1, level=0).sum()
+        result = df.groupby(axis=1, level=0, observed=observed).sum()
         expected_data = 2 * np.ones((5, 2), int)
-        expected_columns = CategoricalIndex(categories,
-                                            categories=categories,
-                                            ordered=True)
+
+        if observed:
+            # if we are not-observed we undergo a reindex
+            # so need to adjust the output as our expected sets us up
+            # to be non-observed
+            expected_columns = CategoricalIndex(['A', 'B'],
+                                                categories=categories,
+                                                ordered=True)
+        else:
+            expected_columns = CategoricalIndex(categories,
+                                                categories=categories,
+                                                ordered=True)
         expected = DataFrame(data=expected_data, columns=expected_columns)
         assert_frame_equal(result, expected)
 
         # test transposed version
         df = DataFrame(data.T, index=cat_columns)
-        result = df.groupby(axis=0, level=0).sum()
+        result = df.groupby(axis=0, level=0, observed=observed).sum()
         expected = DataFrame(data=expected_data.T, index=expected_columns)
         assert_frame_equal(result, expected)
 
@@ -572,11 +581,11 @@ def test_get_group(self):
         pytest.raises(ValueError,
                       lambda: g.get_group(('foo', 'bar', 'baz')))
 
-    def test_get_group_empty_bins(self):
+    def test_get_group_empty_bins(self, observed):
 
         d = pd.DataFrame([3, 1, 7, 6])
         bins = [0, 5, 10, 15]
-        g = d.groupby(pd.cut(d[0], bins))
+        g = d.groupby(pd.cut(d[0], bins), observed=observed)
 
         # TODO: should prob allow a str of Interval work as well
         # IOW '(0, 5]'
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 1004b40bfb4c1..76cdc1d2a195d 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -93,23 +93,24 @@ def test_pivot_table_dropna(self):
 
     def test_pivot_table_categorical(self):
 
-        raw_cat1 = Categorical(["a", "a", "b", "b"],
-                               categories=["a", "b", "z"], ordered=True)
-        raw_cat2 = Categorical(["c", "d", "c", "d"],
-                               categories=["c", "d", "y"], ordered=True)
-        df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]})
-        result = pd.pivot_table(df, values='values', index=['A', 'B'])
-
-        exp_index = pd.MultiIndex.from_product(
-            [Categorical(["a", "b", "z"], ordered=True),
-             Categorical(["c", "d", "y"], ordered=True)],
+        cat1 = Categorical(["a", "a", "b", "b"],
+                           categories=["a", "b", "z"], ordered=True)
+        cat2 = Categorical(["c", "d", "c", "d"],
+                           categories=["c", "d", "y"], ordered=True)
+        df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+        result = pd.pivot_table(df, values='values', index=['A', 'B'],
+                                dropna=True)
+
+        exp_index = pd.MultiIndex.from_arrays(
+            [cat1, cat2],
             names=['A', 'B'])
         expected = DataFrame(
-            {'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]},
+            {'values': [1, 2, 3, 4]},
             index=exp_index)
         tm.assert_frame_equal(result, expected)
 
-    def test_pivot_table_dropna_categoricals(self):
+    @pytest.mark.parametrize('dropna', [True, False])
+    def test_pivot_table_dropna_categoricals(self, dropna):
         # GH 15193
         categories = ['a', 'b', 'c', 'd']
 
@@ -118,30 +119,23 @@ def test_pivot_table_dropna_categoricals(self):
                         'C': range(0, 9)})
 
         df['A'] = df['A'].astype(CDT(categories, ordered=False))
-        result_true = df.pivot_table(index='B', columns='A', values='C',
-                                     dropna=True)
+        result = df.pivot_table(index='B', columns='A', values='C',
+                                dropna=dropna)
         expected_columns = Series(['a', 'b', 'c'], name='A')
         expected_columns = expected_columns.astype(
             CDT(categories, ordered=False))
         expected_index = Series([1, 2, 3], name='B')
-        expected_true = DataFrame([[0.0, 3.0, 6.0],
-                                   [1.0, 4.0, 7.0],
-                                   [2.0, 5.0, 8.0]],
-                                  index=expected_index,
-                                  columns=expected_columns,)
-        tm.assert_frame_equal(expected_true, result_true)
-
-        result_false = df.pivot_table(index='B', columns='A', values='C',
-                                      dropna=False)
-        expected_columns = (
-            Series(['a', 'b', 'c', 'd'], name='A').astype('category')
-        )
-        expected_false = DataFrame([[0.0, 3.0, 6.0, np.NaN],
-                                    [1.0, 4.0, 7.0, np.NaN],
-                                    [2.0, 5.0, 8.0, np.NaN]],
-                                   index=expected_index,
-                                   columns=expected_columns,)
-        tm.assert_frame_equal(expected_false, result_false)
+        expected = DataFrame([[0, 3, 6],
+                              [1, 4, 7],
+                              [2, 5, 8]],
+                             index=expected_index,
+                             columns=expected_columns,)
+        if not dropna:
+            # add back the non observed to compare
+            expected = expected.reindex(
+                columns=Categorical(categories)).astype('float')
+
+        tm.assert_frame_equal(result, expected)
 
     def test_pass_array(self):
         result = self.data.pivot_table(
@@ -1068,7 +1062,7 @@ def test_pivot_table_margins_name_with_aggfunc_list(self):
 
     @pytest.mark.xfail(reason='GH 17035 (np.mean of ints is casted back to '
                               'ints)')
-    def test_categorical_margins(self):
+    def test_categorical_margins(self, observed):
         # GH 10989
         df = pd.DataFrame({'x': np.arange(8),
                            'y': np.arange(8) // 4,
@@ -1078,12 +1072,12 @@ def test_categorical_margins(self):
         expected.index = Index([0, 1, 'All'], name='y')
         expected.columns = Index([0, 1, 'All'], name='z')
 
-        table = df.pivot_table('x', 'y', 'z', margins=True)
+        table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True)
         tm.assert_frame_equal(table, expected)
 
     @pytest.mark.xfail(reason='GH 17035 (np.mean of ints is casted back to '
                               'ints)')
-    def test_categorical_margins_category(self):
+    def test_categorical_margins_category(self, observed):
         df = pd.DataFrame({'x': np.arange(8),
                            'y': np.arange(8) // 4,
                            'z': np.arange(8) % 2})
@@ -1094,16 +1088,17 @@ def test_categorical_margins_category(self):
 
         df.y = df.y.astype('category')
         df.z = df.z.astype('category')
-        table = df.pivot_table('x', 'y', 'z', margins=True)
+        table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True)
         tm.assert_frame_equal(table, expected)
 
-    def test_categorical_aggfunc(self):
+    def test_categorical_aggfunc(self, observed):
         # GH 9534
         df = pd.DataFrame({"C1": ["A", "B", "C", "C"],
                            "C2": ["a", "a", "b", "b"],
                            "V": [1, 2, 3, 4]})
         df["C1"] = df["C1"].astype("category")
-        result = df.pivot_table("V", index="C1", columns="C2", aggfunc="count")
+        result = df.pivot_table("V", index="C1", columns="C2",
+                                dropna=observed, aggfunc="count")
 
         expected_index = pd.CategoricalIndex(['A', 'B', 'C'],
                                              categories=['A', 'B', 'C'],
@@ -1118,7 +1113,7 @@ def test_categorical_aggfunc(self):
                                 columns=expected_columns)
         tm.assert_frame_equal(result, expected)
 
-    def test_categorical_pivot_index_ordering(self):
+    def test_categorical_pivot_index_ordering(self, observed):
         # GH 8731
         df = pd.DataFrame({'Sales': [100, 120, 220],
                            'Month': ['January', 'January', 'January'],
@@ -1130,18 +1125,19 @@ def test_categorical_pivot_index_ordering(self):
         result = df.pivot_table(values='Sales',
                                 index='Month',
                                 columns='Year',
+                                dropna=observed,
                                 aggfunc='sum')
         expected_columns = pd.Int64Index([2013, 2014], name='Year')
-        expected_index = pd.CategoricalIndex(months,
+        expected_index = pd.CategoricalIndex(['January'],
                                              categories=months,
                                              ordered=False,
                                              name='Month')
-        expected_data = np.empty((12, 2))
-        expected_data.fill(np.nan)
-        expected_data[0, :] = [320., 120.]
-        expected = pd.DataFrame(expected_data,
+        expected = pd.DataFrame([[320, 120]],
                                 index=expected_index,
                                 columns=expected_columns)
+        if not observed:
+            result = result.dropna().astype(np.int64)
+
         tm.assert_frame_equal(result, expected)
 
     def test_pivot_table_not_series(self):