ENH: Add dropna in groupby to allow NaN in keys (pandas-dev#30584)

rhshadrach · May 10, 2020 · 61b57a7 · 61b57a7
1 parent 2cbac0c
commit 61b57a7
Show file tree

Hide file tree

Showing 10 changed files with 477 additions and 3 deletions.
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -199,6 +199,33 @@ For example, the groups created by ``groupby()`` below are in the order they app
    df3.groupby(['X']).get_group('B')
 
 
+.. _groupby.dropna:
+
+.. versionadded:: 1.1.0
+
+GroupBy dropna
+^^^^^^^^^^^^^^
+
+By default ``NA`` values are excluded from group keys during the ``groupby`` operation. However,
+in case you want to include ``NA`` values in group keys, you could pass ``dropna=False`` to achieve it.
+
+.. ipython:: python
+
+    df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
+    df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])
+
+    df_dropna
+
+.. ipython:: python
+
+    # Default `dropna` is set to True, which will exclude NaNs in keys
+    df_dropna.groupby(by=["b"], dropna=True).sum()
+
+    # In order to allow NaN in keys, set `dropna` to False
+    df_dropna.groupby(by=["b"], dropna=False).sum()
+
+The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys.
+
 
 .. _groupby.attributes:
 

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -36,6 +36,37 @@ For example:
    ser["2014"]
    ser.loc["May 2015"]
 
+
+.. _whatsnew_110.groupby_key:
+
+Allow NA in groupby key
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+With :ref:`groupby <groupby.dropna>` , we've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to
+allow ``NA`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include
+``NA`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards
+compatibility (:issue:`3729`)
+
+.. ipython:: python
+
+    df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
+    df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])
+
+    df_dropna
+
+.. ipython:: python
+
+    # Default `dropna` is set to True, which will exclude NaNs in keys
+    df_dropna.groupby(by=["b"], dropna=True).sum()
+
+    # In order to allow NaN in keys, set `dropna` to False
+    df_dropna.groupby(by=["b"], dropna=False).sum()
+
+The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys.
+
+.. versionadded:: 1.1.0
+
+
 .. _whatsnew_110.key_sorting:
 
 Sorting with keys

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -523,7 +523,11 @@ def _factorize_array(
     ),
 )
 def factorize(
-    values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None
+    values,
+    sort: bool = False,
+    na_sentinel: int = -1,
+    size_hint: Optional[int] = None,
+    dropna: bool = True,
 ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
     """
     Encode the object as an enumerated type or categorical variable.
@@ -649,6 +653,14 @@ def factorize(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
+    code_is_na = codes == na_sentinel
+    if not dropna and code_is_na.any():
+        # na_value is set based on the dtype of uniques, and compat set to False is
+        # because we do not want na_value to be 0 for integers
+        na_value = na_value_for_dtype(uniques.dtype, compat=False)
+        uniques = np.append(uniques, [na_value])
+        codes = np.where(code_is_na, len(uniques) - 1, codes)
+
     uniques = _reconstruct_data(uniques, dtype, original)
 
     # return original tenor

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6134,6 +6134,41 @@ def update(
 Type
 Captive      210.0
 Wild         185.0
+
+We can also choose to include NA in group keys or not by setting
+`dropna` parameter, the default setting is `True`:
+
+>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
+>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
+
+>>> df.groupby(by=["b"]).sum()
+    a   c
+b
+1.0 2   3
+2.0 2   5
+
+>>> df.groupby(by=["b"], dropna=False).sum()
+    a   c
+b
+1.0 2   3
+2.0 2   5
+NaN 1   4
+
+>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
+>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
+
+>>> df.groupby(by="a").sum()
+    b     c
+a
+a   13.0   13.0
+b   12.3  123.0
+
+>>> df.groupby(by="a", dropna=False).sum()
+    b     c
+a
+a   13.0   13.0
+b   12.3  123.0
+NaN 12.3   33.0
 """
     )
     @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
@@ -6147,6 +6182,7 @@ def groupby(
         group_keys: bool = True,
         squeeze: bool = False,
         observed: bool = False,
+        dropna: bool = True,
     ) -> "DataFrameGroupBy":
         from pandas.core.groupby.generic import DataFrameGroupBy
 
@@ -6164,6 +6200,7 @@ def groupby(
             group_keys=group_keys,
             squeeze=squeeze,
             observed=observed,
+            dropna=dropna,
         )
 
     _shared_docs[

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -7475,6 +7475,12 @@ def clip(
             If False: show all values for categorical groupers.
 
             .. versionadded:: 0.23.0
+        dropna : bool, default True
+            If True, and if group keys contain NA values, NA values together
+            with row/column will be dropped.
+            If False, NA values will also be treated as the key in groups
+
+            .. versionadded:: 1.1.0
 
         Returns
         -------

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -474,6 +474,7 @@ def __init__(
         squeeze: bool = False,
         observed: bool = False,
         mutated: bool = False,
+        dropna: bool = True,
     ):
 
         self._selection = selection
@@ -496,6 +497,7 @@ def __init__(
         self.squeeze = squeeze
         self.observed = observed
         self.mutated = mutated
+        self.dropna = dropna
 
         if grouper is None:
             from pandas.core.groupby.grouper import get_grouper
@@ -508,6 +510,7 @@ def __init__(
                 sort=sort,
                 observed=observed,
                 mutated=self.mutated,
+                dropna=self.dropna,
             )
 
         self.obj = obj
@@ -2649,6 +2652,7 @@ def get_groupby(
     squeeze: bool = False,
     observed: bool = False,
     mutated: bool = False,
+    dropna: bool = True,
 ) -> GroupBy:
 
     klass: Type[GroupBy]
@@ -2677,4 +2681,5 @@ def get_groupby(
         squeeze=squeeze,
         observed=observed,
         mutated=mutated,
+        dropna=dropna,
     )
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -134,7 +134,9 @@ def __new__(cls, *args, **kwargs):
             cls = TimeGrouper
         return super().__new__(cls)
 
-    def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
+    def __init__(
+        self, key=None, level=None, freq=None, axis=0, sort=False, dropna=True
+    ):
         self.key = key
         self.level = level
         self.freq = freq
@@ -146,6 +148,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
         self.indexer = None
         self.binner = None
         self._grouper = None
+        self.dropna = dropna
 
     @property
     def ax(self):
@@ -171,6 +174,7 @@ def _get_grouper(self, obj, validate: bool = True):
             level=self.level,
             sort=self.sort,
             validate=validate,
+            dropna=self.dropna,
         )
         return self.binner, self.grouper, self.obj
 
@@ -283,6 +287,7 @@ def __init__(
         sort: bool = True,
         observed: bool = False,
         in_axis: bool = False,
+        dropna: bool = True,
     ):
         self.name = name
         self.level = level
@@ -293,6 +298,7 @@ def __init__(
         self.obj = obj
         self.observed = observed
         self.in_axis = in_axis
+        self.dropna = dropna
 
         # right place for this?
         if isinstance(grouper, (Series, Index)) and name is None:
@@ -446,7 +452,9 @@ def _make_codes(self) -> None:
                 codes = self.grouper.codes_info
                 uniques = self.grouper.result_index
             else:
-                codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
+                codes, uniques = algorithms.factorize(
+                    self.grouper, sort=self.sort, dropna=self.dropna
+                )
                 uniques = Index(uniques, name=self.name)
             self._codes = codes
             self._group_index = uniques
@@ -465,6 +473,7 @@ def get_grouper(
     observed: bool = False,
     mutated: bool = False,
     validate: bool = True,
+    dropna: bool = True,
 ) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]":
     """
     Create and return a BaseGrouper, which is an internal
@@ -655,6 +664,7 @@ def is_in_obj(gpr) -> bool:
                 sort=sort,
                 observed=observed,
                 in_axis=in_axis,
+                dropna=dropna,
             )
             if not isinstance(gpr, Grouping)
             else gpr

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1603,6 +1603,34 @@ def _set_name(self, name, inplace=False) -> "Series":
 Captive    210.0
 Wild       185.0
 Name: Max Speed, dtype: float64
+
+We can also choose to include `NA` in group keys or not by defining
+`dropna` parameter, the default setting is `True`:
+
+>>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan])
+>>> ser.groupby(level=0).sum()
+a    3
+b    3
+dtype: int64
+
+>>> ser.groupby(level=0, dropna=False).sum()
+a    3
+b    3
+NaN  3
+dtype: int64
+
+>>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
+>>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
+>>> ser.groupby(["a", "b", "a", np.nan]).mean()
+a    210.0
+b    350.0
+Name: Max Speed, dtype: float64
+
+>>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()
+a    210.0
+b    350.0
+NaN   20.0
+Name: Max Speed, dtype: float64
 """
     )
     @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs)
@@ -1616,6 +1644,7 @@ def groupby(
         group_keys: bool = True,
         squeeze: bool = False,
         observed: bool = False,
+        dropna: bool = True,
     ) -> "SeriesGroupBy":
         from pandas.core.groupby.generic import SeriesGroupBy
 
@@ -1633,6 +1662,7 @@ def groupby(
             group_keys=group_keys,
             squeeze=squeeze,
             observed=observed,
+            dropna=dropna,
         )
 
     # ----------------------------------------------------------------------