Skip to content

Commit

Permalink
ENH: Add dropna in groupby to allow NaN in keys (#30584)
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesdong1991 authored May 9, 2020
1 parent d09f20e commit 88d5f12
Show file tree
Hide file tree
Showing 10 changed files with 477 additions and 3 deletions.
27 changes: 27 additions & 0 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,33 @@ For example, the groups created by ``groupby()`` below are in the order they app
df3.groupby(['X']).get_group('B')
.. _groupby.dropna:

.. versionadded:: 1.1.0

GroupBy dropna
^^^^^^^^^^^^^^

By default ``NA`` values are excluded from group keys during the ``groupby`` operation. However,
in case you want to include ``NA`` values in group keys, you could pass ``dropna=False`` to achieve it.

.. ipython:: python
df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])
df_dropna
.. ipython:: python
# Default `dropna` is set to True, which will exclude NaNs in keys
df_dropna.groupby(by=["b"], dropna=True).sum()
# In order to allow NaN in keys, set `dropna` to False
df_dropna.groupby(by=["b"], dropna=False).sum()
The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys.


.. _groupby.attributes:

Expand Down
31 changes: 31 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,37 @@ For example:
ser["2014"]
ser.loc["May 2015"]
.. _whatsnew_110.groupby_key:

Allow NA in groupby key
^^^^^^^^^^^^^^^^^^^^^^^^

With :ref:`groupby <groupby.dropna>` , we've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to
allow ``NA`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include
``NA`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards
compatibility (:issue:`3729`)

.. ipython:: python
df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"])
df_dropna
.. ipython:: python
# Default `dropna` is set to True, which will exclude NaNs in keys
df_dropna.groupby(by=["b"], dropna=True).sum()
# In order to allow NaN in keys, set `dropna` to False
df_dropna.groupby(by=["b"], dropna=False).sum()
The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys.

.. versionadded:: 1.1.0


.. _whatsnew_110.key_sorting:

Sorting with keys
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,11 @@ def _factorize_array(
),
)
def factorize(
values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None
values,
sort: bool = False,
na_sentinel: int = -1,
size_hint: Optional[int] = None,
dropna: bool = True,
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
"""
Encode the object as an enumerated type or categorical variable.
Expand Down Expand Up @@ -649,6 +653,14 @@ def factorize(
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
)

code_is_na = codes == na_sentinel
if not dropna and code_is_na.any():
# na_value is set based on the dtype of uniques, and compat set to False is
# because we do not want na_value to be 0 for integers
na_value = na_value_for_dtype(uniques.dtype, compat=False)
uniques = np.append(uniques, [na_value])
codes = np.where(code_is_na, len(uniques) - 1, codes)

uniques = _reconstruct_data(uniques, dtype, original)

# return original tenor
Expand Down
37 changes: 37 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6134,6 +6134,41 @@ def update(
Type
Captive 210.0
Wild 185.0
We can also choose to include NA in group keys or not by setting
`dropna` parameter, the default setting is `True`:
>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
>>> df.groupby(by=["b"]).sum()
a c
b
1.0 2 3
2.0 2 5
>>> df.groupby(by=["b"], dropna=False).sum()
a c
b
1.0 2 3
2.0 2 5
NaN 1 4
>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
>>> df.groupby(by="a").sum()
b c
a
a 13.0 13.0
b 12.3 123.0
>>> df.groupby(by="a", dropna=False).sum()
b c
a
a 13.0 13.0
b 12.3 123.0
NaN 12.3 33.0
"""
)
@Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
Expand All @@ -6147,6 +6182,7 @@ def groupby(
group_keys: bool = True,
squeeze: bool = False,
observed: bool = False,
dropna: bool = True,
) -> "DataFrameGroupBy":
from pandas.core.groupby.generic import DataFrameGroupBy

Expand All @@ -6164,6 +6200,7 @@ def groupby(
group_keys=group_keys,
squeeze=squeeze,
observed=observed,
dropna=dropna,
)

_shared_docs[
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7475,6 +7475,12 @@ def clip(
If False: show all values for categorical groupers.
.. versionadded:: 0.23.0
dropna : bool, default True
If True, and if group keys contain NA values, NA values together
with row/column will be dropped.
If False, NA values will also be treated as the key in groups
.. versionadded:: 1.1.0
Returns
-------
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,7 @@ def __init__(
squeeze: bool = False,
observed: bool = False,
mutated: bool = False,
dropna: bool = True,
):

self._selection = selection
Expand All @@ -496,6 +497,7 @@ def __init__(
self.squeeze = squeeze
self.observed = observed
self.mutated = mutated
self.dropna = dropna

if grouper is None:
from pandas.core.groupby.grouper import get_grouper
Expand All @@ -508,6 +510,7 @@ def __init__(
sort=sort,
observed=observed,
mutated=self.mutated,
dropna=self.dropna,
)

self.obj = obj
Expand Down Expand Up @@ -2649,6 +2652,7 @@ def get_groupby(
squeeze: bool = False,
observed: bool = False,
mutated: bool = False,
dropna: bool = True,
) -> GroupBy:

klass: Type[GroupBy]
Expand Down Expand Up @@ -2677,4 +2681,5 @@ def get_groupby(
squeeze=squeeze,
observed=observed,
mutated=mutated,
dropna=dropna,
)
14 changes: 12 additions & 2 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@ def __new__(cls, *args, **kwargs):
cls = TimeGrouper
return super().__new__(cls)

def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
def __init__(
self, key=None, level=None, freq=None, axis=0, sort=False, dropna=True
):
self.key = key
self.level = level
self.freq = freq
Expand All @@ -146,6 +148,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
self.indexer = None
self.binner = None
self._grouper = None
self.dropna = dropna

@property
def ax(self):
Expand All @@ -171,6 +174,7 @@ def _get_grouper(self, obj, validate: bool = True):
level=self.level,
sort=self.sort,
validate=validate,
dropna=self.dropna,
)
return self.binner, self.grouper, self.obj

Expand Down Expand Up @@ -283,6 +287,7 @@ def __init__(
sort: bool = True,
observed: bool = False,
in_axis: bool = False,
dropna: bool = True,
):
self.name = name
self.level = level
Expand All @@ -293,6 +298,7 @@ def __init__(
self.obj = obj
self.observed = observed
self.in_axis = in_axis
self.dropna = dropna

# right place for this?
if isinstance(grouper, (Series, Index)) and name is None:
Expand Down Expand Up @@ -446,7 +452,9 @@ def _make_codes(self) -> None:
codes = self.grouper.codes_info
uniques = self.grouper.result_index
else:
codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
codes, uniques = algorithms.factorize(
self.grouper, sort=self.sort, dropna=self.dropna
)
uniques = Index(uniques, name=self.name)
self._codes = codes
self._group_index = uniques
Expand All @@ -465,6 +473,7 @@ def get_grouper(
observed: bool = False,
mutated: bool = False,
validate: bool = True,
dropna: bool = True,
) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]":
"""
Create and return a BaseGrouper, which is an internal
Expand Down Expand Up @@ -655,6 +664,7 @@ def is_in_obj(gpr) -> bool:
sort=sort,
observed=observed,
in_axis=in_axis,
dropna=dropna,
)
if not isinstance(gpr, Grouping)
else gpr
Expand Down
30 changes: 30 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1603,6 +1603,34 @@ def _set_name(self, name, inplace=False) -> "Series":
Captive 210.0
Wild 185.0
Name: Max Speed, dtype: float64
We can also choose to include `NA` in group keys or not by defining
`dropna` parameter, the default setting is `True`:
>>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan])
>>> ser.groupby(level=0).sum()
a 3
b 3
dtype: int64
>>> ser.groupby(level=0, dropna=False).sum()
a 3
b 3
NaN 3
dtype: int64
>>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
>>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
>>> ser.groupby(["a", "b", "a", np.nan]).mean()
a 210.0
b 350.0
Name: Max Speed, dtype: float64
>>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()
a 210.0
b 350.0
NaN 20.0
Name: Max Speed, dtype: float64
"""
)
@Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs)
Expand All @@ -1616,6 +1644,7 @@ def groupby(
group_keys: bool = True,
squeeze: bool = False,
observed: bool = False,
dropna: bool = True,
) -> "SeriesGroupBy":
from pandas.core.groupby.generic import SeriesGroupBy

Expand All @@ -1633,6 +1662,7 @@ def groupby(
group_keys=group_keys,
squeeze=squeeze,
observed=observed,
dropna=dropna,
)

# ----------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 88d5f12

Please sign in to comment.