diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index c5f58425139ee..ddba3dc452e28 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -199,6 +199,33 @@ For example, the groups created by ``groupby()`` below are in the order they app df3.groupby(['X']).get_group('B') +.. _groupby.dropna: + +.. versionadded:: 1.1.0 + +GroupBy dropna +^^^^^^^^^^^^^^ + +By default ``NA`` values are excluded from group keys during the ``groupby`` operation. However, +in case you want to include ``NA`` values in group keys, you could pass ``dropna=False`` to achieve it. + +.. ipython:: python + + df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"]) + + df_dropna + +.. ipython:: python + + # Default `dropna` is set to True, which will exclude NaNs in keys + df_dropna.groupby(by=["b"], dropna=True).sum() + + # In order to allow NaN in keys, set `dropna` to False + df_dropna.groupby(by=["b"], dropna=False).sum() + +The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. + .. _groupby.attributes: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9c424f70b1ee0..55af0b218a2c7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -36,6 +36,37 @@ For example: ser["2014"] ser.loc["May 2015"] + +.. _whatsnew_110.groupby_key: + +Allow NA in groupby key +^^^^^^^^^^^^^^^^^^^^^^^^ + +With :ref:`groupby ` , we've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to +allow ``NA`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include +``NA`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards +compatibility (:issue:`3729`) + +.. ipython:: python + + df_list = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + df_dropna = pd.DataFrame(df_list, columns=["a", "b", "c"]) + + df_dropna + +.. ipython:: python + + # Default `dropna` is set to True, which will exclude NaNs in keys + df_dropna.groupby(by=["b"], dropna=True).sum() + + # In order to allow NaN in keys, set `dropna` to False + df_dropna.groupby(by=["b"], dropna=False).sum() + +The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. + +.. versionadded:: 1.1.0 + + .. _whatsnew_110.key_sorting: Sorting with keys diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c2115094918e5..aeb0c2d32c31c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -517,7 +517,11 @@ def _factorize_array( ), ) def factorize( - values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None + values, + sort: bool = False, + na_sentinel: int = -1, + size_hint: Optional[int] = None, + dropna: bool = True, ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: """ Encode the object as an enumerated type or categorical variable. @@ -643,6 +647,14 @@ def factorize( uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) + code_is_na = codes == na_sentinel + if not dropna and code_is_na.any(): + # na_value is set based on the dtype of uniques, and compat set to False is + # because we do not want na_value to be 0 for integers + na_value = na_value_for_dtype(uniques.dtype, compat=False) + uniques = np.append(uniques, [na_value]) + codes = np.where(code_is_na, len(uniques) - 1, codes) + uniques = _reconstruct_data(uniques, dtype, original) # return original tenor diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4e86b3710a1bd..19caf42823fa3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6139,6 +6139,41 @@ def update( Type Captive 210.0 Wild 185.0 + +We can also choose to include NA in group keys or not by setting +`dropna` parameter, the default setting is `True`: + +>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] +>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + +>>> df.groupby(by=["b"]).sum() + a c +b +1.0 2 3 +2.0 2 5 + +>>> df.groupby(by=["b"], dropna=False).sum() + a c +b +1.0 2 3 +2.0 2 5 +NaN 1 4 + +>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] +>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) + +>>> df.groupby(by="a").sum() + b c +a +a 13.0 13.0 +b 12.3 123.0 + +>>> df.groupby(by="a", dropna=False).sum() + b c +a +a 13.0 13.0 +b 12.3 123.0 +NaN 12.3 33.0 """ ) @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) @@ -6152,6 +6187,7 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, + dropna: bool = True, ) -> "DataFrameGroupBy": from pandas.core.groupby.generic import DataFrameGroupBy @@ -6169,6 +6205,7 @@ def groupby( group_keys=group_keys, squeeze=squeeze, observed=observed, + dropna=dropna, ) _shared_docs[ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b550857252466..792e5a1228fe6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7475,6 +7475,12 @@ def clip( If False: show all values for categorical groupers. .. versionadded:: 0.23.0 + dropna : bool, default True + If True, and if group keys contain NA values, NA values together + with row/column will be dropped. + If False, NA values will also be treated as the key in groups + + .. versionadded:: 1.1.0 Returns ------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 81c3fd7ad9e89..b92ff1c7c8ca4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -474,6 +474,7 @@ def __init__( squeeze: bool = False, observed: bool = False, mutated: bool = False, + dropna: bool = True, ): self._selection = selection @@ -496,6 +497,7 @@ def __init__( self.squeeze = squeeze self.observed = observed self.mutated = mutated + self.dropna = dropna if grouper is None: from pandas.core.groupby.grouper import get_grouper @@ -508,6 +510,7 @@ def __init__( sort=sort, observed=observed, mutated=self.mutated, + dropna=self.dropna, ) self.obj = obj @@ -2649,6 +2652,7 @@ def get_groupby( squeeze: bool = False, observed: bool = False, mutated: bool = False, + dropna: bool = True, ) -> GroupBy: klass: Type[GroupBy] @@ -2677,4 +2681,5 @@ def get_groupby( squeeze=squeeze, observed=observed, mutated=mutated, + dropna=dropna, ) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f84ca6c05f40f..948b4ba27f705 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -134,7 +134,9 @@ def __new__(cls, *args, **kwargs): cls = TimeGrouper return super().__new__(cls) - def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): + def __init__( + self, key=None, level=None, freq=None, axis=0, sort=False, dropna=True + ): self.key = key self.level = level self.freq = freq @@ -146,6 +148,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): self.indexer = None self.binner = None self._grouper = None + self.dropna = dropna @property def ax(self): @@ -171,6 +174,7 @@ def _get_grouper(self, obj, validate: bool = True): level=self.level, sort=self.sort, validate=validate, + dropna=self.dropna, ) return self.binner, self.grouper, self.obj @@ -283,6 +287,7 @@ def __init__( sort: bool = True, observed: bool = False, in_axis: bool = False, + dropna: bool = True, ): self.name = name self.level = level @@ -293,6 +298,7 @@ def __init__( self.obj = obj self.observed = observed self.in_axis = in_axis + self.dropna = dropna # right place for this? if isinstance(grouper, (Series, Index)) and name is None: @@ -446,7 +452,9 @@ def _make_codes(self) -> None: codes = self.grouper.codes_info uniques = self.grouper.result_index else: - codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) + codes, uniques = algorithms.factorize( + self.grouper, sort=self.sort, dropna=self.dropna + ) uniques = Index(uniques, name=self.name) self._codes = codes self._group_index = uniques @@ -465,6 +473,7 @@ def get_grouper( observed: bool = False, mutated: bool = False, validate: bool = True, + dropna: bool = True, ) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": """ Create and return a BaseGrouper, which is an internal @@ -655,6 +664,7 @@ def is_in_obj(gpr) -> bool: sort=sort, observed=observed, in_axis=in_axis, + dropna=dropna, ) if not isinstance(gpr, Grouping) else gpr diff --git a/pandas/core/series.py b/pandas/core/series.py index eb409b432f89c..388395902c0f6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1609,6 +1609,34 @@ def _set_name(self, name, inplace=False) -> "Series": Captive 210.0 Wild 185.0 Name: Max Speed, dtype: float64 + +We can also choose to include `NA` in group keys or not by defining +`dropna` parameter, the default setting is `True`: + +>>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) +>>> ser.groupby(level=0).sum() +a 3 +b 3 +dtype: int64 + +>>> ser.groupby(level=0, dropna=False).sum() +a 3 +b 3 +NaN 3 +dtype: int64 + +>>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] +>>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") +>>> ser.groupby(["a", "b", "a", np.nan]).mean() +a 210.0 +b 350.0 +Name: Max Speed, dtype: float64 + +>>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean() +a 210.0 +b 350.0 +NaN 20.0 +Name: Max Speed, dtype: float64 """ ) @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) @@ -1622,6 +1650,7 @@ def groupby( group_keys: bool = True, squeeze: bool = False, observed: bool = False, + dropna: bool = True, ) -> "SeriesGroupBy": from pandas.core.groupby.generic import SeriesGroupBy @@ -1639,6 +1668,7 @@ def groupby( group_keys=group_keys, squeeze=squeeze, observed=observed, + dropna=dropna, ) # ---------------------------------------------------------------------- diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py new file mode 100644 index 0000000000000..1a525d306e9f5 --- /dev/null +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -0,0 +1,244 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas.testing as tm + + +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]}, + ), + ( + False, + [["A", "B"], ["A", np.nan], ["B", "A"]], + { + "c": [13.0, 12.3, 123.23], + "d": [13.0, 233.0, 123.0], + "e": [13.0, 12.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_nan_in_one_group( + dropna, tuples, outputs, nulls_fixture +): + # GH 3729 this is to test that NA is in one group + df_list = [ + ["A", "B", 12, 12, 12], + ["A", nulls_fixture, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + ["A", "B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + grouped = df.groupby(["a", "b"], dropna=dropna).sum() + + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. + if not dropna: + mi = mi.set_levels(["A", "B", np.nan], level="b") + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]}, + ), + ( + False, + [["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]], + { + "c": [12.0, 13.3, 123.23, 1.0], + "d": [12.0, 234.0, 123.0, 1.0], + "e": [12.0, 13.0, 1.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( + dropna, tuples, outputs, nulls_fixture, nulls_fixture2 +): + # GH 3729 this is to test that NA in different groups with different representations + df_list = [ + ["A", "B", 12, 12, 12], + ["A", nulls_fixture, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + [nulls_fixture2, "B", 1, 1, 1.0], + ["A", nulls_fixture2, 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + grouped = df.groupby(["a", "b"], dropna=dropna).sum() + + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. + if not dropna: + mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]]) + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "dropna, idx, outputs", + [ + (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}), + ( + False, + ["A", "B", np.nan], + { + "b": [123.23, 13.0, 12.3], + "c": [123.0, 13.0, 233.0], + "d": [1.0, 13.0, 12.0], + }, + ), + ], +) +def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): + # GH 3729 + df_list = [ + ["B", 12, 12, 12], + [None, 12.3, 233.0, 12], + ["A", 123.23, 123, 1], + ["B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) + grouped = df.groupby("a", dropna=dropna).sum() + + expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + + tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "dropna, idx, expected", + [ + (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])), + ( + False, + ["a", "a", "b", np.nan], + pd.Series([3, 3, 3], index=["a", "b", np.nan]), + ), + ], +) +def test_groupby_dropna_series_level(dropna, idx, expected): + ser = pd.Series([1, 2, 3, 3], index=idx) + + result = ser.groupby(level=0, dropna=dropna).sum() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dropna, expected", + [ + (True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")), + ( + False, + pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"), + ), + ], +) +def test_groupby_dropna_series_by(dropna, expected): + ser = pd.Series( + [390.0, 350.0, 30.0, 20.0], + index=["Falcon", "Falcon", "Parrot", "Parrot"], + name="Max Speed", + ) + + result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "dropna, tuples, outputs", + [ + ( + True, + [["A", "B"], ["B", "A"]], + {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]}, + ), + ( + False, + [["A", "B"], ["A", np.nan], ["B", "A"]], + { + "c": [13.0, 12.3, 123.23], + "d": [12.0, 233.0, 123.0], + "e": [1.0, 12.0, 1.0], + }, + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): + # GH 3729 + df_list = [ + ["A", "B", 12, 12, 12], + ["A", None, 12.3, 233.0, 12], + ["B", "A", 123.23, 123, 1], + ["A", "B", 1, 1, 1.0], + ] + df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"]) + agg_dict = {"c": sum, "d": max, "e": "min"} + grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict) + + mi = pd.MultiIndex.from_tuples(tuples, names=list("ab")) + + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. + if not dropna: + mi = mi.set_levels(["A", "B", np.nan], level="b") + expected = pd.DataFrame(outputs, index=mi) + + tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "datetime1, datetime2", + [ + (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")), + (pd.Timedelta("-2 days"), pd.Timedelta("-1 days")), + (pd.Period("2020-01-01"), pd.Period("2020-02-01")), + ], +) +@pytest.mark.parametrize( + "dropna, values", [(True, [12, 3]), (False, [12, 3, 6],)], +) +def test_groupby_dropna_datetime_like_data( + dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2 +): + # 3729 + df = pd.DataFrame( + { + "values": [1, 2, 3, 4, 5, 6], + "dt": [ + datetime1, + unique_nulls_fixture, + datetime2, + unique_nulls_fixture2, + datetime1, + datetime1, + ], + } + ) + + if dropna: + indexes = [datetime1, datetime2] + else: + indexes = [datetime1, datetime2, np.nan] + + grouped = df.groupby("dt", dropna=dropna).agg({"values": sum}) + expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) + + tm.assert_frame_equal(grouped, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5f904241da485..d6228d031bfd5 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -325,6 +325,78 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): else: tm.assert_extension_array_equal(uniques, expected_uniques) + @pytest.mark.parametrize( + "data, dropna, expected_codes, expected_uniques", + [ + ( + ["a", None, "b", "a"], + True, + np.array([0, -1, 1, 0], dtype=np.int64), + np.array(["a", "b"], dtype=object), + ), + ( + ["a", np.nan, "b", "a"], + True, + np.array([0, -1, 1, 0], dtype=np.int64), + np.array(["a", "b"], dtype=object), + ), + ( + ["a", None, "b", "a"], + False, + np.array([0, 2, 1, 0], dtype=np.int64), + np.array(["a", "b", np.nan], dtype=object), + ), + ( + ["a", np.nan, "b", "a"], + False, + np.array([0, 2, 1, 0], dtype=np.int64), + np.array(["a", "b", np.nan], dtype=object), + ), + ], + ) + def test_object_factorize_dropna( + self, data, dropna, expected_codes, expected_uniques + ): + codes, uniques = algos.factorize(data, dropna=dropna) + + tm.assert_numpy_array_equal(uniques, expected_uniques) + tm.assert_numpy_array_equal(codes, expected_codes) + + @pytest.mark.parametrize( + "data, dropna, expected_codes, expected_uniques", + [ + ( + [1, None, 1, 2], + True, + np.array([0, -1, 0, 1], dtype=np.int64), + np.array([1, 2], dtype="O"), + ), + ( + [1, np.nan, 1, 2], + True, + np.array([0, -1, 0, 1], dtype=np.int64), + np.array([1, 2], dtype=np.float64), + ), + ( + [1, None, 1, 2], + False, + np.array([0, 2, 0, 1], dtype=np.int64), + np.array([1, 2, np.nan], dtype="O"), + ), + ( + [1, np.nan, 1, 2], + False, + np.array([0, 2, 0, 1], dtype=np.int64), + np.array([1, 2, np.nan], dtype=np.float64), + ), + ], + ) + def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques): + codes, uniques = algos.factorize(data, dropna=dropna) + + tm.assert_numpy_array_equal(uniques, expected_uniques) + tm.assert_numpy_array_equal(codes, expected_codes) + class TestUnique: def test_ints(self):