Add support for group_keys in groupby (#11659)

- [x] This PR adds support for `group_keys` in `groupby`. Starting pandas 1.5.0, issues around `group_keys` have been resolved: pandas-dev/pandas#34998 pandas-dev/pandas#47185 - [x] This PR defaults `group_keys` to `False` which is the same as what pandas is going to be defaulting to in the future version. - [x] Required to unblock `pandas-1.5.0` upgrade in cudf: #11617 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Ashwin Srinath (https://github.com/shwina) URL: #11659
rapidsai · Sep 8, 2022 · d3e8f6d · d3e8f6d
1 parent 0684ee1
commit d3e8f6d
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 17 deletions.
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
@@ -12,3 +12,4 @@
 PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3")
 PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4")
 PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0")
+PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0")
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -3836,7 +3836,7 @@ def groupby(
         level=None,
         as_index=True,
         sort=False,
-        group_keys=True,
+        group_keys=False,
         squeeze=False,
         observed=False,
         dropna=True,

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -69,6 +69,12 @@ def _quantile_75(x):
     ``False`` for better performance. Note this does not influence
     the order of observations within each group. Groupby preserves
     the order of rows within each group.
+group_keys : bool, optional
+    When calling apply and the ``by`` argument produces a like-indexed
+    result, add group keys to index to identify pieces. By default group
+    keys are not included when the result's index (and column) labels match
+    the inputs, and are included otherwise. This argument has no effect if
+    the result produced is not like-indexed with respect to the input.
 {ret}
 Examples
 --------
@@ -135,6 +141,32 @@ def _quantile_75(x):
 Type
 Wild         185.0
 Captive      210.0
+
+>>> df = cudf.DataFrame({{'A': 'a a b'.split(),
+...                      'B': [1,2,3],
+...                      'C': [4,6,5]}})
+>>> g1 = df.groupby('A', group_keys=False)
+>>> g2 = df.groupby('A', group_keys=True)
+
+Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
+differ in their ``group_keys`` argument. Calling `apply` in various ways,
+we can get different grouping results:
+
+>>> g1[['B', 'C']].apply(lambda x: x / x.sum())
+          B    C
+0  0.333333  0.4
+1  0.666667  0.6
+2  1.000000  1.0
+
+In the above, the groups are not part of the index. We can have them included
+by using ``g2`` where ``group_keys=True``:
+
+>>> g2[['B', 'C']].apply(lambda x: x / x.sum())
+            B    C
+A
+a 0  0.333333  0.4
+  1  0.666667  0.6
+b 2  1.000000  1.0
 """
 )
 
@@ -174,7 +206,14 @@ class GroupBy(Serializable, Reducible, Scannable):
     _MAX_GROUPS_BEFORE_WARN = 100
 
     def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
+        self,
+        obj,
+        by=None,
+        level=None,
+        sort=False,
+        as_index=True,
+        dropna=True,
+        group_keys=True,
     ):
         """
         Group a DataFrame or Series by a set of columns.
@@ -210,6 +249,7 @@ def __init__(
         self._level = level
         self._sort = sort
         self._dropna = dropna
+        self._group_keys = group_keys
 
         if isinstance(by, _Grouping):
             by._obj = self.obj
@@ -544,7 +584,9 @@ def _grouped(self):
         grouped_key_cols, grouped_value_cols, offsets = self._groupby.groups(
             [*self.obj._index._columns, *self.obj._columns]
         )
-        grouped_keys = cudf.core.index._index_from_columns(grouped_key_cols)
+        grouped_keys = cudf.core.index._index_from_columns(
+            grouped_key_cols, name=self.grouping.keys.name
+        )
         grouped_values = self.obj._from_columns_like_self(
             grouped_value_cols,
             column_names=self.obj._column_names,
@@ -707,7 +749,7 @@ def mult(df):
         """
         if not callable(function):
             raise TypeError(f"type {type(function)} is not callable")
-        group_names, offsets, _, grouped_values = self._grouped()
+        group_names, offsets, group_keys, grouped_values = self._grouped()
 
         ngroups = len(offsets) - 1
         if ngroups > self._MAX_GROUPS_BEFORE_WARN:
@@ -726,14 +768,21 @@ def mult(df):
         if cudf.api.types.is_scalar(chunk_results[0]):
             result = cudf.Series(chunk_results, index=group_names)
             result.index.names = self.grouping.names
-        elif isinstance(chunk_results[0], cudf.Series):
-            if isinstance(self.obj, cudf.DataFrame):
+        else:
+            if isinstance(chunk_results[0], cudf.Series) and isinstance(
+                self.obj, cudf.DataFrame
+            ):
                 result = cudf.concat(chunk_results, axis=1).T
                 result.index.names = self.grouping.names
             else:
                 result = cudf.concat(chunk_results)
-        else:
-            result = cudf.concat(chunk_results)
+                if self._group_keys:
+                    result.index = cudf.MultiIndex._from_data(
+                        {
+                            group_keys.name: group_keys._column,
+                            None: grouped_values.index._column,
+                        }
+                    )
 
         if self._sort:
             result = result.sort_index()
@@ -1582,7 +1631,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     def __getitem__(self, key):
         return self.obj[key].groupby(
-            by=self.grouping.keys, dropna=self._dropna, sort=self._sort
+            by=self.grouping.keys,
+            dropna=self._dropna,
+            sort=self._sort,
+            group_keys=self._group_keys,
         )
 
 

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -3535,19 +3535,14 @@ def groupby(
         level=None,
         as_index=True,
         sort=False,
-        group_keys=True,
+        group_keys=False,
         squeeze=False,
         observed=False,
         dropna=True,
     ):
         if axis not in (0, "index"):
             raise NotImplementedError("axis parameter is not yet implemented")
 
-        if group_keys is not True:
-            raise NotImplementedError(
-                "The group_keys keyword is not yet implemented"
-            )
-
         if squeeze is not False:
             raise NotImplementedError(
                 "squeeze parameter is not yet implemented"
@@ -3562,6 +3557,8 @@ def groupby(
             raise TypeError(
                 "groupby() requires either by or level to be specified."
             )
+        if group_keys is None:
+            group_keys = False
 
         return (
             self.__class__._resampler(self, by=by)
@@ -3573,6 +3570,7 @@ def groupby(
                 as_index=as_index,
                 dropna=dropna,
                 sort=sort,
+                group_keys=group_keys,
             )
         )
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -3075,7 +3075,7 @@ def groupby(
         level=None,
         as_index=True,
         sort=False,
-        group_keys=True,
+        group_keys=False,
         squeeze=False,
         observed=False,
         dropna=True,

diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
@@ -14,7 +14,12 @@
 
 import cudf
 from cudf import DataFrame, Series
-from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_130, PANDAS_LT_140
+from cudf.core._compat import (
+    PANDAS_GE_110,
+    PANDAS_GE_130,
+    PANDAS_GE_150,
+    PANDAS_LT_140,
+)
 from cudf.testing._utils import (
     DATETIME_TYPES,
     SIGNED_TYPES,
@@ -2677,3 +2682,31 @@ def test_groupby_pct_change_empty_columns():
     expected = pdf.groupby("id").pct_change()
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "group_keys",
+    [
+        None,
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                condition=not PANDAS_GE_150,
+                reason="https://github.com/pandas-dev/pandas/pull/34998",
+            ),
+        ),
+        False,
+    ],
+)
+def test_groupby_group_keys(group_keys):
+    gdf = cudf.DataFrame(
+        {"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}
+    )
+    pdf = gdf.to_pandas()
+
+    g_group = gdf.groupby("A", group_keys=group_keys)
+    p_group = pdf.groupby("A", group_keys=group_keys)
+
+    actual = g_group[["B", "C"]].apply(lambda x: x / x.sum())
+    expected = p_group[["B", "C"]].apply(lambda x: x / x.sum())
+    assert_eq(actual, expected)