Skip to content

Commit

Permalink
Add support for group_keys in groupby (#11659)
Browse files Browse the repository at this point in the history
- [x] This PR adds support for `group_keys` in `groupby`. Starting pandas 1.5.0, issues around `group_keys` have been resolved:

pandas-dev/pandas#34998
pandas-dev/pandas#47185

- [x] This PR defaults `group_keys` to `False` which is the same as what pandas is going to be defaulting to in the future version.
- [x] Required to unblock `pandas-1.5.0` upgrade in cudf: #11617

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: #11659
  • Loading branch information
galipremsagar authored Sep 8, 2022
1 parent 0684ee1 commit d3e8f6d
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 17 deletions.
1 change: 1 addition & 0 deletions python/cudf/cudf/core/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3")
PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4")
PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0")
PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0")
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3836,7 +3836,7 @@ def groupby(
level=None,
as_index=True,
sort=False,
group_keys=True,
group_keys=False,
squeeze=False,
observed=False,
dropna=True,
Expand Down
68 changes: 60 additions & 8 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ def _quantile_75(x):
``False`` for better performance. Note this does not influence
the order of observations within each group. Groupby preserves
the order of rows within each group.
group_keys : bool, optional
When calling apply and the ``by`` argument produces a like-indexed
result, add group keys to index to identify pieces. By default group
keys are not included when the result's index (and column) labels match
the inputs, and are included otherwise. This argument has no effect if
the result produced is not like-indexed with respect to the input.
{ret}
Examples
--------
Expand Down Expand Up @@ -135,6 +141,32 @@ def _quantile_75(x):
Type
Wild 185.0
Captive 210.0
>>> df = cudf.DataFrame({{'A': 'a a b'.split(),
... 'B': [1,2,3],
... 'C': [4,6,5]}})
>>> g1 = df.groupby('A', group_keys=False)
>>> g2 = df.groupby('A', group_keys=True)
Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
differ in their ``group_keys`` argument. Calling `apply` in various ways,
we can get different grouping results:
>>> g1[['B', 'C']].apply(lambda x: x / x.sum())
B C
0 0.333333 0.4
1 0.666667 0.6
2 1.000000 1.0
In the above, the groups are not part of the index. We can have them included
by using ``g2`` where ``group_keys=True``:
>>> g2[['B', 'C']].apply(lambda x: x / x.sum())
B C
A
a 0 0.333333 0.4
1 0.666667 0.6
b 2 1.000000 1.0
"""
)

Expand Down Expand Up @@ -174,7 +206,14 @@ class GroupBy(Serializable, Reducible, Scannable):
_MAX_GROUPS_BEFORE_WARN = 100

def __init__(
self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
self,
obj,
by=None,
level=None,
sort=False,
as_index=True,
dropna=True,
group_keys=True,
):
"""
Group a DataFrame or Series by a set of columns.
Expand Down Expand Up @@ -210,6 +249,7 @@ def __init__(
self._level = level
self._sort = sort
self._dropna = dropna
self._group_keys = group_keys

if isinstance(by, _Grouping):
by._obj = self.obj
Expand Down Expand Up @@ -544,7 +584,9 @@ def _grouped(self):
grouped_key_cols, grouped_value_cols, offsets = self._groupby.groups(
[*self.obj._index._columns, *self.obj._columns]
)
grouped_keys = cudf.core.index._index_from_columns(grouped_key_cols)
grouped_keys = cudf.core.index._index_from_columns(
grouped_key_cols, name=self.grouping.keys.name
)
grouped_values = self.obj._from_columns_like_self(
grouped_value_cols,
column_names=self.obj._column_names,
Expand Down Expand Up @@ -707,7 +749,7 @@ def mult(df):
"""
if not callable(function):
raise TypeError(f"type {type(function)} is not callable")
group_names, offsets, _, grouped_values = self._grouped()
group_names, offsets, group_keys, grouped_values = self._grouped()

ngroups = len(offsets) - 1
if ngroups > self._MAX_GROUPS_BEFORE_WARN:
Expand All @@ -726,14 +768,21 @@ def mult(df):
if cudf.api.types.is_scalar(chunk_results[0]):
result = cudf.Series(chunk_results, index=group_names)
result.index.names = self.grouping.names
elif isinstance(chunk_results[0], cudf.Series):
if isinstance(self.obj, cudf.DataFrame):
else:
if isinstance(chunk_results[0], cudf.Series) and isinstance(
self.obj, cudf.DataFrame
):
result = cudf.concat(chunk_results, axis=1).T
result.index.names = self.grouping.names
else:
result = cudf.concat(chunk_results)
else:
result = cudf.concat(chunk_results)
if self._group_keys:
result.index = cudf.MultiIndex._from_data(
{
group_keys.name: group_keys._column,
None: grouped_values.index._column,
}
)

if self._sort:
result = result.sort_index()
Expand Down Expand Up @@ -1582,7 +1631,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):

def __getitem__(self, key):
return self.obj[key].groupby(
by=self.grouping.keys, dropna=self._dropna, sort=self._sort
by=self.grouping.keys,
dropna=self._dropna,
sort=self._sort,
group_keys=self._group_keys,
)


Expand Down
10 changes: 4 additions & 6 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3535,19 +3535,14 @@ def groupby(
level=None,
as_index=True,
sort=False,
group_keys=True,
group_keys=False,
squeeze=False,
observed=False,
dropna=True,
):
if axis not in (0, "index"):
raise NotImplementedError("axis parameter is not yet implemented")

if group_keys is not True:
raise NotImplementedError(
"The group_keys keyword is not yet implemented"
)

if squeeze is not False:
raise NotImplementedError(
"squeeze parameter is not yet implemented"
Expand All @@ -3562,6 +3557,8 @@ def groupby(
raise TypeError(
"groupby() requires either by or level to be specified."
)
if group_keys is None:
group_keys = False

return (
self.__class__._resampler(self, by=by)
Expand All @@ -3573,6 +3570,7 @@ def groupby(
as_index=as_index,
dropna=dropna,
sort=sort,
group_keys=group_keys,
)
)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3075,7 +3075,7 @@ def groupby(
level=None,
as_index=True,
sort=False,
group_keys=True,
group_keys=False,
squeeze=False,
observed=False,
dropna=True,
Expand Down
35 changes: 34 additions & 1 deletion python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

import cudf
from cudf import DataFrame, Series
from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_130, PANDAS_LT_140
from cudf.core._compat import (
PANDAS_GE_110,
PANDAS_GE_130,
PANDAS_GE_150,
PANDAS_LT_140,
)
from cudf.testing._utils import (
DATETIME_TYPES,
SIGNED_TYPES,
Expand Down Expand Up @@ -2677,3 +2682,31 @@ def test_groupby_pct_change_empty_columns():
expected = pdf.groupby("id").pct_change()

assert_eq(expected, actual)


@pytest.mark.parametrize(
"group_keys",
[
None,
pytest.param(
True,
marks=pytest.mark.xfail(
condition=not PANDAS_GE_150,
reason="https://github.com/pandas-dev/pandas/pull/34998",
),
),
False,
],
)
def test_groupby_group_keys(group_keys):
gdf = cudf.DataFrame(
{"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}
)
pdf = gdf.to_pandas()

g_group = gdf.groupby("A", group_keys=group_keys)
p_group = pdf.groupby("A", group_keys=group_keys)

actual = g_group[["B", "C"]].apply(lambda x: x / x.sum())
expected = p_group[["B", "C"]].apply(lambda x: x / x.sum())
assert_eq(actual, expected)

0 comments on commit d3e8f6d

Please sign in to comment.