rapidsai · rapids-bot · Nov 9, 2021 · Oct 21, 2021 · Oct 21, 2021 · Nov 2, 2021
@@ -19,6 +19,19 @@
 
 import cudf
 
+SUPPORTED_AGGS = (
+    "count",
+    "mean",
+    "std",
+    "var",
+    "sum",
+    "min",
+    "max",
+    "collect",
+    "first",
+    "last",
+)
+
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
     def __init__(self, *args, **kwargs):
@@ -60,23 +73,24 @@ def mean(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )
 
+    def collect(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {c: "collect" for c in self.obj.columns if c not in self.index},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )
+
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
         arg = _redirect_aggs(arg)
 
-        _supported = {
-            "count",
-            "mean",
-            "std",
-            "var",
-            "sum",
-            "min",
-            "max",
-            "collect",
-            "first",
-            "last",
-        }
         if (
             isinstance(self.obj, DaskDataFrame)
             and (
@@ -86,7 +100,7 @@ def aggregate(self, arg, split_every=None, split_out=1):
                     and all(isinstance(x, str) for x in self.index)
                 )
             )
-            and _is_supported(arg, _supported)
+            and _is_supported(arg, SUPPORTED_AGGS)
         ):
             if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
                 keys = self._meta.grouping.keys.names
@@ -129,33 +143,62 @@ def mean(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
+    def std(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {self._slice: "std"},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )[self._slice]
+
+    def var(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {self._slice: "var"},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )[self._slice]
+
+    def collect(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {self._slice: "collect"},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )[self._slice]
+
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
         arg = _redirect_aggs(arg)
 
-        _supported = {
-            "count",
-            "mean",
-            "std",
-            "var",
-            "sum",
-            "min",
-            "max",
-            "collect",
-            "first",
-            "last",
-        }
+        if not isinstance(arg, dict):
+            arg = {self._slice: arg}
 
         if (
             isinstance(self.obj, DaskDataFrame)
             and isinstance(self.index, (str, list))
-            and _is_supported({self._slice: arg}, _supported)
+            and _is_supported(arg, SUPPORTED_AGGS)
         ):
             return groupby_agg(
                 self.obj,
                 self.index,
-                {self._slice: arg},
+                arg,
                 split_every=split_every,
                 split_out=split_out,
                 dropna=self.dropna,
@@ -201,21 +244,9 @@ def groupby_agg(
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
-    _supported = {
-        "count",
-        "mean",
-        "std",
-        "var",
-        "sum",
-        "min",
-        "max",
-        "collect",
-        "first",
-        "last",
-    }
-    if not _is_supported(aggs, _supported):
+    if not _is_supported(aggs, SUPPORTED_AGGS):
         raise ValueError(
-            f"Supported aggs include {_supported} for groupby_agg API. "
+            f"Supported aggs include {SUPPORTED_AGGS} for groupby_agg API. "
             f"Aggregations must be specified with dict or list syntax."
         )
 
@@ -475,7 +506,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
-        elif agg in ("min", "max", "collect"):
+        elif agg in SUPPORTED_AGGS:
             agg_dict[col] = [agg]
         else:
             raise ValueError(f"Unexpected aggregation: {agg}")

@@ -11,11 +11,12 @@
 from cudf.core._compat import PANDAS_GE_120
 
 import dask_cudf
-from dask_cudf.groupby import _is_supported
+from dask_cudf.groupby import SUPPORTED_AGGS, _is_supported
 
 
-@pytest.mark.parametrize("aggregation", ["sum", "mean", "count", "min", "max"])
-def test_groupby_basic_aggs(aggregation):
+@pytest.mark.parametrize("aggregation", SUPPORTED_AGGS)
+@pytest.mark.parametrize("series", [False, True])
+def test_groupby_basic(series, aggregation):
     pdf = pd.DataFrame(
         {
             "x": np.random.randint(0, 5, size=10000),
@@ -24,19 +25,23 @@ def test_groupby_basic_aggs(aggregation):
     )
 
     gdf = cudf.DataFrame.from_pandas(pdf)
+    gdf_grouped = gdf.groupby("x")
+    ddf_grouped = dask_cudf.from_cudf(gdf, npartitions=5).groupby("x")
 
-    ddf = dask_cudf.from_cudf(gdf, npartitions=5)
+    if series:
+        gdf_grouped = gdf_grouped.x
+        ddf_grouped = ddf_grouped.x
 
-    a = getattr(gdf.groupby("x"), aggregation)()
-    b = getattr(ddf.groupby("x"), aggregation)().compute()
+    a = getattr(gdf_grouped, aggregation)()
+    b = getattr(ddf_grouped, aggregation)().compute()
 
     if aggregation == "count":
         dd.assert_eq(a, b, check_dtype=False)
     else:
         dd.assert_eq(a, b)
 
-    a = gdf.groupby("x").agg({"x": aggregation})
-    b = ddf.groupby("x").agg({"x": aggregation}).compute()
+    a = gdf_grouped.agg({"x": aggregation})
+    b = ddf_grouped.agg({"x": aggregation}).compute()
 
     if aggregation == "count":
         dd.assert_eq(a, b, check_dtype=False)
@@ -117,31 +122,6 @@ def test_groupby_std(func):
     dd.assert_eq(a, b)
 
 
-@pytest.mark.parametrize(
-    "func",
-    [
-        lambda df: df.groupby("x").agg({"y": "collect"}),
-        lambda df: df.groupby("x").y.agg("collect"),
-    ],
-)
-def test_groupby_collect(func):
-    pdf = pd.DataFrame(
-        {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
-        }
-    )
-
-    gdf = cudf.DataFrame.from_pandas(pdf)
-
-    ddf = dask_cudf.from_cudf(gdf, npartitions=5)
-
-    a = func(gdf).to_pandas()
-    b = func(ddf).compute().to_pandas()
-
-    dd.assert_eq(a, b)
-
-
 # reason gotattr in cudf
 @pytest.mark.parametrize(
     "func",