You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
cuDF recently implemented groupby collect set and exposed it as groupby.unique. I'd like to be able to use this with Dask, like I can on the CPU. We may want to explore this in Dask or special case in Dask cuDF. It looks like we fail doing a typecheck on a ListDtype. Complement to #7812
import cudf
import dask_cudf
import pandas as pd
import dask.dataframe as dd
df = pd.DataFrame({
"a":[0,0,0,1,1,1],
"b":[10,10,10,7,8,9]
})
ddf = dd.from_pandas(df, 2)
gdf = cudf.from_pandas(df)
gddf = dask_cudf.from_cudf(gdf, 2)
print(ddf.groupby("a").b.unique().compute()) # works as expected
print(gddf.groupby("a").b.unique().compute())
a
0 [10]
1 [7, 8, 9]
Name: b, dtype: object
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-21-f8b1b8dfa717> in <module>
1617print(ddf.groupby("a").b.unique().compute()) # works as expected
---> 18 print(gddf.groupby("a").b.unique().compute())
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/base.py in compute(self, **kwargs)
281 dask.base.compute
282"""
--> 283 (result,) = compute(self, traverse=False, **kwargs)
284return result
285
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/base.py in compute(*args, **kwargs)
563 postcomputes.append(x.__dask_postcompute__())
564
--> 565 results = schedule(dsk, keys, **kwargs)
566return repack([f(r, *a) for r, (f, a) inzip(results, postcomputes)])
567
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/local.py in get_sync(dsk, keys, **kwargs)
526"""527 kwargs.pop("num_workers", None) # if num_workers present, remove it
--> 528 return get_async(apply_sync, 1, dsk, keys, **kwargs)
529530
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
470# Seed initial tasks into the thread pool471while state["ready"] andlen(state["running"]) < num_workers:
--> 472 fire_task()
473474# Main loop, wait on tasks to finish, insert new ones
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/local.py in fire_task()
465 pack_exception,
466 ),
--> 467 callback=queue.put,
468 )
469
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/local.py in apply_sync(func, args, kwds, callback)
515defapply_sync(func, args=(), kwds={}, callback=None):
516""" A naive synchronous version of apply_async """
--> 517 res = func(*args, **kwds)
518if callback isnotNone:
519 callback(res)
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
225 failed =False226exceptBaseExceptionas e:
--> 227 result = pack_exception(e, dumps)
228 failed =True229return key, result, failed
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
220try:
221 task, data = loads(task_info)
--> 222 result = _execute_task(task, data)
223id= get_id()
224 result = dumps((result, id))
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/core.py in _execute_task(arg, cache, dsk)
119# temporaries by their reference count and can execute certain120# operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122elifnot ishashable(arg):
123return arg
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/utils.py in apply(func, args, kwargs)
33defapply(func, args, kwargs=None):
34if kwargs:
---> 35 return func(*args, **kwargs)
36else:
37return func(*args)
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/dataframe/groupby.py in _apply_chunk(df, dropna, observed, *index, **kwargs)
301ifisinstance(columns, (tuple, list, set, pd.Index)):
302 columns =list(columns)
--> 303 return func(g[columns], **kwargs)
304305
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/dask/utils.py in __call__(self, obj, *args, **kwargs)
899900def__call__(self, obj, *args, **kwargs):
--> 901 return getattr(obj, self.method)(*args, **kwargs)
902903def__reduce__(self):
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/cudf/core/groupby/groupby.py in _agg_func_name_with_args(self, func_name, *args, **kwargs)
279280 func.__name__= func_name
--> 281 return self.agg(func)
282283def_normalize_aggs(self, aggs):
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/cudf/core/groupby/groupby.py in agg(self, func)
786iflen(result._data):
787if result.shape[1] ==1andnot pd.api.types.is_list_like(func):
--> 788 return result.iloc[:, 0]
789790# drop the first level if we have a multiindex
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/cudf/core/indexing.py in __getitem__(self, arg)
214ifnotisinstance(arg, tuple):
215 arg = (arg, slice(None))
--> 216 return self._getitem_tuple_arg(arg)
217218def__setitem__(self, key, value):
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/contextlib.py in inner(*args, **kwds)
72definner(*args, **kwds):
73withself._recreate_cm():
---> 74 return func(*args, **kwds)
75return inner
76
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/cudf/core/indexing.py in _getitem_tuple_arg(self, arg)
468# Iloc Step 4:469# Downcast
--> 470 if self._can_downcast_to_series(df, arg):
471returnself._downcast_to_series(df, arg)
472
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/cudf/core/indexing.py in _can_downcast_to_series(self, df, arg)
250 dtypes = df.dtypes.values.tolist()
251 all_numeric =all(
--> 252 [pd.api.types.is_numeric_dtype(t) for t in dtypes]
253 )
254if all_numeric:
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/cudf/core/indexing.py in <listcomp>(.0)
250 dtypes = df.dtypes.values.tolist()
251 all_numeric =all(
--> 252 [pd.api.types.is_numeric_dtype(t) for t in dtypes]
253 )
254if all_numeric:
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/pandas/core/dtypes/common.py in is_numeric_dtype(arr_or_dtype)
1270 """
1271 return _is_dtype_type(
-> 1272 arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_)
1273 )
1274
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/pandas/core/dtypes/common.py in _is_dtype_type(arr_or_dtype, condition)
1649 return False
1650
-> 1651 return condition(tipo)
1652
1653
/raid/nicholasb/miniconda3/envs/rapids-gpubdb-20210331/lib/python3.7/site-packages/pandas/core/dtypes/common.py in <lambda>(tipo)
194"""195returnlambdatipo: (
--> 196 issubclass(tipo, klasses)
197andnotissubclass(tipo, (np.datetime64, np.timedelta64))
198 )
TypeError: issubclass() arg 1 must be a class
So we would need some code-changes(both in dask-cudf & in upstream dask too) to generalize things related to unique groupby aggregation, but these are currently blocked by the following libcudf bug: #7611
So we would need some code-changes(both in dask-cudf & in upstream dask too) to generalize things related to unique groupby aggregation, but these are currently blocked by the following libcudf bug: #7611
cc: @mythrocks would we be able to get #7611 addressed as part of 0.20 ?
cuDF recently implemented groupby collect set and exposed it as
groupby.unique
. I'd like to be able to use this with Dask, like I can on the CPU. We may want to explore this in Dask or special case in Dask cuDF. It looks like we fail doing a typecheck on a ListDtype. Complement to #7812The text was updated successfully, but these errors were encountered: