You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Today, in cuDF Python I can manipulate struct columns. But, I can't with Dask. One hurdle appears to be that we cannot do column selection on a struct typed column.
I believe this is best framed as a feature request, as this is a new type.
importcudfimportdask_cudf
df=cudf.DataFrame(
{"col": [
{"a":5, "b":10},
{"a":3, "b":7},
{"a":-3, "b":11}
]}
)
ddf=dask_cudf.from_cudf(df, 2)
ddf.col---------------------------------------------------------------------------ValueErrorTraceback (mostrecentcalllast)
/tmp/ipykernel_22770/1289136161.pyin<module>10 )
11ddf=dask_cudf.from_cudf(df, 2)
--->12ddf.col/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/dask/dataframe/core.pyin__getattr__(self, key)
3991def__getattr__(self, key):
3992ifkeyinself.columns:
->3993returnself[key]
3994else:
3995raiseAttributeError("'DataFrame' object has no attribute %r"%key)
/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/dask/dataframe/core.pyin__getitem__(self, key)
3912dsk=partitionwise_graph(operator.getitem, name, self, key)
3913graph=HighLevelGraph.from_collections(name, dsk, dependencies=[self])
->3914returnnew_dd_object(graph, name, meta, self.divisions)
3915elifisinstance(key, slice):
3916frompandas.api.typesimportis_float_dtype/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/dask/dataframe/core.pyinnew_dd_object(dsk, name, meta, divisions, parent_meta)
6802""" 6803 if has_parallel_type(meta):-> 6804 return get_parallel_type(meta)(dsk, name, meta, divisions) 6805 elif is_arraylike(meta) and meta.shape: 6806 import dask.array as da/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/dask_cudf/core.py in __init__(self, dsk, name, meta, divisions) 63 self.dask = dsk 64 self._name = name---> 65 meta = dask_make_meta(meta) 66 if not isinstance(meta, self._partition_type): 67 raise TypeError(/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/dask/dataframe/dispatch.py in make_meta(x, index, parent_meta) 125 126 try:--> 127 return make_meta_dispatch(x, index=index) 128 except TypeError: 129 if parent_meta is not None:/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/dask/utils.py in __call__(self, arg, *args, **kwargs) 573 """574meth=self.dispatch(type(arg))
-->575returnmeth(arg, *args, **kwargs)
576577 @property/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/dask_cudf/backends.pyinmake_meta_cudf(x, index)
130 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
131defmake_meta_cudf(x, index=None):
-->132returnx.head(0)
133134/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/cudf/core/series.pyinhead(self, n)
1169dtype: object1170 """
-> 1171 return self.iloc[:n]
1172
1173 def tail(self, n=5):
/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/cudf/core/indexing.py in __getitem__(self, arg)
91 if isinstance(arg, tuple):
92 arg = list(arg)
---> 93 data = self._sr._column[arg]
94
95 if (
/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/cudf/core/column/struct.py in __getitem__(self, args)
82
83 def __getitem__(self, args):
---> 84 result = super().__getitem__(args)
85 if isinstance(result, dict):
86 return {
/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/cudf/core/column/column.py in __getitem__(self, arg)
511 elif isinstance(arg, slice):
512 start, stop, stride = arg.indices(len(self))
--> 513 return self.slice(start, stop, stride)
514 else:
515 arg = as_column(arg)
/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/cudf/core/column/column.py in slice(self, start, stop, stride)
495 stop = stop + len(self)
496 if (stride > 0 and start >= stop) or (stride < 0 and start <= stop):
--> 497 return column_empty(0, self.dtype, masked=True)
498 # compute mask slice
499 if stride == 1:
/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/cudf/core/column/column.py in column_empty(row_count, dtype, masked)
1365 mask = None
1366
-> 1367 return build_column(
1368 data, dtype, mask=mask, size=row_count, children=children
1369 )
/raid/nicholasb/miniconda3/envs/rapids-21.08/lib/python3.8/site-packages/cudf/core/column/column.py in build_column(data, dtype, size, mask, offset, null_count, children)
1474 if size is None:
1475 raise TypeError("Must specify size")
->1476returncudf.core.column.StructColumn(
1477data=data,
1478dtype=dtype,
cudf/_lib/column.pyxincudf._lib.column.Column.__init__()
cudf/_lib/column.pyxincudf._lib.column.Column.set_base_mask()
ValueError: TheBufferformaskissmallerthanexpected, got0bytes, expected64bytes.
Note: Themaskisexpectedtobesizedaccordingtothebaseallocationasopposedtotheoffsettedorsizedallocation.
Today, in cuDF Python I can manipulate struct columns. But, I can't with Dask. One hurdle appears to be that we cannot do column selection on a struct typed column.
I believe this is best framed as a feature request, as this is a new type.
This does not happen for list typed columns.
The text was updated successfully, but these errors were encountered: