Skip to content

Commit

Permalink
Fix cudf.Series constructor to handle list of sequences (#8735)
Browse files Browse the repository at this point in the history
Fixes: #7840
Dependent on: dask/dask#7892

This PR introduces ability to construct `list` Series by passing in a sequence of array-like objects to `cudf.Series`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Ashwin Srinath (https://github.com/shwina)

URL: #8735
  • Loading branch information
galipremsagar authored Jul 20, 2021
1 parent 7ee347c commit 456f088
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 2 deletions.
12 changes: 11 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -2027,7 +2027,6 @@ def as_column(
mask = bools_to_mask(as_column(mask).unary_operator("not"))

data = data.set_mask(mask)

else:
try:
data = as_column(
Expand Down Expand Up @@ -2099,6 +2098,17 @@ def as_column(
elif is_interval_dtype(dtype):
sr = pd.Series(arbitrary, dtype="interval")
data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
elif (
isinstance(arbitrary, Sequence)
and len(arbitrary) > 0
and any(
cudf.utils.dtypes.is_column_like(arb)
for arb in arbitrary
)
):
return cudf.core.column.ListColumn.from_sequences(
arbitrary
)
else:
data = as_column(
_construct_array(arbitrary, dtype),
Expand Down
37 changes: 37 additions & 0 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import pickle
from typing import Sequence

import numpy as np
import pyarrow as pa
Expand Down Expand Up @@ -278,6 +279,42 @@ def leaves(self):
else:
return self.elements

@classmethod
def from_sequences(
cls, arbitrary: Sequence[ColumnLike]
) -> "cudf.core.column.ListColumn":
"""
Create a list column for list of column-like sequences
"""
data_col = column.column_empty(0)
mask_col = []
offset_col = [0]
offset = 0

# Build Data, Mask & Offsets
for data in arbitrary:
if cudf._lib.scalar._is_null_host_scalar(data):
mask_col.append(False)
offset_col.append(offset)
else:
mask_col.append(True)
data_col = data_col.append(as_column(data))
offset += len(data)
offset_col.append(offset)

offset_col = column.as_column(offset_col, dtype="int32")

# Build ListColumn
res = cls(
size=len(arbitrary),
dtype=cudf.ListDtype(data_col.dtype),
mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
offset=0,
null_count=0,
children=(offset_col, data_col),
)
return res


class ListMethods(ColumnMethods):
"""
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def __init__(
if isinstance(data, dict):
index = data.keys()
data = column.as_column(
data.values(), nan_as_null=nan_as_null, dtype=dtype
list(data.values()), nan_as_null=nan_as_null, dtype=dtype
)

if data is None:
Expand Down
27 changes: 27 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from string import ascii_letters, digits

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1203,3 +1204,29 @@ def test_explode(data, ignore_index, p_index):
assert_eq(expect, got, check_dtype=False)
else:
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize(
"data, expected",
[
(
[cudf.Series([1, 2, 3]), cudf.Series([10, 20])],
cudf.Series([[1, 2, 3], [10, 20]]),
),
(
[cudf.Series([1, 2, 3]), None, cudf.Series([10, 20, np.nan])],
cudf.Series([[1, 2, 3], None, [10, 20, np.nan]]),
),
(
[cp.array([5, 6]), cudf.NA, cp.array([1])],
cudf.Series([[5, 6], None, [1]]),
),
(
[None, None, None, None, None, cudf.Series([10, 20])],
cudf.Series([None, None, None, None, None, [10, 20]]),
),
],
)
def test_nested_series_from_sequence_data(data, expected):
actual = cudf.Series(data)
assert_eq(actual, expected)
14 changes: 14 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,17 @@ def test_groupby_agg_redirect(aggregations):
)
def test_is_supported(arg):
assert _is_supported(arg, {"supported"}) is False


def test_groupby_unique_lists():
df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
ddf = dd.from_pandas(df, 2)
gdf = cudf.from_pandas(df)
gddf = dask_cudf.from_cudf(gdf, 2)
dd.assert_eq(
ddf.groupby("a").b.unique().compute(),
gddf.groupby("a").b.unique().compute(),
)
dd.assert_eq(
gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(),
)

0 comments on commit 456f088

Please sign in to comment.