Fix cudf.Series constructor to handle list of sequences (#8735)

Fixes: #7840 Dependent on: dask/dask#7892 This PR introduces ability to construct `list` Series by passing in a sequence of array-like objects to `cudf.Series`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Charles Blackmon-Luca (https://github.com/charlesbluca) - Ashwin Srinath (https://github.com/shwina) URL: #8735
rapidsai · Jul 20, 2021 · 456f088 · 456f088
1 parent 7ee347c
commit 456f088
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 2 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -2027,7 +2027,6 @@ def as_column(
         mask = bools_to_mask(as_column(mask).unary_operator("not"))
 
         data = data.set_mask(mask)
-
     else:
         try:
             data = as_column(
@@ -2099,6 +2098,17 @@ def as_column(
                 elif is_interval_dtype(dtype):
                     sr = pd.Series(arbitrary, dtype="interval")
                     data = as_column(sr, nan_as_null=nan_as_null, dtype=dtype)
+                elif (
+                    isinstance(arbitrary, Sequence)
+                    and len(arbitrary) > 0
+                    and any(
+                        cudf.utils.dtypes.is_column_like(arb)
+                        for arb in arbitrary
+                    )
+                ):
+                    return cudf.core.column.ListColumn.from_sequences(
+                        arbitrary
+                    )
                 else:
                     data = as_column(
                         _construct_array(arbitrary, dtype),

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import pickle
+from typing import Sequence
 
 import numpy as np
 import pyarrow as pa
@@ -278,6 +279,42 @@ def leaves(self):
         else:
             return self.elements
 
+    @classmethod
+    def from_sequences(
+        cls, arbitrary: Sequence[ColumnLike]
+    ) -> "cudf.core.column.ListColumn":
+        """
+        Create a list column for list of column-like sequences
+        """
+        data_col = column.column_empty(0)
+        mask_col = []
+        offset_col = [0]
+        offset = 0
+
+        # Build Data, Mask & Offsets
+        for data in arbitrary:
+            if cudf._lib.scalar._is_null_host_scalar(data):
+                mask_col.append(False)
+                offset_col.append(offset)
+            else:
+                mask_col.append(True)
+                data_col = data_col.append(as_column(data))
+                offset += len(data)
+                offset_col.append(offset)
+
+        offset_col = column.as_column(offset_col, dtype="int32")
+
+        # Build ListColumn
+        res = cls(
+            size=len(arbitrary),
+            dtype=cudf.ListDtype(data_col.dtype),
+            mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
+            offset=0,
+            null_count=0,
+            children=(offset_col, data_col),
+        )
+        return res
+
 
 class ListMethods(ColumnMethods):
     """

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -241,7 +241,7 @@ def __init__(
         if isinstance(data, dict):
             index = data.keys()
             data = column.as_column(
-                data.values(), nan_as_null=nan_as_null, dtype=dtype
+                list(data.values()), nan_as_null=nan_as_null, dtype=dtype
             )
 
         if data is None:

diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -4,6 +4,7 @@
 import re
 from string import ascii_letters, digits
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 import pytest
@@ -1203,3 +1204,29 @@ def test_explode(data, ignore_index, p_index):
             assert_eq(expect, got, check_dtype=False)
     else:
         assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        (
+            [cudf.Series([1, 2, 3]), cudf.Series([10, 20])],
+            cudf.Series([[1, 2, 3], [10, 20]]),
+        ),
+        (
+            [cudf.Series([1, 2, 3]), None, cudf.Series([10, 20, np.nan])],
+            cudf.Series([[1, 2, 3], None, [10, 20, np.nan]]),
+        ),
+        (
+            [cp.array([5, 6]), cudf.NA, cp.array([1])],
+            cudf.Series([[5, 6], None, [1]]),
+        ),
+        (
+            [None, None, None, None, None, cudf.Series([10, 20])],
+            cudf.Series([None, None, None, None, None, [10, 20]]),
+        ),
+    ],
+)
+def test_nested_series_from_sequence_data(data, expected):
+    actual = cudf.Series(data)
+    assert_eq(actual, expected)
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -580,3 +580,17 @@ def test_groupby_agg_redirect(aggregations):
 )
 def test_is_supported(arg):
     assert _is_supported(arg, {"supported"}) is False
+
+
+def test_groupby_unique_lists():
+    df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
+    ddf = dd.from_pandas(df, 2)
+    gdf = cudf.from_pandas(df)
+    gddf = dask_cudf.from_cudf(gdf, 2)
+    dd.assert_eq(
+        ddf.groupby("a").b.unique().compute(),
+        gddf.groupby("a").b.unique().compute(),
+    )
+    dd.assert_eq(
+        gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(),
+    )