Skip to content

Commit

Permalink
Adds explode API (#7607)
Browse files Browse the repository at this point in the history
Closes #2975 

This PR introduces `explode` API, which flattens list columns and turns list elements into rows. Example:

```python
>>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]])
>>> s
0    [1, 2, 3]
1           []
2         None
3       [4, 5]
dtype: list
>>> s.explode()
0       1
0       2
0       3
1    <NA>
2    <NA>
3       4
3       5
dtype: int64
```

Supersedes #7538

Authors:
  - Michael Wang (@isVoid)

Approvers:
  - Keith Kraus (@kkraus14)
  - GALI PREM SAGAR (@galipremsagar)

URL: #7607
  • Loading branch information
isVoid authored Mar 18, 2021
1 parent f5a4214 commit ec5364c
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 1 deletion.
13 changes: 13 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/explode.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type

cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
cdef unique_ptr[table] explode_outer(
const table_view,
size_type explode_column_idx,
) except +
28 changes: 27 additions & 1 deletion python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr, shared_ptr, make_shared
from libcpp.utility cimport move

from cudf._lib.cpp.lists.count_elements cimport (
count_elements as cpp_count_elements
)
from cudf._lib.cpp.lists.explode cimport (
explode_outer as cpp_explode_outer
)
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.column.column cimport column

from cudf._lib.column cimport Column
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type

from cudf._lib.column cimport Column
from cudf._lib.table cimport Table

from cudf.core.dtypes import ListDtype

Expand All @@ -32,3 +40,21 @@ def count_elements(Column col):

result = Column.from_unique_ptr(move(c_result))
return result


def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
cdef table_view c_table_view = (
tbl.data_view() if ignore_index else tbl.view()
)
cdef size_type c_explode_column_idx = explode_column_idx

cdef unique_ptr[table] c_result

with nogil:
c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))

return Table.from_unique_ptr(
move(c_result),
column_names=tbl._column_names,
index_names=None if ignore_index else tbl._index_names
)
46 changes: 46 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7705,6 +7705,52 @@ def equals(self, other):
return False
return super().equals(other)

def explode(self, column, ignore_index=False):
"""
Transform each element of a list-like to a row, replicating index
values.
Parameters
----------
column : str or tuple
Column to explode.
ignore_index : bool, default False
If True, the resulting index will be labeled 0, 1, …, n - 1.
Returns
-------
DataFrame
Examples
--------
>>> import cudf
>>> cudf.DataFrame(
{"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]})
a b
0 [1, 2, 3] 11
1 [] 22
2 None 33
3 [4, 5] 44
>>> df.explode('a')
a b
0 1 11
0 2 11
0 3 11
1 <NA> 22
2 <NA> 33
3 4 44
3 5 44
"""
if column not in self._column_names:
raise KeyError(column)

if not is_list_dtype(self._data[column].dtype):
data = self._data.copy(deep=True)
idx = None if ignore_index else self._index.copy(deep=True)
return self.__class__._from_data(data, index=idx)

return super()._explode(column, ignore_index)

_accessors = set() # type: Set[Any]


Expand Down
22 changes: 22 additions & 0 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,28 @@ def equals(self, other, **kwargs):
else:
return self._index.equals(other._index)

def _explode(self, explode_column: Any, ignore_index: bool):
"""Helper function for `explode` in `Series` and `Dataframe`, explodes
a specified nested column. Other columns' corresponding rows are
duplicated. If ignore_index is set, the original index is not exploded
and will be replaced with a `RangeIndex`.
"""
explode_column_num = self._column_names.index(explode_column)
if not ignore_index and self._index is not None:
explode_column_num += self._index.nlevels

res_tbl = libcudf.lists.explode_outer(
self, explode_column_num, ignore_index
)
res = self.__class__._from_table(res_tbl)

res._data.multiindex = self._data.multiindex
res._data._level_names = self._data._level_names

if not ignore_index and self._index is not None:
res.index.names = self._index.names
return res

def _get_columns_by_label(self, labels, downcast):
"""
Returns columns of the Frame specified by `labels`
Expand Down
41 changes: 41 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -6362,6 +6362,47 @@ def keys(self):
"""
return self.index

def explode(self, ignore_index=False):
"""
Transform each element of a list-like to a row, replicating index
values.
Parameters
----------
ignore_index : bool, default False
If True, the resulting index will be labeled 0, 1, …, n - 1.
Returns
-------
DataFrame
Examples
--------
>>> import cudf
>>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]])
>>> s
0 [1, 2, 3]
1 []
2 None
3 [4, 5]
dtype: list
>>> s.explode()
0 1
0 2
0 3
1 <NA>
2 <NA>
3 4
3 5
dtype: int64
"""
if not is_list_dtype(self._column.dtype):
data = self._data.copy(deep=True)
idx = None if ignore_index else self._index.copy(deep=True)
return self.__class__._from_data(data, index=idx)

return super()._explode(self._column_names[0], ignore_index)

_accessors = set() # type: Set[Any]


Expand Down
53 changes: 53 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -8442,3 +8442,56 @@ def test_rename_for_level_is_None_MC():
got = gdf.rename(columns={"a": "f"}, level=None)

assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
[
[[1, 2, 3], 11, "a"],
[None, 22, "e"],
[[4], 33, "i"],
[[], 44, "o"],
[[5, 6], 55, "u"],
], # nested
[
[1, 11, "a"],
[2, 22, "e"],
[3, 33, "i"],
[4, 44, "o"],
[5, 55, "u"],
], # non-nested
],
)
@pytest.mark.parametrize(
("labels", "label_to_explode"),
[
(None, 0),
(pd.Index(["a", "b", "c"]), "a"),
(
pd.MultiIndex.from_tuples(
[(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"]
),
(0, "a"),
),
],
)
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize(
"p_index",
[
None,
["ia", "ib", "ic", "id", "ie"],
pd.MultiIndex.from_tuples(
[(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]
),
],
)
def test_explode(data, labels, ignore_index, p_index, label_to_explode):
pdf = pd.DataFrame(data, index=p_index, columns=labels)
gdf = cudf.from_pandas(pdf)

expect = pdf.explode(label_to_explode, ignore_index)
got = gdf.explode(label_to_explode, ignore_index)

assert_eq(expect, got, check_dtype=False)
29 changes: 29 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,3 +1118,32 @@ def test_series_drop_raises():
actual = gs.drop("p", errors="ignore")

assert_eq(actual, expect)


@pytest.mark.parametrize(
"data", [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]],
)
@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize(
"p_index",
[
None,
["ia", "ib", "ic", "id", "ie"],
pd.MultiIndex.from_tuples(
[(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")]
),
],
)
def test_explode(data, ignore_index, p_index):
pdf = pd.Series(data, index=p_index, name="someseries")
gdf = cudf.from_pandas(pdf)

expect = pdf.explode(ignore_index)
got = gdf.explode(ignore_index)

if data == [1, 2, 3, 4, 5] and ignore_index and p_index is not None:
# https://github.com/pandas-dev/pandas/issues/40487
with pytest.raises(AssertionError, match="different"):
assert_eq(expect, got, check_dtype=False)
else:
assert_eq(expect, got, check_dtype=False)

0 comments on commit ec5364c

Please sign in to comment.