diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd new file mode 100644 index 00000000000..cd2d44d2e42 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/explode.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type + +cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil: + cdef unique_ptr[table] explode_outer( + const table_view, + size_type explode_column_idx, + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index aba13580912..0f0ee35556a 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -1,17 +1,25 @@ # Copyright (c) 2021, NVIDIA CORPORATION. +from libcpp cimport bool from libcpp.memory cimport unique_ptr, shared_ptr, make_shared from libcpp.utility cimport move from cudf._lib.cpp.lists.count_elements cimport ( count_elements as cpp_count_elements ) +from cudf._lib.cpp.lists.explode cimport ( + explode_outer as cpp_explode_outer +) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.column.column cimport column -from cudf._lib.column cimport Column +from cudf._lib.cpp.table.table cimport table +from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type +from cudf._lib.column cimport Column +from cudf._lib.table cimport Table from cudf.core.dtypes import ListDtype @@ -32,3 +40,21 @@ def count_elements(Column col): result = Column.from_unique_ptr(move(c_result)) return result + + +def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): + cdef table_view c_table_view = ( + tbl.data_view() if ignore_index else tbl.view() + ) + cdef size_type c_explode_column_idx = explode_column_idx + + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx)) + + return Table.from_unique_ptr( + move(c_result), + column_names=tbl._column_names, + index_names=None if ignore_index else tbl._index_names + ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9672ab3002f..4fb5762e098 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7705,6 +7705,52 @@ def equals(self, other): return False return super().equals(other) + def explode(self, column, ignore_index=False): + """ + Transform each element of a list-like to a row, replicating index + values. + + Parameters + ---------- + column : str or tuple + Column to explode. + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + + Examples + -------- + >>> import cudf + >>> cudf.DataFrame( + {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]}) + a b + 0 [1, 2, 3] 11 + 1 [] 22 + 2 None 33 + 3 [4, 5] 44 + >>> df.explode('a') + a b + 0 1 11 + 0 2 11 + 0 3 11 + 1 22 + 2 33 + 3 4 44 + 3 5 44 + """ + if column not in self._column_names: + raise KeyError(column) + + if not is_list_dtype(self._data[column].dtype): + data = self._data.copy(deep=True) + idx = None if ignore_index else self._index.copy(deep=True) + return self.__class__._from_data(data, index=idx) + + return super()._explode(column, ignore_index) + _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fab5936f94d..bfcc2d125db 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -573,6 +573,28 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) + def _explode(self, explode_column: Any, ignore_index: bool): + """Helper function for `explode` in `Series` and `Dataframe`, explodes + a specified nested column. Other columns' corresponding rows are + duplicated. If ignore_index is set, the original index is not exploded + and will be replaced with a `RangeIndex`. + """ + explode_column_num = self._column_names.index(explode_column) + if not ignore_index and self._index is not None: + explode_column_num += self._index.nlevels + + res_tbl = libcudf.lists.explode_outer( + self, explode_column_num, ignore_index + ) + res = self.__class__._from_table(res_tbl) + + res._data.multiindex = self._data.multiindex + res._data._level_names = self._data._level_names + + if not ignore_index and self._index is not None: + res.index.names = self._index.names + return res + def _get_columns_by_label(self, labels, downcast): """ Returns columns of the Frame specified by `labels` diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b06fef178f6..7ed2157277c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6362,6 +6362,47 @@ def keys(self): """ return self.index + def explode(self, ignore_index=False): + """ + Transform each element of a list-like to a row, replicating index + values. + + Parameters + ---------- + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]]) + >>> s + 0 [1, 2, 3] + 1 [] + 2 None + 3 [4, 5] + dtype: list + >>> s.explode() + 0 1 + 0 2 + 0 3 + 1 + 2 + 3 4 + 3 5 + dtype: int64 + """ + if not is_list_dtype(self._column.dtype): + data = self._data.copy(deep=True) + idx = None if ignore_index else self._index.copy(deep=True) + return self.__class__._from_data(data, index=idx) + + return super()._explode(self._column_names[0], ignore_index) + _accessors = set() # type: Set[Any] diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 5f4d571e8c5..b3ba439cb15 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8442,3 +8442,56 @@ def test_rename_for_level_is_None_MC(): got = gdf.rename(columns={"a": "f"}, level=None) assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + [[1, 2, 3], 11, "a"], + [None, 22, "e"], + [[4], 33, "i"], + [[], 44, "o"], + [[5, 6], 55, "u"], + ], # nested + [ + [1, 11, "a"], + [2, 22, "e"], + [3, 33, "i"], + [4, 44, "o"], + [5, 55, "u"], + ], # non-nested + ], +) +@pytest.mark.parametrize( + ("labels", "label_to_explode"), + [ + (None, 0), + (pd.Index(["a", "b", "c"]), "a"), + ( + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] + ), + (0, "a"), + ), + ], +) +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize( + "p_index", + [ + None, + ["ia", "ib", "ic", "id", "ie"], + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), + ], +) +def test_explode(data, labels, ignore_index, p_index, label_to_explode): + pdf = pd.DataFrame(data, index=p_index, columns=labels) + gdf = cudf.from_pandas(pdf) + + expect = pdf.explode(label_to_explode, ignore_index) + got = gdf.explode(label_to_explode, ignore_index) + + assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a1b4236719d..beda14934ca 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1118,3 +1118,32 @@ def test_series_drop_raises(): actual = gs.drop("p", errors="ignore") assert_eq(actual, expect) + + +@pytest.mark.parametrize( + "data", [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], +) +@pytest.mark.parametrize("ignore_index", [True, False]) +@pytest.mark.parametrize( + "p_index", + [ + None, + ["ia", "ib", "ic", "id", "ie"], + pd.MultiIndex.from_tuples( + [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] + ), + ], +) +def test_explode(data, ignore_index, p_index): + pdf = pd.Series(data, index=p_index, name="someseries") + gdf = cudf.from_pandas(pdf) + + expect = pdf.explode(ignore_index) + got = gdf.explode(ignore_index) + + if data == [1, 2, 3, 4, 5] and ignore_index and p_index is not None: + # https://github.com/pandas-dev/pandas/issues/40487 + with pytest.raises(AssertionError, match="different"): + assert_eq(expect, got, check_dtype=False) + else: + assert_eq(expect, got, check_dtype=False)