diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index 57a4f724c2d..3397cb0ca1d 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,6 +38,7 @@ namespace cudf { */ class lists_column_view : private column_view { public: + lists_column_view() = default; /** * @brief Construct a new lists column view object from a column view. * diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 5d406f5c85f..0ad09dba717 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -9,10 +9,6 @@ from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view -from cudf._lib.pylibcudf.libcudf.lists.contains cimport ( - contains, - index_of as cpp_index_of, -) from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport ( count_elements as cpp_count_elements, ) @@ -26,7 +22,6 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport ( from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport ( distinct as cpp_distinct, ) -from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar from cudf._lib.pylibcudf.libcudf.types cimport ( nan_equality, null_equality, @@ -34,11 +29,12 @@ from cudf._lib.pylibcudf.libcudf.types cimport ( order, size_type, ) -from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf._lib import pylibcudf +from cudf._lib.pylibcudf cimport Scalar + @acquire_spill_lock() def count_elements(Column col): @@ -153,64 +149,36 @@ def extract_element_column(Column col, Column index): @acquire_spill_lock() -def contains_scalar(Column col, object py_search_key): - - cdef DeviceScalar search_key = py_search_key.device_value - - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) +def contains_scalar(Column col, py_search_key): + return Column.from_pylibcudf( + pylibcudf.lists.contains( + col.to_pylibcudf(mode="read"), + py_search_key.device_value.c_value, + ) ) - cdef const scalar* search_key_value = search_key.get_raw_ptr() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(contains( - list_view.get()[0], - search_key_value[0], - )) - result = Column.from_unique_ptr(move(c_result)) - return result @acquire_spill_lock() def index_of_scalar(Column col, object py_search_key): - - cdef DeviceScalar search_key = py_search_key.device_value - - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) + return Column.from_pylibcudf( + pylibcudf.lists.index_of( + col.to_pylibcudf(mode="read"), + py_search_key.device_value.c_value, + True, + ) ) - cdef const scalar* search_key_value = search_key.get_raw_ptr() - - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_index_of( - list_view.get()[0], - search_key_value[0], - )) - return Column.from_unique_ptr(move(c_result)) @acquire_spill_lock() def index_of_column(Column col, Column search_keys): - - cdef column_view keys_view = search_keys.view() - - cdef shared_ptr[lists_column_view] list_view = ( - make_shared[lists_column_view](col.view()) + return Column.from_pylibcudf( + pylibcudf.lists.index_of( + col.to_pylibcudf(mode="read"), + search_keys.to_pylibcudf(mode="read"), + True, + ) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_index_of( - list_view.get()[0], - keys_view, - )) - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def concatenate_rows(list source_columns): diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd index e121e856865..d13791d95cf 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd @@ -8,6 +8,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport ( column_view, mutable_column_view, ) +from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( + lists_column_view, +) from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type from .gpumemoryview cimport gpumemoryview @@ -56,3 +59,4 @@ cdef class ListColumnView: cdef Column _column cpdef child(self) cpdef offsets(self) + cdef lists_column_view view(self) nogil diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx index e726eca154f..e0cf8b7ee32 100644 --- a/python/cudf/cudf/_lib/pylibcudf/column.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx @@ -348,6 +348,15 @@ cdef class ListColumnView: """The offsets column of the underlying list column.""" return self._column.child(1) + cdef lists_column_view view(self) nogil: + """Generate a libcudf lists_column_view to pass to libcudf algorithms. + + This method is for pylibcudf's functions to use to generate inputs when + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + return lists_column_view(self._column.view()) + @functools.cache def _datatype_from_dtype_desc(desc): diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd index 721679f35c7..82aed7d70a0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from cudf._lib.exception_handler cimport cudf_exception_handler @@ -12,17 +13,33 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil: + + cpdef enum class duplicate_find_option(int32_t): + FIND_FIRST + FIND_LAST + cdef unique_ptr[column] contains( - lists_column_view lists, - scalar search_key, + const lists_column_view& lists, + const scalar& search_key, + ) except +cudf_exception_handler + + cdef unique_ptr[column] contains( + const lists_column_view& lists, + const column_view& search_keys, + ) except +cudf_exception_handler + + cdef unique_ptr[column] contains_nulls( + const lists_column_view& lists, ) except +cudf_exception_handler cdef unique_ptr[column] index_of( - lists_column_view lists, - scalar search_key, + const lists_column_view& lists, + const scalar& search_key, + duplicate_find_option find_option, ) except +cudf_exception_handler cdef unique_ptr[column] index_of( - lists_column_view lists, - column_view search_keys, + const lists_column_view& lists, + const column_view& search_keys, + duplicate_find_option find_option, ) except +cudf_exception_handler diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd index dbafc415e45..fd21e7b334b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd @@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil: cdef cppclass lists_column_view(column_view): + lists_column_view() except + lists_column_view(const column_view& lists_column) except + column_view parent() except + column_view offsets() except + diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 2d2a5b2a9ea..2ccf0139e90 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -5,11 +5,21 @@ from libcpp cimport bool from cudf._lib.pylibcudf.libcudf.types cimport size_type from .column cimport Column +from .scalar cimport Scalar from .table cimport Table +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Table explode_outer(Table, size_type explode_column_idx) cpdef Column concatenate_rows(Table) cpdef Column concatenate_list_elements(Column, bool dropna) + +cpdef Column contains(Column, ColumnOrScalar) + +cpdef Column contains_nulls(Column) + +cpdef Column index_of(Column, ColumnOrScalar, bool) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 069c9da31c2..a94d940accd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -1,11 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from cudf._lib.pylibcudf.libcudf.column.column cimport column -from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode +from cudf._lib.pylibcudf.libcudf.lists cimport ( + contains as cpp_contains, + explode as cpp_explode, +) from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_list_elements as cpp_concatenate_list_elements, concatenate_null_policy, @@ -13,8 +17,10 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( ) from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.lists cimport ColumnOrScalar -from .column cimport Column +from .column cimport Column, ListColumnView +from .scalar cimport Scalar from .table cimport Table @@ -71,15 +77,15 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): ---------- input : Column The input column + dropna : bool + If true, null list elements will be ignored + from concatenation. Otherwise any input null values will result in + the corresponding output row being set to null. Returns ------- Column A new Column of concatenated list elements - dropna : bool - If true, null list elements will be ignored - from concatenation. Otherwise any input null values will result in - the corresponding output row being set to null. """ cdef concatenate_null_policy null_policy = ( concatenate_null_policy.IGNORE if dropna @@ -94,3 +100,109 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): )) return Column.from_libcudf(move(c_result)) + + +cpdef Column contains(Column input, ColumnOrScalar search_key): + """Create a column of bool values indicating whether + the search_key is contained in the input. + + ``search_key`` may be a + :py:class:`~cudf._lib.pylibcudf.column.Column` or a + :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. + + For details, see :cpp:func:`contains`. + + Parameters + ---------- + input : Column + The input column. + search_key : Union[Column, Scalar] + The search key. + + Returns + ------- + Column + A new Column of bools indicating if the search_key was + found in the list column. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + + if not isinstance(search_key, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = move(cpp_contains.contains( + list_view.view(), + search_key.view() if ColumnOrScalar is Column else dereference( + search_key.get() + ), + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column contains_nulls(Column input): + """Create a column of bool values indicating whether + each row in the lists column contains a null value. + + Parameters + ---------- + input : Column + The input column. + + Returns + ------- + Column + A new Column of bools indicating if the list column + contains a null value. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + with nogil: + c_result = move(cpp_contains.contains_nulls(list_view.view())) + return Column.from_libcudf(move(c_result)) + + +cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option): + """Create a column of index values indicating the position of a search + key row within the corresponding list row in the lists column. + + ``search_key`` may be a + :py:class:`~cudf._lib.pylibcudf.column.Column` or a + :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. + + For details, see :cpp:func:`index_of`. + + Parameters + ---------- + input : Column + The input column. + search_key : Union[Column, Scalar] + The search key. + find_first_option : bool + If true, index_of returns the first match. + Otherwise the last match is returned. + + Returns + ------- + Column + A new Column of index values that indicate where in the + list column tthe search_key was found. An index value + of -1 indicates that the search_key was not found. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView list_view = input.list_view() + cdef cpp_contains.duplicate_find_option find_option = ( + cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option + else cpp_contains.duplicate_find_option.FIND_LAST + ) + + with nogil: + c_result = move(cpp_contains.index_of( + list_view.view(), + search_key.view() if ColumnOrScalar is Column else dereference( + search_key.get() + ), + find_option, + )) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index b21af8ea11c..c781126e388 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -7,15 +7,28 @@ from cudf._lib import pylibcudf as plc -def test_concatenate_rows(): - test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]] +@pytest.fixture +def test_data(): + return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]] - arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"]) + +@pytest.fixture +def scalar(): + return pa.scalar(1) + + +@pytest.fixture +def column(): + return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32()) + + +def test_concatenate_rows(test_data): + arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"]) plc_tbl = plc.interop.from_arrow(arrow_tbl) res = plc.lists.concatenate_rows(plc_tbl) - expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)]) + expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data[0])]) assert_column_eq(expect, res) @@ -44,3 +57,80 @@ def test_concatenate_list_elements(test_data, dropna, expected): expect = pa.array(expected) assert_column_eq(expect, res) + + +def test_contains_scalar(test_data, scalar): + list_column = test_data[0][0] + arr = pa.array(list_column) + + plc_column = plc.interop.from_arrow(arr) + plc_scalar = plc.interop.from_arrow(scalar) + res = plc.lists.contains(plc_column, plc_scalar) + + expect = pa.array([True, False, False, False]) + + assert_column_eq(expect, res) + + +def test_contains_list_column(test_data): + list_column1 = test_data[0][0] + list_column2 = [1, 3, 5, 1] + arr1 = pa.array(list_column1) + arr2 = pa.array(list_column2) + + plc_column1 = plc.interop.from_arrow(arr1) + plc_column2 = plc.interop.from_arrow(arr2) + res = plc.lists.contains(plc_column1, plc_column2) + + expect = pa.array([True, False, True, False]) + + assert_column_eq(expect, res) + + +@pytest.mark.parametrize( + "list_column, expected", + [ + ( + [[1, None], [1, 3, 4], [5, None]], + [True, False, True], + ), + ( + [[1, None], None, [5]], + [True, None, False], + ), + ], +) +def test_contains_nulls(list_column, expected): + arr = pa.array(list_column) + plc_column = plc.interop.from_arrow(arr) + res = plc.lists.contains_nulls(plc_column) + + expect = pa.array(expected) + + assert_column_eq(expect, res) + + +def test_index_of_scalar(test_data, scalar): + list_column = test_data[0][0] + arr = pa.array(list_column) + + plc_column = plc.interop.from_arrow(arr) + plc_scalar = plc.interop.from_arrow(scalar) + res = plc.lists.index_of(plc_column, plc_scalar, True) + + expect = pa.array([1, -1, -1, -1], type=pa.int32()) + + assert_column_eq(expect, res) + + +def test_index_of_list_column(test_data, column): + list_column = test_data[0][0] + arr1 = pa.array(list_column) + arr2, expect = column + plc_column1 = plc.interop.from_arrow(arr1) + plc_column2 = plc.interop.from_arrow(arr2) + res = plc.lists.index_of(plc_column1, plc_column2, True) + + expect = pa.array(column[1], type=pa.int32()) + + assert_column_eq(expect, res)