diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp index c986418c790..44067ff87c0 100644 --- a/cpp/include/cudf/detail/search.hpp +++ b/cpp/include/cudf/detail/search.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,11 +33,11 @@ namespace detail { * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr lower_bound( - table_view const& t, - table_view const& values, + table_view const& haystack, + table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -46,33 +46,29 @@ std::unique_ptr lower_bound( * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr upper_bound( - table_view const& t, - table_view const& values, + table_view const& haystack, + table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::contains(column_view const&, scalar const&, - * rmm::mr::device_memory_resource*) + * @copydoc cudf::contains(column_view const&, scalar const&, rmm::mr::device_memory_resource*) * * @param stream CUDA stream used for device memory operations and kernel launches. */ -bool contains(column_view const& col, - scalar const& value, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); +bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream); /** - * @copydoc cudf::contains(column_view const&, column_view const&, - * rmm::mr::device_memory_resource*) + * @copydoc cudf::contains(column_view const&, column_view const&, rmm::mr::device_memory_resource*) * * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr contains( column_view const& haystack, column_view const& needles, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp index 56a31891e27..3b68923ee93 100644 --- a/cpp/include/cudf/search.hpp +++ b/cpp/include/cudf/search.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,134 +32,123 @@ namespace cudf { */ /** - * @brief Find smallest indices in a sorted table where values should be - * inserted to maintain order + * @brief Find smallest indices in a sorted table where values should be inserted to maintain order. * - * For each row v in @p values, find the first index in @p t where - * inserting the row will maintain the sort order of @p t + * For each row in `needles`, find the first index in `haystack` where inserting the row still + * maintains its sort order. * * @code{.pseudo} * Example: * * Single column: - * idx 0 1 2 3 4 - * column = { 10, 20, 20, 30, 50 } - * values = { 20 } - * result = { 1 } + * idx 0 1 2 3 4 + * haystack = { 10, 20, 20, 30, 50 } + * needles = { 20 } + * result = { 1 } * * Multi Column: - * idx 0 1 2 3 4 - * t = {{ 10, 20, 20, 20, 20 }, - * { 5.0, .5, .5, .7, .7 }, - * { 90, 77, 78, 61, 61 }} - * values = {{ 20 }, - * { .7 }, - * { 61 }} - * result = { 3 } + * idx 0 1 2 3 4 + * haystack = {{ 10, 20, 20, 20, 20 }, + * { 5.0, .5, .5, .7, .7 }, + * { 90, 77, 78, 61, 61 }} + * needles = {{ 20 }, + * { .7 }, + * { 61 }} + * result = { 3 } * @endcode * - * @param t Table to search - * @param values Find insert locations for these values - * @param column_order Vector of column sort order - * @param null_precedence Vector of null_precedence enums values - * @param mr Device memory resource used to allocate the returned column's device - * memory + * @param haystack The table containing search space. + * @param needles Values for which to find the insert locations in the search space. + * @param column_order Vector of column sort order. + * @param null_precedence Vector of null_precedence enums needles. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return A non-nullable column of cudf::size_type elements containing the insertion points. */ std::unique_ptr lower_bound( - table_view const& t, - table_view const& values, + table_view const& haystack, + table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Find largest indices in a sorted table where values should be - * inserted to maintain order + * @brief Find largest indices in a sorted table where values should be inserted to maintain order. * - * For each row v in @p values, find the last index in @p t where - * inserting the row will maintain the sort order of @p t + * For each row in `needles`, find the last index in `haystack` where inserting the row still + * maintains its sort order. * * @code{.pseudo} * Example: * * Single Column: - * idx 0 1 2 3 4 - * column = { 10, 20, 20, 30, 50 } - * values = { 20 } - * result = { 3 } + * idx 0 1 2 3 4 + * haystack = { 10, 20, 20, 30, 50 } + * needles = { 20 } + * result = { 3 } * * Multi Column: - * idx 0 1 2 3 4 - * t = {{ 10, 20, 20, 20, 20 }, - * { 5.0, .5, .5, .7, .7 }, - * { 90, 77, 78, 61, 61 }} - * values = {{ 20 }, - * { .7 }, - * { 61 }} - * result = { 5 } + * idx 0 1 2 3 4 + * haystack = {{ 10, 20, 20, 20, 20 }, + * { 5.0, .5, .5, .7, .7 }, + * { 90, 77, 78, 61, 61 }} + * needles = {{ 20 }, + * { .7 }, + * { 61 }} + * result = { 5 } * @endcode * - * @param search_table Table to search - * @param values Find insert locations for these values - * @param column_order Vector of column sort order - * @param null_precedence Vector of null_precedence enums values - * @param mr Device memory resource used to allocate the returned column's device - * memory + * @param haystack The table containing search space. + * @param needles Values for which to find the insert locations in the search space. + * @param column_order Vector of column sort order. + * @param null_precedence Vector of null_precedence enums needles. + * @param mr Device memory resource used to allocate the returned column's device memory. * @return A non-nullable column of cudf::size_type elements containing the insertion points. */ std::unique_ptr upper_bound( - table_view const& search_table, - table_view const& values, + table_view const& haystack, + table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Find if the `value` is present in the `col` + * @brief Check if the given `needle` value exists in the `haystack` column. * - * @throws cudf::logic_error - * If `col.type() != values.type()` + * @throws cudf::logic_error If `haystack.type() != needle.type()`. * * @code{.pseudo} * Single Column: - * idx 0 1 2 3 4 - * col = { 10, 20, 20, 30, 50 } - * Scalar: - * value = { 20 } - * result = true + * idx 0 1 2 3 4 + * haystack = { 10, 20, 20, 30, 50 } + * needle = { 20 } + * result = true * @endcode * - * @param col A column object - * @param value A scalar value to search for in `col` - * - * @return bool If `value` is found in `column` true, else false. + * @param haystack The column containing search space. + * @param needle A scalar value to check for existence in the search space. + * @return true if the given `needle` value exists in the `haystack` column. */ -bool contains(column_view const& col, scalar const& value); +bool contains(column_view const& haystack, scalar const& needle); /** - * @brief Returns a new column of type bool identifying for each element of @p haystack column, - * if that element is contained in @p needles column. + * @brief Check if the given `needles` values exists in the `haystack` column. * - * The new column will have the same dimension and null status as the @p haystack column. That is, - * any element that is invalid in the @p haystack column will be invalid in the returned column. + * The new column will have type BOOL and have the same size and null mask as the input `needles` + * column. That is, any null row in the `needles` column will result in a nul row in the output + * column. * - * @throws cudf::logic_error - * If `haystack.type() != needles.type()` + * @throws cudf::logic_error If `haystack.type() != needles.type()` * * @code{.pseudo} * haystack = { 10, 20, 30, 40, 50 } * needles = { 20, 40, 60, 80 } - * - * result = { false, true, false, true, false } + * result = { true, true, false, false } * @endcode * - * @param haystack A column object - * @param needles A column of values to search for in `col` - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @return A column of bool elements containing true if the corresponding entry in haystack - * appears in needles and false if it does not. + * @param haystack The column containing search space. + * @param needles A column of values to check for existence in the search space. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return A BOOL column indicating if each element in `needles` exists in the search space. */ std::unique_ptr contains( column_view const& haystack, diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index c4b3bbc00e4..a98e69149af 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -158,7 +158,7 @@ std::unique_ptr remove_keys( CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match"); // locate keys to remove by searching the keys column - auto const matches = cudf::detail::contains(keys_view, keys_to_remove, stream, mr); + auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr); auto d_matches = matches->view().data(); // call common utility method to keep the keys not matched to keys_to_remove auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; }; @@ -181,7 +181,7 @@ std::unique_ptr remove_unused_keys( thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end()); // wrap the indices for comparison in contains() column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data()); - return cudf::detail::contains(keys_positions_view, indices_view, stream, mr); + return cudf::detail::contains(indices_view, keys_positions_view, stream, mr); }(); auto d_matches = matches->view().data(); diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index dfc6cbb78cc..25c46837e9f 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -138,7 +138,7 @@ std::unique_ptr set_keys( std::unique_ptr keys_column(std::move(sorted_keys.front())); // compute the new nulls - auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr); + auto matches = cudf::detail::contains(keys_column->view(), keys, stream, mr); auto d_matches = matches->view().data(); auto indices_itr = cudf::detail::indexalator_factory::make_input_iterator(dictionary_column.indices()); diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index 29eddf703df..491ad49e020 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -43,40 +43,9 @@ namespace cudf { namespace { -template -void launch_search(DataIterator it_data, - ValuesIterator it_vals, - size_type data_size, - size_type values_size, - OutputIterator it_output, - Comparator comp, - bool find_first, - rmm::cuda_stream_view stream) -{ - if (find_first) { - thrust::lower_bound(rmm::exec_policy(stream), - it_data, - it_data + data_size, - it_vals, - it_vals + values_size, - it_output, - comp); - } else { - thrust::upper_bound(rmm::exec_policy(stream), - it_data, - it_data + data_size, - it_vals, - it_vals + values_size, - it_output, - comp); - } -} -std::unique_ptr search_ordered(table_view const& t, - table_view const& values, +std::unique_ptr search_ordered(table_view const& haystack, + table_view const& needles, bool find_first, std::vector const& column_order, std::vector const& null_precedence, @@ -84,30 +53,30 @@ std::unique_ptr search_ordered(table_view const& t, rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS( - column_order.empty() or static_cast(t.num_columns()) == column_order.size(), + column_order.empty() or static_cast(haystack.num_columns()) == column_order.size(), "Mismatch between number of columns and column order."); - CUDF_EXPECTS( - null_precedence.empty() or static_cast(t.num_columns()) == null_precedence.size(), - "Mismatch between number of columns and null precedence."); + CUDF_EXPECTS(null_precedence.empty() or + static_cast(haystack.num_columns()) == null_precedence.size(), + "Mismatch between number of columns and null precedence."); // Allocate result column auto result = make_numeric_column( - data_type{type_to_id()}, values.num_rows(), mask_state::UNALLOCATED, stream, mr); - auto const result_out = result->mutable_view().data(); + data_type{type_to_id()}, needles.num_rows(), mask_state::UNALLOCATED, stream, mr); + auto const out_it = result->mutable_view().data(); // Handle empty inputs - if (t.num_rows() == 0) { + if (haystack.num_rows() == 0) { CUDF_CUDA_TRY( - cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value())); + cudaMemsetAsync(out_it, 0, needles.num_rows() * sizeof(size_type), stream.value())); return result; } // This utility will ensure all corresponding dictionary columns have matching keys. // It will return any new dictionary columns created as well as updated table_views. - auto const matched = dictionary::detail::match_dictionaries({t, values}, stream); + auto const matched = dictionary::detail::match_dictionaries({haystack, needles}, stream); // Prepare to flatten the structs column - auto const has_null_elements = has_nested_nulls(t) or has_nested_nulls(values); + auto const has_null_elements = has_nested_nulls(haystack) or has_nested_nulls(needles); auto const flatten_nullability = has_null_elements ? structs::detail::column_nullability::FORCE : structs::detail::column_nullability::MATCH_INCOMING; @@ -135,37 +104,50 @@ std::unique_ptr search_ordered(table_view const& t, rhs, column_order_dv.data(), null_precedence_dv.data()); - launch_search( - count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream); + + auto const do_search = [find_first](auto&&... args) { + if (find_first) { + thrust::lower_bound(std::forward(args)...); + } else { + thrust::upper_bound(std::forward(args)...); + } + }; + do_search(rmm::exec_policy(stream), + count_it, + count_it + haystack.num_rows(), + count_it, + count_it + needles.num_rows(), + out_it, + comp); return result; } struct contains_scalar_dispatch { template - bool operator()(column_view const& col, scalar const& value, rmm::cuda_stream_view stream) + bool operator()(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match"); + CUDF_EXPECTS(haystack.type() == needle.type(), "scalar and column types must match"); using Type = device_storage_type_t; using ScalarType = cudf::scalar_type_t; - auto d_col = column_device_view::create(col, stream); - auto s = static_cast(&value); + auto d_haystack = column_device_view::create(haystack, stream); + auto s = static_cast(&needle); - if (col.has_nulls()) { + if (haystack.has_nulls()) { auto found_iter = thrust::find(rmm::exec_policy(stream), - d_col->pair_begin(), - d_col->pair_end(), + d_haystack->pair_begin(), + d_haystack->pair_end(), thrust::make_pair(s->value(stream), true)); - return found_iter != d_col->pair_end(); + return found_iter != d_haystack->pair_end(); } else { auto found_iter = thrust::find(rmm::exec_policy(stream), // - d_col->begin(), - d_col->end(), + d_haystack->begin(), + d_haystack->end(), s->value(stream)); - return found_iter != d_col->end(); + return found_iter != d_haystack->end(); } } }; @@ -179,66 +161,69 @@ bool contains_scalar_dispatch::operator()(column_view const&, } template <> -bool contains_scalar_dispatch::operator()(column_view const& col, - scalar const& value, +bool contains_scalar_dispatch::operator()(column_view const& haystack, + scalar const& needle, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match"); + CUDF_EXPECTS(haystack.type() == needle.type(), "scalar and column types must match"); - auto const scalar_table = static_cast(&value)->view(); - CUDF_EXPECTS(col.num_children() == scalar_table.num_columns(), + auto const scalar_table = static_cast(&needle)->view(); + CUDF_EXPECTS(haystack.num_children() == scalar_table.num_columns(), "struct scalar and structs column must have the same number of children"); - for (size_type i = 0; i < col.num_children(); ++i) { - CUDF_EXPECTS(col.child(i).type() == scalar_table.column(i).type(), + for (size_type i = 0; i < haystack.num_children(); ++i) { + CUDF_EXPECTS(haystack.child(i).type() == scalar_table.column(i).type(), "scalar and column children types must match"); } // Prepare to flatten the structs column and scalar. - auto const has_null_elements = - has_nested_nulls(table_view{std::vector{col.child_begin(), col.child_end()}}) || - has_nested_nulls(scalar_table); + auto const has_null_elements = has_nested_nulls(table_view{std::vector{ + haystack.child_begin(), haystack.child_end()}}) || + has_nested_nulls(scalar_table); auto const flatten_nullability = has_null_elements ? structs::detail::column_nullability::FORCE : structs::detail::column_nullability::MATCH_INCOMING; // Flatten the input structs column, only materialize the bitmask if there is null in the input. - auto const col_flattened = - structs::detail::flatten_nested_columns(table_view{{col}}, {}, {}, flatten_nullability); - auto const val_flattened = + auto const haystack_flattened = + structs::detail::flatten_nested_columns(table_view{{haystack}}, {}, {}, flatten_nullability); + auto const needle_flattened = structs::detail::flatten_nested_columns(scalar_table, {}, {}, flatten_nullability); // The struct scalar only contains the struct member columns. // Thus, if there is any null in the input, we must exclude the first column in the flattened // table of the input column from searching because that column is the materialized bitmask of // the input structs column. - auto const col_flattened_content = col_flattened.flattened_columns(); - auto const col_flattened_children = table_view{std::vector{ - col_flattened_content.begin() + static_cast(has_null_elements), - col_flattened_content.end()}}; + auto const haystack_flattened_content = haystack_flattened.flattened_columns(); + auto const haystack_flattened_children = table_view{std::vector{ + haystack_flattened_content.begin() + static_cast(has_null_elements), + haystack_flattened_content.end()}}; - auto const d_col_children_ptr = table_device_view::create(col_flattened_children, stream); - auto const d_val_ptr = table_device_view::create(val_flattened, stream); + auto const d_haystack_children_ptr = + table_device_view::create(haystack_flattened_children, stream); + auto const d_needle_ptr = table_device_view::create(needle_flattened, stream); auto const start_iter = thrust::make_counting_iterator(0); - auto const end_iter = start_iter + col.size(); - auto const comp = row_equality_comparator( - nullate::DYNAMIC{has_null_elements}, *d_col_children_ptr, *d_val_ptr, null_equality::EQUAL); + auto const end_iter = start_iter + haystack.size(); + auto const comp = row_equality_comparator(nullate::DYNAMIC{has_null_elements}, + *d_haystack_children_ptr, + *d_needle_ptr, + null_equality::EQUAL); auto const found_iter = thrust::find_if( rmm::exec_policy(stream), start_iter, end_iter, [comp] __device__(auto const idx) { - return comp(idx, 0); // compare col[idx] == val[0]. + return comp(idx, 0); // compare haystack[idx] == val[0]. }); return found_iter != end_iter; } template <> -bool contains_scalar_dispatch::operator()(column_view const& col, - scalar const& value, +bool contains_scalar_dispatch::operator()(column_view const& haystack, + scalar const& needle, rmm::cuda_stream_view stream) { - auto dict_col = cudf::dictionary_column_view(col); - // first, find the value in the dictionary's key set - auto index = cudf::dictionary::detail::get_index(dict_col, value, stream); + auto dict_col = cudf::dictionary_column_view(haystack); + // first, find the needle in the dictionary's key set + auto index = cudf::dictionary::detail::get_index(dict_col, needle, stream); // if found, check the index is actually in the indices column return index->is_valid(stream) ? cudf::type_dispatcher(dict_col.indices().type(), contains_scalar_dispatch{}, @@ -251,12 +236,13 @@ bool contains_scalar_dispatch::operator()(column_view const& } // namespace namespace detail { -bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream) +bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream) { - if (col.is_empty()) { return false; } - if (not value.is_valid(stream)) { return col.has_nulls(); } + if (haystack.is_empty()) { return false; } + if (not needle.is_valid(stream)) { return haystack.has_nulls(); } - return cudf::type_dispatcher(col.type(), contains_scalar_dispatch{}, col, value, stream); + return cudf::type_dispatcher( + haystack.type(), contains_scalar_dispatch{}, haystack, needle, stream); } struct multi_contains_dispatch { @@ -267,44 +253,44 @@ struct multi_contains_dispatch { rmm::mr::device_memory_resource* mr) { std::unique_ptr result = make_numeric_column(data_type{type_to_id()}, - haystack.size(), - copy_bitmask(haystack), - haystack.null_count(), + needles.size(), + copy_bitmask(needles), + needles.null_count(), stream, mr); - if (haystack.is_empty()) { return result; } + if (needles.is_empty()) { return result; } mutable_column_view result_view = result.get()->mutable_view(); - if (needles.is_empty()) { + if (haystack.is_empty()) { thrust::fill( rmm::exec_policy(stream), result_view.begin(), result_view.end(), false); return result; } - auto hash_set = cudf::detail::unordered_multiset::create(needles, stream); + auto hash_set = cudf::detail::unordered_multiset::create(haystack, stream); auto device_hash_set = hash_set.to_device(); - auto d_haystack_ptr = column_device_view::create(haystack, stream); - auto d_haystack = *d_haystack_ptr; + auto d_needles_ptr = column_device_view::create(needles, stream); + auto d_needles = *d_needles_ptr; - if (haystack.has_nulls()) { + if (needles.has_nulls()) { thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(haystack.size()), + thrust::make_counting_iterator(needles.size()), result_view.begin(), - [device_hash_set, d_haystack] __device__(size_t index) { - return d_haystack.is_null_nocheck(index) || - device_hash_set.contains(d_haystack.element(index)); + [device_hash_set, d_needles] __device__(size_t index) { + return d_needles.is_null_nocheck(index) || + device_hash_set.contains(d_needles.element(index)); }); } else { thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(haystack.size()), + thrust::make_counting_iterator(needles.size()), result_view.begin(), - [device_hash_set, d_haystack] __device__(size_t index) { - return device_hash_set.contains(d_haystack.element(index)); + [device_hash_set, d_needles] __device__(size_t index) { + return device_hash_set.contains(d_needles.element(index)); }); } @@ -336,10 +322,10 @@ std::unique_ptr multi_contains_dispatch::operator()( dictionary_column_view const haystack(haystack_in); dictionary_column_view const needles(needles_in); // first combine keys so both dictionaries have the same set - auto haystack_matched = dictionary::detail::add_keys(haystack, needles.keys(), stream); - auto const haystack_view = dictionary_column_view(haystack_matched->view()); - auto needles_matched = dictionary::detail::set_keys(needles, haystack_view.keys(), stream); + auto needles_matched = dictionary::detail::add_keys(needles, haystack.keys(), stream); auto const needles_view = dictionary_column_view(needles_matched->view()); + auto haystack_matched = dictionary::detail::set_keys(haystack, needles_view.keys(), stream); + auto const haystack_view = dictionary_column_view(haystack_matched->view()); // now just use the indices for the contains column_view const haystack_indices = haystack_view.get_indices_annotated(); @@ -363,56 +349,56 @@ std::unique_ptr contains(column_view const& haystack, haystack.type(), multi_contains_dispatch{}, haystack, needles, stream, mr); } -std::unique_ptr lower_bound(table_view const& t, - table_view const& values, +std::unique_ptr lower_bound(table_view const& haystack, + table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return search_ordered(t, values, true, column_order, null_precedence, stream, mr); + return search_ordered(haystack, needles, true, column_order, null_precedence, stream, mr); } -std::unique_ptr upper_bound(table_view const& t, - table_view const& values, +std::unique_ptr upper_bound(table_view const& haystack, + table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return search_ordered(t, values, false, column_order, null_precedence, stream, mr); + return search_ordered(haystack, needles, false, column_order, null_precedence, stream, mr); } } // namespace detail // external APIs -std::unique_ptr lower_bound(table_view const& t, - table_view const& values, +std::unique_ptr lower_bound(table_view const& haystack, + table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::lower_bound( - t, values, column_order, null_precedence, rmm::cuda_stream_default, mr); + haystack, needles, column_order, null_precedence, rmm::cuda_stream_default, mr); } -std::unique_ptr upper_bound(table_view const& t, - table_view const& values, +std::unique_ptr upper_bound(table_view const& haystack, + table_view const& needles, std::vector const& column_order, std::vector const& null_precedence, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::upper_bound( - t, values, column_order, null_precedence, rmm::cuda_stream_default, mr); + haystack, needles, column_order, null_precedence, rmm::cuda_stream_default, mr); } -bool contains(column_view const& col, scalar const& value) +bool contains(column_view const& haystack, scalar const& needle) { CUDF_FUNC_RANGE(); - return detail::contains(col, value, rmm::cuda_stream_default); + return detail::contains(haystack, needle, rmm::cuda_stream_default); } std::unique_ptr contains(column_view const& haystack, diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp index 6b1caa5ed6f..9eba259ee39 100644 --- a/cpp/tests/search/search_dictionary_test.cpp +++ b/cpp/tests/search/search_dictionary_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,7 +88,7 @@ TEST_F(DictionarySearchTest, contains_dictionary) EXPECT_FALSE(cudf::contains(column, string_scalar{"28"})); cudf::test::dictionary_column_wrapper needles({"00", "17", "23", "27"}); - fixed_width_column_wrapper expect{1, 1, 1, 1, 1, 1, 0}; + fixed_width_column_wrapper expect{1, 1, 1, 0}; auto result = cudf::contains(column, needles); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect); } @@ -101,7 +101,7 @@ TEST_F(DictionarySearchTest, contains_nullable_dictionary) EXPECT_FALSE(cudf::contains(column, numeric_scalar{28})); cudf::test::dictionary_column_wrapper needles({0, 17, 23, 27}); - fixed_width_column_wrapper expect({1, 0, 1, 1, 1, 1, 0}, {1, 0, 1, 1, 1, 1, 1}); + fixed_width_column_wrapper expect{1, 1, 1, 0}; auto result = cudf::contains(column, needles); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect); } diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp index 0a2533cd5f3..169eaffa41a 100644 --- a/cpp/tests/search/search_test.cpp +++ b/cpp/tests/search/search_test.cpp @@ -1627,7 +1627,7 @@ TEST_F(SearchTest, multi_contains_some) fixed_width_column_wrapper haystack{0, 1, 17, 19, 23, 29, 71}; fixed_width_column_wrapper needles{17, 19, 45, 72}; - fixed_width_column_wrapper expect{0, 0, 1, 1, 0, 0, 0}; + fixed_width_column_wrapper expect{1, 1, 0, 0}; auto result = cudf::contains(haystack, needles); @@ -1641,7 +1641,7 @@ TEST_F(SearchTest, multi_contains_none) fixed_width_column_wrapper haystack{0, 1, 17, 19, 23, 29, 71}; fixed_width_column_wrapper needles{2, 3}; - fixed_width_column_wrapper expect{0, 0, 0, 0, 0, 0, 0}; + fixed_width_column_wrapper expect{0, 0}; auto result = cudf::contains(haystack, needles); @@ -1657,7 +1657,7 @@ TEST_F(SearchTest, multi_contains_some_string) cudf::test::strings_column_wrapper needles(h_needles_strings.begin(), h_needles_strings.end()); - fixed_width_column_wrapper expect{0, 0, 1, 1, 0, 0, 0}; + fixed_width_column_wrapper expect{1, 1, 0, 0}; auto result = cudf::contains(haystack, needles); @@ -1673,7 +1673,7 @@ TEST_F(SearchTest, multi_contains_none_string) cudf::test::strings_column_wrapper needles(h_needles_strings.begin(), h_needles_strings.end()); - fixed_width_column_wrapper expect{0, 0, 0, 0, 0, 0, 0}; + fixed_width_column_wrapper expect{0, 0}; auto result = cudf::contains(haystack, needles); @@ -1688,7 +1688,7 @@ TEST_F(SearchTest, multi_contains_some_with_nulls) {1, 1, 0, 1, 1, 1, 1}}; fixed_width_column_wrapper needles{{17, 19, 23, 72}, {1, 0, 1, 1}}; - fixed_width_column_wrapper expect{{0, 0, 0, 0, 1, 0, 0}, {1, 1, 0, 1, 1, 1, 1}}; + fixed_width_column_wrapper expect{{0, 0, 1, 0}, {1, 0, 1, 1}}; auto result = cudf::contains(haystack, needles); @@ -1703,7 +1703,7 @@ TEST_F(SearchTest, multi_contains_none_with_nulls) {1, 1, 0, 1, 1, 1, 1}}; fixed_width_column_wrapper needles{{17, 19, 24, 72}, {1, 0, 1, 1}}; - fixed_width_column_wrapper expect{{0, 0, 0, 0, 0, 0, 0}, {1, 1, 0, 1, 1, 1, 1}}; + fixed_width_column_wrapper expect{{0, 0, 0, 0}, {1, 0, 1, 1}}; auto result = cudf::contains(haystack, needles); @@ -1715,7 +1715,7 @@ TEST_F(SearchTest, multi_contains_some_string_with_nulls) std::vector h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"}; std::vector h_needles_strings{"17", "23", nullptr, "72"}; - fixed_width_column_wrapper expect{{0, 0, 0, 0, 1, 0, 0}, {1, 1, 0, 1, 1, 1, 1}}; + fixed_width_column_wrapper expect{{0, 1, 0, 0}, {1, 1, 0, 1}}; cudf::test::strings_column_wrapper haystack( h_haystack_strings.begin(), @@ -1739,7 +1739,7 @@ TEST_F(SearchTest, multi_contains_none_string_with_nulls) std::vector h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"}; std::vector h_needles_strings{"2", nullptr}; - fixed_width_column_wrapper expect{{0, 0, 0, 0, 0, 0, 0}, {1, 1, 0, 1, 1, 1, 1}}; + fixed_width_column_wrapper expect{{0, 0}, {1, 0}}; cudf::test::strings_column_wrapper haystack( h_haystack_strings.begin(), @@ -1765,7 +1765,7 @@ TEST_F(SearchTest, multi_contains_empty_column) fixed_width_column_wrapper haystack{}; fixed_width_column_wrapper needles{2, 3}; - fixed_width_column_wrapper expect{}; + fixed_width_column_wrapper expect{0, 0}; auto result = cudf::contains(haystack, needles); @@ -1781,7 +1781,7 @@ TEST_F(SearchTest, multi_contains_empty_column_string) cudf::test::strings_column_wrapper needles(h_needles_strings.begin(), h_needles_strings.end()); - fixed_width_column_wrapper expect{}; + fixed_width_column_wrapper expect{0, 0, 0, 0}; auto result = cudf::contains(haystack, needles); @@ -1795,7 +1795,7 @@ TEST_F(SearchTest, multi_contains_empty_input_set) fixed_width_column_wrapper haystack{0, 1, 17, 19, 23, 29, 71}; fixed_width_column_wrapper needles{}; - fixed_width_column_wrapper expect{0, 0, 0, 0, 0, 0, 0}; + fixed_width_column_wrapper expect{}; auto result = cudf::contains(haystack, needles); @@ -1811,7 +1811,7 @@ TEST_F(SearchTest, multi_contains_empty_input_set_string) cudf::test::strings_column_wrapper needles(h_needles_strings.begin(), h_needles_strings.end()); - fixed_width_column_wrapper expect{0, 0, 0, 0, 0, 0, 0}; + fixed_width_column_wrapper expect{}; auto result = cudf::contains(haystack, needles); diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index e871da18966..9f07b130a83 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -1769,22 +1769,23 @@ public boolean contains(Scalar needle) { } /** - * Returns a new ColumnVector of {@link DType#BOOL8} elements containing true if the corresponding - * entry in haystack is contained in needles and false if it is not. The caller will be responsible - * for the lifecycle of the new vector. + * Returns a new column of {@link DType#BOOL8} elements having the same size as this column, + * each row value is true if the corresponding entry in this column is contained in the + * given searchSpace column and false if it is not. + * The caller will be responsible for the lifecycle of the new vector. * * example: * - * haystack = { 10, 20, 30, 40, 50 } - * needles = { 20, 40, 60, 80 } + * col = { 10, 20, 30, 40, 50 } + * searchSpace = { 20, 40, 60, 80 } * * result = { false, true, false, true, false } * - * @param needles + * @param searchSpace * @return A new ColumnVector of type {@link DType#BOOL8} */ - public final ColumnVector contains(ColumnView needles) { - return new ColumnVector(containsVector(getNativeView(), needles.getNativeView())); + public final ColumnVector contains(ColumnView searchSpace) { + return new ColumnVector(containsVector(getNativeView(), searchSpace.getNativeView())); } /** @@ -4080,7 +4081,7 @@ private static native long segmentedGather(long sourceColumnHandle, long gatherM private static native boolean containsScalar(long columnViewHaystack, long scalarHandle) throws CudfException; - private static native long containsVector(long columnViewHaystack, long columnViewNeedles) throws CudfException; + private static native long containsVector(long valuesHandle, long searchSpaceHandle) throws CudfException; private static native long transform(long viewHandle, String udf, boolean isPtx); diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index e074180c312..b33769bdc1b 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1166,15 +1166,18 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_containsScalar(JNIEnv } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *env, jobject j_object, - jlong j_haystack_handle, - jlong j_needle_handle) { - JNI_NULL_CHECK(env, j_haystack_handle, "haystack vector is null", false); - JNI_NULL_CHECK(env, j_needle_handle, "needle vector is null", false); + jlong j_values_handle, + jlong j_search_space_handle) { + JNI_NULL_CHECK(env, j_values_handle, "values vector is null", false); + JNI_NULL_CHECK(env, j_search_space_handle, "search_space vector is null", false); try { cudf::jni::auto_set_device(env); - cudf::column_view *haystack = reinterpret_cast(j_haystack_handle); - cudf::column_view *needle = reinterpret_cast(j_needle_handle); - return release_as_jlong(cudf::contains(*haystack, *needle)); + auto const search_space_ptr = + reinterpret_cast(j_search_space_handle); + auto const values_ptr = reinterpret_cast(j_values_handle); + + // The C++ API `cudf::contains` requires that the search space is the first parameter. + return release_as_jlong(cudf::contains(*search_space_ptr, *values_ptr)); } CATCH_STD(env, 0); } diff --git a/python/cudf/cudf/_lib/cpp/search.pxd b/python/cudf/cudf/_lib/cpp/search.pxd index 4df73881ea5..8baef0aa1b9 100644 --- a/python/cudf/cudf/_lib/cpp/search.pxd +++ b/python/cudf/cudf/_lib/cpp/search.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -12,15 +12,15 @@ from cudf._lib.cpp.table.table_view cimport table_view cdef extern from "cudf/search.hpp" namespace "cudf" nogil: cdef unique_ptr[column] lower_bound( - table_view t, - table_view values, + table_view haystack, + table_view needles, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence, ) except + cdef unique_ptr[column] upper_bound( - table_view t, - table_view values, + table_view haystack, + table_view needles, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence, ) except + diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 70097f15372..09a4754f519 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5407,7 +5407,7 @@ def fillna( def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: found_indices = libcudf.search.contains( - self, column.as_column([value], dtype=self.dtype) + column.as_column([value], dtype=self.dtype), self ) found_indices = libcudf.unary.cast(found_indices, dtype=np.int32) first = column.as_column(found_indices).find_first_value(np.int32(1))