Skip to content

Commit

Permalink
Refactor cudf::contains, renaming and switching parameters role (#1…
Browse files Browse the repository at this point in the history
…0802)

This PR does the following:
 * Renaming parameters of `cudf::contains`, changing from `t`/`values`, `col`/`value`, etc... into `haystack`/`needle` in a consistent way.
 * Switching the role of `haystack` and `needles` parameters of the overload `cudf::contains(column_view, column_view)`, which incorrectly searches for `haystack` inside `needles`.
 * Update the internal usage of that overloads in cudf.
 * Update unit tests.
 * Rewriting all `cudf::contains` doxygen.
 * And some minor code cleanup/refactor.

Since the role of parameters is switched, this causes breaking changes.

Closes #10725. In addition, this is also a foundation for more changes in `search.cu` to support nested types in #10656.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Jason Lowe (https://github.com/jlowe)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #10802
  • Loading branch information
ttnghia authored May 10, 2022
1 parent 0fcd364 commit 4539e5e
Show file tree
Hide file tree
Showing 11 changed files with 227 additions and 252 deletions.
26 changes: 11 additions & 15 deletions cpp/include/cudf/detail/search.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,11 +33,11 @@ namespace detail {
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> lower_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -46,33 +46,29 @@ std::unique_ptr<column> lower_bound(
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> upper_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::contains(column_view const&, scalar const&,
* rmm::mr::device_memory_resource*)
* @copydoc cudf::contains(column_view const&, scalar const&, rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
bool contains(column_view const& col,
scalar const& value,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream);

/**
* @copydoc cudf::contains(column_view const&, column_view const&,
* rmm::mr::device_memory_resource*)
* @copydoc cudf::contains(column_view const&, column_view const&, rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> contains(
column_view const& haystack,
column_view const& needles,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
Expand Down
141 changes: 65 additions & 76 deletions cpp/include/cudf/search.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,134 +32,123 @@ namespace cudf {
*/

/**
* @brief Find smallest indices in a sorted table where values should be
* inserted to maintain order
* @brief Find smallest indices in a sorted table where values should be inserted to maintain order.
*
* For each row v in @p values, find the first index in @p t where
* inserting the row will maintain the sort order of @p t
* For each row in `needles`, find the first index in `haystack` where inserting the row still
* maintains its sort order.
*
* @code{.pseudo}
* Example:
*
* Single column:
* idx 0 1 2 3 4
* column = { 10, 20, 20, 30, 50 }
* values = { 20 }
* result = { 1 }
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needles = { 20 }
* result = { 1 }
*
* Multi Column:
* idx 0 1 2 3 4
* t = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* values = {{ 20 },
* { .7 },
* { 61 }}
* result = { 3 }
* idx 0 1 2 3 4
* haystack = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* needles = {{ 20 },
* { .7 },
* { 61 }}
* result = { 3 }
* @endcode
*
* @param t Table to search
* @param values Find insert locations for these values
* @param column_order Vector of column sort order
* @param null_precedence Vector of null_precedence enums values
* @param mr Device memory resource used to allocate the returned column's device
* memory
* @param haystack The table containing search space.
* @param needles Values for which to find the insert locations in the search space.
* @param column_order Vector of column sort order.
* @param null_precedence Vector of null_precedence enums needles.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A non-nullable column of cudf::size_type elements containing the insertion points.
*/
std::unique_ptr<column> lower_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Find largest indices in a sorted table where values should be
* inserted to maintain order
* @brief Find largest indices in a sorted table where values should be inserted to maintain order.
*
* For each row v in @p values, find the last index in @p t where
* inserting the row will maintain the sort order of @p t
* For each row in `needles`, find the last index in `haystack` where inserting the row still
* maintains its sort order.
*
* @code{.pseudo}
* Example:
*
* Single Column:
* idx 0 1 2 3 4
* column = { 10, 20, 20, 30, 50 }
* values = { 20 }
* result = { 3 }
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needles = { 20 }
* result = { 3 }
*
* Multi Column:
* idx 0 1 2 3 4
* t = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* values = {{ 20 },
* { .7 },
* { 61 }}
* result = { 5 }
* idx 0 1 2 3 4
* haystack = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* needles = {{ 20 },
* { .7 },
* { 61 }}
* result = { 5 }
* @endcode
*
* @param search_table Table to search
* @param values Find insert locations for these values
* @param column_order Vector of column sort order
* @param null_precedence Vector of null_precedence enums values
* @param mr Device memory resource used to allocate the returned column's device
* memory
* @param haystack The table containing search space.
* @param needles Values for which to find the insert locations in the search space.
* @param column_order Vector of column sort order.
* @param null_precedence Vector of null_precedence enums needles.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A non-nullable column of cudf::size_type elements containing the insertion points.
*/
std::unique_ptr<column> upper_bound(
table_view const& search_table,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Find if the `value` is present in the `col`
* @brief Check if the given `needle` value exists in the `haystack` column.
*
* @throws cudf::logic_error
* If `col.type() != values.type()`
* @throws cudf::logic_error If `haystack.type() != needle.type()`.
*
* @code{.pseudo}
* Single Column:
* idx 0 1 2 3 4
* col = { 10, 20, 20, 30, 50 }
* Scalar:
* value = { 20 }
* result = true
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needle = { 20 }
* result = true
* @endcode
*
* @param col A column object
* @param value A scalar value to search for in `col`
*
* @return bool If `value` is found in `column` true, else false.
* @param haystack The column containing search space.
* @param needle A scalar value to check for existence in the search space.
* @return true if the given `needle` value exists in the `haystack` column.
*/
bool contains(column_view const& col, scalar const& value);
bool contains(column_view const& haystack, scalar const& needle);

/**
* @brief Returns a new column of type bool identifying for each element of @p haystack column,
* if that element is contained in @p needles column.
* @brief Check if the given `needles` values exists in the `haystack` column.
*
* The new column will have the same dimension and null status as the @p haystack column. That is,
* any element that is invalid in the @p haystack column will be invalid in the returned column.
* The new column will have type BOOL and have the same size and null mask as the input `needles`
* column. That is, any null row in the `needles` column will result in a nul row in the output
* column.
*
* @throws cudf::logic_error
* If `haystack.type() != needles.type()`
* @throws cudf::logic_error If `haystack.type() != needles.type()`
*
* @code{.pseudo}
* haystack = { 10, 20, 30, 40, 50 }
* needles = { 20, 40, 60, 80 }
*
* result = { false, true, false, true, false }
* result = { true, true, false, false }
* @endcode
*
* @param haystack A column object
* @param needles A column of values to search for in `col`
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @return A column of bool elements containing true if the corresponding entry in haystack
* appears in needles and false if it does not.
* @param haystack The column containing search space.
* @param needles A column of values to check for existence in the search space.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A BOOL column indicating if each element in `needles` exists in the search space.
*/
std::unique_ptr<column> contains(
column_view const& haystack,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dictionary/remove_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ std::unique_ptr<column> remove_keys(
CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");

// locate keys to remove by searching the keys column
auto const matches = cudf::detail::contains(keys_view, keys_to_remove, stream, mr);
auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr);
auto d_matches = matches->view().data<bool>();
// call common utility method to keep the keys not matched to keys_to_remove
auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; };
Expand All @@ -181,7 +181,7 @@ std::unique_ptr<column> remove_unused_keys(
thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
// wrap the indices for comparison in contains()
column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data());
return cudf::detail::contains(keys_positions_view, indices_view, stream, mr);
return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
}();
auto d_matches = matches->view().data<bool>();

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/set_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ std::unique_ptr<column> set_keys(
std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));

// compute the new nulls
auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr);
auto matches = cudf::detail::contains(keys_column->view(), keys, stream, mr);
auto d_matches = matches->view().data<bool>();
auto indices_itr =
cudf::detail::indexalator_factory::make_input_iterator(dictionary_column.indices());
Expand Down
Loading

0 comments on commit 4539e5e

Please sign in to comment.