Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor cudf::contains, renaming and switching parameters role #10802

Merged
merged 22 commits into from
May 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 11 additions & 15 deletions cpp/include/cudf/detail/search.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,11 +33,11 @@ namespace detail {
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> lower_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand All @@ -46,33 +46,29 @@ std::unique_ptr<column> lower_bound(
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> upper_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::contains(column_view const&, scalar const&,
* rmm::mr::device_memory_resource*)
* @copydoc cudf::contains(column_view const&, scalar const&, rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
bool contains(column_view const& col,
scalar const& value,
rmm::cuda_stream_view stream = rmm::cuda_stream_default);
bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream);

/**
* @copydoc cudf::contains(column_view const&, column_view const&,
* rmm::mr::device_memory_resource*)
* @copydoc cudf::contains(column_view const&, column_view const&, rmm::mr::device_memory_resource*)
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> contains(
column_view const& haystack,
column_view const& needles,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
Expand Down
141 changes: 65 additions & 76 deletions cpp/include/cudf/search.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,134 +32,123 @@ namespace cudf {
*/

/**
* @brief Find smallest indices in a sorted table where values should be
* inserted to maintain order
* @brief Find smallest indices in a sorted table where values should be inserted to maintain order.
*
* For each row v in @p values, find the first index in @p t where
* inserting the row will maintain the sort order of @p t
* For each row in `needles`, find the first index in `haystack` where inserting the row still
* maintains its sort order.
*
* @code{.pseudo}
* Example:
*
* Single column:
* idx 0 1 2 3 4
* column = { 10, 20, 20, 30, 50 }
* values = { 20 }
* result = { 1 }
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needles = { 20 }
* result = { 1 }
*
* Multi Column:
* idx 0 1 2 3 4
* t = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* values = {{ 20 },
* { .7 },
* { 61 }}
* result = { 3 }
* idx 0 1 2 3 4
* haystack = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* needles = {{ 20 },
* { .7 },
* { 61 }}
* result = { 3 }
* @endcode
*
* @param t Table to search
* @param values Find insert locations for these values
* @param column_order Vector of column sort order
* @param null_precedence Vector of null_precedence enums values
* @param mr Device memory resource used to allocate the returned column's device
* memory
* @param haystack The table containing search space.
* @param needles Values for which to find the insert locations in the search space.
* @param column_order Vector of column sort order.
* @param null_precedence Vector of null_precedence enums needles.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A non-nullable column of cudf::size_type elements containing the insertion points.
*/
std::unique_ptr<column> lower_bound(
table_view const& t,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Find largest indices in a sorted table where values should be
* inserted to maintain order
* @brief Find largest indices in a sorted table where values should be inserted to maintain order.
*
* For each row v in @p values, find the last index in @p t where
* inserting the row will maintain the sort order of @p t
* For each row in `needles`, find the last index in `haystack` where inserting the row still
* maintains its sort order.
*
* @code{.pseudo}
* Example:
*
* Single Column:
* idx 0 1 2 3 4
* column = { 10, 20, 20, 30, 50 }
* values = { 20 }
* result = { 3 }
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needles = { 20 }
* result = { 3 }
*
* Multi Column:
* idx 0 1 2 3 4
* t = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* values = {{ 20 },
* { .7 },
* { 61 }}
* result = { 5 }
* idx 0 1 2 3 4
* haystack = {{ 10, 20, 20, 20, 20 },
* { 5.0, .5, .5, .7, .7 },
* { 90, 77, 78, 61, 61 }}
* needles = {{ 20 },
* { .7 },
* { 61 }}
* result = { 5 }
* @endcode
*
* @param search_table Table to search
* @param values Find insert locations for these values
* @param column_order Vector of column sort order
* @param null_precedence Vector of null_precedence enums values
* @param mr Device memory resource used to allocate the returned column's device
* memory
* @param haystack The table containing search space.
* @param needles Values for which to find the insert locations in the search space.
* @param column_order Vector of column sort order.
* @param null_precedence Vector of null_precedence enums needles.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A non-nullable column of cudf::size_type elements containing the insertion points.
*/
std::unique_ptr<column> upper_bound(
table_view const& search_table,
table_view const& values,
table_view const& haystack,
table_view const& needles,
std::vector<order> const& column_order,
std::vector<null_order> const& null_precedence,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Find if the `value` is present in the `col`
* @brief Check if the given `needle` value exists in the `haystack` column.
*
* @throws cudf::logic_error
* If `col.type() != values.type()`
* @throws cudf::logic_error If `haystack.type() != needle.type()`.
*
* @code{.pseudo}
* Single Column:
* idx 0 1 2 3 4
* col = { 10, 20, 20, 30, 50 }
* Scalar:
* value = { 20 }
* result = true
* idx 0 1 2 3 4
* haystack = { 10, 20, 20, 30, 50 }
* needle = { 20 }
* result = true
* @endcode
*
* @param col A column object
* @param value A scalar value to search for in `col`
*
* @return bool If `value` is found in `column` true, else false.
* @param haystack The column containing search space.
* @param needle A scalar value to check for existence in the search space.
* @return true if the given `needle` value exists in the `haystack` column.
*/
bool contains(column_view const& col, scalar const& value);
bool contains(column_view const& haystack, scalar const& needle);

/**
* @brief Returns a new column of type bool identifying for each element of @p haystack column,
* if that element is contained in @p needles column.
* @brief Check if the given `needles` values exists in the `haystack` column.
*
* The new column will have the same dimension and null status as the @p haystack column. That is,
* any element that is invalid in the @p haystack column will be invalid in the returned column.
* The new column will have type BOOL and have the same size and null mask as the input `needles`
* column. That is, any null row in the `needles` column will result in a nul row in the output
* column.
*
* @throws cudf::logic_error
* If `haystack.type() != needles.type()`
* @throws cudf::logic_error If `haystack.type() != needles.type()`
*
* @code{.pseudo}
* haystack = { 10, 20, 30, 40, 50 }
* needles = { 20, 40, 60, 80 }
*
* result = { false, true, false, true, false }
* result = { true, true, false, false }
* @endcode
*
* @param haystack A column object
* @param needles A column of values to search for in `col`
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @return A column of bool elements containing true if the corresponding entry in haystack
* appears in needles and false if it does not.
* @param haystack The column containing search space.
* @param needles A column of values to check for existence in the search space.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A BOOL column indicating if each element in `needles` exists in the search space.
*/
std::unique_ptr<column> contains(
column_view const& haystack,
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/dictionary/remove_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ std::unique_ptr<column> remove_keys(
CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");

// locate keys to remove by searching the keys column
auto const matches = cudf::detail::contains(keys_view, keys_to_remove, stream, mr);
auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr);
auto d_matches = matches->view().data<bool>();
// call common utility method to keep the keys not matched to keys_to_remove
auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; };
Expand All @@ -181,7 +181,7 @@ std::unique_ptr<column> remove_unused_keys(
thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
// wrap the indices for comparison in contains()
column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data());
return cudf::detail::contains(keys_positions_view, indices_view, stream, mr);
return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
}();
auto d_matches = matches->view().data<bool>();

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/dictionary/set_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ std::unique_ptr<column> set_keys(
std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));

// compute the new nulls
auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr);
auto matches = cudf::detail::contains(keys_column->view(), keys, stream, mr);
auto d_matches = matches->view().data<bool>();
auto indices_itr =
cudf::detail::indexalator_factory::make_input_iterator(dictionary_column.indices());
Expand Down
Loading