Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reimplement lists::drop_list_duplicates for keys-values lists columns #9345

Merged
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1831963
Rewrite API interface and doxygen
ttnghia Sep 29, 2021
eea4a6f
Update doxygen
ttnghia Sep 29, 2021
18dbf0e
Reuse existing `duplicate_keep_option` enum
ttnghia Sep 30, 2021
c81cdb2
WIP
ttnghia Oct 18, 2021
f4e111d
Merge branch 'branch-21.12' into drop_list_duplicates_keys_values
ttnghia Oct 19, 2021
2ecf118
Fix errors
ttnghia Oct 19, 2021
c5a1d4b
Implementation compiles
ttnghia Oct 19, 2021
8ec9518
Rewrite doxygen
ttnghia Oct 19, 2021
e830a53
Fix error
ttnghia Oct 19, 2021
d720adf
Update doxygen
ttnghia Oct 20, 2021
e058053
Fix all errors, tests passed
ttnghia Oct 20, 2021
a898560
Cleanup
ttnghia Oct 21, 2021
1177da3
Separate code into a header file
ttnghia Oct 21, 2021
6478db8
Implement duplicate_keep_option
ttnghia Oct 21, 2021
8e57842
Reorder parameters
ttnghia Oct 21, 2021
a0b5684
Fix all bugs and added unit tests
ttnghia Oct 21, 2021
db15ef8
Cleanup
ttnghia Oct 21, 2021
be5e2d4
Add comments
ttnghia Oct 22, 2021
063a2a9
Fix style
ttnghia Oct 22, 2021
c074297
Merge branch 'branch-21.12' into drop_list_duplicates_keys_values
ttnghia Oct 28, 2021
58ad58b
Rewrite doxygen
ttnghia Oct 28, 2021
caca70b
Cleanup
ttnghia Oct 28, 2021
a9dbb77
Merge branch 'branch-21.12' into drop_list_duplicates_keys_values
ttnghia Nov 2, 2021
2272094
Merge branch 'branch-21.12' into drop_list_duplicates_keys_values
ttnghia Nov 8, 2021
1e47de2
Remove staled header
ttnghia Nov 9, 2021
93d4eef
Rewrite doxygen
ttnghia Nov 9, 2021
58fff1c
Rewrite `drop_list_duplicates.cu`
ttnghia Nov 9, 2021
bae68f1
Merge branch 'branch-21.12' into drop_list_duplicates_keys_values
ttnghia Nov 9, 2021
fa866e7
Rewrite doxygen
ttnghia Nov 9, 2021
ba93300
Merge branch 'branch-21.12' into drop_list_duplicates_keys_values
ttnghia Nov 10, 2021
dadf273
Fix doxygen
ttnghia Nov 10, 2021
bdf9912
Rewrite doxygen
ttnghia Nov 10, 2021
06048ff
Rewrite doxygen
ttnghia Nov 10, 2021
2446711
Add detail interface for `normalize_nans_and_zeros` that accepts stre…
ttnghia Nov 10, 2021
7a65185
Address review comments
ttnghia Nov 10, 2021
07056ce
Fix comment typos
ttnghia Nov 10, 2021
c35d6cb
Construct `gather_map` as `device_span` instead of `column_view`
ttnghia Nov 10, 2021
6a96f74
Fix `device_span` ctor input
ttnghia Nov 10, 2021
5121878
Remove `has_value()` check and use the optional object as bool
ttnghia Nov 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cpp/include/cudf/detail/replace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,5 +96,15 @@ std::unique_ptr<column> find_and_replace_all(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::normalize_nans_and_zeros
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> normalize_nans_and_zeros(
column_view const& input,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
} // namespace cudf
28 changes: 24 additions & 4 deletions cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,45 @@
*/
#pragma once

#include <cudf/lists/lists_column_view.hpp>
#include <cudf/lists/drop_list_duplicates.hpp>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
namespace lists {
namespace detail {
/**
* @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
* lists_column_view const&,
* duplicate_keep_option,
* null_equality,
* nan_equality,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> drop_list_duplicates(
lists_column_view const& keys,
lists_column_view const& values,
duplicate_keep_option keep_option,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::lists::drop_list_duplicates
*
* @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
* null_equality,
* nan_equality,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> drop_list_duplicates(
lists_column_view const& lists_column,
lists_column_view const& input,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
} // namespace lists
} // namespace cudf
94 changes: 73 additions & 21 deletions cpp/include/cudf/lists/drop_list_duplicates.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,35 +28,87 @@ namespace lists {
*/

/**
* @brief Create a new lists column by extracting unique entries from list elements in the given
* lists column.
*
* Given an input lists column, the list elements in the column are copied to an output lists
* column such that their duplicated entries are dropped out to keep only the unique ones. The
* order of those entries within each list are not guaranteed to be preserved as in the input. In
* the current implementation, entries in the output lists are sorted by ascending order (nulls
* last), but this is not guaranteed in future implementation.
*
* @throw cudf::logic_error if the child column of the input lists column contains nested type other
* than struct.
*
* @param lists_column The input lists column to extract lists with unique entries.
* @param nulls_equal Flag to specify whether null entries should be considered equal.
* @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only
* applicable for floating point data column).
* @brief Copy the elements from the lists in `keys` and associated `values` columns according to
* the unique elements in `keys`.
*
* For each list in `keys` and associated `values`, according to the parameter `keep_option`, copy
* the unique elements from the list in `keys` and their corresponding elements in `values` to new
* lists. Order of the output elements within each list are not guaranteed to be preserved as in the
* input.
*
* Behavior is undefined if `count_elements(keys)[i] != count_elements(values)[i]` for all `i` in
* `[0, keys.size())`.
*
* @throw cudf::logic_error If the child column of the input keys column contains nested type other
* than STRUCT.
* @throw cudf::logic_error If `keys.size() != values.size()`.
*
* @param keys The input keys lists column to check for uniqueness and copy unique elements.
* @param values The values lists column in which the elements are mapped to elements in the key
* column.
* @param nulls_equal Flag to specify whether null key elements should be considered as equal.
* @param nans_equal Flag to specify whether NaN key elements should be considered as equal
* (only applicable for floating point keys elements).
* @param keep_option Flag to specify which elements will be copied from the input to the output.
* @param mr Device resource used to allocate memory.
*
* @code{.pseudo}
* input = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
* output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
* keys = { {1, 1, 2, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
* values = { {"a", "b", "c", "d"}, {"e"}, NULL, {}, {"N0", "N1", "N2", "f", "g", "h", "i", "j"} }
*
* [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_FIRST)
* out_keys = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
* out_values = { {"a", "c", "d"}, {"e"}, NULL, {}, {"f", "g", "N0"} }
*
* [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_LAST)
* out_keys = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
* out_values = { {"b", "c", "d"}, {"e"}, NULL, {}, {"j", "i", "N2"} }
*
* Note that permuting the entries of each list in this output also produces another valid output.
* [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_NONE)
* out_keys = { {2, 3}, {4}, NULL, {}, {} }
* out_values = { {"c", "d"}, {"e"}, NULL, {}, {} }
* @endcode
*
* @return A pair of lists columns storing the results from extracting unique key elements and their
* corresponding values elements from the input.
*/
std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates(
lists_column_view const& keys,
lists_column_view const& values,
duplicate_keep_option keep_option = duplicate_keep_option::KEEP_FIRST,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::UNEQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new list column by copying elements from the input lists column ignoring
* duplicate list elements.
*
* Given a lists column, an output lists column is generated by copying elements from the input
* lists column in a way such that the duplicate elements in each list are ignored, producing only
* unique list elements.
*
* Order of the output elements are not guaranteed to be preserved as in the input.
*
* @throw cudf::logic_error If the child column of the input lists column contains nested type other
* than STRUCT.
*
* @param input The input lists column to check and copy unique elements.
* @param nulls_equal Flag to specify whether null key elements should be considered as equal.
* @param nans_equal Flag to specify whether NaN key elements should be considered as equal
* (only applicable for floating point keys column).
* @param keep_option Flag to specify which elements will be copied from the input to the output.
* @param mr Device resource used to allocate memory.
*
* @code{.pseudo}
* input = { {1, 1, 2, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
* drop_list_duplicates(input) = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
* @endcode
*
* @return A lists column with list elements having unique entries.
* @return A lists column storing the results from extracting unique list elements from the input.
*/
std::unique_ptr<column> drop_list_duplicates(
lists_column_view const& lists_column,
lists_column_view const& input,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::UNEQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
Expand Down
Loading