From e373a68841655332bb42983511680dd222f74018 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 12 Mar 2021 07:22:49 -0700 Subject: [PATCH] Implement drop_list_duplicates (#7528) Closes #7494 and partially addresses #7414. This is the new implementation for `drop_list_duplicates`, which removes duplicated entries from lists column. The result is a new lists column in which each list row contains only unique entries. By current implementation, the output lists will have entries sorted by ascending order (null(s) last). Example with null_equality=EQUAL: ``` input: { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } output: { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} } ``` Example with null_equality=UNEQUAL: ``` input: { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } output: { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL, NULL, NULL} } ``` Authors: - Nghia Truong (@ttnghia) Approvers: - AJ Schmidt (@ajschmidt8) - @nvdbaranec - David (@davidwendt) - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7528 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + .../cudf/lists/drop_list_duplicates.hpp | 63 ++++ cpp/include/doxygen_groups.h | 1 + cpp/src/lists/drop_list_duplicates.cu | 294 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + .../lists/drop_list_duplicates_tests.cpp | 187 +++++++++++ 7 files changed, 548 insertions(+) create mode 100644 cpp/include/cudf/lists/drop_list_duplicates.hpp create mode 100644 cpp/src/lists/drop_list_duplicates.cu create mode 100644 cpp/tests/lists/drop_list_duplicates_tests.cpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 74f4a20c066..e709824721c 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -134,6 +134,7 @@ test: - test -f $PREFIX/include/cudf/lists/detail/copying.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp + - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/extract.hpp - test -f $PREFIX/include/cudf/lists/contains.hpp - test -f $PREFIX/include/cudf/lists/gather.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2d8e260c0ca..2e0c12d683a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -258,6 +258,7 @@ add_library(cudf src/lists/copying/segmented_gather.cu src/lists/count_elements.cu src/lists/extract.cu + src/lists/drop_list_duplicates.cu src/lists/lists_column_factories.cu src/lists/lists_column_view.cu src/lists/segmented_sort.cu diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp new file mode 100644 index 00000000000..0939bd7956a --- /dev/null +++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf { +namespace lists { +/** + * @addtogroup lists_drop_duplicates + * @{ + * @file + */ + +/** + * @brief Create a new lists column by removing duplicated entries from each list element in the + * given lists column + * + * @throw cudf::logic_error if any row (list element) in the input column is a nested type. + * + * Given an `input` lists_column_view, the list elements in the column are copied to an output lists + * column such that their duplicated entries are dropped out to keep only the unique ones. The + * order of those entries within each list are not guaranteed to be preserved as in the input. In + * the current implementation, entries in the output lists are sorted by ascending order (nulls + * last), but this is not guaranteed in future implementation. + * + * @param lists_column The input lists_column_view + * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param mr Device resource used to allocate memory + * + * @code{.pseudo} + * lists_column = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} } + * output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} } + * + * Note that permuting the entries of each list in this output also produces another valid + * output. + * @endcode + * + * @return A list column with list elements having unique entries + */ +std::unique_ptr drop_list_duplicates( + lists_column_view const& lists_column, + null_equality nulls_equal = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace lists +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index e732a13e67c..3f3efdb7626 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -146,6 +146,7 @@ * @defgroup lists_contains Searching * @defgroup lists_gather Gathering * @defgroup lists_elements Counting + * @defgroup lists_drop_duplicates Filtering * @} * @defgroup nvtext_apis NVText * @{ diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu new file mode 100644 index 00000000000..1eb105d296d --- /dev/null +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +namespace { +using offset_type = lists_column_view::offset_type; +/** + * @brief Copy list entries and entry list offsets ignoring duplicates + * + * Given an array of all entries flattened from a list column and an array that maps each entry to + * the offset of the list containing that entry, those entries and list offsets are copied into + * new arrays such that the duplicated entries within each list will be ignored. + * + * @param all_lists_entries The input array containing all list entries + * @param entries_list_offsets A map from list entries to their corresponding list offsets + * @param nulls_equal Flag to specify whether null entries should be considered equal + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A pair of columns, the first one contains unique list entries and the second one + * contains their corresponding list offsets + */ +template +std::vector> get_unique_entries_and_list_offsets( + column_view const& all_lists_entries, + column_view const& entries_list_offsets, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Create an intermediate table, since the comparator only work on tables + auto const device_input_table = + cudf::table_device_view::create(table_view{{all_lists_entries}}, stream); + auto const comp = row_equality_comparator( + *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL); + + auto const num_entries = all_lists_entries.size(); + // Allocate memory to store the indices of the unique entries + auto const unique_indices = cudf::make_numeric_column( + entries_list_offsets.type(), num_entries, mask_state::UNALLOCATED, stream); + auto const unique_indices_begin = unique_indices->mutable_view().begin(); + + auto const copy_end = thrust::unique_copy( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + unique_indices_begin, + [list_offsets = entries_list_offsets.begin(), comp] __device__(auto i, auto j) { + return list_offsets[i] == list_offsets[j] && comp(i, j); + }); + + // Collect unique entries and entry list offsets + auto const indices = cudf::detail::slice( + unique_indices->view(), 0, thrust::distance(unique_indices_begin, copy_end)); + return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}}, + indices, + cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr) + ->release(); +} + +/** + * @brief Generate a 0-based offset column for a lists column + * + * Given a lists_column_view, which may have a non-zero offset, generate a new column containing + * 0-based list offsets. This is done by subtracting each of the input list offset by the first + * offset. + * + * @code{.pseudo} + * Given a list column having offsets = { 3, 7, 9, 13 }, + * then output_offsets = { 0, 4, 6, 10 } + * @endcode + * + * @param lists_column The input lists column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A column containing 0-based list offsets + */ +std::unique_ptr generate_clean_offsets(lists_column_view const& lists_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto output_offsets = make_numeric_column(data_type{type_to_id()}, + lists_column.size() + 1, + mask_state::UNALLOCATED, + stream, + mr); + thrust::transform( + rmm::exec_policy(stream), + lists_column.offsets_begin(), + lists_column.offsets_end(), + output_offsets->mutable_view().begin(), + [first = lists_column.offsets_begin()] __device__(auto offset) { return offset - *first; }); + return output_offsets; +} + +/** + * @brief Populate list offsets for all list entries + * + * Given an `offsets` column_view containing offsets of a lists column and a number of all list + * entries in the column, generate an array that maps from each list entry to the offset of the list + * containing that entry. + * + * @code{.pseudo} + * num_entries = 10, offsets = { 0, 4, 6, 10 } + * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 } + * @endcode + * + * @param num_entries The number of list entries + * @param offsets Column view to the list offsets + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device resource used to allocate memory + * + * @return A column containing entry list offsets + */ +std::unique_ptr generate_entry_list_offsets(size_type num_entries, + column_view const& offsets, + rmm::cuda_stream_view stream) +{ + auto entry_list_offsets = make_numeric_column(offsets.type(), + num_entries, + mask_state::UNALLOCATED, + stream, + rmm::mr::get_current_device_resource()); + thrust::upper_bound(rmm::exec_policy(stream), + offsets.begin(), + offsets.end(), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries), + entry_list_offsets->mutable_view().begin()); + return entry_list_offsets; +} + +/** + * @brief Generate list offsets from entry offsets + * + * Generate an array of list offsets for the final result lists column. The list + * offsets of the original lists column are also taken into account to make sure the result lists + * column will have the same empty list rows (if any) as in the original lists column. + * + * @param[in] num_entries The number of unique entries after removing duplicates + * @param[in] entries_list_offsets The mapping from list entries to their list offsets + * @param[out] original_offsets The list offsets of the original lists column, which + * will also be used to store the new list offsets + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param[in] mr Device resource used to allocate memory + */ +void generate_offsets(size_type num_entries, + column_view const& entries_list_offsets, + mutable_column_view const& original_offsets, + rmm::cuda_stream_view stream) +{ + // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any) + // If entries_list_offsets = {1, 1, 1, 1, 2, 3, 3, 3, 4, 4 }, num_entries = 10, + // then new_offsets = { 0, 4, 5, 8, 10 } + auto const new_offsets = allocate_like( + original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource()); + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_entries + 1), + new_offsets->mutable_view().begin(), + [num_entries, offsets_ptr = entries_list_offsets.begin()] __device__( + auto i) -> bool { + return i == 0 || i == num_entries || offsets_ptr[i] != offsets_ptr[i - 1]; + }); + + // Generate a prefix sum of number of empty lists, storing inplace to the original lists + // offsets + // If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists), + // and new_offsets = { 0, 4, 6 }, + // then output = { 0, 1, 1, 2, 2, 3} + auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator( + 0, [offsets = original_offsets.begin()] __device__(auto i) { + return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0; + }); + thrust::inclusive_scan(rmm::exec_policy(stream), + iter_trans_begin, + iter_trans_begin + original_offsets.size(), + original_offsets.begin()); + + // Generate the final list offsets + // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 }, + // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 }, + // then output = { 0, 0, 4, 4, 5, 5 } + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(original_offsets.size()), + original_offsets.begin(), + [prefix_sum_empty_lists = original_offsets.begin(), + offsets = new_offsets->view().begin()] __device__(auto i) { + return offsets[i - prefix_sum_empty_lists[i]]; + }); +} +/** + * @copydoc cudf::lists::drop_list_duplicates + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent()); + if (cudf::is_nested(lists_column.child().type())) { + CUDF_FAIL("Nested types are not supported in drop_list_duplicates."); + } + + // Call segmented sort on the list elements and store them in a temporary column sorted_list + auto const sorted_lists = + detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream); + + // Flatten all entries (depth = 1) of the lists column + auto const all_lists_entries = lists_column_view(sorted_lists->view()).get_sliced_child(stream); + + // Generate a 0-based offset column + auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr); + + // Generate a mapping from list entries to offsets of the lists containing those entries + auto const entries_list_offsets = + detail::generate_entry_list_offsets(all_lists_entries.size(), lists_offsets->view(), stream); + + // Copy non-duplicated entries (along with their list offsets) to new arrays + auto unique_entries_and_list_offsets = + all_lists_entries.has_nulls() + ? detail::get_unique_entries_and_list_offsets( + all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr) + : detail::get_unique_entries_and_list_offsets( + all_lists_entries, entries_list_offsets->view(), nulls_equal, stream, mr); + + // Generate offsets for the new lists column + detail::generate_offsets(unique_entries_and_list_offsets.front()->size(), + unique_entries_and_list_offsets.back()->view(), + lists_offsets->mutable_view(), + stream); + + // Construct a new lists column without duplicated entries + return make_lists_column(lists_column.size(), + std::move(lists_offsets), + std::move(unique_entries_and_list_offsets.front()), + lists_column.null_count(), + cudf::detail::copy_bitmask(lists_column.parent(), stream, mr)); +} + +} // anonymous namespace +} // namespace detail + +/** + * @copydoc cudf::lists::drop_list_duplicates + */ +std::unique_ptr drop_list_duplicates(lists_column_view const& lists_column, + null_equality nulls_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::drop_list_duplicates(lists_column, nulls_equal, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 492767c5d2f..40829c74957 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -422,6 +422,7 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) ConfigureTest(LISTS_TEST lists/contains_tests.cpp lists/count_elements_tests.cpp + lists/drop_list_duplicates_tests.cpp lists/extract_tests.cpp lists/sort_lists_tests.cpp) diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp new file mode 100644 index 00000000000..0948ba96f62 --- /dev/null +++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +using float_type = float; +using int_type = int32_t; +using INT_LCW = cudf::test::lists_column_wrapper; +using FLT_LCW = cudf::test::lists_column_wrapper; +using STR_LCW = cudf::test::lists_column_wrapper; + +template +void test_once(cudf::column_view const& input, + LCW const& expected, + cudf::null_equality nulls_equal = cudf::null_equality::EQUAL) +{ + auto const results = + cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal); + if (equal_test) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, true); + } else { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected, true); + } +} + +struct DropListDuplicatesTest : public cudf::test::BaseFixture { +}; + +TEST_F(DropListDuplicatesTest, InvalidCasesTests) +{ + // Lists of nested types are not supported + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{INT_LCW{INT_LCW{{1, 2}, {3}}}}), + cudf::logic_error); + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{FLT_LCW{FLT_LCW{{1, 2}, {3}}}}), + cudf::logic_error); + EXPECT_THROW( + cudf::lists::drop_list_duplicates(cudf::lists_column_view{STR_LCW{STR_LCW{STR_LCW{"string"}}}}), + cudf::logic_error); +} + +TEST_F(DropListDuplicatesTest, FloatingPointTestsNonNull) +{ + // Trivial cases + test_once(FLT_LCW{{}}, FLT_LCW{{}}); + test_once(FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}, FLT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + + // Multiple empty lists + test_once(FLT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + FLT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + + auto constexpr p_inf = std::numeric_limits::infinity(); + auto constexpr m_inf = -std::numeric_limits::infinity(); + + // Lists contain inf + // We can't test for lists containing nan because the order of nan is + // undefined after sorting + test_once(FLT_LCW{0, 1, 2, 0, 1, 2, 0, 1, 2, p_inf, p_inf, p_inf}, + FLT_LCW{0, 1, 2, p_inf}); + test_once(FLT_LCW{p_inf, 0, m_inf, 0, p_inf, 0, m_inf, 0, p_inf, 0, m_inf}, + FLT_LCW{m_inf, 0, p_inf}); +} + +TEST_F(DropListDuplicatesTest, IntegerTestsNonNull) +{ + // Trivial cases + test_once(INT_LCW{{}}, INT_LCW{{}}); + test_once(INT_LCW{{0, 1, 2, 3, 4, 5}, {}}, INT_LCW{{0, 1, 2, 3, 4, 5}, {}}); + + // Multiple empty lists + test_once(INT_LCW{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}, + INT_LCW{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}); + + // Adjacent lists containing the same entries + test_once( + INT_LCW{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}}, + INT_LCW{{1}, {1, 2}, {2, 3}}); + + // Sliced list column + auto const list0 = INT_LCW{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + auto const list1 = cudf::slice(list0, {0, 5})[0]; + auto const list2 = cudf::slice(list0, {1, 5})[0]; + auto const list3 = cudf::slice(list0, {1, 3})[0]; + auto const list4 = cudf::slice(list0, {0, 3})[0]; + + test_once(list0, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list1, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list2, INT_LCW{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}); + test_once(list3, INT_LCW{{1, 2, 3, 4}, {5}}); + test_once(list4, INT_LCW{{1, 2, 3}, {1, 2, 3, 4}, {5}}); +} + +TEST_F(DropListDuplicatesTest, IntegerTestsWithNulls) +{ + auto constexpr null = std::numeric_limits::max(); + + // null lists + test_once(INT_LCW{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}, + INT_LCW{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 2 && i != 3; })}); + + // null entries are equal + test_once( + INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + INT_LCW{{1, 3, 5, 7, 9, null}, + std::initializer_list{true, true, true, true, true, false}}); + + // nulls entries are not equal + test_once( + INT_LCW{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })}, + INT_LCW{ + {1, 3, 5, 7, 9, null, null, null, null, null}, + std::initializer_list{true, true, true, true, true, false, false, false, false, false}}, + cudf::null_equality::UNEQUAL); +} + +TEST_F(DropListDuplicatesTest, StringTestsNonNull) +{ + // Trivial cases + test_once(STR_LCW{{}}, STR_LCW{{}}); + test_once(STR_LCW{"this", "is", "a", "string"}, STR_LCW{"a", "is", "string", "this"}); + + // One list column + test_once(STR_LCW{"this", "is", "is", "is", "a", "string", "string"}, + STR_LCW{"a", "is", "string", "this"}); + + // Multiple lists column + test_once( + STR_LCW{STR_LCW{"this", "is", "a", "no duplicate", "string"}, + STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}, + STR_LCW{"this", "is", "is", "is", "a", "two duplicates", "string"}, + STR_LCW{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}, + STR_LCW{STR_LCW{"a", "is", "no duplicate", "string", "this"}, + STR_LCW{"a", "is", "one duplicate", "string", "this"}, + STR_LCW{"a", "is", "string", "this", "two duplicates"}, + STR_LCW{"a", "is", "string", "this", "three duplicates"}}); +} + +TEST_F(DropListDuplicatesTest, StringTestsWithNulls) +{ + auto const null = std::string(""); + + // One list column with null entries + test_once( + STR_LCW{{"this", null, "is", "is", "is", "a", null, "string", null, "string"}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i != 1 && i != 6 && i != 8; })}, + STR_LCW{{"a", "is", "string", "this", null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })}); + + // Multiple lists column with null lists and null entries + test_once( + STR_LCW{{STR_LCW{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0; })}, + STR_LCW{}, + STR_LCW{"this", "is", "is", "a", "one duplicate", "string"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}, + STR_LCW{ + {STR_LCW{{"a", "is", "no duplicate", "string", "this", null}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i <= 4; })}, + STR_LCW{}, + STR_LCW{"a", "is", "one duplicate", "string", "this"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })}); +}