From 2f33e049408b6dd89076f37aac61ebdee210d70a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 12 Jan 2022 12:24:27 -0500 Subject: [PATCH 01/50] Rename existing compaction APIs --- .../drop_duplicates_benchmark.cpp | 2 +- cpp/include/cudf/detail/stream_compaction.hpp | 22 ++-- cpp/include/cudf/stream_compaction.hpp | 16 +-- cpp/src/dictionary/add_keys.cu | 19 ++-- cpp/src/dictionary/detail/concatenate.cu | 14 +-- cpp/src/dictionary/set_keys.cu | 16 +-- cpp/src/reductions/reductions.cpp | 2 +- cpp/src/stream_compaction/distinct_count.cu | 28 ++--- cpp/src/stream_compaction/drop_duplicates.cu | 28 ++--- cpp/src/transform/encode.cu | 14 +-- .../drop_duplicates_tests.cpp | 100 ++++++++++-------- java/src/main/native/src/TableJni.cpp | 2 +- 12 files changed, 137 insertions(+), 126 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index 8039d7d065f..3ae5f723a56 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -45,7 +45,7 @@ void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep) for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = cudf::drop_duplicates(input_table, {0}, keep); + auto result = cudf::unordered_drop_duplicates(input_table, {0}, keep); } } diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 87823d71c6f..c7a3457e324 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -62,11 +62,11 @@ std::unique_ptr apply_boolean_mask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::drop_duplicates + * @copydoc cudf::unordered_drop_duplicates * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr
drop_duplicates( +std::unique_ptr
unordered_drop_duplicates( table_view const& input, std::vector const& keys, duplicate_keep_option keep, @@ -76,23 +76,23 @@ std::unique_ptr
drop_duplicates( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) + * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @copydoc cudf::distinct_count(table_view const&, null_equality) + * @copydoc cudf::unordered_distinct_count(table_view const&, null_equality) * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -cudf::size_type distinct_count(table_view const& input, - null_equality nulls_equal = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); +cudf::size_type unordered_distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 7551511d281..59d031db02a 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -214,7 +214,7 @@ enum class duplicate_keep_option { }; /** - * @brief Create a new table without duplicate rows + * @brief Create a new table without duplicate rows. * * Given an `input` table_view, each row is copied to output table if the corresponding * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep: @@ -235,7 +235,7 @@ enum class duplicate_keep_option { * * @return Table with unique rows as per specified `keep`. */ -std::unique_ptr
drop_duplicates( +std::unique_ptr
unordered_drop_duplicates( table_view const& input, std::vector const& keys, duplicate_keep_option keep, @@ -244,7 +244,7 @@ std::unique_ptr
drop_duplicates( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Count the unique elements in the column_view + * @brief Count the unique elements in the column_view. * * Given an input column_view, number of unique elements in this column_view is returned * @@ -259,9 +259,9 @@ std::unique_ptr
drop_duplicates( * * @return number of unique elements */ -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling); +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling); /** * @brief Count the unique rows in a table. @@ -273,8 +273,8 @@ cudf::size_type distinct_count(column_view const& input, * * @return number of unique rows in the table */ -cudf::size_type distinct_count(table_view const& input, - null_equality nulls_equal = null_equality::EQUAL); +cudf::size_type unordered_distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL); /** @} */ } // namespace cudf diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index e3d1ea88ece..65739dac9f5 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -58,15 +58,16 @@ std::unique_ptr add_keys( auto combined_keys = cudf::detail::concatenate(std::vector{old_keys, new_keys}, stream); // sort and remove any duplicates from the combined keys - // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f] - auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}}, - std::vector{0}, // only one key column - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); + // unordered_drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f] + auto table_keys = + cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}}, + std::vector{0}, // only one key column + duplicate_keep_option::KEEP_FIRST, + null_equality::EQUAL, + null_order::BEFORE, + stream, + mr) + ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); // create a map for the indices // lower_bound([a,b,c,d,e,f],[a,b,c,d,f]) = [0,1,2,3,5] diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index fd86d8ec7d4..54ff8768455 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -216,13 +216,13 @@ std::unique_ptr concatenate(host_span columns, // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column - auto table_keys = cudf::detail::drop_duplicates(table_view{{all_keys->view()}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) + auto table_keys = cudf::detail::unordered_drop_duplicates(table_view{{all_keys->view()}}, + std::vector{0}, + duplicate_keep_option::KEEP_FIRST, + null_equality::EQUAL, + null_order::BEFORE, + stream, + mr) ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 72f6e034479..76058db4ec5 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -120,14 +120,14 @@ std::unique_ptr set_keys( auto keys = dictionary_column.keys(); CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); - // copy the keys -- use drop_duplicates to make sure they are sorted and unique - auto table_keys = cudf::detail::drop_duplicates(table_view{{new_keys}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) + // copy the keys -- use unordered_drop_duplicates to make sure they are sorted and unique + auto table_keys = cudf::detail::unordered_drop_duplicates(table_view{{new_keys}}, + std::vector{0}, + duplicate_keep_option::KEEP_FIRST, + null_equality::EQUAL, + null_order::BEFORE, + stream, + mr) ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 6f9149a47e2..73c2eb9a716 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -93,7 +93,7 @@ struct reduce_dispatch_functor { case aggregation::NUNIQUE: { auto nunique_agg = dynamic_cast(agg.get()); return make_fixed_width_scalar( - detail::distinct_count( + detail::unordered_distinct_count( col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream), stream, mr); diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 5c695f8a16f..fc0f04ee8c8 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -35,9 +35,9 @@ namespace cudf { namespace detail { -cudf::size_type distinct_count(table_view const& keys, - null_equality nulls_equal, - rmm::cuda_stream_view stream) +cudf::size_type unordered_distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) { // sort only indices auto sorted_indices = sorted_order(keys, @@ -137,10 +137,10 @@ struct has_nans { } }; -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream) +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) { if (0 == input.size() || input.null_count() == input.size()) { return 0; } @@ -156,7 +156,7 @@ cudf::size_type distinct_count(column_view const& input, has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); } - auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // if nan is considered null and there are already null values if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count; @@ -169,18 +169,18 @@ cudf::size_type distinct_count(column_view const& input, } // namespace detail -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling) +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling) { CUDF_FUNC_RANGE(); - return detail::distinct_count(input, null_handling, nan_handling); + return detail::unordered_distinct_count(input, null_handling, nan_handling); } -cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal) +cudf::size_type unordered_distinct_count(table_view const& input, null_equality nulls_equal) { CUDF_FUNC_RANGE(); - return detail::distinct_count(input, nulls_equal); + return detail::unordered_distinct_count(input, nulls_equal); } } // namespace cudf diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index abc34663aee..ba5b0d127f1 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -98,13 +98,13 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, } } // namespace -std::unique_ptr
drop_duplicates(table_view const& input, - std::vector const& keys, - duplicate_keep_option keep, - null_equality nulls_equal, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
unordered_drop_duplicates(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { return empty_like(input); @@ -132,15 +132,15 @@ std::unique_ptr
drop_duplicates(table_view const& input, } // namespace detail -std::unique_ptr
drop_duplicates(table_view const& input, - std::vector const& keys, - duplicate_keep_option const keep, - null_equality nulls_equal, - null_order null_precedence, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
unordered_drop_duplicates(table_view const& input, + std::vector const& keys, + duplicate_keep_option const keep, + null_equality nulls_equal, + null_order null_precedence, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_duplicates( + return detail::unordered_drop_duplicates( input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); } diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index dadeaf7d1e0..65233b4f2e3 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -44,13 +44,13 @@ std::pair, std::unique_ptr> encode( // side effects of this function we are now dependent on: // - resulting column elements are sorted ascending // - nulls are sorted to the beginning - auto keys_table = cudf::detail::drop_duplicates(input_table, - drop_keys, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::AFTER, - stream, - mr); + auto keys_table = cudf::detail::unordered_drop_duplicates(input_table, + drop_keys, + duplicate_keep_option::KEEP_FIRST, + null_equality::EQUAL, + null_order::AFTER, + stream, + mr); auto indices_column = cudf::detail::lower_bound(keys_table->view(), diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 916d2a33b97..defe0421508 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -47,8 +47,9 @@ TYPED_TEST(DistinctCountCommon, NoNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TYPED_TEST(DistinctCountCommon, TableNoNull) @@ -73,7 +74,7 @@ TYPED_TEST(DistinctCountCommon, TableNoNull) cudf::table_view input_table(cols); cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); - EXPECT_EQ(expected, cudf::distinct_count(input_table, null_equality::EQUAL)); + EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); } struct DistinctCount : public cudf::test::BaseFixture { @@ -91,8 +92,9 @@ TEST_F(DistinctCount, WithNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, IgnoringNull) @@ -108,8 +110,9 @@ TEST_F(DistinctCount, IgnoringNull) cudf::size_type expected = std::set(input.begin(), input.end()).size(); // Removing 2 from expected to remove count for 70 and 3 - EXPECT_EQ(expected - 2, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ( + expected - 2, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, WithNansAndNull) @@ -124,8 +127,9 @@ TEST_F(DistinctCount, WithNansAndNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, WithNansOnly) @@ -138,8 +142,9 @@ TEST_F(DistinctCount, WithNansOnly) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; cudf::size_type expected = 5; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, NansAsNullWithNoNull) @@ -152,8 +157,9 @@ TEST_F(DistinctCount, NansAsNullWithNoNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; cudf::size_type expected = 5; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, NansAsNullWithNull) @@ -166,8 +172,9 @@ TEST_F(DistinctCount, NansAsNullWithNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; cudf::size_type expected = 4; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, NansAsNullWithIgnoreNull) @@ -180,8 +187,9 @@ TEST_F(DistinctCount, NansAsNullWithIgnoreNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; cudf::size_type expected = 3; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, EmptyColumn) @@ -191,8 +199,9 @@ TEST_F(DistinctCount, EmptyColumn) cudf::test::fixed_width_column_wrapper input_col{}; cudf::size_type expected = 0; - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, StringColumnWithNull) @@ -203,8 +212,9 @@ TEST_F(DistinctCount, StringColumnWithNull) cudf::size_type expected = (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); - EXPECT_EQ(expected, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, TableWithNull) @@ -215,8 +225,8 @@ TEST_F(DistinctCount, TableWithNull) {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; cudf::table_view input{{col1, col2}}; - EXPECT_EQ(8, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); } TEST_F(DistinctCount, EmptyColumnedTable) @@ -225,10 +235,10 @@ TEST_F(DistinctCount, EmptyColumnedTable) cudf::table_view input(cols); - EXPECT_EQ(0, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::distinct_count(input, null_equality::UNEQUAL)); - EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::distinct_count(cudf::table_view{}, null_equality::UNEQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(cudf::table_view{}, null_equality::EQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(cudf::table_view{}, null_equality::UNEQUAL)); } TEST_F(DistinctCount, TableMixedTypes) @@ -241,8 +251,8 @@ TEST_F(DistinctCount, TableMixedTypes) {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; cudf::table_view input{{col1, col2, col3}}; - EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); } TEST_F(DistinctCount, TableWithStringColumnWithNull) @@ -254,8 +264,8 @@ TEST_F(DistinctCount, TableWithStringColumnWithNull) {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; cudf::table_view input{{col1, col2}}; - EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); } struct DropDuplicate : public cudf::test::BaseFixture { @@ -280,7 +290,7 @@ TEST_F(DropDuplicate, NonNullTable) cudf::table_view expected_first{ {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; - auto got_first = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); + auto got_first = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); @@ -292,7 +302,7 @@ TEST_F(DropDuplicate, NonNullTable) cudf::table_view expected_last{ {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + auto got_last = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); @@ -304,7 +314,7 @@ TEST_F(DropDuplicate, NonNullTable) cudf::table_view expected_unique{ {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; - auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); + auto got_unique = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } @@ -320,8 +330,8 @@ TEST_F(DropDuplicate, WithNull) cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; - auto got_first = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got_first = unordered_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); @@ -329,7 +339,7 @@ TEST_F(DropDuplicate, WithNull) cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + auto got_last = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); @@ -337,7 +347,7 @@ TEST_F(DropDuplicate, WithNull) cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; - auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); + auto got_unique = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } @@ -354,7 +364,7 @@ TEST_F(DropDuplicate, StringKeyColumn) {0, 1, 1, 1, 1}}; cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + auto got_last = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); } @@ -365,8 +375,8 @@ TEST_F(DropDuplicate, EmptyInputTable) cudf::table_view input{{col}}; std::vector keys{1, 2}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unordered_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } @@ -376,8 +386,8 @@ TEST_F(DropDuplicate, NoColumnInputTable) cudf::table_view input{std::vector()}; std::vector keys{1, 2}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unordered_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } @@ -389,8 +399,8 @@ TEST_F(DropDuplicate, EmptyKeys) cudf::table_view input{{col}}; std::vector keys{}; - auto got = - drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unordered_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 828d163fe07..ae1189e74e9 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2833,7 +2833,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( auto const keys_indices = std::vector(native_keys_indices.begin(), native_keys_indices.end()); - auto result = cudf::drop_duplicates( + auto result = cudf::unordered_drop_duplicates( *input, keys_indices, keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : cudf::duplicate_keep_option::KEEP_LAST, From 7ce954925efa15af794f3686c14c0da899420dc5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 12 Jan 2022 12:44:55 -0500 Subject: [PATCH 02/50] Update cython code to accommodate renaming --- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 18 ++++++++++-------- python/cudf/cudf/_lib/stream_compaction.pyx | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 5b81d369ef5..5c606ee97cb 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -33,11 +33,13 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] drop_duplicates(table_view source_table, - vector[size_type] keys, - duplicate_keep_option keep, - null_equality nulls_equal) except + - - cdef size_type distinct_count(column_view source_table, - null_policy null_handling, - nan_policy nan_handling) except + + cdef unique_ptr[table] unordered_drop_duplicates( + table_view source_table, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal) except + + + cdef size_type unordered_distinct_count( + column_view source_table, + null_policy null_handling, + nan_policy nan_handling) except + diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 4330c565982..17359414850 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -11,8 +11,8 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, - distinct_count as cpp_distinct_count, - drop_duplicates as cpp_drop_duplicates, + unordered_distinct_count as cpp_distinct_count, + unordered_drop_duplicates as cpp_drop_duplicates, drop_nulls as cpp_drop_nulls, duplicate_keep_option, ) From 5c5a41555ae70e8252c13b1eff8a5cb0851d3d01 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 12 Jan 2022 13:57:35 -0500 Subject: [PATCH 03/50] Update copyrights --- cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp | 2 +- cpp/include/cudf/detail/stream_compaction.hpp | 2 +- cpp/include/cudf/stream_compaction.hpp | 2 +- cpp/src/dictionary/add_keys.cu | 2 +- cpp/src/dictionary/detail/concatenate.cu | 2 +- cpp/src/dictionary/set_keys.cu | 2 +- cpp/src/reductions/reductions.cpp | 2 +- cpp/src/stream_compaction/distinct_count.cu | 2 +- cpp/src/stream_compaction/drop_duplicates.cu | 2 +- cpp/src/transform/encode.cu | 2 +- cpp/tests/stream_compaction/drop_duplicates_tests.cpp | 2 +- python/cudf/cudf/_lib/cpp/stream_compaction.pxd | 2 +- python/cudf/cudf/_lib/stream_compaction.pyx | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index 3ae5f723a56..155931e771b 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index c7a3457e324..5cae1d958fa 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 59d031db02a..33ddd89b2a7 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 65739dac9f5..f402d7d807f 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index 54ff8768455..9923e42589e 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 76058db4ec5..e33713d31a4 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 73c2eb9a716..234eaf51f96 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index fc0f04ee8c8..239ecbe61e9 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index ba5b0d127f1..8400facbe5f 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 65233b4f2e3..d78188cf8b4 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index defe0421508..188c0c0bcbc 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 5c606ee97cb..21264ac60fd 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 17359414850..e2109e5f0a9 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import pandas as pd From 0a62adecee2cd64a2c4b8604bd013f7d26718285 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 12 Jan 2022 18:02:33 -0500 Subject: [PATCH 04/50] Refactor unordered_distinct_count with hash-based algorithms --- cpp/src/stream_compaction/distinct_count.cu | 44 +++++++------- .../stream_compaction_common.cuh | 58 +++++++++++++++++++ .../stream_compaction_common.hpp | 47 +++++++++++++++ 3 files changed, 128 insertions(+), 21 deletions(-) create mode 100644 cpp/src/stream_compaction/stream_compaction_common.cuh create mode 100644 cpp/src/stream_compaction/stream_compaction_common.hpp diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 239ecbe61e9..0dfee3d3f57 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -14,14 +14,18 @@ * limitations under the License. */ +#include +#include + #include #include #include +#include +#include #include #include #include #include -#include #include #include @@ -39,26 +43,24 @@ cudf::size_type unordered_distinct_count(table_view const& keys, null_equality nulls_equal, rmm::cuda_stream_view stream) { - // sort only indices - auto sorted_indices = sorted_order(keys, - std::vector{}, - std::vector{}, - stream, - rmm::mr::get_current_device_resource()); - - // count unique elements - auto sorted_row_index = sorted_indices->view().data(); - auto device_input_table = cudf::table_device_view::create(keys, stream); - - row_equality_comparator comp( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(keys.num_rows()), - [sorted_row_index, comp] __device__(cudf::size_type i) { - return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1])); - }); + auto table_ptr = cudf::table_device_view::create(keys, stream); + auto const num_rows{table_ptr->num_rows()}; + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr}; + row_equality_comparator row_equal( + nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); + + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + + return key_map.get_size(); } /** diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh new file mode 100644 index 00000000000..2540c5dc316 --- /dev/null +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf { +namespace detail { + +/** + * @brief Device callable to hash a given row. + */ +template +class compaction_hash { + public: + compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {} + + __device__ __forceinline__ auto operator()(size_type i) const noexcept + { + auto hash = _hash(i); + return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash; + } + + private: + row_hash _hash; +}; + +/** + * @brief Device functor to determine if a row is valid. + */ +class row_is_valid { + public: + row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + + __device__ __inline__ bool operator()(const size_type& i) const noexcept + { + return cudf::bit_is_set(_row_bitmask, i); + } + + private: + bitmask_type const* _row_bitmask; +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp new file mode 100644 index 00000000000..1d743eccdbe --- /dev/null +++ b/cpp/src/stream_compaction/stream_compaction_common.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include + +#include + +#include + +#include + +namespace cudf { +namespace detail { + +constexpr auto COMPACTION_EMPTY_KEY_SENTINEL = std::numeric_limits::max(); +constexpr auto COMPACTION_EMPTY_VALUE_SENTINEL = std::numeric_limits::min(); + +using hash_type = cuco::detail::MurmurHash3_32; + +using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + +using hash_map_type = + cuco::static_map; + +using row_hash = cudf::row_hasher; + +} // namespace detail +} // namespace cudf From fba851cb9465972947556c10e94427fa2fcc2ff0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 12 Jan 2022 22:37:33 -0500 Subject: [PATCH 05/50] Refactor unordered_drop_duplicates with hash-based algorithms --- cpp/include/cudf/detail/stream_compaction.hpp | 2 - cpp/include/cudf/stream_compaction.hpp | 14 +-- cpp/src/dictionary/detail/concatenate.cu | 12 +- cpp/src/dictionary/set_keys.cu | 12 +- cpp/src/stream_compaction/drop_duplicates.cu | 108 ++++++------------ 5 files changed, 49 insertions(+), 99 deletions(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 5cae1d958fa..afbab14f580 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -69,9 +69,7 @@ std::unique_ptr
apply_boolean_mask( std::unique_ptr
unordered_drop_duplicates( table_view const& input, std::vector const& keys, - duplicate_keep_option keep, null_equality nulls_equal = null_equality::EQUAL, - null_order null_precedence = null_order::BEFORE, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 33ddd89b2a7..9bded32d8d3 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -217,30 +217,26 @@ enum class duplicate_keep_option { * @brief Create a new table without duplicate rows. * * Given an `input` table_view, each row is copied to output table if the corresponding - * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep: - * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied - * - KEEP_LAST: only the last of a sequence of duplicate rows is copied - * - KEEP_NONE: no duplicate rows are copied + * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which + * row is copied. + * + * Elements in the output table are in a random order. * * @throws cudf::logic_error if The `input` row size mismatches with `keys`. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep first entry, last entry, or no entries if duplicates found * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL - * @param[in] null_precedence flag to denote nulls should appear before or after non-null items * @param[in] mr Device memory resource used to allocate the returned table's device * memory * - * @return Table with unique rows as per specified `keep`. + * @return Table with unique rows in an unspecified order. */ std::unique_ptr
unordered_drop_duplicates( table_view const& input, std::vector const& keys, - duplicate_keep_option keep, null_equality nulls_equal = null_equality::EQUAL, - null_order null_precedence = null_order::BEFORE, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index 9923e42589e..c9e05877ee8 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -216,14 +216,10 @@ std::unique_ptr concatenate(host_span columns, // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column - auto table_keys = cudf::detail::unordered_drop_duplicates(table_view{{all_keys->view()}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); + auto table_keys = + cudf::detail::unordered_drop_duplicates( + table_view{{all_keys->view()}}, std::vector{0}, null_equality::EQUAL, stream, mr) + ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); // next, concatenate the indices diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index e33713d31a4..351eaf2028d 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -121,14 +121,10 @@ std::unique_ptr set_keys( CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); // copy the keys -- use unordered_drop_duplicates to make sure they are sorted and unique - auto table_keys = cudf::detail::unordered_drop_duplicates(table_view{{new_keys}}, - std::vector{0}, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::BEFORE, - stream, - mr) - ->release(); + auto table_keys = + cudf::detail::unordered_drop_duplicates( + table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr) + ->release(); std::unique_ptr keys_column(std::move(table_keys.front())); // compute the new nulls diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 8400facbe5f..e6a59edce3b 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -14,13 +14,15 @@ * limitations under the License. */ -#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -41,68 +43,9 @@ namespace cudf { namespace detail { -namespace { -/** - * @brief Create a column_view of index values which represent the row values - * without duplicates as per @p `keep` - * - * Given a `keys` table_view, each row index is copied to output `unique_indices`, if the - * corresponding row of `keys` table_view is unique, where the definition of unique depends on the - * value of @p keep: - * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied - * - KEEP_LAST: only the last of a sequence of duplicate rows is copied - * - KEEP_NONE: only unique rows are kept - * - * @param[in] keys table_view to identify duplicate rows - * @param[out] unique_indices Column to store the index with unique rows - * @param[in] keep keep first entry, last entry, or no entries if duplicates found - * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, - * @param[in] null_precedence flag to denote nulls should appear before or after non-null items, - * nulls are not equal if null_equality::UNEQUAL - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @return column_view column_view of unique row index as per specified `keep`, this is actually - * slice of `unique_indices`. - */ -column_view get_unique_ordered_indices(cudf::table_view const& keys, - cudf::mutable_column_view& unique_indices, - duplicate_keep_option keep, - null_equality nulls_equal, - null_order null_precedence, - rmm::cuda_stream_view stream) -{ - // Sort only the indices. - // Note that stable sort must be used to maintain the order of duplicate elements. - auto sorted_indices = stable_sorted_order( - keys, - std::vector{}, - std::vector{static_cast(keys.num_columns()), null_precedence}, - stream, - rmm::mr::get_current_device_resource()); - - // extract unique indices - auto device_input_table = cudf::table_device_view::create(keys, stream); - - auto comp = row_equality_comparator( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - auto result_end = unique_copy(sorted_indices->view().begin(), - sorted_indices->view().end(), - unique_indices.begin(), - comp, - keep, - stream); - - return cudf::detail::slice(column_view(unique_indices), - 0, - thrust::distance(unique_indices.begin(), result_end)); -} -} // namespace - std::unique_ptr
unordered_drop_duplicates(table_view const& input, std::vector const& keys, - duplicate_keep_option keep, null_equality nulls_equal, - null_order null_precedence, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -112,14 +55,38 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, auto keys_view = input.select(keys); - // The values will be filled into this column - auto unique_indices = cudf::make_numeric_column( - data_type{type_id::INT32}, keys_view.num_rows(), mask_state::UNALLOCATED, stream); - auto mutable_unique_indices_view = unique_indices->mutable_view(); - // This is just slice of `unique_indices` but with different size as per the - // keys_view has been processed in `get_unique_ordered_indices` - auto unique_indices_view = detail::get_unique_ordered_indices( - keys_view, mutable_unique_indices_view, keep, nulls_equal, null_precedence, stream); + auto table_ptr = cudf::table_device_view::create(keys_view, stream); + auto const num_rows{table_ptr->num_rows()}; + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{nullate::DYNAMIC{cudf::has_nulls(keys_view)}, *table_ptr}; + row_equality_comparator row_equal( + nullate::DYNAMIC{cudf::has_nulls(keys_view)}, *table_ptr, *table_ptr, nulls_equal); + + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + + auto counting_iter = thrust::make_counting_iterator(0); + rmm::device_uvector existences(num_rows, stream, mr); + key_map.contains(counting_iter, counting_iter + num_rows, existences.begin(), hash_key); + + auto const output_size{key_map.get_size()}; + + rmm::device_uvector unique_indices(output_size, stream, mr); + thrust::copy_if(rmm::exec_policy(stream), + counting_iter, + counting_iter + num_rows, + existences.begin(), + unique_indices.begin(), + [] __device__(bool const b) { return b; }); + + column_view unique_indices_view(data_type{type_id::INT32}, output_size, unique_indices.data()); // run gather operation to establish new order return detail::gather(input, @@ -134,14 +101,11 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, std::unique_ptr
unordered_drop_duplicates(table_view const& input, std::vector const& keys, - duplicate_keep_option const keep, null_equality nulls_equal, - null_order null_precedence, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::unordered_drop_duplicates( - input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); + return detail::unordered_drop_duplicates(input, keys, nulls_equal, rmm::cuda_stream_default, mr); } } // namespace cudf From 05ee85f2b420cd105f1a99dedf618829edaed2db Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 12 Jan 2022 22:43:56 -0500 Subject: [PATCH 06/50] Update cython code --- python/cudf/cudf/_lib/cpp/stream_compaction.pxd | 1 - python/cudf/cudf/_lib/stream_compaction.pyx | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 21264ac60fd..02a53c5c4f3 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -36,7 +36,6 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ cdef unique_ptr[table] unordered_drop_duplicates( table_view source_table, vector[size_type] keys, - duplicate_keep_option keep, null_equality nulls_equal) except + cdef size_type unordered_distinct_count( diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index e2109e5f0a9..5cc10bd0fcc 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -11,10 +11,10 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, - unordered_distinct_count as cpp_distinct_count, - unordered_drop_duplicates as cpp_drop_duplicates, drop_nulls as cpp_drop_nulls, duplicate_keep_option, + unordered_distinct_count as cpp_distinct_count, + unordered_drop_duplicates as cpp_drop_duplicates, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -152,7 +152,6 @@ def drop_duplicates(columns: list, cpp_drop_duplicates( source_table_view, cpp_keys, - cpp_keep_option, cpp_nulls_equal ) ) From 8ab22a455df12b139ef2e47d3c824c1536554e7b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 11:47:32 -0500 Subject: [PATCH 07/50] Optimize distinct count: insert valid rows only if nulls are equal --- cpp/src/stream_compaction/distinct_count.cu | 25 ++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 0dfee3d3f57..2bf7f25a8f1 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -45,6 +45,7 @@ cudf::size_type unordered_distinct_count(table_view const& keys, { auto table_ptr = cudf::table_device_view::create(keys, stream); auto const num_rows{table_ptr->num_rows()}; + auto const has_null = cudf::has_nulls(keys); hash_map_type key_map{compute_hash_table_size(num_rows), COMPACTION_EMPTY_KEY_SENTINEL, @@ -54,13 +55,31 @@ cudf::size_type unordered_distinct_count(table_view const& keys, compaction_hash hash_key{nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr}; row_equality_comparator row_equal( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); + nullate::DYNAMIC{has_null}, *table_ptr, *table_ptr, nulls_equal); auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); - key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - return key_map.get_size(); + auto const count = [&]() { + std::size_t c = 0; + // when nulls are equal and input has nulls, only non-null rows are inserted. Thus the + // total distinct count equals the number of valid rows plus one (number of null rows) + if ((compare_nulls == null_equality::EQUAL) and has_null) { + thrust::counting_iterator stencil(0); + auto const row_bitmask = cudf::detail::bitmask_and(keys, stream).first; + row_is_valid pred{static_cast(row_bitmask.data())}; + + // insert valid rows only + hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.valu()); + c = key_map.get_size() + 1; + } else { + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + c = key_map.get_size(); + } + return c; + }(); + + return count; } /** From bba7b57faf8e9ec3d6f8fe7be6cba9b617294557 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 11:48:30 -0500 Subject: [PATCH 08/50] Fill column via mutable view + update comments --- cpp/src/stream_compaction/drop_duplicates.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index e6a59edce3b..f1b14b84f3f 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -70,27 +70,30 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); + // insert unique indices into the map. key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); auto counting_iter = thrust::make_counting_iterator(0); rmm::device_uvector existences(num_rows, stream, mr); + // enumerate all indices to check if they are present in the map. key_map.contains(counting_iter, counting_iter + num_rows, existences.begin(), hash_key); auto const output_size{key_map.get_size()}; - rmm::device_uvector unique_indices(output_size, stream, mr); + // write unique indices to a numeric column + auto unique_indices = make_numeric_column( + data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); + auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); thrust::copy_if(rmm::exec_policy(stream), counting_iter, counting_iter + num_rows, existences.begin(), - unique_indices.begin(), + mutable_view->begin(), [] __device__(bool const b) { return b; }); - column_view unique_indices_view(data_type{type_id::INT32}, output_size, unique_indices.data()); - // run gather operation to establish new order return detail::gather(input, - unique_indices_view, + unique_indices->view(), out_of_bounds_policy::DONT_CHECK, detail::negative_index_policy::NOT_ALLOWED, stream, From 6746f28d4d6a6bbb061a1638476f5f93fd599a8b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 11:54:06 -0500 Subject: [PATCH 09/50] Minor corrections --- cpp/src/stream_compaction/distinct_count.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 2bf7f25a8f1..31721131e6d 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -64,13 +64,13 @@ cudf::size_type unordered_distinct_count(table_view const& keys, std::size_t c = 0; // when nulls are equal and input has nulls, only non-null rows are inserted. Thus the // total distinct count equals the number of valid rows plus one (number of null rows) - if ((compare_nulls == null_equality::EQUAL) and has_null) { + if ((nulls_equal == null_equality::EQUAL) and has_null) { thrust::counting_iterator stencil(0); auto const row_bitmask = cudf::detail::bitmask_and(keys, stream).first; row_is_valid pred{static_cast(row_bitmask.data())}; // insert valid rows only - hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.valu()); + key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); c = key_map.get_size() + 1; } else { key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); From 70292bc46105ab712d5f4c243269afb6f57bc234 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 16:37:27 -0500 Subject: [PATCH 10/50] Update benchmarks and unit tests --- .../drop_duplicates_benchmark.cpp | 42 +++---- .../drop_duplicates_tests.cpp | 116 +++++++----------- 2 files changed, 61 insertions(+), 97 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index 155931e771b..05f24f3f0f9 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -29,7 +29,7 @@ class Compaction : public cudf::benchmark { }; template -void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep) +void BM_compaction(benchmark::State& state) { auto const n_rows = static_cast(state.range(0)); @@ -45,34 +45,26 @@ void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep) for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = cudf::unordered_drop_duplicates(input_table, {0}, keep); + auto result = cudf::unordered_drop_duplicates(input_table, {0}); } } -#define concat(a, b, c) a##b##c -#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op - -// TYPE, OP -#define RBM_BENCHMARK_DEFINE(name, type, keep) \ - BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \ - { \ - BM_compaction(state, get_keep(keep)); \ - } \ - BENCHMARK_REGISTER_F(Compaction, name) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ +// TYPE +#define RBM_BENCHMARK_DEFINE(name, type) \ + BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) { BM_compaction(state); } \ + BENCHMARK_REGISTER_F(Compaction, name) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ ->Arg(10000000) /* 10M */ -#define COMPACTION_BENCHMARK_DEFINE(type, keep) \ - RBM_BENCHMARK_DEFINE(concat(type, _, keep), type, keep) +#define COMPACTION_BENCHMARK_DEFINE(type) RBM_BENCHMARK_DEFINE(type, type) -COMPACTION_BENCHMARK_DEFINE(bool, NONE); -COMPACTION_BENCHMARK_DEFINE(int8_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST); -COMPACTION_BENCHMARK_DEFINE(int32_t, LAST); +COMPACTION_BENCHMARK_DEFINE(bool); +COMPACTION_BENCHMARK_DEFINE(int8_t); +COMPACTION_BENCHMARK_DEFINE(int32_t); +COMPACTION_BENCHMARK_DEFINE(int64_t); using cudf::timestamp_ms; -COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE); -COMPACTION_BENCHMARK_DEFINE(float, NONE); +COMPACTION_BENCHMARK_DEFINE(timestamp_ms); +COMPACTION_BENCHMARK_DEFINE(float); diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 188c0c0bcbc..71d24df7cac 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -273,8 +274,8 @@ struct DropDuplicate : public cudf::test::BaseFixture { TEST_F(DropDuplicate, NonNullTable) { - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; - cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; @@ -283,90 +284,64 @@ TEST_F(DropDuplicate, NonNullTable) // Keep first of duplicate // The expected table would be sorted in ascending order with respect to keys - cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_first{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_first{{21, 20, 19, 20, 9}}; - cudf::table_view expected_first{ - {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; + cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key{{21, 20, 19, 20, 9}}; + cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; - auto got_first = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); + auto result = unordered_drop_duplicates(input, keys); + auto key_view = result->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(result->view(), key_view); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - - // keep last of duplicate - cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_last{{21, 20, 19, 20, 9}}; - cudf::table_view expected_last{ - {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; - - auto got_last = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep unique - cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{21, 20, 20, 9}}; - cudf::table_view expected_unique{ - {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; - - auto got_unique = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } TEST_F(DropDuplicate, WithNull) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; cudf::table_view input{{col, key}}; std::vector keys{1}; - // Keep first of duplicate - cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; - auto got_first = unordered_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - - // Keep last of duplicate - cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep unique of duplicate - cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; - cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; - auto got_unique = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); + // Nulls are equal + cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; + auto res_equal = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + auto equal_keys = res_equal->select(keys.begin(), keys.end()); + auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); + + // Nulls are unequal + cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, + {0, 1, 0, 1, 1}}; + cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}}; + auto res_unequal = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL); + auto sorted_unequal = cudf::sort(res_unequal->view()); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); } TEST_F(DropDuplicate, StringKeyColumn) { - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, {1, 1, 1, 0, 1, 1}}; cudf::table_view input{{col, key_col}}; std::vector keys{1}; - cudf::test::fixed_width_column_wrapper exp_col_last{{5, 3, 4, 1, 8}, {1, 1, 0, 1, 1}}; - cudf::test::strings_column_wrapper exp_key_col_last{{"new", "all", "new", "strings", "the"}, - {0, 1, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; + cudf::test::fixed_width_column_wrapper exp_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; + cudf::test::strings_column_wrapper exp_key_col{{"new", "all", "new", "strings", "the"}, + {0, 1, 1, 1, 1}}; + cudf::table_view expected{{exp_col, exp_key_col}}; - auto got_last = unordered_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + auto result = unordered_drop_duplicates(input, keys); + auto key_view = result->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(result->view(), key_view); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } TEST_F(DropDuplicate, EmptyInputTable) @@ -375,8 +350,7 @@ TEST_F(DropDuplicate, EmptyInputTable) cudf::table_view input{{col}}; std::vector keys{1, 2}; - auto got = unordered_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } @@ -386,8 +360,7 @@ TEST_F(DropDuplicate, NoColumnInputTable) cudf::table_view input{std::vector()}; std::vector keys{1, 2}; - auto got = unordered_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } @@ -399,8 +372,7 @@ TEST_F(DropDuplicate, EmptyKeys) cudf::table_view input{{col}}; std::vector keys{}; - auto got = unordered_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); } From 46d83b967a03dfcf7c70795e87070847bc7d0a84 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 16:38:26 -0500 Subject: [PATCH 11/50] Add reminder for further optimization in distinct count --- cpp/src/stream_compaction/distinct_count.cu | 40 +++++++++++---------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 31721131e6d..c2f5fb564ef 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -60,24 +60,28 @@ cudf::size_type unordered_distinct_count(table_view const& keys, auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); - auto const count = [&]() { - std::size_t c = 0; - // when nulls are equal and input has nulls, only non-null rows are inserted. Thus the - // total distinct count equals the number of valid rows plus one (number of null rows) - if ((nulls_equal == null_equality::EQUAL) and has_null) { - thrust::counting_iterator stencil(0); - auto const row_bitmask = cudf::detail::bitmask_and(keys, stream).first; - row_is_valid pred{static_cast(row_bitmask.data())}; - - // insert valid rows only - key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); - c = key_map.get_size() + 1; - } else { - key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - c = key_map.get_size(); - } - return c; - }(); + // TODO: debug the code below to improve efficiency: when nulls are equal, only non-null row + // indices are inserted into the hash map. + // auto const count = [&]() { + // std::size_t c = 0; + // // when nulls are equal and input has nulls, only non-null rows are inserted. Thus the + // // total distinct count equals the number of valid rows plus one (number of null rows) + // if ((nulls_equal == null_equality::EQUAL) and has_null) { + // thrust::counting_iterator stencil(0); + // auto const row_bitmask = cudf::detail::bitmask_and(keys, stream).first; + // row_is_valid pred{static_cast(row_bitmask.data())}; + // // insert valid rows only + // key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, + // stream.value()); c = key_map.get_size() + 1; + // } else { + // key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + // c = key_map.get_size(); + // } + // return c; + // }(); + // + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + auto count = key_map.get_size(); return count; } From f07d3d09b26ef0616e6e858f928421d4b3d6a53c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 16:42:24 -0500 Subject: [PATCH 12/50] Fix transform test failure --- cpp/src/transform/encode.cu | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index d78188cf8b4..a2b40cf3dee 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -38,29 +38,22 @@ namespace detail { std::pair, std::unique_ptr> encode( table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - std::vector drop_keys(input_table.num_columns()); + auto const num_cols = input_table.num_columns(); + + std::vector drop_keys(num_cols); std::iota(drop_keys.begin(), drop_keys.end(), 0); - // side effects of this function we are now dependent on: - // - resulting column elements are sorted ascending - // - nulls are sorted to the beginning - auto keys_table = cudf::detail::unordered_drop_duplicates(input_table, - drop_keys, - duplicate_keep_option::KEEP_FIRST, - null_equality::EQUAL, - null_order::AFTER, - stream, - mr); + auto unique_keys = cudf::detail::unordered_drop_duplicates( + input_table, drop_keys, null_equality::EQUAL, stream, mr); + + std::vector column_order(num_cols, order::ASCENDING); + std::vector null_precedence(num_cols, null_order::AFTER); + auto sorted_unique_keys = sort(unique_keys->view(), column_order, null_precedence, stream, mr); - auto indices_column = - cudf::detail::lower_bound(keys_table->view(), - input_table, - std::vector(input_table.num_columns(), order::ASCENDING), - std::vector(input_table.num_columns(), null_order::AFTER), - stream, - mr); + auto indices_column = cudf::detail::lower_bound( + sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr); - return std::make_pair(std::move(keys_table), std::move(indices_column)); + return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column)); } } // namespace detail From 8fcfbaeb03188f81bc9e3ef015d374f6a6ce7697 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 17:08:06 -0500 Subject: [PATCH 13/50] Fix dictionary test failures --- cpp/src/dictionary/add_keys.cu | 27 +++++++++++++++------------ cpp/src/dictionary/set_keys.cu | 15 ++++++++++----- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index f402d7d807f..ee3a05285cf 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -62,22 +63,24 @@ std::unique_ptr add_keys( auto table_keys = cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}}, std::vector{0}, // only one key column - duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, - null_order::BEFORE, stream, - mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + mr); + + std::vector column_order{order::ASCENDING}; + std::vector null_precedence{null_order::AFTER}; // should be no nulls here + auto sorted_keys = + cudf::detail::sort(table_keys->view(), column_order, null_precedence, stream, mr)->release(); + + std::unique_ptr keys_column(std::move(sorted_keys.front())); // create a map for the indices // lower_bound([a,b,c,d,e,f],[a,b,c,d,f]) = [0,1,2,3,5] - auto map_indices = cudf::detail::lower_bound( - table_view{{keys_column->view()}}, - table_view{{old_keys}}, - std::vector{order::ASCENDING}, - std::vector{null_order::AFTER}, // should be no nulls here - stream, - mr); + auto map_indices = cudf::detail::lower_bound(table_view{{keys_column->view()}}, + table_view{{old_keys}}, + column_order, + null_precedence, + stream, + mr); // now create the indices column -- map old values to the new ones // gather([4,0,3,1,2,2,2,4,0],[0,1,2,3,5]) = [5,0,3,1,2,2,2,5,0] column_view indices_view(dictionary_column.indices().type(), diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 351eaf2028d..9d82fdc9de2 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -121,11 +122,15 @@ std::unique_ptr set_keys( CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); // copy the keys -- use unordered_drop_duplicates to make sure they are sorted and unique - auto table_keys = - cudf::detail::unordered_drop_duplicates( - table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + auto unique_keys = cudf::detail::unordered_drop_duplicates( + table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr); + auto sorted_keys = cudf::detail::sort(unique_keys->view(), + std::vector{order::ASCENDING}, + std::vector{null_order::BEFORE}, + stream, + mr) + ->release(); + std::unique_ptr keys_column(std::move(sorted_keys.front())); // compute the new nulls auto matches = cudf::detail::contains(keys, keys_column->view(), stream, mr); From f2ac25d97e4bb332c4e83a9d167a9daa7defdac9 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 19:20:13 -0500 Subject: [PATCH 14/50] Add sort-based implementations back to the repo --- .../drop_duplicates_benchmark.cpp | 76 +++-- cpp/include/cudf/detail/stream_compaction.hpp | 14 + cpp/include/cudf/stream_compaction.hpp | 32 ++ cpp/src/stream_compaction/drop_duplicates.cu | 101 +++++++ cpp/tests/CMakeLists.txt | 1 + .../distinct_count_tests.cpp | 270 +++++++++++++++++ .../drop_duplicates_tests.cpp | 286 +++++------------- java/src/main/native/src/TableJni.cpp | 2 +- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 8 +- python/cudf/cudf/_lib/stream_compaction.pyx | 3 +- 10 files changed, 563 insertions(+), 230 deletions(-) create mode 100644 cpp/tests/stream_compaction/distinct_count_tests.cpp diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index 05f24f3f0f9..1f1bd0505d6 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -27,9 +27,14 @@ class Compaction : public cudf::benchmark { }; +class HashCompaction : public cudf::benchmark { +}; + +enum class algorithm { SORT_BASED, HASH_BASED }; -template -void BM_compaction(benchmark::State& state) +template +void BM_compaction(benchmark::State& state, + cudf::duplicate_keep_option keep = cudf::duplicate_keep_option::KEEP_FIRST) { auto const n_rows = static_cast(state.range(0)); @@ -45,26 +50,63 @@ void BM_compaction(benchmark::State& state) for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = cudf::unordered_drop_duplicates(input_table, {0}); + auto const result = [&]() { + if constexpr (Algo == algorithm::HASH_BASED) { + return cudf::unordered_drop_duplicates(input_table, {0}); + } else { + return cudf::drop_duplicates(input_table, {0}, keep); + } + }(); } } +#define concat(a, b, c) a##b##c +#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op + +// TYPE, OP +#define SORT_BENCHMARK_DEFINE(name, type, keep) \ + BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \ + { \ + BM_compaction(state, get_keep(keep)); \ + } \ + BENCHMARK_REGISTER_F(Compaction, name) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ + +#define COMPACTION_BENCHMARK_DEFINE(type, keep) \ + SORT_BENCHMARK_DEFINE(concat(type, _, keep), type, keep) + // TYPE -#define RBM_BENCHMARK_DEFINE(name, type) \ - BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) { BM_compaction(state); } \ - BENCHMARK_REGISTER_F(Compaction, name) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ +#define HASH_BENCHMARK_DEFINE(type) \ + BENCHMARK_DEFINE_F(HashCompaction, type)(::benchmark::State & state) \ + { \ + BM_compaction(state); \ + } \ + BENCHMARK_REGISTER_F(HashCompaction, type) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ ->Arg(10000000) /* 10M */ -#define COMPACTION_BENCHMARK_DEFINE(type) RBM_BENCHMARK_DEFINE(type, type) +#define HASH_COMPACTION_BENCHMARK_DEFINE(type) HASH_BENCHMARK_DEFINE(type) -COMPACTION_BENCHMARK_DEFINE(bool); -COMPACTION_BENCHMARK_DEFINE(int8_t); -COMPACTION_BENCHMARK_DEFINE(int32_t); -COMPACTION_BENCHMARK_DEFINE(int64_t); using cudf::timestamp_ms; -COMPACTION_BENCHMARK_DEFINE(timestamp_ms); -COMPACTION_BENCHMARK_DEFINE(float); + +COMPACTION_BENCHMARK_DEFINE(bool, NONE); +COMPACTION_BENCHMARK_DEFINE(int8_t, NONE); +COMPACTION_BENCHMARK_DEFINE(int32_t, NONE); +COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST); +COMPACTION_BENCHMARK_DEFINE(int32_t, LAST); +COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE); +COMPACTION_BENCHMARK_DEFINE(float, NONE); + +HASH_COMPACTION_BENCHMARK_DEFINE(bool); +HASH_COMPACTION_BENCHMARK_DEFINE(int8_t); +HASH_COMPACTION_BENCHMARK_DEFINE(int32_t); +HASH_COMPACTION_BENCHMARK_DEFINE(int64_t); +HASH_COMPACTION_BENCHMARK_DEFINE(timestamp_ms); +HASH_COMPACTION_BENCHMARK_DEFINE(float); diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index afbab14f580..ef0bdfd0142 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -61,6 +61,20 @@ std::unique_ptr
apply_boolean_mask( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::drop_duplicates + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
drop_duplicates( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal = null_equality::EQUAL, + null_order null_precedence = null_order::BEFORE, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::unordered_drop_duplicates * diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 9bded32d8d3..59e81a17256 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -216,6 +216,38 @@ enum class duplicate_keep_option { /** * @brief Create a new table without duplicate rows. * + * The output table is sorted according to the lexicographic ordering of the `keys` rows. + * + * Given an `input` table_view, each row is copied to output table if the corresponding + * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep: + * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied + * - KEEP_LAST: only the last of a sequence of duplicate rows is copied + * - KEEP_NONE: no duplicate rows are copied + * + * @throws cudf::logic_error if The `input` row size mismatches with `keys`. + * + * @param[in] input input table_view to copy only unique rows + * @param[in] keys vector of indices representing key columns from `input` + * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not + * equal if null_equality::UNEQUAL + * @param[in] null_precedence flag to denote nulls should appear before or after non-null items + * @param[in] mr Device memory resource used to allocate the returned table's device + * memory + * + * @return Table with sorted unique rows as per specified `keep`. + */ +std::unique_ptr
drop_duplicates( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal = null_equality::EQUAL, + null_order null_precedence = null_order::BEFORE, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create a new table without duplicate rows with hash-based algorithms. + * * Given an `input` table_view, each row is copied to output table if the corresponding * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which * row is copied. diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index f1b14b84f3f..97669705b5b 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -43,6 +44,94 @@ namespace cudf { namespace detail { +namespace { +/** + * @brief Create a column_view of index values which represent the row values + * without duplicates as per @p `keep` + * + * Given a `keys` table_view, each row index is copied to output `unique_indices`, if the + * corresponding row of `keys` table_view is unique, where the definition of unique depends on the + * value of @p keep: + * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied + * - KEEP_LAST: only the last of a sequence of duplicate rows is copied + * - KEEP_NONE: only unique rows are kept + * + * @param[in] keys table_view to identify duplicate rows + * @param[out] unique_indices Column to store the index with unique rows + * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, + * @param[in] null_precedence flag to denote nulls should appear before or after non-null items, + * nulls are not equal if null_equality::UNEQUAL + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * + * @return column_view column_view of unique row index as per specified `keep`, this is actually + * slice of `unique_indices`. + */ +column_view get_unique_ordered_indices(cudf::table_view const& keys, + cudf::mutable_column_view& unique_indices, + duplicate_keep_option keep, + null_equality nulls_equal, + null_order null_precedence, + rmm::cuda_stream_view stream) +{ + // Sort only the indices. + // Note that stable sort must be used to maintain the order of duplicate elements. + auto sorted_indices = stable_sorted_order( + keys, + std::vector{}, + std::vector{static_cast(keys.num_columns()), null_precedence}, + stream, + rmm::mr::get_current_device_resource()); + + // extract unique indices + auto device_input_table = cudf::table_device_view::create(keys, stream); + + auto comp = row_equality_comparator( + nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); + auto result_end = unique_copy(sorted_indices->view().begin(), + sorted_indices->view().end(), + unique_indices.begin(), + comp, + keep, + stream); + + return cudf::detail::slice(column_view(unique_indices), + 0, + thrust::distance(unique_indices.begin(), result_end)); +} +} // namespace + +std::unique_ptr
drop_duplicates(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { + return empty_like(input); + } + + auto keys_view = input.select(keys); + + // The values will be filled into this column + auto unique_indices = cudf::make_numeric_column( + data_type{type_id::INT32}, keys_view.num_rows(), mask_state::UNALLOCATED, stream); + auto mutable_unique_indices_view = unique_indices->mutable_view(); + // This is just slice of `unique_indices` but with different size as per the + // keys_view has been processed in `get_unique_ordered_indices` + auto unique_indices_view = detail::get_unique_ordered_indices( + keys_view, mutable_unique_indices_view, keep, nulls_equal, null_precedence, stream); + + // run gather operation to establish new order + return detail::gather(input, + unique_indices_view, + out_of_bounds_policy::DONT_CHECK, + detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} std::unique_ptr
unordered_drop_duplicates(table_view const& input, std::vector const& keys, null_equality nulls_equal, @@ -102,6 +191,18 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, } // namespace detail +std::unique_ptr
drop_duplicates(table_view const& input, + std::vector const& keys, + duplicate_keep_option const keep, + null_equality nulls_equal, + null_order null_precedence, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::drop_duplicates( + input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); +} + std::unique_ptr
unordered_drop_duplicates(table_view const& input, std::vector const& keys, null_equality nulls_equal, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d90260400a0..5dee195ea44 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -292,6 +292,7 @@ ConfigureTest( ConfigureTest( STREAM_COMPACTION_TEST stream_compaction/apply_boolean_mask_tests.cpp + stream_compaction/distinct_count_tests.cpp stream_compaction/drop_nulls_tests.cpp stream_compaction/drop_nans_tests.cpp stream_compaction/drop_duplicates_tests.cpp diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp new file mode 100644 index 00000000000..e5bd3e857dc --- /dev/null +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using cudf::nan_policy; +using cudf::null_equality; +using cudf::null_policy; +template +struct DistinctCountCommon : public cudf::test::BaseFixture { +}; + +TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); + +TYPED_TEST(DistinctCountCommon, NoNull) +{ + using T = TypeParam; + + auto const input = cudf::test::make_type_param_vector( + {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); + + cudf::size_type expected = std::set(input.begin(), input.end()).size(); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TYPED_TEST(DistinctCountCommon, TableNoNull) +{ + using T = TypeParam; + + auto const input1 = cudf::test::make_type_param_vector( + {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + auto const input2 = cudf::test::make_type_param_vector( + {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); + + std::vector> pair_input; + std::transform( + input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { + return std::make_pair(a, b); + }); + + cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); + cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); + + std::vector cols{input_col1, input_col2}; + cudf::table_view input_table(cols); + + cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); + EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); +} + +struct DistinctCount : public cudf::test::BaseFixture { +}; + +TEST_F(DistinctCount, WithNull) +{ + using T = int32_t; + + // Considering 70 as null + std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + cudf::size_type expected = std::set(input.begin(), input.end()).size(); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, IgnoringNull) +{ + using T = int32_t; + + // Considering 70 and 3 as null + std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; + std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + + cudf::size_type expected = std::set(input.begin(), input.end()).size(); + // Removing 2 from expected to remove count for 70 and 3 + EXPECT_EQ( + expected - 2, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, WithNansAndNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31, 1, 8, 2, 0, 70, 1, + 70, 10, 40, 31, NAN, 0, NAN, 8, 5, 70}; + std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + cudf::size_type expected = std::set(input.begin(), input.end()).size(); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, WithNansOnly) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + cudf::size_type expected = 5; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, NansAsNullWithNoNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 1, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + cudf::size_type expected = 5; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, NansAsNullWithNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + cudf::size_type expected = 4; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, NansAsNullWithIgnoreNull) +{ + using T = float; + + std::vector input = {1, 3, NAN, 70, 31}; + std::vector valid = {1, 1, 1, 0, 1}; + + cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + + cudf::size_type expected = 3; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, EmptyColumn) +{ + using T = float; + + cudf::test::fixed_width_column_wrapper input_col{}; + + cudf::size_type expected = 0; + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); +} + +TEST_F(DistinctCount, StringColumnWithNull) +{ + cudf::test::strings_column_wrapper input_col{ + {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; + + cudf::size_type expected = + (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); + EXPECT_EQ( + expected, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); +} + +TEST_F(DistinctCount, TableWithNull) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; + cudf::table_view input{{col1, col2}}; + + EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, EmptyColumnedTable) +{ + std::vector cols{}; + + cudf::table_view input(cols); + + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(cudf::table_view{}, null_equality::EQUAL)); + EXPECT_EQ(0, cudf::unordered_distinct_count(cudf::table_view{}, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, TableMixedTypes) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; + cudf::test::fixed_width_column_wrapper col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, + {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; + cudf::table_view input{{col1, col2, col3}}; + + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, TableWithStringColumnWithNull) +{ + cudf::test::fixed_width_column_wrapper col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + cudf::test::strings_column_wrapper col2{ + {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; + + cudf::table_view input{{col1, col2}}; + EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); +} diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 71d24df7cac..89acdc63c34 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -32,247 +32,119 @@ using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; -template -struct DistinctCountCommon : public cudf::test::BaseFixture { -}; - -TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes); - -TYPED_TEST(DistinctCountCommon, NoNull) -{ - using T = TypeParam; - - auto const input = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TYPED_TEST(DistinctCountCommon, TableNoNull) -{ - using T = TypeParam; - - auto const input1 = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); - auto const input2 = cudf::test::make_type_param_vector( - {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); - - std::vector> pair_input; - std::transform( - input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { - return std::make_pair(a, b); - }); - - cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); - cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); - - std::vector cols{input_col1, input_col2}; - cudf::table_view input_table(cols); - - cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); - EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); -} -struct DistinctCount : public cudf::test::BaseFixture { +struct DropDuplicate : public cudf::test::BaseFixture { }; -TEST_F(DistinctCount, WithNull) -{ - using T = int32_t; - - // Considering 70 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} - -TEST_F(DistinctCount, IgnoringNull) +TEST_F(DropDuplicate, NonNullTable) { - using T = int32_t; - - // Considering 70 and 3 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; - - cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - // Removing 2 from expected to remove count for 70 and 3 - EXPECT_EQ( - expected - 2, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); -} + cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; -TEST_F(DistinctCount, WithNansAndNull) -{ - using T = float; + cudf::table_view input{{col1, col2, col1_key, col2_key}}; + std::vector keys{2, 3}; - std::vector input = {1, 3, NAN, 70, 31, 1, 8, 2, 0, 70, 1, - 70, 10, 40, 31, NAN, 0, NAN, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, - 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; + // Keep first of duplicate + // The expected table would be sorted in ascending order with respect to keys + cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key{{21, 20, 19, 20, 9}}; + cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto result = unordered_drop_duplicates(input, keys); + auto key_view = result->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(result->view(), key_view); - cudf::size_type expected = std::set(input.begin(), input.end()).size(); - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } -TEST_F(DistinctCount, WithNansOnly) +TEST_F(DropDuplicate, WithNull) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 1, 1}; - - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - - cudf::size_type expected = 5; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); -} + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; -TEST_F(DistinctCount, NansAsNullWithNoNull) -{ - using T = float; + // Nulls are equal + cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; + auto res_equal = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + auto equal_keys = res_equal->select(keys.begin(), keys.end()); + auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys); - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 1, 1}; + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + // Nulls are unequal + cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, + {0, 1, 0, 1, 1}}; + cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}}; + auto res_unequal = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL); + auto sorted_unequal = cudf::sort(res_unequal->view()); - cudf::size_type expected = 5; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); } -TEST_F(DistinctCount, NansAsNullWithNull) +TEST_F(DropDuplicate, StringKeyColumn) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 0, 1}; + cudf::test::fixed_width_column_wrapper col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 0, 1, 1}}; + cudf::table_view input{{col, key_col}}; + std::vector keys{1}; + cudf::test::fixed_width_column_wrapper exp_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; + cudf::test::strings_column_wrapper exp_key_col{{"new", "all", "new", "strings", "the"}, + {0, 1, 1, 1, 1}}; + cudf::table_view expected{{exp_col, exp_key_col}}; - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto result = unordered_drop_duplicates(input, keys); + auto key_view = result->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(result->view(), key_view); - cudf::size_type expected = 4; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } -TEST_F(DistinctCount, NansAsNullWithIgnoreNull) +TEST_F(DropDuplicate, EmptyInputTable) { - using T = float; - - std::vector input = {1, 3, NAN, 70, 31}; - std::vector valid = {1, 1, 1, 0, 1}; + cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector keys{1, 2}; - cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; + auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - cudf::size_type expected = 3; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } -TEST_F(DistinctCount, EmptyColumn) +TEST_F(DropDuplicate, NoColumnInputTable) { - using T = float; - - cudf::test::fixed_width_column_wrapper input_col{}; - - cudf::size_type expected = 0; - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); -} + cudf::table_view input{std::vector()}; + std::vector keys{1, 2}; -TEST_F(DistinctCount, StringColumnWithNull) -{ - cudf::test::strings_column_wrapper input_col{ - {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; - - cudf::size_type expected = - (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); - EXPECT_EQ( - expected, - cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); -} + auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); -TEST_F(DistinctCount, TableWithNull) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; - cudf::table_view input{{col1, col2}}; - - EXPECT_EQ(8, cudf::unordered_distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } -TEST_F(DistinctCount, EmptyColumnedTable) +TEST_F(DropDuplicate, EmptyKeys) { - std::vector cols{}; - - cudf::table_view input(cols); - - EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); - EXPECT_EQ(0, cudf::unordered_distinct_count(cudf::table_view{}, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::unordered_distinct_count(cudf::table_view{}, null_equality::UNEQUAL)); -} + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper empty_col{}; + cudf::table_view input{{col}}; + std::vector keys{}; -TEST_F(DistinctCount, TableMixedTypes) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}}; - cudf::test::fixed_width_column_wrapper col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1}, - {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}}; - cudf::table_view input{{col1, col2, col3}}; - - EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); -} + auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); -TEST_F(DistinctCount, TableWithStringColumnWithNull) -{ - cudf::test::fixed_width_column_wrapper col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - cudf::test::strings_column_wrapper col2{ - {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}}; - - cudf::table_view input{{col1, col2}}; - EXPECT_EQ(9, cudf::unordered_distinct_count(input, null_equality::EQUAL)); - EXPECT_EQ(10, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); } -struct DropDuplicate : public cudf::test::BaseFixture { +struct UnorderedDropDuplicate : public cudf::test::BaseFixture { }; -TEST_F(DropDuplicate, NonNullTable) +TEST_F(UnorderedDropDuplicate, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; @@ -297,7 +169,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } -TEST_F(DropDuplicate, WithNull) +TEST_F(UnorderedDropDuplicate, WithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; @@ -325,7 +197,7 @@ TEST_F(DropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); } -TEST_F(DropDuplicate, StringKeyColumn) +TEST_F(UnorderedDropDuplicate, StringKeyColumn) { cudf::test::fixed_width_column_wrapper col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, @@ -344,7 +216,7 @@ TEST_F(DropDuplicate, StringKeyColumn) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } -TEST_F(DropDuplicate, EmptyInputTable) +TEST_F(UnorderedDropDuplicate, EmptyInputTable) { cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); cudf::table_view input{{col}}; @@ -355,7 +227,7 @@ TEST_F(DropDuplicate, EmptyInputTable) CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } -TEST_F(DropDuplicate, NoColumnInputTable) +TEST_F(UnorderedDropDuplicate, NoColumnInputTable) { cudf::table_view input{std::vector()}; std::vector keys{1, 2}; @@ -365,7 +237,7 @@ TEST_F(DropDuplicate, NoColumnInputTable) CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); } -TEST_F(DropDuplicate, EmptyKeys) +TEST_F(UnorderedDropDuplicate, EmptyKeys) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper empty_col{}; diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index ae1189e74e9..828d163fe07 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2833,7 +2833,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( auto const keys_indices = std::vector(native_keys_indices.begin(), native_keys_indices.end()); - auto result = cudf::unordered_drop_duplicates( + auto result = cudf::drop_duplicates( *input, keys_indices, keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : cudf::duplicate_keep_option::KEEP_LAST, diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 02a53c5c4f3..c73368633bf 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -33,10 +33,10 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] unordered_drop_duplicates( - table_view source_table, - vector[size_type] keys, - null_equality nulls_equal) except + + cdef unique_ptr[table] drop_duplicates(table_view source_table, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal) except + cdef size_type unordered_distinct_count( column_view source_table, diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 5cc10bd0fcc..3a627a8ea8d 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -11,10 +11,10 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, + drop_duplicates as cpp_drop_duplicates, drop_nulls as cpp_drop_nulls, duplicate_keep_option, unordered_distinct_count as cpp_distinct_count, - unordered_drop_duplicates as cpp_drop_duplicates, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -152,6 +152,7 @@ def drop_duplicates(columns: list, cpp_drop_duplicates( source_table_view, cpp_keys, + cpp_keep_option, cpp_nulls_equal ) ) From 0ed571253c2b538ff659d2e4a599db3575ab53f7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 13 Jan 2022 19:25:37 -0500 Subject: [PATCH 15/50] Update copyright --- cpp/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 5dee195ea44..9ad18763055 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at From e37214454a160d6cc5363b71ced793a422a0e045 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 14 Jan 2022 11:59:33 -0500 Subject: [PATCH 16/50] Add consecutive distinct_count --- cpp/include/cudf/detail/stream_compaction.hpp | 19 ++++++ cpp/include/cudf/stream_compaction.hpp | 38 ++++++++++-- cpp/src/stream_compaction/distinct_count.cu | 61 +++++++++++++++++-- 3 files changed, 109 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index ef0bdfd0142..3d065556827 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -87,6 +87,25 @@ std::unique_ptr
unordered_drop_duplicates( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @copydoc cudf::distinct_count(table_view const&, null_equality) + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + /** * @copydoc cudf::unordered_distinct_count(column_view const&, null_policy, nan_policy) * diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 59e81a17256..4424f24d1a2 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -271,19 +271,49 @@ std::unique_ptr
unordered_drop_duplicates( null_equality nulls_equal = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Count the number of consecutive groups of equivalent elements in a column. + * + * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both + * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and + * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in count. + * + * @param[in] input View of the input column + * @param[in] null_handling flag to include or ignore `null` while counting + * @param[in] nan_handling flag to consider `NaN==null` or not + * + * @return number of consecutive groups in the column + */ +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling); + +/** + * @brief Count the number of consecutive groups of equivalent elements in a table. + * + * + * @param[in] input Table whose number of consecutive groups will be counted + * @param[in] nulls_equal flag to denote if null elements should be considered equal + * nulls are not equal if null_equality::UNEQUAL + * + * @return number of consecutive groups in the table + */ +cudf::size_type distinct_count(table_view const& input, + null_equality nulls_equal = null_equality::EQUAL); + /** * @brief Count the unique elements in the column_view. * - * Given an input column_view, number of unique elements in this column_view is returned + * Given an input column_view, number of unique elements in this column_view is returned. * * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique * count. * - * @param[in] input The column_view whose unique elements will be counted. + * @param[in] input The column_view whose unique elements will be counted * @param[in] null_handling flag to include or ignore `null` while counting - * @param[in] nan_handling flag to consider `NaN==null` or not. + * @param[in] nan_handling flag to consider `NaN==null` or not * * @return number of unique elements */ @@ -295,7 +325,7 @@ cudf::size_type unordered_distinct_count(column_view const& input, * @brief Count the unique rows in a table. * * - * @param[in] input Table whose unique rows will be counted. + * @param[in] input Table whose unique rows will be counted * @param[in] nulls_equal flag to denote if null elements should be considered equal * nulls are not equal if null_equality::UNEQUAL * diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index c2f5fb564ef..156814f40d4 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -39,6 +39,20 @@ namespace cudf { namespace detail { +cudf::size_type distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + row_equality_comparator comp( + nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(keys.num_rows()), + [comp] __device__(cudf::size_type i) { return (i == 0 || not comp(i, i - 1)); }); +} + cudf::size_type unordered_distinct_count(table_view const& keys, null_equality nulls_equal, rmm::cuda_stream_view stream) @@ -162,10 +176,11 @@ struct has_nans { } }; -cudf::size_type unordered_distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream) +template +cudf::size_type col_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) { if (0 == input.size() || input.null_count() == input.size()) { return 0; } @@ -181,7 +196,12 @@ cudf::size_type unordered_distinct_count(column_view const& input, has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); } - auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + auto count = [&]() { + if constexpr (unordered) { + return detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + } + return detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); + }(); // if nan is considered null and there are already null values if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count; @@ -192,8 +212,39 @@ cudf::size_type unordered_distinct_count(column_view const& input, return count; } +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + auto constexpr unordered = false; + return col_distinct_count(input, null_handling, nan_handling, stream); +} + +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + auto constexpr unordered = true; + return col_distinct_count(input, null_handling, nan_handling, stream); +} } // namespace detail +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling) +{ + CUDF_FUNC_RANGE(); + return detail::distinct_count(input, null_handling, nan_handling); +} + +cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal) +{ + CUDF_FUNC_RANGE(); + return detail::distinct_count(input, nulls_equal); +} + cudf::size_type unordered_distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling) From fc57b29bd8e009f5ce1af72f6079814ed1415d12 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 15 Jan 2022 16:38:01 -0500 Subject: [PATCH 17/50] Remove nan control in distinct_count --- cpp/include/cudf/detail/stream_compaction.hpp | 1 - cpp/include/cudf/stream_compaction.hpp | 5 +- cpp/src/stream_compaction/distinct_count.cu | 195 +++++++++--------- .../distinct_count_tests.cpp | 39 +++- .../drop_duplicates_tests.cpp | 8 +- 5 files changed, 133 insertions(+), 115 deletions(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 3d065556827..dd52438f9ee 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -94,7 +94,6 @@ std::unique_ptr
unordered_drop_duplicates( */ cudf::size_type distinct_count(column_view const& input, null_policy null_handling, - nan_policy nan_handling, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 4424f24d1a2..91f5395421a 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -280,13 +280,10 @@ std::unique_ptr
unordered_drop_duplicates( * * @param[in] input View of the input column * @param[in] null_handling flag to include or ignore `null` while counting - * @param[in] nan_handling flag to consider `NaN==null` or not * * @return number of consecutive groups in the column */ -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling); +cudf::size_type distinct_count(column_view const& input, null_policy null_handling); /** * @brief Count the number of consecutive groups of equivalent elements in a table. diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 156814f40d4..e0bc9405d9c 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -38,68 +38,7 @@ namespace cudf { namespace detail { - -cudf::size_type distinct_count(table_view const& keys, - null_equality nulls_equal, - rmm::cuda_stream_view stream) -{ - auto table_ptr = cudf::table_device_view::create(keys, stream); - row_equality_comparator comp( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(keys.num_rows()), - [comp] __device__(cudf::size_type i) { return (i == 0 || not comp(i, i - 1)); }); -} - -cudf::size_type unordered_distinct_count(table_view const& keys, - null_equality nulls_equal, - rmm::cuda_stream_view stream) -{ - auto table_ptr = cudf::table_device_view::create(keys, stream); - auto const num_rows{table_ptr->num_rows()}; - auto const has_null = cudf::has_nulls(keys); - - hash_map_type key_map{compute_hash_table_size(num_rows), - COMPACTION_EMPTY_KEY_SENTINEL, - COMPACTION_EMPTY_VALUE_SENTINEL, - detail::hash_table_allocator_type{default_allocator{}, stream}, - stream.value()}; - - compaction_hash hash_key{nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr}; - row_equality_comparator row_equal( - nullate::DYNAMIC{has_null}, *table_ptr, *table_ptr, nulls_equal); - - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); - - // TODO: debug the code below to improve efficiency: when nulls are equal, only non-null row - // indices are inserted into the hash map. - // auto const count = [&]() { - // std::size_t c = 0; - // // when nulls are equal and input has nulls, only non-null rows are inserted. Thus the - // // total distinct count equals the number of valid rows plus one (number of null rows) - // if ((nulls_equal == null_equality::EQUAL) and has_null) { - // thrust::counting_iterator stencil(0); - // auto const row_bitmask = cudf::detail::bitmask_and(keys, stream).first; - // row_is_valid pred{static_cast(row_bitmask.data())}; - // // insert valid rows only - // key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, - // stream.value()); c = key_map.get_size() + 1; - // } else { - // key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - // c = key_map.get_size(); - // } - // return c; - // }(); - // - key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - auto count = key_map.get_size(); - - return count; -} - +namespace { /** * @brief Functor to check for `NAN` at an index in a `column_device_view`. * @@ -121,12 +60,11 @@ struct check_for_nan { * * @returns bool true if value at `index` is `NAN` and not null, else false */ - __device__ bool operator()(size_type index) + __device__ bool operator()(size_type index) const noexcept { return std::isnan(_input.data()[index]) and _input.is_valid(index); } - protected: cudf::column_device_view _input; }; @@ -175,16 +113,100 @@ struct has_nans { return false; } }; +} // namespace -template -cudf::size_type col_distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream) +cudf::size_type distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) { - if (0 == input.size() || input.null_count() == input.size()) { return 0; } + auto table_ptr = cudf::table_device_view::create(keys, stream); + row_equality_comparator comp( + nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(keys.num_rows()), + [comp] __device__(cudf::size_type i) { return (i == 0 || not comp(i, i - 1)); }); +} - cudf::size_type nrows = input.size(); +cudf::size_type unordered_distinct_count(table_view const& keys, + null_equality nulls_equal, + rmm::cuda_stream_view stream) +{ + auto table_ptr = cudf::table_device_view::create(keys, stream); + auto const num_rows{table_ptr->num_rows()}; + auto const has_null = cudf::has_nulls(keys); + + hash_map_type key_map{compute_hash_table_size(num_rows), + COMPACTION_EMPTY_KEY_SENTINEL, + COMPACTION_EMPTY_VALUE_SENTINEL, + detail::hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + + compaction_hash hash_key{nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr}; + row_equality_comparator row_equal( + nullate::DYNAMIC{has_null}, *table_ptr, *table_ptr, nulls_equal); + + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); + + // TODO: debug the code below to improve efficiency: when nulls are equal, only non-null row + // indices are inserted into the hash map. + // auto const count = [&]() { + // std::size_t c = 0; + // // when nulls are equal and input has nulls, only non-null rows are inserted. Thus the + // // total distinct count equals the number of valid rows plus one (number of null rows) + // if ((nulls_equal == null_equality::EQUAL) and has_null) { + // thrust::counting_iterator stencil(0); + // auto const row_bitmask = cudf::detail::bitmask_and(keys, stream).first; + // row_is_valid pred{static_cast(row_bitmask.data())}; + // // insert valid rows only + // key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, + // stream.value()); c = key_map.get_size() + 1; + // } else { + // key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + // c = key_map.get_size(); + // } + // return c; + // }(); + // + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + auto count = key_map.get_size(); + + return count; +} + +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + rmm::cuda_stream_view stream) +{ + auto const num_rows = input.size(); + + if (0 == num_rows || input.null_count() == num_rows) { return 0; } + + auto input_device_view = cudf::column_device_view::create(input, stream); + auto device_view = *input_device_view; + auto count_nulls = null_handling == null_policy::INCLUDE; + auto t_view = table_view{{input}}; + auto table_ptr = cudf::table_device_view::create(t_view, stream); + row_equality_comparator comp( + nullate::DYNAMIC{cudf::has_nulls(t_view)}, *table_ptr, *table_ptr, null_equality::EQUAL); + + return thrust::count_if(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + [count_nulls, device_view, comp] __device__(cudf::size_type i) { + if ((not count_nulls) and device_view.is_null(i)) { return false; } + return (i == 0 || not comp(i, i - 1)); + }); +} + +cudf::size_type unordered_distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling, + rmm::cuda_stream_view stream) +{ + if (0 == input.size() || input.null_count() == input.size()) { return 0; } bool has_nan = false; // Check for Nans @@ -196,12 +218,7 @@ cudf::size_type col_distinct_count(column_view const& input, has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); } - auto count = [&]() { - if constexpr (unordered) { - return detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); - } - return detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); - }(); + auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // if nan is considered null and there are already null values if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count; @@ -211,32 +228,12 @@ cudf::size_type col_distinct_count(column_view const& input, else return count; } - -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream) -{ - auto constexpr unordered = false; - return col_distinct_count(input, null_handling, nan_handling, stream); -} - -cudf::size_type unordered_distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling, - rmm::cuda_stream_view stream) -{ - auto constexpr unordered = true; - return col_distinct_count(input, null_handling, nan_handling, stream); -} } // namespace detail -cudf::size_type distinct_count(column_view const& input, - null_policy null_handling, - nan_policy nan_handling) +cudf::size_type distinct_count(column_view const& input, null_policy null_handling) { CUDF_FUNC_RANGE(); - return detail::distinct_count(input, null_handling, nan_handling); + return detail::distinct_count(input, null_handling); } cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal) diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index e5bd3e857dc..f7b784be91e 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -14,24 +14,27 @@ * limitations under the License. */ -#include -#include -#include #include #include #include #include #include #include + #include #include #include #include #include +#include +#include +#include + using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; + template struct DistinctCountCommon : public cudf::test::BaseFixture { }; @@ -47,10 +50,18 @@ TYPED_TEST(DistinctCountCommon, NoNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); - cudf::size_type expected = std::set(input.begin(), input.end()).size(); + cudf::size_type expected = std::set(input.begin(), input.end()).size(); EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + /* + std::vector input_data; + std::copy(input.begin(), input.end(), std::back_inserter(input_data)); + auto const new_end = std::unique(input_data.begin(), input_data.end()); + auto const gold_ordered = new_end - input_data.begin(); + EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::INCLUDE)); + */ } TYPED_TEST(DistinctCountCommon, TableNoNull) @@ -58,9 +69,9 @@ TYPED_TEST(DistinctCountCommon, TableNoNull) using T = TypeParam; auto const input1 = cudf::test::make_type_param_vector( - {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); + {1, 3, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4}); auto const input2 = cudf::test::make_type_param_vector( - {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); + {3, 3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1}); std::vector> pair_input; std::transform( @@ -76,6 +87,12 @@ TYPED_TEST(DistinctCountCommon, TableNoNull) cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); + + /* + auto const new_end = std::unique(pair_input.begin(), pair_input.end()); + auto const gold_ordered = new_end - pair_input.begin(); + EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL)); + */ } struct DistinctCount : public cudf::test::BaseFixture { @@ -86,8 +103,8 @@ TEST_F(DistinctCount, WithNull) using T = int32_t; // Considering 70 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; - std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, + std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 70, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); @@ -114,6 +131,12 @@ TEST_F(DistinctCount, IgnoringNull) EXPECT_EQ( expected - 2, cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + + /* + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = new_end - input.begin() - 5; + EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::EXCLUDE)); + */ } TEST_F(DistinctCount, WithNansAndNull) diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 89acdc63c34..5ae0a690fb9 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -14,21 +14,23 @@ * limitations under the License. */ -#include -#include -#include #include #include #include #include #include #include + #include #include #include #include #include +#include +#include +#include + using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; From d810e0ba335ff8bcd0a189c8c3ebed804bd0b2fe Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sat, 15 Jan 2022 17:50:39 -0500 Subject: [PATCH 18/50] Update unit tests --- cpp/src/stream_compaction/distinct_count.cu | 4 ++++ .../stream_compaction/distinct_count_tests.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index e0bc9405d9c..e9cc0843228 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -68,6 +68,8 @@ struct check_for_nan { cudf::column_device_view _input; }; +#define UNUSED(x) (void)(x) + /** * @brief A structure to be used along with type_dispatcher to check if a * `column_view` has `NAN`. @@ -110,6 +112,8 @@ struct has_nans { template ::value>* = nullptr> bool operator()(column_view const& input, rmm::cuda_stream_view stream) { + UNUSED(input); + UNUSED(stream); return false; } }; diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index f7b784be91e..91e76f5ed2d 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -50,18 +50,16 @@ TYPED_TEST(DistinctCountCommon, NoNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); - cudf::size_type expected = std::set(input.begin(), input.end()).size(); + cudf::size_type expected = std::set(input.begin(), input.end()).size(); EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); - /* - std::vector input_data; + std::vector input_data; std::copy(input.begin(), input.end(), std::back_inserter(input_data)); auto const new_end = std::unique(input_data.begin(), input_data.end()); auto const gold_ordered = new_end - input_data.begin(); EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::INCLUDE)); - */ } TYPED_TEST(DistinctCountCommon, TableNoNull) @@ -88,11 +86,9 @@ TYPED_TEST(DistinctCountCommon, TableNoNull) cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); - /* auto const new_end = std::unique(pair_input.begin(), pair_input.end()); auto const gold_ordered = new_end - pair_input.begin(); EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL)); - */ } struct DistinctCount : public cudf::test::BaseFixture { @@ -113,6 +109,10 @@ TEST_F(DistinctCount, WithNull) EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = new_end - input.begin() - 3; + EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::EXCLUDE)); } TEST_F(DistinctCount, IgnoringNull) @@ -132,11 +132,9 @@ TEST_F(DistinctCount, IgnoringNull) expected - 2, cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); - /* auto const new_end = std::unique(input.begin(), input.end()); auto const gold_ordered = new_end - input.begin() - 5; EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::EXCLUDE)); - */ } TEST_F(DistinctCount, WithNansAndNull) @@ -169,6 +167,7 @@ TEST_F(DistinctCount, WithNansOnly) EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE)); } TEST_F(DistinctCount, NansAsNullWithNoNull) @@ -226,6 +225,7 @@ TEST_F(DistinctCount, EmptyColumn) EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::EXCLUDE)); } TEST_F(DistinctCount, StringColumnWithNull) From 40cc410709a46a86f94bf9307f0c41ecb05c7fab Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Sun, 16 Jan 2022 23:21:01 -0500 Subject: [PATCH 19/50] Add nan handling to distinct_count + update unit tests --- cpp/include/cudf/detail/stream_compaction.hpp | 1 + cpp/include/cudf/stream_compaction.hpp | 9 +- cpp/src/stream_compaction/distinct_count.cu | 114 ++++++++++++------ .../distinct_count_tests.cpp | 53 +++++--- 4 files changed, 118 insertions(+), 59 deletions(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index dd52438f9ee..3d065556827 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -94,6 +94,7 @@ std::unique_ptr
unordered_drop_duplicates( */ cudf::size_type distinct_count(column_view const& input, null_policy null_handling, + nan_policy nan_handling, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 91f5395421a..885f96d9ee4 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -278,12 +278,17 @@ std::unique_ptr
unordered_drop_duplicates( * `NaN` and `null` values are ignored. If `null_handling` is null_policy::EXCLUDE and * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in count. * + * `null`s are handled as equal. + * * @param[in] input View of the input column * @param[in] null_handling flag to include or ignore `null` while counting + * @param[in] nan_handling flag to consider `NaN==null` or not * * @return number of consecutive groups in the column */ -cudf::size_type distinct_count(column_view const& input, null_policy null_handling); +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling); /** * @brief Count the number of consecutive groups of equivalent elements in a table. @@ -308,6 +313,8 @@ cudf::size_type distinct_count(table_view const& input, * `nan_handling` is nan_policy::NAN_IS_VALID, only `null` is ignored, `NaN` is considered in unique * count. * + * `null`s are handled as equal. + * * @param[in] input The column_view whose unique elements will be counted * @param[in] null_handling flag to include or ignore `null` while counting * @param[in] nan_handling flag to consider `NaN==null` or not diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index e9cc0843228..be30ace49e1 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -117,6 +117,47 @@ struct has_nans { return false; } }; + +/** + * @brief A structure to be used along with device type_dispatcher to check if + * the row `index` of `column_device_view` is `NAN`. + */ +struct check_nan { + /** + * @brief Checks if the row `index` of `input` is `NAN`. + * + * @note This will be applicable only for floating point type columns. + * + * @param[in] input The `column_device_view` which will be checked for `NAN` + * @param[in] index The index at which the `NAN` needs to be checked in `input` + * + * @returns bool true if value at `index` is `NAN`, else false + */ + template ::value>* = nullptr> + __device__ bool operator()(column_device_view const& input, size_type index) + { + return std::isnan(input.data()[index]); + } + /** + * @brief Checks if the row `index` of `input` is `NAN`. + * + * @note This will be applicable for non-floating point type columns. And + * non-floating point columns can never have `NAN`, so it will always return + * false. + * + * @param[in] input The `column_device_view` which will be checked for `NAN` + * @param[in] index The index at which the `NAN` needs to be checked in `input` + * + * @returns bool true if value at `index` is `NAN`, else false + */ + template ::value>* = nullptr> + __device__ bool operator()(column_device_view const& input, size_type index) + { + UNUSED(input); + UNUSED(index); + return false; + } +}; } // namespace cudf::size_type distinct_count(table_view const& keys, @@ -139,7 +180,7 @@ cudf::size_type unordered_distinct_count(table_view const& keys, { auto table_ptr = cudf::table_device_view::create(keys, stream); auto const num_rows{table_ptr->num_rows()}; - auto const has_null = cudf::has_nulls(keys); + auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)}; hash_map_type key_map{compute_hash_table_size(num_rows), COMPACTION_EMPTY_KEY_SENTINEL, @@ -147,62 +188,51 @@ cudf::size_type unordered_distinct_count(table_view const& keys, detail::hash_table_allocator_type{default_allocator{}, stream}, stream.value()}; - compaction_hash hash_key{nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr}; - row_equality_comparator row_equal( - nullate::DYNAMIC{has_null}, *table_ptr, *table_ptr, nulls_equal); + compaction_hash hash_key{has_null, *table_ptr}; + row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); - // TODO: debug the code below to improve efficiency: when nulls are equal, only non-null row - // indices are inserted into the hash map. - // auto const count = [&]() { - // std::size_t c = 0; - // // when nulls are equal and input has nulls, only non-null rows are inserted. Thus the - // // total distinct count equals the number of valid rows plus one (number of null rows) - // if ((nulls_equal == null_equality::EQUAL) and has_null) { - // thrust::counting_iterator stencil(0); - // auto const row_bitmask = cudf::detail::bitmask_and(keys, stream).first; - // row_is_valid pred{static_cast(row_bitmask.data())}; - // // insert valid rows only - // key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, - // stream.value()); c = key_map.get_size() + 1; - // } else { - // key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - // c = key_map.get_size(); - // } - // return c; - // }(); - // key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - auto count = key_map.get_size(); - - return count; + return key_map.get_size(); } cudf::size_type distinct_count(column_view const& input, null_policy null_handling, + nan_policy nan_handling, rmm::cuda_stream_view stream) { auto const num_rows = input.size(); if (0 == num_rows || input.null_count() == num_rows) { return 0; } + auto const count_nulls = null_handling == null_policy::INCLUDE; + auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; - auto count_nulls = null_handling == null_policy::INCLUDE; auto t_view = table_view{{input}}; auto table_ptr = cudf::table_device_view::create(t_view, stream); row_equality_comparator comp( nullate::DYNAMIC{cudf::has_nulls(t_view)}, *table_ptr, *table_ptr, null_equality::EQUAL); - return thrust::count_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(num_rows), - [count_nulls, device_view, comp] __device__(cudf::size_type i) { - if ((not count_nulls) and device_view.is_null(i)) { return false; } - return (i == 0 || not comp(i, i - 1)); - }); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + [count_nulls, nan_is_null, device_view, comp] __device__(cudf::size_type i) { + bool is_nan = cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); + if (count_nulls) { + if (nan_is_null and (is_nan or device_view.is_null(i))) { + bool prev_is_nan = + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1); + return (i == 0 or not(device_view.is_null(i - 1) or prev_is_nan)); + } + } else { + if (device_view.is_null(i) or (nan_is_null and is_nan)) { return false; } + } + return (i == 0 or not comp(i, i - 1)); + }); } cudf::size_type unordered_distinct_count(column_view const& input, @@ -212,32 +242,36 @@ cudf::size_type unordered_distinct_count(column_view const& input, { if (0 == input.size() || input.null_count() == input.size()) { return 0; } + auto const has_null = input.has_nulls(); + bool has_nan = false; // Check for Nans // Checking for nulls in input and flag nan_handling, as the count will // only get affected if these two conditions are true. NAN will only be // be an extra if nan_handling was NAN_IS_NULL and input also had null, which // will increase the count by 1. - if (input.has_nulls() and nan_handling == nan_policy::NAN_IS_NULL) { + if (has_null and nan_handling == nan_policy::NAN_IS_NULL) { has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); } auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // if nan is considered null and there are already null values - if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and input.has_nulls()) --count; + if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and has_null) --count; - if (null_handling == null_policy::EXCLUDE and input.has_nulls()) + if (null_handling == null_policy::EXCLUDE and has_null) return --count; else return count; } } // namespace detail -cudf::size_type distinct_count(column_view const& input, null_policy null_handling) +cudf::size_type distinct_count(column_view const& input, + null_policy null_handling, + nan_policy nan_handling) { CUDF_FUNC_RANGE(); - return detail::distinct_count(input, null_handling); + return detail::distinct_count(input, null_handling, nan_handling); } cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal) diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index 91e76f5ed2d..8c7042363c5 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -50,7 +50,7 @@ TYPED_TEST(DistinctCountCommon, NoNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); - cudf::size_type expected = std::set(input.begin(), input.end()).size(); + cudf::size_type const expected = std::set(input.begin(), input.end()).size(); EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); @@ -59,7 +59,8 @@ TYPED_TEST(DistinctCountCommon, NoNull) std::copy(input.begin(), input.end(), std::back_inserter(input_data)); auto const new_end = std::unique(input_data.begin(), input_data.end()); auto const gold_ordered = new_end - input_data.begin(); - EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::INCLUDE)); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TYPED_TEST(DistinctCountCommon, TableNoNull) @@ -83,7 +84,8 @@ TYPED_TEST(DistinctCountCommon, TableNoNull) std::vector cols{input_col1, input_col2}; cudf::table_view input_table(cols); - cudf::size_type expected = std::set>(pair_input.begin(), pair_input.end()).size(); + cudf::size_type const expected = + std::set>(pair_input.begin(), pair_input.end()).size(); EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); auto const new_end = std::unique(pair_input.begin(), pair_input.end()); @@ -105,14 +107,15 @@ TEST_F(DistinctCount, WithNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - cudf::size_type expected = std::set(input.begin(), input.end()).size(); + cudf::size_type const expected = std::set(input.begin(), input.end()).size(); EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); auto const new_end = std::unique(input.begin(), input.end()); auto const gold_ordered = new_end - input.begin() - 3; - EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::EXCLUDE)); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, IgnoringNull) @@ -126,7 +129,7 @@ TEST_F(DistinctCount, IgnoringNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - cudf::size_type expected = std::set(input.begin(), input.end()).size(); + cudf::size_type const expected = std::set(input.begin(), input.end()).size(); // Removing 2 from expected to remove count for 70 and 3 EXPECT_EQ( expected - 2, @@ -134,7 +137,8 @@ TEST_F(DistinctCount, IgnoringNull) auto const new_end = std::unique(input.begin(), input.end()); auto const gold_ordered = new_end - input.begin() - 5; - EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::EXCLUDE)); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, WithNansAndNull) @@ -143,15 +147,20 @@ TEST_F(DistinctCount, WithNansAndNull) std::vector input = {1, 3, NAN, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, NAN, 0, NAN, 8, 5, 70}; - std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, + std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - cudf::size_type expected = std::set(input.begin(), input.end()).size(); + cudf::size_type const expected = std::set(input.begin(), input.end()).size(); EXPECT_EQ( - expected, + expected + 1, // +1 since `NAN` is not in std::set cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + auto const new_end = std::unique(input.begin(), input.end()); + auto const gold_ordered = new_end - input.begin(); + EXPECT_EQ(gold_ordered, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, WithNansOnly) @@ -163,11 +172,12 @@ TEST_F(DistinctCount, WithNansOnly) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - cudf::size_type expected = 5; + constexpr auto expected = 5; EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); - EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, NansAsNullWithNoNull) @@ -179,10 +189,12 @@ TEST_F(DistinctCount, NansAsNullWithNoNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - cudf::size_type expected = 5; + constexpr auto expected = 5; EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, NansAsNullWithNull) @@ -194,10 +206,12 @@ TEST_F(DistinctCount, NansAsNullWithNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - cudf::size_type expected = 4; + constexpr auto expected = 4; EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, NansAsNullWithIgnoreNull) @@ -209,10 +223,12 @@ TEST_F(DistinctCount, NansAsNullWithIgnoreNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - cudf::size_type expected = 3; + constexpr auto expected = 3; EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, EmptyColumn) @@ -221,11 +237,12 @@ TEST_F(DistinctCount, EmptyColumn) cudf::test::fixed_width_column_wrapper input_col{}; - cudf::size_type expected = 0; + constexpr auto expected = 0; EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); - EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::EXCLUDE)); + EXPECT_EQ(expected, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, StringColumnWithNull) @@ -234,7 +251,7 @@ TEST_F(DistinctCount, StringColumnWithNull) {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"}, {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}}; - cudf::size_type expected = + cudf::size_type const expected = (std::vector{"", "this", "is", "This", "a", "column", "of", "strings"}).size(); EXPECT_EQ( expected, From dd91e64f9943aab18896f4bae3eb7b1851dfdb57 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 17 Jan 2022 10:33:48 -0500 Subject: [PATCH 20/50] Rename drop_duplicates as sort_and_drop_duplicates --- .../drop_duplicates_benchmark.cpp | 2 +- cpp/include/cudf/detail/stream_compaction.hpp | 4 +- cpp/include/cudf/stream_compaction.hpp | 2 +- cpp/src/stream_compaction/drop_duplicates.cu | 36 +++--- .../drop_duplicates_tests.cpp | 111 ++++++++++++++++-- java/src/main/native/src/TableJni.cpp | 2 +- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 9 +- python/cudf/cudf/_lib/stream_compaction.pyx | 2 +- 8 files changed, 131 insertions(+), 37 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index 1f1bd0505d6..8a6f0e09585 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -54,7 +54,7 @@ void BM_compaction(benchmark::State& state, if constexpr (Algo == algorithm::HASH_BASED) { return cudf::unordered_drop_duplicates(input_table, {0}); } else { - return cudf::drop_duplicates(input_table, {0}, keep); + return cudf::sort_and_drop_duplicates(input_table, {0}, keep); } }(); } diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 3d065556827..716ecfede01 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -62,11 +62,11 @@ std::unique_ptr
apply_boolean_mask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @copydoc cudf::drop_duplicates + * @copydoc cudf::sort_and_drop_duplicates * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr
drop_duplicates( +std::unique_ptr
sort_and_drop_duplicates( table_view const& input, std::vector const& keys, duplicate_keep_option keep, diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 885f96d9ee4..41762febd0e 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -237,7 +237,7 @@ enum class duplicate_keep_option { * * @return Table with sorted unique rows as per specified `keep`. */ -std::unique_ptr
drop_duplicates( +std::unique_ptr
sort_and_drop_duplicates( table_view const& input, std::vector const& keys, duplicate_keep_option keep, diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 97669705b5b..b54f812d7c3 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -101,13 +101,13 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, } } // namespace -std::unique_ptr
drop_duplicates(table_view const& input, - std::vector const& keys, - duplicate_keep_option keep, - null_equality nulls_equal, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
sort_and_drop_duplicates(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { return empty_like(input); @@ -132,6 +132,7 @@ std::unique_ptr
drop_duplicates(table_view const& input, stream, mr); } + std::unique_ptr
unordered_drop_duplicates(table_view const& input, std::vector const& keys, null_equality nulls_equal, @@ -143,8 +144,8 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, } auto keys_view = input.select(keys); - auto table_ptr = cudf::table_device_view::create(keys_view, stream); + auto has_null = nullate::DYNAMIC{cudf::has_nulls(keys_view)}; auto const num_rows{table_ptr->num_rows()}; hash_map_type key_map{compute_hash_table_size(num_rows), @@ -153,9 +154,8 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, detail::hash_table_allocator_type{default_allocator{}, stream}, stream.value()}; - compaction_hash hash_key{nullate::DYNAMIC{cudf::has_nulls(keys_view)}, *table_ptr}; - row_equality_comparator row_equal( - nullate::DYNAMIC{cudf::has_nulls(keys_view)}, *table_ptr, *table_ptr, nulls_equal); + compaction_hash hash_key{has_null, *table_ptr}; + row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); @@ -191,15 +191,15 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, } // namespace detail -std::unique_ptr
drop_duplicates(table_view const& input, - std::vector const& keys, - duplicate_keep_option const keep, - null_equality nulls_equal, - null_order null_precedence, - rmm::mr::device_memory_resource* mr) +std::unique_ptr
sort_and_drop_duplicates(table_view const& input, + std::vector const& keys, + duplicate_keep_option const keep, + null_equality nulls_equal, + null_order null_precedence, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_duplicates( + return detail::sort_and_drop_duplicates( input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); } diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 5ae0a690fb9..1c788c62528 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -63,6 +63,54 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } +TEST_F(DropDuplicate, KeepNonNullTable) +{ + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; + + cudf::table_view input{{col1, col2, col1_key, col2_key}}; + std::vector keys{2, 3}; + + // Keep first of duplicate + // The expected table would be sorted in ascending order with respect to keys + cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_first{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_first{{21, 20, 19, 20, 9}}; + cudf::table_view expected_first{ + {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; + + auto got_first = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); + + // keep last of duplicate + cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_last{{21, 20, 19, 20, 9}}; + cudf::table_view expected_last{ + {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; + + auto got_last = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + + // Keep unique + cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{21, 20, 20, 9}}; + cudf::table_view expected_unique{ + {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; + + auto got_unique = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); +} + TEST_F(DropDuplicate, WithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; @@ -91,6 +139,39 @@ TEST_F(DropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); } +TEST_F(DropDuplicate, KeepWithNull) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; + + // Keep first of duplicate + cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; + auto got_first = sort_and_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); + + // Keep last of duplicate + cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; + cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; + auto got_last = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + + // Keep unique of duplicate + cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; + cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; + auto got_unique = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); +} + TEST_F(DropDuplicate, StringKeyColumn) { cudf::test::fixed_width_column_wrapper col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; @@ -211,9 +292,12 @@ TEST_F(UnorderedDropDuplicate, StringKeyColumn) {0, 1, 1, 1, 1}}; cudf::table_view expected{{exp_col, exp_key_col}}; - auto result = unordered_drop_duplicates(input, keys); - auto key_view = result->select(keys.begin(), keys.end()); - auto sorted_result = cudf::sort_by_key(result->view(), key_view); + auto got_last = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_last->view()); + + auto got_unordered = unordered_drop_duplicates(input, keys); + auto key_view = got_unordered->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } @@ -224,9 +308,12 @@ TEST_F(UnorderedDropDuplicate, EmptyInputTable) cudf::table_view input{{col}}; std::vector keys{1, 2}; - auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - + auto got = sort_and_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } TEST_F(UnorderedDropDuplicate, NoColumnInputTable) @@ -234,9 +321,12 @@ TEST_F(UnorderedDropDuplicate, NoColumnInputTable) cudf::table_view input{std::vector()}; std::vector keys{1, 2}; - auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - + auto got = sort_and_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } TEST_F(UnorderedDropDuplicate, EmptyKeys) @@ -246,7 +336,10 @@ TEST_F(UnorderedDropDuplicate, EmptyKeys) cudf::table_view input{{col}}; std::vector keys{}; - auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - + auto got = sort_and_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); + + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 828d163fe07..64475a5afcb 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2833,7 +2833,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( auto const keys_indices = std::vector(native_keys_indices.begin(), native_keys_indices.end()); - auto result = cudf::drop_duplicates( + auto result = cudf::sort_and_drop_duplicates( *input, keys_indices, keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : cudf::duplicate_keep_option::KEEP_LAST, diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index c73368633bf..1a7ff14f4a1 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -33,10 +33,11 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] drop_duplicates(table_view source_table, - vector[size_type] keys, - duplicate_keep_option keep, - null_equality nulls_equal) except + + cdef unique_ptr[table] sort_and_drop_duplicates( + table_view source_table, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal) except + cdef size_type unordered_distinct_count( column_view source_table, diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 3a627a8ea8d..760f52faf45 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -11,9 +11,9 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, - drop_duplicates as cpp_drop_duplicates, drop_nulls as cpp_drop_nulls, duplicate_keep_option, + sort_and_drop_duplicates as cpp_drop_duplicates, unordered_distinct_count as cpp_distinct_count, ) from cudf._lib.cpp.table.table cimport table From d80911c63671b776fde8e142b6271e8ce85cbab1 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 17 Jan 2022 13:53:08 -0500 Subject: [PATCH 21/50] Add consecutive drop_duplicates --- cpp/include/cudf/detail/stream_compaction.hpp | 13 + cpp/include/cudf/stream_compaction.hpp | 29 ++ cpp/src/stream_compaction/drop_duplicates.cu | 51 +++ .../drop_duplicates_tests.cpp | 367 ++++++++++-------- 4 files changed, 299 insertions(+), 161 deletions(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 716ecfede01..8a42b2b54f3 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -61,6 +61,19 @@ std::unique_ptr
apply_boolean_mask( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::drop_duplicates + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
drop_duplicates( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal = null_equality::EQUAL, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @copydoc cudf::sort_and_drop_duplicates * diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 41762febd0e..5a28ab52ee7 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -213,6 +213,35 @@ enum class duplicate_keep_option { KEEP_NONE ///< Keeps only unique elements }; +/** + * @brief Eliminates all except the row specified by `keep` from every consecutive group of + * equivalent rows. + * + * Given an `input` table_view, one row from a group of equivalent elements is copied to + * output table depending on the value of @p keep: + * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied + * - KEEP_LAST: only the last of a sequence of duplicate rows is copied + * - KEEP_NONE: no duplicate rows are copied + * + * @throws cudf::logic_error if The `input` row size mismatches with `keys`. + * + * @param[in] input input table_view to copy only unique rows + * @param[in] keys vector of indices representing key columns from `input` + * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not + * equal if null_equality::UNEQUAL + * @param[in] mr Device memory resource used to allocate the returned table's device + * memory + * + * @return Table with unique rows from each sequence of equivalent rows as per specified `keep`. + */ +std::unique_ptr
drop_duplicates( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Create a new table without duplicate rows. * diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index b54f812d7c3..00fdadd0085 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -101,6 +101,47 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, } } // namespace +std::unique_ptr
drop_duplicates(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input.num_rows(); + if (0 == num_rows || 0 == input.num_columns() || 0 == keys.size()) { return empty_like(input); } + + auto unique_indices = + make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream, mr); + auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); + auto keys_view = input.select(keys); + auto device_keys_view = cudf::table_device_view::create(keys_view, stream); + auto row_equal = row_equality_comparator(nullate::DYNAMIC{cudf::has_nulls(keys_view)}, + *device_keys_view, + *device_keys_view, + nulls_equal); + + // get indices of unique rows + auto result_end = unique_copy(thrust::counting_iterator(0), + thrust::counting_iterator(num_rows), + mutable_view->begin(), + row_equal, + keep, + stream); + auto indices_view = + cudf::detail::slice(column_view(*unique_indices), + 0, + thrust::distance(mutable_view->begin(), result_end)); + + // gather unique rows and return + return detail::gather(input, + indices_view, + out_of_bounds_policy::DONT_CHECK, + detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + std::unique_ptr
sort_and_drop_duplicates(table_view const& input, std::vector const& keys, duplicate_keep_option keep, @@ -191,6 +232,16 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, } // namespace detail +std::unique_ptr
drop_duplicates(table_view const& input, + std::vector const& keys, + duplicate_keep_option const keep, + null_equality nulls_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::drop_duplicates(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr); +} + std::unique_ptr
sort_and_drop_duplicates(table_view const& input, std::vector const& keys, duplicate_keep_option const keep, diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 1c788c62528..ee598831ddb 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -35,13 +35,100 @@ using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; +struct DropDuplicateCommon : public cudf::test::BaseFixture { +}; + +TEST_F(DropDuplicateCommon, StringKeyColumn) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 1, 0, 1, 1}}; + cudf::table_view input{{col, key_col}}; + std::vector keys{1}; + cudf::test::fixed_width_column_wrapper exp_col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::strings_column_wrapper exp_key_col{{"all", "new", "all", "new", "the", "strings"}, + {1, 1, 1, 0, 1, 1}}; + cudf::table_view expected{{exp_col, exp_key_col}}; + + auto got = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got->view()); + + cudf::test::fixed_width_column_wrapper exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; + cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"}, + {0, 1, 1, 1, 1}}; + cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}}; + + auto got_sort = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view()); + + auto got_unordered = unordered_drop_duplicates(input, keys); + auto key_view = got_unordered->select(keys.begin(), keys.end()); + auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view()); +} + +TEST_F(DropDuplicateCommon, EmptyInputTable) +{ + cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector keys{1, 2}; + + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + + auto got_sort = sort_and_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_sort->view()); + + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); +} + +TEST_F(DropDuplicateCommon, NoColumnInputTable) +{ + cudf::table_view input{std::vector()}; + std::vector keys{1, 2}; + + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); + + auto got_sort = sort_and_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_sort->view()); + + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); +} + +TEST_F(DropDuplicateCommon, EmptyKeys) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper empty_col{}; + cudf::table_view input{{col}}; + std::vector keys{}; + + auto got = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); + + auto got_sort = sort_and_drop_duplicates( + input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_sort->view()); + + auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); +} + struct DropDuplicate : public cudf::test::BaseFixture { }; TEST_F(DropDuplicate, NonNullTable) { - cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; - cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; @@ -50,20 +137,122 @@ TEST_F(DropDuplicate, NonNullTable) // Keep first of duplicate // The expected table would be sorted in ascending order with respect to keys - cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key{{21, 20, 19, 20, 9}}; - cudf::table_view expected{{exp_col1, exp_col2, exp_col1_key, exp_col2_key}}; + cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_first{{20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_first{{19, 20, 20, 9, 21}}; + cudf::table_view expected_first{ + {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; - auto result = unordered_drop_duplicates(input, keys); - auto key_view = result->select(keys.begin(), keys.end()); - auto sorted_result = cudf::sort_by_key(result->view(), key_view); + auto got_first = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); + + // keep last of duplicate + cudf::test::fixed_width_column_wrapper exp_col1_last{{4, 3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper exp_col2_last{{5, 3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_last{{20, 20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_last{{19, 20, 20, 9, 21}}; + cudf::table_view expected_last{ + {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; + + auto got_last = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); + + // Keep unique + cudf::test::fixed_width_column_wrapper exp_col1_unique{{3, 5, 8, 5}}; + cudf::test::fixed_width_column_wrapper exp_col2_unique{{3, 4, 9, 4}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{20, 19, 21, 9}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{20, 20, 9, 21}}; + cudf::table_view expected_unique{ + {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; + + auto got_unique = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicate, KeepNonNullTable) +TEST_F(DropDuplicate, WithNull) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; + + // Keep first of duplicate + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_first_equal{{5, 3, 5, 8, 1}, + {1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_equal{{20, 20, 19, 21, 19}, + {1, 0, 1, 1, 1}}; + cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}}; + auto got_first_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_first_unequal{{5, 3, 2, 5, 8, 1}, + {1, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_unequal{ + {20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}}; + auto got_first_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view()); + + // Keep last of duplicate + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_last_equal{{4, 2, 5, 8, 1}, + {0, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_equal{{20, 20, 19, 21, 19}, + {1, 0, 1, 1, 1}}; + cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}}; + auto got_last_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_last_unequal{{4, 3, 2, 5, 8, 1}, + {0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_unequal{{20, 20, 20, 19, 21, 19}, + {1, 0, 0, 1, 1, 1}}; + cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}}; + auto got_last_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view()); + + // Keep unique + // nulls are equal + cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{5, 8, 1}, {1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{19, 21, 19}, {1, 1, 1}}; + cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}}; + auto got_unique_equal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view()); + + // nulls are unequal + cudf::test::fixed_width_column_wrapper exp_col_unique_unequal{{3, 2, 5, 8, 1}, + {1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_unequal{{20, 20, 19, 21, 19}, + {0, 0, 1, 1, 1}}; + cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}}; + auto got_unique_unequal = + drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view()); +} + +struct SortedDropDuplicate : public cudf::test::BaseFixture { +}; + +TEST_F(SortedDropDuplicate, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; @@ -111,35 +300,7 @@ TEST_F(DropDuplicate, KeepNonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicate, WithNull) -{ - cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; - cudf::table_view input{{col, key}}; - std::vector keys{1}; - - // Nulls are equal - cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; - auto res_equal = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - auto equal_keys = res_equal->select(keys.begin(), keys.end()); - auto sorted_equal = cudf::sort_by_key(res_equal->view(), equal_keys); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); - - // Nulls are unequal - cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, - {0, 1, 0, 1, 1}}; - cudf::table_view expected_unequal{{exp_unequal_col, exp_unequal_key_col}}; - auto res_unequal = unordered_drop_duplicates(input, keys, null_equality::UNEQUAL); - auto sorted_unequal = cudf::sort(res_unequal->view()); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); -} - -TEST_F(DropDuplicate, KeepWithNull) +TEST_F(SortedDropDuplicate, WithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; @@ -147,6 +308,7 @@ TEST_F(DropDuplicate, KeepWithNull) std::vector keys{1}; // Keep first of duplicate + // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; @@ -163,7 +325,7 @@ TEST_F(DropDuplicate, KeepWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - // Keep unique of duplicate + // Keep unique cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; @@ -172,58 +334,6 @@ TEST_F(DropDuplicate, KeepWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicate, StringKeyColumn) -{ - cudf::test::fixed_width_column_wrapper col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, - {1, 1, 1, 0, 1, 1}}; - cudf::table_view input{{col, key_col}}; - std::vector keys{1}; - cudf::test::fixed_width_column_wrapper exp_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; - cudf::test::strings_column_wrapper exp_key_col{{"new", "all", "new", "strings", "the"}, - {0, 1, 1, 1, 1}}; - cudf::table_view expected{{exp_col, exp_key_col}}; - - auto result = unordered_drop_duplicates(input, keys); - auto key_view = result->select(keys.begin(), keys.end()); - auto sorted_result = cudf::sort_by_key(result->view(), key_view); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); -} - -TEST_F(DropDuplicate, EmptyInputTable) -{ - cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); - cudf::table_view input{{col}}; - std::vector keys{1, 2}; - - auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); -} - -TEST_F(DropDuplicate, NoColumnInputTable) -{ - cudf::table_view input{std::vector()}; - std::vector keys{1, 2}; - - auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); -} - -TEST_F(DropDuplicate, EmptyKeys) -{ - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper empty_col{}; - cudf::table_view input{{col}}; - std::vector keys{}; - - auto got = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); -} - struct UnorderedDropDuplicate : public cudf::test::BaseFixture { }; @@ -237,7 +347,6 @@ TEST_F(UnorderedDropDuplicate, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep first of duplicate // The expected table would be sorted in ascending order with respect to keys cudf::test::fixed_width_column_wrapper exp_col1{{5, 5, 6, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2{{4, 4, 6, 3, 9}}; @@ -259,7 +368,7 @@ TEST_F(UnorderedDropDuplicate, WithNull) cudf::table_view input{{col, key}}; std::vector keys{1}; - // Nulls are equal + // nulls are equal cudf::test::fixed_width_column_wrapper exp_equal_col{{4, 1, 5, 8}, {0, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_equal_key_col{{20, 19, 20, 21}, {0, 1, 1, 1}}; cudf::table_view expected_equal{{exp_equal_col, exp_equal_key_col}}; @@ -269,7 +378,7 @@ TEST_F(UnorderedDropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_equal, sorted_equal->view()); - // Nulls are unequal + // nulls are unequal cudf::test::fixed_width_column_wrapper exp_unequal_col{{4, 1, 4, 5, 8}, {0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_unequal_key_col{{20, 19, 20, 20, 21}, {0, 1, 0, 1, 1}}; @@ -279,67 +388,3 @@ TEST_F(UnorderedDropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unequal, sorted_unequal->view()); } - -TEST_F(UnorderedDropDuplicate, StringKeyColumn) -{ - cudf::test::fixed_width_column_wrapper col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::strings_column_wrapper key_col{{"all", "new", "all", "new", "the", "strings"}, - {1, 1, 1, 0, 1, 1}}; - cudf::table_view input{{col, key_col}}; - std::vector keys{1}; - cudf::test::fixed_width_column_wrapper exp_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; - cudf::test::strings_column_wrapper exp_key_col{{"new", "all", "new", "strings", "the"}, - {0, 1, 1, 1, 1}}; - cudf::table_view expected{{exp_col, exp_key_col}}; - - auto got_last = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_last->view()); - - auto got_unordered = unordered_drop_duplicates(input, keys); - auto key_view = got_unordered->select(keys.begin(), keys.end()); - auto sorted_result = cudf::sort_by_key(got_unordered->view(), key_view); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); -} - -TEST_F(UnorderedDropDuplicate, EmptyInputTable) -{ - cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); - cudf::table_view input{{col}}; - std::vector keys{1, 2}; - - auto got = sort_and_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); -} - -TEST_F(UnorderedDropDuplicate, NoColumnInputTable) -{ - cudf::table_view input{std::vector()}; - std::vector keys{1, 2}; - - auto got = sort_and_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); -} - -TEST_F(UnorderedDropDuplicate, EmptyKeys) -{ - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper empty_col{}; - cudf::table_view input{{col}}; - std::vector keys{}; - - auto got = sort_and_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); - - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); -} From 7ced99532b30475a4b90034476f9412d784bf58a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 17 Jan 2022 15:13:52 -0500 Subject: [PATCH 22/50] Optimize unordered_distinct_count: insert non-null rows only to improve efficiency --- cpp/src/stream_compaction/distinct_count.cu | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index be30ace49e1..e12a62442f2 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -178,8 +178,8 @@ cudf::size_type unordered_distinct_count(table_view const& keys, null_equality nulls_equal, rmm::cuda_stream_view stream) { - auto table_ptr = cudf::table_device_view::create(keys, stream); - auto const num_rows{table_ptr->num_rows()}; + auto table_ptr = cudf::table_device_view::create(keys, stream); + auto const num_rows = table_ptr->num_rows(); auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)}; hash_map_type key_map{compute_hash_table_size(num_rows), @@ -190,12 +190,23 @@ cudf::size_type unordered_distinct_count(table_view const& keys, compaction_hash hash_key{has_null, *table_ptr}; row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); - auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); - key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - return key_map.get_size(); + auto const count = [&]() { + if (nulls_equal == null_equality::EQUAL and has_null) { + thrust::counting_iterator stencil(0); + auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); + row_is_valid pred{static_cast(row_bitmask.data())}; + + // when nulls are equal, insert non-null rows only + key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); + return key_map.get_size() + static_cast((null_count > 0) ? 1 : 0); + } + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + return key_map.get_size(); + }(); + return static_cast(count); } cudf::size_type distinct_count(column_view const& input, From 2ea5d8ed304ec9cd0161f08cdfe443362c3fe1bd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 17 Jan 2022 16:08:01 -0500 Subject: [PATCH 23/50] Update cuco git tag --- cpp/cmake/thirdparty/get_cucollections.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 16e7a58b020..c1bd9b1a093 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) cuco 0.0 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 193de1aa74f5721717f991ca757dc610c852bb17 + GIT_TAG 922a87856aac17742fb964eeaf1b9bbc5d7a916e OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) From 4bb7b162f02b3c740da5bd3adf68bdffafaafedd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 18 Jan 2022 10:48:59 -0500 Subject: [PATCH 24/50] Slience unused argument warning via function prototyping --- cpp/src/stream_compaction/distinct_count.cu | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index e12a62442f2..175083b1840 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -68,8 +68,6 @@ struct check_for_nan { cudf::column_device_view _input; }; -#define UNUSED(x) (void)(x) - /** * @brief A structure to be used along with type_dispatcher to check if a * `column_view` has `NAN`. @@ -110,10 +108,8 @@ struct has_nans { * @returns bool Always false as non-floating point columns can't have `NAN` */ template ::value>* = nullptr> - bool operator()(column_view const& input, rmm::cuda_stream_view stream) + bool operator()(column_view const&, rmm::cuda_stream_view) { - UNUSED(input); - UNUSED(stream); return false; } }; @@ -151,10 +147,8 @@ struct check_nan { * @returns bool true if value at `index` is `NAN`, else false */ template ::value>* = nullptr> - __device__ bool operator()(column_device_view const& input, size_type index) + __device__ bool operator()(column_device_view const&, size_type) { - UNUSED(input); - UNUSED(index); return false; } }; From 012ca8b610a351caac7b1dc8a40db272f74c7d67 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 18 Jan 2022 11:53:46 -0500 Subject: [PATCH 25/50] Refactor compaction benchmark with nvbench --- cpp/benchmarks/CMakeLists.txt | 2 +- .../drop_duplicates_benchmark.cpp | 112 --------------- .../drop_duplicates_nvbench.cpp | 135 ++++++++++++++++++ 3 files changed, 136 insertions(+), 113 deletions(-) delete mode 100644 cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp create mode 100644 cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 370f84fc14a..e047bd2ec28 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -123,7 +123,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask_ben # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- -ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchmark.cpp) +ConfigureNVBench(STREAM_COMPACTION_NVBENCH stream_compaction/drop_duplicates_nvbench.cpp) # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp deleted file mode 100644 index 8a6f0e09585..00000000000 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -class Compaction : public cudf::benchmark { -}; -class HashCompaction : public cudf::benchmark { -}; - -enum class algorithm { SORT_BASED, HASH_BASED }; - -template -void BM_compaction(benchmark::State& state, - cudf::duplicate_keep_option keep = cudf::duplicate_keep_option::KEEP_FIRST) -{ - auto const n_rows = static_cast(state.range(0)); - - cudf::test::UniformRandomGenerator rand_gen(0, 100); - auto elements = cudf::detail::make_counting_transform_iterator( - 0, [&rand_gen](auto row) { return rand_gen.generate(); }); - auto valids = cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i % 100 == 0 ? false : true; }); - cudf::test::fixed_width_column_wrapper values(elements, elements + n_rows, valids); - - auto input_column = cudf::column_view(values); - auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); - - for (auto _ : state) { - cuda_event_timer timer(state, true); - auto const result = [&]() { - if constexpr (Algo == algorithm::HASH_BASED) { - return cudf::unordered_drop_duplicates(input_table, {0}); - } else { - return cudf::sort_and_drop_duplicates(input_table, {0}, keep); - } - }(); - } -} - -#define concat(a, b, c) a##b##c -#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op - -// TYPE, OP -#define SORT_BENCHMARK_DEFINE(name, type, keep) \ - BENCHMARK_DEFINE_F(Compaction, name)(::benchmark::State & state) \ - { \ - BM_compaction(state, get_keep(keep)); \ - } \ - BENCHMARK_REGISTER_F(Compaction, name) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ - -#define COMPACTION_BENCHMARK_DEFINE(type, keep) \ - SORT_BENCHMARK_DEFINE(concat(type, _, keep), type, keep) - -// TYPE -#define HASH_BENCHMARK_DEFINE(type) \ - BENCHMARK_DEFINE_F(HashCompaction, type)(::benchmark::State & state) \ - { \ - BM_compaction(state); \ - } \ - BENCHMARK_REGISTER_F(HashCompaction, type) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ - -#define HASH_COMPACTION_BENCHMARK_DEFINE(type) HASH_BENCHMARK_DEFINE(type) - -using cudf::timestamp_ms; - -COMPACTION_BENCHMARK_DEFINE(bool, NONE); -COMPACTION_BENCHMARK_DEFINE(int8_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, NONE); -COMPACTION_BENCHMARK_DEFINE(int32_t, FIRST); -COMPACTION_BENCHMARK_DEFINE(int32_t, LAST); -COMPACTION_BENCHMARK_DEFINE(timestamp_ms, NONE); -COMPACTION_BENCHMARK_DEFINE(float, NONE); - -HASH_COMPACTION_BENCHMARK_DEFINE(bool); -HASH_COMPACTION_BENCHMARK_DEFINE(int8_t); -HASH_COMPACTION_BENCHMARK_DEFINE(int32_t); -HASH_COMPACTION_BENCHMARK_DEFINE(int64_t); -HASH_COMPACTION_BENCHMARK_DEFINE(timestamp_ms); -HASH_COMPACTION_BENCHMARK_DEFINE(float); diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp new file mode 100644 index 00000000000..782be30b60f --- /dev/null +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +enum class algorithm { SORT_BASED, HASH_BASED }; + +// mandatory for enum types +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + algorithm, + // Callable to generate input strings: + // Short identifier used for tables, command-line args, etc. + // Used when context is available to figure out the enum type. + [](algorithm algo) { + switch (algo) { + case algorithm::SORT_BASED: return "SORT_BASED"; + case algorithm::HASH_BASED: return "HASH_BASED"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + // If non-empty, these are used in `--list` to describe values. + // Used when context may not be available to figure out the type from the + // input string. + // Just use `[](auto) { return std::string{}; }` if you don't want these. + [](auto) { return std::string{}; }) + +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + // Enum type: + cudf::duplicate_keep_option, + // Callable to generate input strings: + // Short identifier used for tables, command-line args, etc. + // Used when context is available to figure out the enum type. + [](cudf::duplicate_keep_option option) { + switch (option) { + case cudf::duplicate_keep_option::KEEP_FIRST: return "KEEP_FIRST"; + case cudf::duplicate_keep_option::KEEP_LAST: return "KEEP_LAST"; + case cudf::duplicate_keep_option::KEEP_NONE: return "KEEP_NONE"; + default: return "ERROR"; + } + }, + // Callable to generate descriptions: + // If non-empty, these are used in `--list` to describe values. + // Used when context may not be available to figure out the type from the + // input string. + // Just use `[](auto) { return std::string{}; }` if you don't want these. + [](auto) { return std::string{}; }) + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); + +template +void nvbench_compaction( + nvbench::state& state, + nvbench::type_list, nvbench::enum_type>) +{ + if constexpr ((not std::is_same_v and + Keep != cudf::duplicate_keep_option::KEEP_FIRST and + Algo == algorithm::SORT_BASED) or + (Algo == algorithm::HASH_BASED and + Keep != cudf::duplicate_keep_option::KEEP_FIRST)) { + state.skip("Skip unwanted benchmarks."); + } + + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); + + cudf::test::UniformRandomGenerator rand_gen(0, 100); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [&rand_gen](auto row) { return rand_gen.generate(); }); + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 100 == 0 ? false : true; }); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); + + auto input_column = cudf::column_view(values); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto const result = [&]() { + if constexpr (Algo == algorithm::HASH_BASED) { + return cudf::detail::unordered_drop_duplicates( + input_table, {0}, cudf::null_equality::EQUAL, stream_view); + } else { + return cudf::detail::sort_and_drop_duplicates(input_table, + {0}, + Keep, + cudf::null_equality::EQUAL, + cudf::null_order::BEFORE, + stream_view); + } + }(); + }); +} + +using data_type = nvbench::type_list; +using algo = nvbench::enum_type_list; +using keep_option = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(nvbench_compaction, NVBENCH_TYPE_AXES(data_type, algo, keep_option)) + .set_name("drop_duplicates") + .set_type_axes_names({"Type", "Algorithm", "KeepOption"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); From d489e2ec006f3259c0df3aaecf8894ad37686820 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 18 Jan 2022 12:27:24 -0500 Subject: [PATCH 26/50] Update copyright --- cpp/benchmarks/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index e047bd2ec28..0e36b3001f2 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at From 3e47ffd4b4ac6e6aa45ca5d295f1d57a8229cd52 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 19 Jan 2022 10:46:27 -0500 Subject: [PATCH 27/50] Get rid of nvbench primitive types --- .../stream_compaction/drop_duplicates_nvbench.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp index 782be30b60f..8f2ec2bfb2f 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp @@ -118,12 +118,7 @@ void nvbench_compaction( }); } -using data_type = nvbench::type_list; +using data_type = nvbench::type_list; using algo = nvbench::enum_type_list; using keep_option = nvbench::enum_type_list Date: Wed, 19 Jan 2022 13:50:08 -0500 Subject: [PATCH 28/50] Update docs & comments --- cpp/include/cudf/stream_compaction.hpp | 39 +++++++------- cpp/src/dictionary/set_keys.cu | 3 +- cpp/src/stream_compaction/distinct_count.cu | 56 ++++++++++----------- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 5a28ab52ee7..3fb4a5f4df4 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -189,7 +189,7 @@ std::unique_ptr
drop_nans( * @note if @p input.num_rows() is zero, there is no error, and an empty table * is returned. * - * @throws cudf::logic_error if The `input` size and `boolean_mask` size mismatches. + * @throws cudf::logic_error if `input.num_rows() != boolean_mask.size()`. * @throws cudf::logic_error if `boolean_mask` is not `type_id::BOOL8` type. * * @param[in] input The input table_view to filter @@ -223,17 +223,17 @@ enum class duplicate_keep_option { * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied * - * @throws cudf::logic_error if The `input` row size mismatches with `keys`. + * @throws cudf::logic_error if `input.num_rows() != keys.size()`. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] keep keep first row, last row, or no rows of the found duplicates * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL * @param[in] mr Device memory resource used to allocate the returned table's device - * memory + * memory * - * @return Table with unique rows from each sequence of equivalent rows as per specified `keep`. + * @return Table with unique rows from each sequence of equivalent rows as specified by `keep`. */ std::unique_ptr
drop_duplicates( table_view const& input, @@ -245,7 +245,8 @@ std::unique_ptr
drop_duplicates( /** * @brief Create a new table without duplicate rows. * - * The output table is sorted according to the lexicographic ordering of the `keys` rows. + * The output table is sorted according to the lexicographic ordering of the data in the columns + * indexed by `keys`. * * Given an `input` table_view, each row is copied to output table if the corresponding * row of `keys` columns is unique, where the definition of unique depends on the value of @p keep: @@ -253,18 +254,18 @@ std::unique_ptr
drop_duplicates( * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied * - * @throws cudf::logic_error if The `input` row size mismatches with `keys`. + * @throws cudf::logic_error if `input.num_rows() != keys.size()`. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep first entry, last entry, or no entries if duplicates found + * @param[in] keep keep first row, last row, or no rows of the found duplicates * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL * @param[in] null_precedence flag to denote nulls should appear before or after non-null items * @param[in] mr Device memory resource used to allocate the returned table's device - * memory + * memory * - * @return Table with sorted unique rows as per specified `keep`. + * @return Table with sorted unique rows as specified by `keep`. */ std::unique_ptr
sort_and_drop_duplicates( table_view const& input, @@ -281,16 +282,14 @@ std::unique_ptr
sort_and_drop_duplicates( * row of `keys` columns is unique. If duplicate rows are present, it is unspecified which * row is copied. * - * Elements in the output table are in a random order. - * - * @throws cudf::logic_error if The `input` row size mismatches with `keys`. + * The order of elements in the output table is not specified. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not * equal if null_equality::UNEQUAL * @param[in] mr Device memory resource used to allocate the returned table's device - * memory + * memory * * @return Table with unique rows in an unspecified order. */ @@ -309,11 +308,11 @@ std::unique_ptr
unordered_drop_duplicates( * * `null`s are handled as equal. * - * @param[in] input View of the input column + * @param[in] input The column_view whose number of distinct consecutive groups will be counted * @param[in] null_handling flag to include or ignore `null` while counting * @param[in] nan_handling flag to consider `NaN==null` or not * - * @return number of consecutive groups in the column + * @return number of distinct consecutive groups in the column */ cudf::size_type distinct_count(column_view const& input, null_policy null_handling, @@ -322,12 +321,11 @@ cudf::size_type distinct_count(column_view const& input, /** * @brief Count the number of consecutive groups of equivalent elements in a table. * - * - * @param[in] input Table whose number of consecutive groups will be counted + * @param[in] input Table whose number of distinct consecutive groups will be counted * @param[in] nulls_equal flag to denote if null elements should be considered equal * nulls are not equal if null_equality::UNEQUAL * - * @return number of consecutive groups in the table + * @return number of distinct consecutive groups in the table */ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL); @@ -335,6 +333,8 @@ cudf::size_type distinct_count(table_view const& input, /** * @brief Count the unique elements in the column_view. * + * If `nulls_equal == nulls_equal::UNEQUAL`, all `null`s are unique. + * * Given an input column_view, number of unique elements in this column_view is returned. * * If `null_handling` is null_policy::EXCLUDE and `nan_handling` is nan_policy::NAN_IS_NULL, both @@ -357,7 +357,6 @@ cudf::size_type unordered_distinct_count(column_view const& input, /** * @brief Count the unique rows in a table. * - * * @param[in] input Table whose unique rows will be counted * @param[in] nulls_equal flag to denote if null elements should be considered equal * nulls are not equal if null_equality::UNEQUAL diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 9d82fdc9de2..c1fb1fa2180 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -121,7 +121,8 @@ std::unique_ptr set_keys( auto keys = dictionary_column.keys(); CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match"); - // copy the keys -- use unordered_drop_duplicates to make sure they are sorted and unique + // copy the keys -- use unordered_drop_duplicates to make sure they are unique, then + // sort the results. auto unique_keys = cudf::detail::unordered_drop_duplicates( table_view{{new_keys}}, std::vector{0}, null_equality::EQUAL, stream, mr); auto sorted_keys = cudf::detail::sort(unique_keys->view(), diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 175083b1840..8eccf4163f5 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -40,7 +40,7 @@ namespace cudf { namespace detail { namespace { /** - * @brief Functor to check for `NAN` at an index in a `column_device_view`. + * @brief Functor to check for `NaN` at an index in a `column_device_view`. * * @tparam T The type of `column_device_view` */ @@ -54,11 +54,11 @@ struct check_for_nan { check_for_nan(cudf::column_device_view input) : _input{input} {} /** - * @brief Operator to be called to check for `NAN` at `index` in `_input` + * @brief Operator to be called to check for `NaN` at `index` in `_input` * - * @param[in] index The index at which the `NAN` needs to be checked in `input` + * @param[in] index The index at which the `NaN` needs to be checked in `input` * - * @returns bool true if value at `index` is `NAN` and not null, else false + * @returns bool true if value at `index` is `NaN` and not null, else false */ __device__ bool operator()(size_type index) const noexcept { @@ -70,18 +70,18 @@ struct check_for_nan { /** * @brief A structure to be used along with type_dispatcher to check if a - * `column_view` has `NAN`. + * `column_view` has `NaN`. */ struct has_nans { /** - * @brief Checks if `input` has `NAN` + * @brief Checks if `input` has `NaN` * * @note This will be applicable only for floating point type columns. * - * @param[in] input The `column_view` which will be checked for `NAN` + * @param[in] input The `column_view` which will be checked for `NaN` * @param[in] stream CUDA stream used for device memory operations and kernel launches. * - * @returns bool true if `input` has `NAN` else false + * @returns bool true if `input` has `NaN` else false */ template ::value>* = nullptr> bool operator()(column_view const& input, rmm::cuda_stream_view stream) @@ -96,16 +96,16 @@ struct has_nans { } /** - * @brief Checks if `input` has `NAN` + * @brief Checks if `input` has `NaN` * * @note This will be applicable only for non-floating point type columns. And - * non-floating point columns can never have `NAN`, so it will always return + * non-floating point columns can never have `NaN`, so it will always return * false * - * @param[in] input The `column_view` which will be checked for `NAN` + * @param[in] input The `column_view` which will be checked for `NaN` * @param[in] stream CUDA stream used for device memory operations and kernel launches. * - * @returns bool Always false as non-floating point columns can't have `NAN` + * @returns bool Always false as non-floating point columns can't have `NaN` */ template ::value>* = nullptr> bool operator()(column_view const&, rmm::cuda_stream_view) @@ -115,19 +115,19 @@ struct has_nans { }; /** - * @brief A structure to be used along with device type_dispatcher to check if - * the row `index` of `column_device_view` is `NAN`. + * @brief A functor to be used along with device type_dispatcher to check if + * the row `index` of `column_device_view` is `NaN`. */ struct check_nan { /** - * @brief Checks if the row `index` of `input` is `NAN`. + * @brief Checks if the row `index` of `input` is `NaN`. * * @note This will be applicable only for floating point type columns. * - * @param[in] input The `column_device_view` which will be checked for `NAN` - * @param[in] index The index at which the `NAN` needs to be checked in `input` + * @param[in] input The `column_device_view` which will be checked for `NaN` + * @param[in] index The index at which the `NaN` needs to be checked in `input` * - * @returns bool true if value at `index` is `NAN`, else false + * @returns bool true if value at `index` is `NaN`, else false */ template ::value>* = nullptr> __device__ bool operator()(column_device_view const& input, size_type index) @@ -135,16 +135,16 @@ struct check_nan { return std::isnan(input.data()[index]); } /** - * @brief Checks if the row `index` of `input` is `NAN`. + * @brief Checks if the row `index` of `input` is `NaN`. * * @note This will be applicable for non-floating point type columns. And - * non-floating point columns can never have `NAN`, so it will always return + * non-floating point columns can never have `NaN`, so it will always return * false. * - * @param[in] input The `column_device_view` which will be checked for `NAN` - * @param[in] index The index at which the `NAN` needs to be checked in `input` + * @param[in] input The `column_device_view` which will be checked for `NaN` + * @param[in] index The index at which the `NaN` needs to be checked in `input` * - * @returns bool true if value at `index` is `NAN`, else false + * @returns bool true if value at `index` is `NaN`, else false */ template ::value>* = nullptr> __device__ bool operator()(column_device_view const&, size_type) @@ -213,7 +213,7 @@ cudf::size_type distinct_count(column_view const& input, if (0 == num_rows || input.null_count() == num_rows) { return 0; } auto const count_nulls = null_handling == null_policy::INCLUDE; - auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; + auto const nan_is_null = nan_handling == nan_policy::NaN_IS_NULL; auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; auto t_view = table_view{{input}}; @@ -252,17 +252,17 @@ cudf::size_type unordered_distinct_count(column_view const& input, bool has_nan = false; // Check for Nans // Checking for nulls in input and flag nan_handling, as the count will - // only get affected if these two conditions are true. NAN will only be - // be an extra if nan_handling was NAN_IS_NULL and input also had null, which + // only get affected if these two conditions are true. NaN will only be + // be an extra if nan_handling was NaN_IS_NULL and input also had null, which // will increase the count by 1. - if (has_null and nan_handling == nan_policy::NAN_IS_NULL) { + if (has_null and nan_handling == nan_policy::NaN_IS_NULL) { has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); } auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // if nan is considered null and there are already null values - if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and has_null) --count; + if (nan_handling == nan_policy::NaN_IS_NULL and has_nan and has_null) --count; if (null_handling == null_policy::EXCLUDE and has_null) return --count; From 5fb92c76a40a9cc1a5ae4aefae87877adc37dee0 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 19 Jan 2022 15:06:57 -0500 Subject: [PATCH 29/50] Address review comments --- cpp/src/stream_compaction/distinct_count.cu | 26 ++++----- cpp/src/stream_compaction/drop_duplicates.cu | 18 +++---- .../stream_compaction_common.cuh | 4 +- cpp/src/transform/encode.cu | 3 +- .../distinct_count_tests.cpp | 1 - .../drop_duplicates_tests.cpp | 53 +++++++++---------- python/cudf/cudf/_lib/stream_compaction.pyx | 8 +-- 7 files changed, 57 insertions(+), 56 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 8eccf4163f5..77efc7429f6 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -165,7 +165,7 @@ cudf::size_type distinct_count(table_view const& keys, rmm::exec_policy(stream), thrust::counting_iterator(0), thrust::counting_iterator(keys.num_rows()), - [comp] __device__(cudf::size_type i) { return (i == 0 || not comp(i, i - 1)); }); + [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); } cudf::size_type unordered_distinct_count(table_view const& keys, @@ -191,7 +191,7 @@ cudf::size_type unordered_distinct_count(table_view const& keys, if (nulls_equal == null_equality::EQUAL and has_null) { thrust::counting_iterator stencil(0); auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); - row_is_valid pred{static_cast(row_bitmask.data())}; + row_validity pred{static_cast(row_bitmask.data())}; // when nulls are equal, insert non-null rows only key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); @@ -210,16 +210,18 @@ cudf::size_type distinct_count(column_view const& input, { auto const num_rows = input.size(); - if (0 == num_rows || input.null_count() == num_rows) { return 0; } + if (num_rows == 0 or num_rows == input.null_count()) { return 0; } auto const count_nulls = null_handling == null_policy::INCLUDE; - auto const nan_is_null = nan_handling == nan_policy::NaN_IS_NULL; + auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; - auto t_view = table_view{{input}}; - auto table_ptr = cudf::table_device_view::create(t_view, stream); - row_equality_comparator comp( - nullate::DYNAMIC{cudf::has_nulls(t_view)}, *table_ptr, *table_ptr, null_equality::EQUAL); + auto input_table_view = table_view{{input}}; + auto table_ptr = cudf::table_device_view::create(input_table_view, stream); + row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)}, + *table_ptr, + *table_ptr, + null_equality::EQUAL); return thrust::count_if( rmm::exec_policy(stream), @@ -245,7 +247,7 @@ cudf::size_type unordered_distinct_count(column_view const& input, nan_policy nan_handling, rmm::cuda_stream_view stream) { - if (0 == input.size() || input.null_count() == input.size()) { return 0; } + if (0 == input.size() or input.null_count() == input.size()) { return 0; } auto const has_null = input.has_nulls(); @@ -253,16 +255,16 @@ cudf::size_type unordered_distinct_count(column_view const& input, // Check for Nans // Checking for nulls in input and flag nan_handling, as the count will // only get affected if these two conditions are true. NaN will only be - // be an extra if nan_handling was NaN_IS_NULL and input also had null, which + // be an extra if nan_handling was NAN_IS_NULL and input also had null, which // will increase the count by 1. - if (has_null and nan_handling == nan_policy::NaN_IS_NULL) { + if (has_null and nan_handling == nan_policy::NAN_IS_NULL) { has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); } auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // if nan is considered null and there are already null values - if (nan_handling == nan_policy::NaN_IS_NULL and has_nan and has_null) --count; + if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and has_null) --count; if (null_handling == null_policy::EXCLUDE and has_null) return --count; diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 00fdadd0085..6c714e41418 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -109,16 +109,16 @@ std::unique_ptr
drop_duplicates(table_view const& input, rmm::mr::device_memory_resource* mr) { auto const num_rows = input.num_rows(); - if (0 == num_rows || 0 == input.num_columns() || 0 == keys.size()) { return empty_like(input); } + if (num_rows == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } auto unique_indices = make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream, mr); auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); auto keys_view = input.select(keys); - auto device_keys_view = cudf::table_device_view::create(keys_view, stream); + auto keys_device_view = cudf::table_device_view::create(keys_view, stream); auto row_equal = row_equality_comparator(nullate::DYNAMIC{cudf::has_nulls(keys_view)}, - *device_keys_view, - *device_keys_view, + *keys_device_view, + *keys_device_view, nulls_equal); // get indices of unique rows @@ -150,7 +150,7 @@ std::unique_ptr
sort_and_drop_duplicates(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } @@ -204,9 +204,9 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); auto counting_iter = thrust::make_counting_iterator(0); - rmm::device_uvector existences(num_rows, stream, mr); + rmm::device_uvector index_exists_in_map(num_rows, stream, mr); // enumerate all indices to check if they are present in the map. - key_map.contains(counting_iter, counting_iter + num_rows, existences.begin(), hash_key); + key_map.contains(counting_iter, counting_iter + num_rows, index_exists_in_map.begin(), hash_key); auto const output_size{key_map.get_size()}; @@ -217,9 +217,9 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, thrust::copy_if(rmm::exec_policy(stream), counting_iter, counting_iter + num_rows, - existences.begin(), + index_exists_in_map.begin(), mutable_view->begin(), - [] __device__(bool const b) { return b; }); + thrust::identity{}); // run gather operation to establish new order return detail::gather(input, diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh index 2540c5dc316..81ed7dc03f8 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.cuh +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -41,9 +41,9 @@ class compaction_hash { /** * @brief Device functor to determine if a row is valid. */ -class row_is_valid { +class row_validity { public: - row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} + row_validity(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} __device__ __inline__ bool operator()(const size_type& i) const noexcept { diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index a2b40cf3dee..400d1b26757 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -48,7 +48,8 @@ std::pair, std::unique_ptr> encode( std::vector column_order(num_cols, order::ASCENDING); std::vector null_precedence(num_cols, null_order::AFTER); - auto sorted_unique_keys = sort(unique_keys->view(), column_order, null_precedence, stream, mr); + auto sorted_unique_keys = + cudf::detail::sort(unique_keys->view(), column_order, null_precedence, stream, mr); auto indices_column = cudf::detail::lower_bound( sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr); diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index 8c7042363c5..f92c711c920 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -29,7 +29,6 @@ #include #include -#include using cudf::nan_policy; using cudf::null_equality; diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index ee598831ddb..c063acf57ce 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -29,16 +29,15 @@ #include #include -#include using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; -struct DropDuplicateCommon : public cudf::test::BaseFixture { +struct DropDuplicatesCommon : public cudf::test::BaseFixture { }; -TEST_F(DropDuplicateCommon, StringKeyColumn) +TEST_F(DropDuplicatesCommon, StringKeyColumn) { cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}}; cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"}, @@ -68,7 +67,7 @@ TEST_F(DropDuplicateCommon, StringKeyColumn) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, sorted_result->view()); } -TEST_F(DropDuplicateCommon, EmptyInputTable) +TEST_F(DropDuplicatesCommon, EmptyInputTable) { cudf::test::fixed_width_column_wrapper col(std::initializer_list{}); cudf::table_view input{{col}}; @@ -86,7 +85,7 @@ TEST_F(DropDuplicateCommon, EmptyInputTable) CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DropDuplicateCommon, NoColumnInputTable) +TEST_F(DropDuplicatesCommon, NoColumnInputTable) { cudf::table_view input{std::vector()}; std::vector keys{1, 2}; @@ -103,7 +102,7 @@ TEST_F(DropDuplicateCommon, NoColumnInputTable) CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } -TEST_F(DropDuplicateCommon, EmptyKeys) +TEST_F(DropDuplicatesCommon, EmptyKeys) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper empty_col{}; @@ -122,10 +121,10 @@ TEST_F(DropDuplicateCommon, EmptyKeys) CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); } -struct DropDuplicate : public cudf::test::BaseFixture { +struct DropDuplicates : public cudf::test::BaseFixture { }; -TEST_F(DropDuplicate, NonNullTable) +TEST_F(DropDuplicates, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; @@ -135,7 +134,7 @@ TEST_F(DropDuplicate, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep first of duplicate + // Keep the first of duplicate // The expected table would be sorted in ascending order with respect to keys cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 3, 4, 9, 4}}; @@ -148,7 +147,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // keep last of duplicate + // Keep the last of duplicate cudf::test::fixed_width_column_wrapper exp_col1_last{{4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper exp_col2_last{{5, 3, 4, 9, 4}}; cudf::test::fixed_width_column_wrapper exp_col1_key_last{{20, 20, 19, 21, 9}}; @@ -160,7 +159,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - // Keep unique + // Keep no duplicate rows cudf::test::fixed_width_column_wrapper exp_col1_unique{{3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper exp_col2_unique{{3, 4, 9, 4}}; cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{20, 19, 21, 9}}; @@ -173,7 +172,7 @@ TEST_F(DropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicate, WithNull) +TEST_F(DropDuplicates, WithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, @@ -181,7 +180,7 @@ TEST_F(DropDuplicate, WithNull) cudf::table_view input{{col, key}}; std::vector keys{1}; - // Keep first of duplicate + // Keep the first of duplicate // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_first_equal{{5, 3, 5, 8, 1}, {1, 1, 1, 1, 1}}; @@ -204,7 +203,7 @@ TEST_F(DropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view()); - // Keep last of duplicate + // Keep the last of duplicate // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_last_equal{{4, 2, 5, 8, 1}, {0, 1, 1, 1, 1}}; @@ -227,7 +226,7 @@ TEST_F(DropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view()); - // Keep unique + // Keep no duplicate rows // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{5, 8, 1}, {1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{19, 21, 19}, {1, 1, 1}}; @@ -249,10 +248,10 @@ TEST_F(DropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view()); } -struct SortedDropDuplicate : public cudf::test::BaseFixture { +struct SortedDropDuplicates : public cudf::test::BaseFixture { }; -TEST_F(SortedDropDuplicate, NonNullTable) +TEST_F(SortedDropDuplicates, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; @@ -262,7 +261,7 @@ TEST_F(SortedDropDuplicate, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep first of duplicate + // Keep the first of duplicate // The expected table would be sorted in ascending order with respect to keys cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; @@ -275,7 +274,7 @@ TEST_F(SortedDropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // keep last of duplicate + // Keep the last of duplicate cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; @@ -287,7 +286,7 @@ TEST_F(SortedDropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - // Keep unique + // Keep no duplicate rows cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; @@ -300,14 +299,14 @@ TEST_F(SortedDropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(SortedDropDuplicate, WithNull) +TEST_F(SortedDropDuplicates, WithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; cudf::table_view input{{col, key}}; std::vector keys{1}; - // Keep first of duplicate + // Keep the first of duplicate // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; @@ -317,7 +316,7 @@ TEST_F(SortedDropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // Keep last of duplicate + // Keep the last of duplicate cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; @@ -325,7 +324,7 @@ TEST_F(SortedDropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - // Keep unique + // Keep no duplicate rows cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; @@ -334,10 +333,10 @@ TEST_F(SortedDropDuplicate, WithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -struct UnorderedDropDuplicate : public cudf::test::BaseFixture { +struct UnorderedDropDuplicates : public cudf::test::BaseFixture { }; -TEST_F(UnorderedDropDuplicate, NonNullTable) +TEST_F(UnorderedDropDuplicates, NonNullTable) { cudf::test::fixed_width_column_wrapper col1{{6, 6, 3, 5, 8, 5}}; cudf::test::fixed_width_column_wrapper col2{{6, 6, 3, 4, 9, 4}}; @@ -361,7 +360,7 @@ TEST_F(UnorderedDropDuplicate, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, sorted_result->view()); } -TEST_F(UnorderedDropDuplicate, WithNull) +TEST_F(UnorderedDropDuplicates, WithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 4, 1, 8, 1}, {1, 0, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 760f52faf45..7b6037a925e 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -13,8 +13,8 @@ from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, drop_nulls as cpp_drop_nulls, duplicate_keep_option, - sort_and_drop_duplicates as cpp_drop_duplicates, - unordered_distinct_count as cpp_distinct_count, + sort_and_drop_duplicates as cpp_sort_and_drop_duplicates, + unordered_distinct_count as cpp_unordered_distinct_count, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -149,7 +149,7 @@ def drop_duplicates(columns: list, with nogil: c_result = move( - cpp_drop_duplicates( + cpp_sort_and_drop_duplicates( source_table_view, cpp_keys, cpp_keep_option, @@ -190,7 +190,7 @@ def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False): cdef column_view source_column_view = source_column.view() with nogil: - count = cpp_distinct_count( + count = cpp_unordered_distinct_count( source_column_view, cpp_null_handling, cpp_nan_handling From 3af4fd0bd253e578e0a73577de6d48fed3880199 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 19 Jan 2022 17:37:44 -0500 Subject: [PATCH 30/50] Address more review comments --- cpp/include/cudf/stream_compaction.hpp | 12 ++-- cpp/src/stream_compaction/distinct_count.cu | 56 +++++++------------ cpp/src/stream_compaction/drop_duplicates.cu | 8 ++- .../distinct_count_tests.cpp | 5 +- 4 files changed, 32 insertions(+), 49 deletions(-) diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 3fb4a5f4df4..a1384e59bc5 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -223,7 +223,7 @@ enum class duplicate_keep_option { * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied * - * @throws cudf::logic_error if `input.num_rows() != keys.size()`. + * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` @@ -254,7 +254,7 @@ std::unique_ptr
drop_duplicates( * - KEEP_LAST: only the last of a sequence of duplicate rows is copied * - KEEP_NONE: no duplicate rows are copied * - * @throws cudf::logic_error if `input.num_rows() != keys.size()`. + * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table. * * @param[in] input input table_view to copy only unique rows * @param[in] keys vector of indices representing key columns from `input` @@ -322,8 +322,8 @@ cudf::size_type distinct_count(column_view const& input, * @brief Count the number of consecutive groups of equivalent elements in a table. * * @param[in] input Table whose number of distinct consecutive groups will be counted - * @param[in] nulls_equal flag to denote if null elements should be considered equal - * nulls are not equal if null_equality::UNEQUAL + * @param[in] nulls_equal flag to denote if null elements should be considered equal. + * nulls are not equal if null_equality::UNEQUAL. * * @return number of distinct consecutive groups in the table */ @@ -358,8 +358,8 @@ cudf::size_type unordered_distinct_count(column_view const& input, * @brief Count the unique rows in a table. * * @param[in] input Table whose unique rows will be counted - * @param[in] nulls_equal flag to denote if null elements should be considered equal - * nulls are not equal if null_equality::UNEQUAL + * @param[in] nulls_equal flag to denote if null elements should be considered equal. + * nulls are not equal if null_equality::UNEQUAL. * * @return number of unique rows in the table */ diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 77efc7429f6..7514732933d 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -119,33 +119,13 @@ struct has_nans { * the row `index` of `column_device_view` is `NaN`. */ struct check_nan { - /** - * @brief Checks if the row `index` of `input` is `NaN`. - * - * @note This will be applicable only for floating point type columns. - * - * @param[in] input The `column_device_view` which will be checked for `NaN` - * @param[in] index The index at which the `NaN` needs to be checked in `input` - * - * @returns bool true if value at `index` is `NaN`, else false - */ + // Check if it's `NaN` for floating point type columns template ::value>* = nullptr> __device__ bool operator()(column_device_view const& input, size_type index) { return std::isnan(input.data()[index]); } - /** - * @brief Checks if the row `index` of `input` is `NaN`. - * - * @note This will be applicable for non-floating point type columns. And - * non-floating point columns can never have `NaN`, so it will always return - * false. - * - * @param[in] input The `column_device_view` which will be checked for `NaN` - * @param[in] index The index at which the `NaN` needs to be checked in `input` - * - * @returns bool true if value at `index` is `NaN`, else false - */ + // Non-floating point type columns can never have `NaN`, so it will always return false. template ::value>* = nullptr> __device__ bool operator()(column_device_view const&, size_type) { @@ -184,23 +164,25 @@ cudf::size_type unordered_distinct_count(table_view const& keys, compaction_hash hash_key{has_null, *table_ptr}; row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); + auto iter = cudf::detail::make_counting_transform_iterator(0, [] __device__(size_type i) { + // TODO: cuco::make_pair currently requires rvalue references. We + // create a copy to avoid double-move invoking undefined behavior. + auto ii = i; + return cuco::make_pair(std::move(i), std::move(ii)); + }); - auto const count = [&]() { - if (nulls_equal == null_equality::EQUAL and has_null) { - thrust::counting_iterator stencil(0); - auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); - row_validity pred{static_cast(row_bitmask.data())}; + // when nulls are equal, insert non-null rows only to improve efficiency + if (nulls_equal == null_equality::EQUAL and has_null) { + thrust::counting_iterator stencil(0); + auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); + row_validity pred{static_cast(row_bitmask.data())}; - // when nulls are equal, insert non-null rows only - key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); - return key_map.get_size() + static_cast((null_count > 0) ? 1 : 0); - } - key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - return key_map.get_size(); - }(); - return static_cast(count); + key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); + return key_map.get_size() + static_cast((null_count > 0) ? 1 : 0); + } + // otherwise, insert all + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + return key_map.get_size(); } cudf::size_type distinct_count(column_view const& input, diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 6c714e41418..d740140b928 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -198,8 +198,12 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, compaction_hash hash_key{has_null, *table_ptr}; row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [] __device__(size_type i) { return cuco::make_pair(std::move(i), std::move(i)); }); + auto iter = cudf::detail::make_counting_transform_iterator(0, [] __device__(size_type i) { + // TODO: cuco::make_pair currently requires rvalue references. We + // create a copy to avoid double-move invoking undefined behavior. + auto ii = i; + return cuco::make_pair(std::move(i), std::move(ii)); + }); // insert unique indices into the map. key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index f92c711c920..fe9cc6021fb 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -54,8 +54,7 @@ TYPED_TEST(DistinctCountCommon, NoNull) expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); - std::vector input_data; - std::copy(input.begin(), input.end(), std::back_inserter(input_data)); + std::vector input_data(input.begin(), input.end()); auto const new_end = std::unique(input_data.begin(), input_data.end()); auto const gold_ordered = new_end - input_data.begin(); EXPECT_EQ(gold_ordered, @@ -277,8 +276,6 @@ TEST_F(DistinctCount, EmptyColumnedTable) EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::EQUAL)); EXPECT_EQ(0, cudf::unordered_distinct_count(input, null_equality::UNEQUAL)); - EXPECT_EQ(0, cudf::unordered_distinct_count(cudf::table_view{}, null_equality::EQUAL)); - EXPECT_EQ(0, cudf::unordered_distinct_count(cudf::table_view{}, null_equality::UNEQUAL)); } TEST_F(DistinctCount, TableMixedTypes) From a58751122857bc8724fc7680f40d9dba3b9f6b6c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 19 Jan 2022 17:58:50 -0500 Subject: [PATCH 31/50] Split tests --- .../drop_duplicates_tests.cpp | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index c063acf57ce..fd2e17009b4 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -172,7 +172,7 @@ TEST_F(DropDuplicates, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); } -TEST_F(DropDuplicates, WithNull) +TEST_F(DropDuplicates, KeepFirstWithNull) { cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, @@ -180,7 +180,6 @@ TEST_F(DropDuplicates, WithNull) cudf::table_view input{{col, key}}; std::vector keys{1}; - // Keep the first of duplicate // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_first_equal{{5, 3, 5, 8, 1}, {1, 1, 1, 1, 1}}; @@ -202,8 +201,16 @@ TEST_F(DropDuplicates, WithNull) drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_unequal, got_first_unequal->view()); +} + +TEST_F(DropDuplicates, KeepLastWithNull) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; - // Keep the last of duplicate // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_last_equal{{4, 2, 5, 8, 1}, {0, 1, 1, 1, 1}}; @@ -225,8 +232,16 @@ TEST_F(DropDuplicates, WithNull) drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_unequal, got_last_unequal->view()); +} + +TEST_F(DropDuplicates, KeepNoneWithNull) +{ + cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 20, 19, 21, 19}, + {1, 1, 0, 0, 1, 1, 1}}; + cudf::table_view input{{col, key}}; + std::vector keys{1}; - // Keep no duplicate rows // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{5, 8, 1}, {1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{19, 21, 19}, {1, 1, 1}}; @@ -307,7 +322,6 @@ TEST_F(SortedDropDuplicates, WithNull) std::vector keys{1}; // Keep the first of duplicate - // nulls are equal cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; From b062eb5c8ec0c031715618619163e76afa36721f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 19 Jan 2022 18:12:40 -0500 Subject: [PATCH 32/50] Use null masks in tests --- .../distinct_count_tests.cpp | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index fe9cc6021fb..f8542598067 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -34,6 +34,9 @@ using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; +constexpr int32_t XXX{70}; // Mark for null elements +constexpr int32_t YYY{3}; // Mark for null elements + template struct DistinctCountCommon : public cudf::test::BaseFixture { }; @@ -49,11 +52,13 @@ TYPED_TEST(DistinctCountCommon, NoNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); + // explicit instantiation to one particular type (`double`) to reduce build time cudf::size_type const expected = std::set(input.begin(), input.end()).size(); EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + // explicit instantiation to one particular type (`double`) to reduce build time std::vector input_data(input.begin(), input.end()); auto const new_end = std::unique(input_data.begin(), input_data.end()); auto const gold_ordered = new_end - input_data.begin(); @@ -98,13 +103,14 @@ TEST_F(DistinctCount, WithNull) { using T = int32_t; - // Considering 70 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 70, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; + std::vector input = {1, 3, 3, XXX, 31, 1, 8, 2, 0, XXX, XXX, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); + // explicit instantiation to one particular type (`double`) to reduce build time cudf::size_type const expected = std::set(input.begin(), input.end()).size(); EXPECT_EQ( expected, @@ -120,15 +126,15 @@ TEST_F(DistinctCount, IgnoringNull) { using T = int32_t; - // Considering 70 and 3 as null - std::vector input = {1, 3, 3, 70, 31, 1, 8, 2, 0, 70, 1, 70, 10, 40, 31, 42, 0, 42, 8, 5, 70}; + std::vector input = {1, YYY, YYY, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, 42, 0, 42, 8, 5, XXX}; std::vector valid = {1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); cudf::size_type const expected = std::set(input.begin(), input.end()).size(); - // Removing 2 from expected to remove count for 70 and 3 + // Removing 2 from expected to remove count for `XXX` and `YYY` EXPECT_EQ( expected - 2, cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); @@ -143,8 +149,8 @@ TEST_F(DistinctCount, WithNansAndNull) { using T = float; - std::vector input = {1, 3, NAN, 70, 31, 1, 8, 2, 0, 70, 1, - 70, 10, 40, 31, NAN, 0, NAN, 8, 5, 70}; + std::vector input = {1, 3, NAN, XXX, 31, 1, 8, 2, 0, XXX, 1, + XXX, 10, 40, 31, NAN, 0, NAN, 8, 5, XXX}; std::vector valid = {1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}; @@ -199,7 +205,7 @@ TEST_F(DistinctCount, NansAsNullWithNull) { using T = float; - std::vector input = {1, 3, NAN, 70, 31}; + std::vector input = {1, 3, NAN, XXX, 31}; std::vector valid = {1, 1, 1, 0, 1}; cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; @@ -216,7 +222,7 @@ TEST_F(DistinctCount, NansAsNullWithIgnoreNull) { using T = float; - std::vector input = {1, 3, NAN, 70, 31}; + std::vector input = {1, 3, NAN, XXX, 31}; std::vector valid = {1, 1, 1, 0, 1}; cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; From a5f881ff4ae4e40efc9ba2f03cdc9e2f9e4a1304 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 19 Jan 2022 18:29:49 -0500 Subject: [PATCH 33/50] Split benchmarks --- .../drop_duplicates_nvbench.cpp | 65 +++++++++++-------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp index 8f2ec2bfb2f..01bd83c78ab 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp @@ -73,16 +73,12 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); -template -void nvbench_compaction( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +void nvbench_sort_and_drop_duplicates(nvbench::state& state, + nvbench::type_list>) { - if constexpr ((not std::is_same_v and - Keep != cudf::duplicate_keep_option::KEEP_FIRST and - Algo == algorithm::SORT_BASED) or - (Algo == algorithm::HASH_BASED and - Keep != cudf::duplicate_keep_option::KEEP_FIRST)) { + if constexpr (not std::is_same_v and + Keep != cudf::duplicate_keep_option::KEEP_FIRST) { state.skip("Skip unwanted benchmarks."); } @@ -102,29 +98,46 @@ void nvbench_compaction( state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto const result = [&]() { - if constexpr (Algo == algorithm::HASH_BASED) { - return cudf::detail::unordered_drop_duplicates( - input_table, {0}, cudf::null_equality::EQUAL, stream_view); - } else { - return cudf::detail::sort_and_drop_duplicates(input_table, - {0}, - Keep, - cudf::null_equality::EQUAL, - cudf::null_order::BEFORE, - stream_view); - } - }(); + auto result = cudf::detail::sort_and_drop_duplicates( + input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view); + }); +} + +template +void nvbench_unordered_drop_duplicates(nvbench::state& state, nvbench::type_list) +{ + cudf::rmm_pool_raii pool_raii; + + auto const num_rows = state.get_int64("NumRows"); + + cudf::test::UniformRandomGenerator rand_gen(0, 100); + auto elements = cudf::detail::make_counting_transform_iterator( + 0, [&rand_gen](auto row) { return rand_gen.generate(); }); + auto valids = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i % 100 == 0 ? false : true; }); + cudf::test::fixed_width_column_wrapper values(elements, elements + num_rows, valids); + + auto input_column = cudf::column_view(values); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::unordered_drop_duplicates( + input_table, {0}, cudf::null_equality::EQUAL, stream_view); }); } using data_type = nvbench::type_list; -using algo = nvbench::enum_type_list; using keep_option = nvbench::enum_type_list; -NVBENCH_BENCH_TYPES(nvbench_compaction, NVBENCH_TYPE_AXES(data_type, algo, keep_option)) - .set_name("drop_duplicates") - .set_type_axes_names({"Type", "Algorithm", "KeepOption"}) +NVBENCH_BENCH_TYPES(nvbench_sort_and_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option)) + .set_name("sort_and_drop_duplicates") + .set_type_axes_names({"Type", "KeepOption"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +NVBENCH_BENCH_TYPES(nvbench_unordered_drop_duplicates, NVBENCH_TYPE_AXES(data_type)) + .set_name("unordered_drop_duplicates") + .set_type_axes_names({"Type"}) .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); From df36e7739458ad0a42782bce4bc00f5c6f9e0aee Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 20 Jan 2022 19:02:40 -0500 Subject: [PATCH 34/50] Fix a bug + update tests --- cpp/src/stream_compaction/distinct_count.cu | 23 +++++--- .../distinct_count_tests.cpp | 55 +++++++++++++++++++ 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 7514732933d..2599b2c5e7d 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -34,6 +34,7 @@ #include #include +#include #include namespace cudf { @@ -88,11 +89,10 @@ struct has_nans { { auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; - auto count = thrust::count_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - check_for_nan(device_view)); - return count > 0; + return thrust::any_of(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + check_for_nan(device_view)); } /** @@ -239,19 +239,24 @@ cudf::size_type unordered_distinct_count(column_view const& input, // only get affected if these two conditions are true. NaN will only be // be an extra if nan_handling was NAN_IS_NULL and input also had null, which // will increase the count by 1. - if (has_null and nan_handling == nan_policy::NAN_IS_NULL) { + if (nan_handling == nan_policy::NAN_IS_NULL) { has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); } auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // if nan is considered null and there are already null values - if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and has_null) --count; + if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and has_null) { --count; } + if (not has_null and has_nan and null_handling == null_policy::EXCLUDE and + nan_handling == nan_policy::NAN_IS_NULL) { + --count; + } - if (null_handling == null_policy::EXCLUDE and has_null) + if (null_handling == null_policy::EXCLUDE and has_null) { return --count; - else + } else { return count; + } } } // namespace detail diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index f8542598067..f1ae4da68e8 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -165,6 +165,17 @@ TEST_F(DistinctCount, WithNansAndNull) auto const gold_ordered = new_end - input.begin(); EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 2; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, WithNansOnly) @@ -182,6 +193,17 @@ TEST_F(DistinctCount, WithNansOnly) cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, NansAsNullWithNoNull) @@ -199,6 +221,17 @@ TEST_F(DistinctCount, NansAsNullWithNoNull) cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 1; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, NansAsNullWithNull) @@ -216,6 +249,17 @@ TEST_F(DistinctCount, NansAsNullWithNull) cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, XXX}; + valid = {1, 1, 0}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_null = 1; + EXPECT_EQ( + expected_all_null, + cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_null, + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, NansAsNullWithIgnoreNull) @@ -233,6 +277,17 @@ TEST_F(DistinctCount, NansAsNullWithIgnoreNull) cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); EXPECT_EQ(expected, cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + + input = {NAN, NAN, NAN}; + valid = {1, 1, 1}; + input_col = cudf::test::fixed_width_column_wrapper{input.begin(), input.end(), valid.begin()}; + + constexpr auto expected_all_nan = 0; + EXPECT_EQ( + expected_all_nan, + cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); + EXPECT_EQ(expected_all_nan, + cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); } TEST_F(DistinctCount, EmptyColumn) From 20ed6eacdb99a72dc08284af852c48093766a20a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 20 Jan 2022 19:07:16 -0500 Subject: [PATCH 35/50] Update docs --- cpp/include/cudf/stream_compaction.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index a1384e59bc5..4bd40f77899 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -214,10 +214,9 @@ enum class duplicate_keep_option { }; /** - * @brief Eliminates all except the row specified by `keep` from every consecutive group of - * equivalent rows. + * @brief Eliminates all except one specific row from every consecutive group of equivalent rows. * - * Given an `input` table_view, one row from a group of equivalent elements is copied to + * Given an `input` table_view, one specific row from a group of equivalent elements is copied to * output table depending on the value of @p keep: * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied * - KEEP_LAST: only the last of a sequence of duplicate rows is copied From ecc1d7e29966537d3ce5690c4963b149bf114436 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 20 Jan 2022 21:02:17 -0500 Subject: [PATCH 36/50] Add should_check_nan predicate to avoid unnecessary type-dispatching --- cpp/src/stream_compaction/distinct_count.cu | 62 +++++++++++++++------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 2599b2c5e7d..1ce914ad21d 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -84,7 +84,7 @@ struct has_nans { * * @returns bool true if `input` has `NaN` else false */ - template ::value>* = nullptr> + template >* = nullptr> bool operator()(column_view const& input, rmm::cuda_stream_view stream) { auto input_device_view = cudf::column_device_view::create(input, stream); @@ -107,27 +107,46 @@ struct has_nans { * * @returns bool Always false as non-floating point columns can't have `NaN` */ - template ::value>* = nullptr> + template >* = nullptr> bool operator()(column_view const&, rmm::cuda_stream_view) { return false; } }; +/** + * @brief A functor to be used along with type_dispatcher to determine if the device + * `check_nan` functor should be used. + */ +struct check_nan_predicate { + // Check `NaN` for floating point type columns + template >* = nullptr> + bool operator()() + { + return true; + } + // Non-floating point type columns can never have `NaN`, so it will always return false. + template >* = nullptr> + bool operator()() + { + return false; + } +}; + /** * @brief A functor to be used along with device type_dispatcher to check if * the row `index` of `column_device_view` is `NaN`. */ struct check_nan { // Check if it's `NaN` for floating point type columns - template ::value>* = nullptr> - __device__ bool operator()(column_device_view const& input, size_type index) + template >* = nullptr> + __device__ __forceinline__ bool operator()(column_device_view const& input, size_type index) { return std::isnan(input.data()[index]); } // Non-floating point type columns can never have `NaN`, so it will always return false. - template ::value>* = nullptr> - __device__ bool operator()(column_device_view const&, size_type) + template >* = nullptr> + __device__ __forceinline__ bool operator()(column_device_view const&, size_type) { return false; } @@ -194,12 +213,13 @@ cudf::size_type distinct_count(column_view const& input, if (num_rows == 0 or num_rows == input.null_count()) { return 0; } - auto const count_nulls = null_handling == null_policy::INCLUDE; - auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; - auto input_device_view = cudf::column_device_view::create(input, stream); - auto device_view = *input_device_view; - auto input_table_view = table_view{{input}}; - auto table_ptr = cudf::table_device_view::create(input_table_view, stream); + auto const count_nulls = null_handling == null_policy::INCLUDE; + auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; + auto const should_check_nan = cudf::type_dispatcher(input.type(), check_nan_predicate{}); + auto input_device_view = cudf::column_device_view::create(input, stream); + auto device_view = *input_device_view; + auto input_table_view = table_view{{input}}; + auto table_ptr = cudf::table_device_view::create(input_table_view, stream); row_equality_comparator comp(nullate::DYNAMIC{cudf::has_nulls(input_table_view)}, *table_ptr, *table_ptr, @@ -209,16 +229,22 @@ cudf::size_type distinct_count(column_view const& input, rmm::exec_policy(stream), thrust::counting_iterator(0), thrust::counting_iterator(num_rows), - [count_nulls, nan_is_null, device_view, comp] __device__(cudf::size_type i) { - bool is_nan = cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); + [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) { + bool is_nan = false; + bool is_null = device_view.is_null(i); + if (nan_is_null and should_check_nan) { + is_nan = cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); + } if (count_nulls) { - if (nan_is_null and (is_nan or device_view.is_null(i))) { - bool prev_is_nan = - cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1); + if (nan_is_null and (is_nan or is_null)) { + auto prev_is_nan = + should_check_nan + ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) + : false; return (i == 0 or not(device_view.is_null(i - 1) or prev_is_nan)); } } else { - if (device_view.is_null(i) or (nan_is_null and is_nan)) { return false; } + if (is_null or (nan_is_null and is_nan)) { return false; } } return (i == 0 or not comp(i, i - 1)); }); From a1514436c74ff9cb34e852fb8aa4816c87fdc451 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 21 Jan 2022 13:52:01 -0500 Subject: [PATCH 37/50] Rename benchmark according to benchmarking guide --- cpp/benchmarks/CMakeLists.txt | 2 +- ...rop_duplicates_nvbench.cpp => drop_duplicates_benchmark.cpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/benchmarks/stream_compaction/{drop_duplicates_nvbench.cpp => drop_duplicates_benchmark.cpp} (100%) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 0e36b3001f2..684420e83de 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -123,7 +123,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask_ben # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- -ConfigureNVBench(STREAM_COMPACTION_NVBENCH stream_compaction/drop_duplicates_nvbench.cpp) +ConfigureNVBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchmark.cpp) # ################################################################################################## # * join benchmark -------------------------------------------------------------------------------- diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp similarity index 100% rename from cpp/benchmarks/stream_compaction/drop_duplicates_nvbench.cpp rename to cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp From 024d7e026b292d9a119ece71b12e21edf34ecd98 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 24 Jan 2022 11:51:12 -0500 Subject: [PATCH 38/50] Remove std::unique-like drop_duplicates --- .../drop_duplicates_benchmark.cpp | 10 +- cpp/include/cudf/detail/stream_compaction.hpp | 13 -- cpp/include/cudf/stream_compaction.hpp | 30 +--- cpp/src/stream_compaction/drop_duplicates.cu | 57 +----- .../drop_duplicates_tests.cpp | 170 ++++-------------- java/src/main/native/src/TableJni.cpp | 2 +- .../cudf/cudf/_lib/cpp/stream_compaction.pxd | 2 +- python/cudf/cudf/_lib/stream_compaction.pyx | 4 +- 8 files changed, 45 insertions(+), 243 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index 01bd83c78ab..511c465d99d 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -74,8 +74,8 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); template -void nvbench_sort_and_drop_duplicates(nvbench::state& state, - nvbench::type_list>) +void nvbench_drop_duplicates(nvbench::state& state, + nvbench::type_list>) { if constexpr (not std::is_same_v and Keep != cudf::duplicate_keep_option::KEEP_FIRST) { @@ -98,7 +98,7 @@ void nvbench_sort_and_drop_duplicates(nvbench::state& state, state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = cudf::detail::sort_and_drop_duplicates( + auto result = cudf::detail::drop_duplicates( input_table, {0}, Keep, cudf::null_equality::EQUAL, cudf::null_order::BEFORE, stream_view); }); } @@ -132,8 +132,8 @@ using keep_option = nvbench::enum_type_list; -NVBENCH_BENCH_TYPES(nvbench_sort_and_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option)) - .set_name("sort_and_drop_duplicates") +NVBENCH_BENCH_TYPES(nvbench_drop_duplicates, NVBENCH_TYPE_AXES(data_type, keep_option)) + .set_name("drop_duplicates") .set_type_axes_names({"Type", "KeepOption"}) .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 8a42b2b54f3..3d065556827 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -67,19 +67,6 @@ std::unique_ptr
apply_boolean_mask( * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr
drop_duplicates( - table_view const& input, - std::vector const& keys, - duplicate_keep_option keep, - null_equality nulls_equal = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @copydoc cudf::sort_and_drop_duplicates - * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr
sort_and_drop_duplicates( table_view const& input, std::vector const& keys, duplicate_keep_option keep, diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 4bd40f77899..94039d81f31 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -213,34 +213,6 @@ enum class duplicate_keep_option { KEEP_NONE ///< Keeps only unique elements }; -/** - * @brief Eliminates all except one specific row from every consecutive group of equivalent rows. - * - * Given an `input` table_view, one specific row from a group of equivalent elements is copied to - * output table depending on the value of @p keep: - * - KEEP_FIRST: only the first of a sequence of duplicate rows is copied - * - KEEP_LAST: only the last of a sequence of duplicate rows is copied - * - KEEP_NONE: no duplicate rows are copied - * - * @throws cudf::logic_error if the `keys` column indices are out of bounds in the `input` table. - * - * @param[in] input input table_view to copy only unique rows - * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep first row, last row, or no rows of the found duplicates - * @param[in] nulls_equal flag to denote nulls are equal if null_equality::EQUAL, nulls are not - * equal if null_equality::UNEQUAL - * @param[in] mr Device memory resource used to allocate the returned table's device - * memory - * - * @return Table with unique rows from each sequence of equivalent rows as specified by `keep`. - */ -std::unique_ptr
drop_duplicates( - table_view const& input, - std::vector const& keys, - duplicate_keep_option keep, - null_equality nulls_equal = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Create a new table without duplicate rows. * @@ -266,7 +238,7 @@ std::unique_ptr
drop_duplicates( * * @return Table with sorted unique rows as specified by `keep`. */ -std::unique_ptr
sort_and_drop_duplicates( +std::unique_ptr
drop_duplicates( table_view const& input, std::vector const& keys, duplicate_keep_option keep, diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index d740140b928..14b25a20d08 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -105,50 +105,9 @@ std::unique_ptr
drop_duplicates(table_view const& input, std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal, + null_order null_precedence, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) -{ - auto const num_rows = input.num_rows(); - if (num_rows == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } - - auto unique_indices = - make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream, mr); - auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); - auto keys_view = input.select(keys); - auto keys_device_view = cudf::table_device_view::create(keys_view, stream); - auto row_equal = row_equality_comparator(nullate::DYNAMIC{cudf::has_nulls(keys_view)}, - *keys_device_view, - *keys_device_view, - nulls_equal); - - // get indices of unique rows - auto result_end = unique_copy(thrust::counting_iterator(0), - thrust::counting_iterator(num_rows), - mutable_view->begin(), - row_equal, - keep, - stream); - auto indices_view = - cudf::detail::slice(column_view(*unique_indices), - 0, - thrust::distance(mutable_view->begin(), result_end)); - - // gather unique rows and return - return detail::gather(input, - indices_view, - out_of_bounds_policy::DONT_CHECK, - detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); -} - -std::unique_ptr
sort_and_drop_duplicates(table_view const& input, - std::vector const& keys, - duplicate_keep_option keep, - null_equality nulls_equal, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) { if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); @@ -240,21 +199,11 @@ std::unique_ptr
drop_duplicates(table_view const& input, std::vector const& keys, duplicate_keep_option const keep, null_equality nulls_equal, + null_order null_precedence, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::drop_duplicates(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr); -} - -std::unique_ptr
sort_and_drop_duplicates(table_view const& input, - std::vector const& keys, - duplicate_keep_option const keep, - null_equality nulls_equal, - null_order null_precedence, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::sort_and_drop_duplicates( + return detail::drop_duplicates( input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr); } diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index fd2e17009b4..052e79222ff 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -44,20 +44,13 @@ TEST_F(DropDuplicatesCommon, StringKeyColumn) {1, 1, 1, 1, 0, 1, 1}}; cudf::table_view input{{col, key_col}}; std::vector keys{1}; - cudf::test::fixed_width_column_wrapper exp_col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::strings_column_wrapper exp_key_col{{"all", "new", "all", "new", "the", "strings"}, - {1, 1, 1, 0, 1, 1}}; - cudf::table_view expected{{exp_col, exp_key_col}}; - - auto got = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got->view()); cudf::test::fixed_width_column_wrapper exp_sort_col{{5, 5, 4, 1, 8}, {1, 1, 0, 1, 1}}; cudf::test::strings_column_wrapper exp_sort_key_col{{"new", "all", "new", "strings", "the"}, {0, 1, 1, 1, 1}}; cudf::table_view expected_sort{{exp_sort_col, exp_sort_key_col}}; - auto got_sort = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); + auto got_sort = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, got_sort->view()); auto got_unordered = unordered_drop_duplicates(input, keys); @@ -77,10 +70,6 @@ TEST_F(DropDuplicatesCommon, EmptyInputTable) drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - auto got_sort = sort_and_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_sort->view()); - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } @@ -94,10 +83,6 @@ TEST_F(DropDuplicatesCommon, NoColumnInputTable) drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); - auto got_sort = sort_and_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_sort->view()); - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got_unordered->view()); } @@ -113,10 +98,6 @@ TEST_F(DropDuplicatesCommon, EmptyKeys) drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); - auto got_sort = sort_and_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_sort->view()); - auto got_unordered = unordered_drop_duplicates(input, keys, null_equality::EQUAL); CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got_unordered->view()); } @@ -136,10 +117,10 @@ TEST_F(DropDuplicates, NonNullTable) // Keep the first of duplicate // The expected table would be sorted in ascending order with respect to keys - cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 3, 5, 8, 5}}; - cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 3, 4, 9, 4}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_first{{20, 20, 19, 21, 9}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_first{{19, 20, 20, 9, 21}}; + cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_first{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_first{{21, 20, 19, 20, 9}}; cudf::table_view expected_first{ {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; @@ -148,10 +129,10 @@ TEST_F(DropDuplicates, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); // Keep the last of duplicate - cudf::test::fixed_width_column_wrapper exp_col1_last{{4, 3, 5, 8, 5}}; - cudf::test::fixed_width_column_wrapper exp_col2_last{{5, 3, 4, 9, 4}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_last{{20, 20, 19, 21, 9}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_last{{19, 20, 20, 9, 21}}; + cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_last{{21, 20, 19, 20, 9}}; cudf::table_view expected_last{ {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; @@ -160,10 +141,10 @@ TEST_F(DropDuplicates, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); // Keep no duplicate rows - cudf::test::fixed_width_column_wrapper exp_col1_unique{{3, 5, 8, 5}}; - cudf::test::fixed_width_column_wrapper exp_col2_unique{{3, 4, 9, 4}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{20, 19, 21, 9}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{20, 20, 9, 21}}; + cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; + cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; + cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; + cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{21, 20, 20, 9}}; cudf::table_view expected_unique{ {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; @@ -181,10 +162,9 @@ TEST_F(DropDuplicates, KeepFirstWithNull) std::vector keys{1}; // nulls are equal - cudf::test::fixed_width_column_wrapper exp_col_first_equal{{5, 3, 5, 8, 1}, - {1, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first_equal{{20, 20, 19, 21, 19}, - {1, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_first_equal{{3, 5, 5, 8}, {1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_equal{{20, 19, 20, 21}, + {0, 1, 1, 1}}; cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}}; auto got_first_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); @@ -192,10 +172,10 @@ TEST_F(DropDuplicates, KeepFirstWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view()); // nulls are unequal - cudf::test::fixed_width_column_wrapper exp_col_first_unequal{{5, 3, 2, 5, 8, 1}, - {1, 1, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first_unequal{ - {20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_first_unequal{{3, 2, 5, 5, 8}, + {1, 1, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_first_unequal{{20, 20, 19, 20, 21}, + {0, 0, 1, 1, 1}}; cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}}; auto got_first_unequal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL); @@ -212,10 +192,9 @@ TEST_F(DropDuplicates, KeepLastWithNull) std::vector keys{1}; // nulls are equal - cudf::test::fixed_width_column_wrapper exp_col_last_equal{{4, 2, 5, 8, 1}, - {0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last_equal{{20, 20, 19, 21, 19}, - {1, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_last_equal{{2, 1, 4, 8}, {1, 1, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_equal{{20, 19, 20, 21}, + {0, 1, 1, 1}}; cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}}; auto got_last_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL); @@ -223,10 +202,10 @@ TEST_F(DropDuplicates, KeepLastWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view()); // nulls are unequal - cudf::test::fixed_width_column_wrapper exp_col_last_unequal{{4, 3, 2, 5, 8, 1}, - {0, 1, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last_unequal{{20, 20, 20, 19, 21, 19}, - {1, 0, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_last_unequal{{3, 2, 1, 4, 8}, + {1, 1, 1, 0, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_last_unequal{{20, 20, 19, 20, 21}, + {0, 0, 1, 1, 1}}; cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}}; auto got_last_unequal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL); @@ -243,8 +222,8 @@ TEST_F(DropDuplicates, KeepNoneWithNull) std::vector keys{1}; // nulls are equal - cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{5, 8, 1}, {1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{19, 21, 19}, {1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_unique_equal{{8}, {1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_equal{{21}, {1}}; cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}}; auto got_unique_equal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL); @@ -252,10 +231,9 @@ TEST_F(DropDuplicates, KeepNoneWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view()); // nulls are unequal - cudf::test::fixed_width_column_wrapper exp_col_unique_unequal{{3, 2, 5, 8, 1}, - {1, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique_unequal{{20, 20, 19, 21, 19}, - {0, 0, 1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_col_unique_unequal{{3, 2, 8}, {1, 1, 1}}; + cudf::test::fixed_width_column_wrapper exp_key_col_unique_unequal{{20, 20, 21}, + {0, 0, 1}}; cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}}; auto got_unique_unequal = drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL); @@ -263,90 +241,6 @@ TEST_F(DropDuplicates, KeepNoneWithNull) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_unequal, got_unique_unequal->view()); } -struct SortedDropDuplicates : public cudf::test::BaseFixture { -}; - -TEST_F(SortedDropDuplicates, NonNullTable) -{ - cudf::test::fixed_width_column_wrapper col1{{5, 4, 3, 5, 8, 5}}; - cudf::test::fixed_width_column_wrapper col2{{4, 5, 3, 4, 9, 4}}; - cudf::test::fixed_width_column_wrapper col1_key{{20, 20, 20, 19, 21, 9}}; - cudf::test::fixed_width_column_wrapper col2_key{{19, 19, 20, 20, 9, 21}}; - - cudf::table_view input{{col1, col2, col1_key, col2_key}}; - std::vector keys{2, 3}; - - // Keep the first of duplicate - // The expected table would be sorted in ascending order with respect to keys - cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_first{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_first{{21, 20, 19, 20, 9}}; - cudf::table_view expected_first{ - {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}}; - - auto got_first = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_FIRST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - - // Keep the last of duplicate - cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_last{{21, 20, 19, 20, 9}}; - cudf::table_view expected_last{ - {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}}; - - auto got_last = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep no duplicate rows - cudf::test::fixed_width_column_wrapper exp_col1_unique{{5, 5, 3, 8}}; - cudf::test::fixed_width_column_wrapper exp_col2_unique{{4, 4, 3, 9}}; - cudf::test::fixed_width_column_wrapper exp_col1_key_unique{{9, 19, 20, 21}}; - cudf::test::fixed_width_column_wrapper exp_col2_key_unique{{21, 20, 20, 9}}; - cudf::table_view expected_unique{ - {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}}; - - auto got_unique = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); -} - -TEST_F(SortedDropDuplicates, WithNull) -{ - cudf::test::fixed_width_column_wrapper col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper key{{20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}}; - cudf::table_view input{{col, key}}; - std::vector keys{1}; - - // Keep the first of duplicate - cudf::test::fixed_width_column_wrapper exp_col_first{{4, 5, 5, 8}, {0, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_first{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_first{{exp_col_first, exp_key_col_first}}; - auto got_first = sort_and_drop_duplicates( - input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - - // Keep the last of duplicate - cudf::test::fixed_width_column_wrapper exp_col_last{{3, 1, 5, 8}, {1, 1, 1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_last{{20, 19, 20, 21}, {0, 1, 1, 1}}; - cudf::table_view expected_last{{exp_col_last, exp_key_col_last}}; - auto got_last = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_LAST); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, got_last->view()); - - // Keep no duplicate rows - cudf::test::fixed_width_column_wrapper exp_col_unique{{5, 8}, {1, 1}}; - cudf::test::fixed_width_column_wrapper exp_key_col_unique{{20, 21}, {1, 1}}; - cudf::table_view expected_unique{{exp_col_unique, exp_key_col_unique}}; - auto got_unique = sort_and_drop_duplicates(input, keys, cudf::duplicate_keep_option::KEEP_NONE); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, got_unique->view()); -} - struct UnorderedDropDuplicates : public cudf::test::BaseFixture { }; diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 15f14d745e2..22b089fa93a 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -3006,7 +3006,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( auto const keys_indices = std::vector(native_keys_indices.begin(), native_keys_indices.end()); - auto result = cudf::sort_and_drop_duplicates( + auto result = cudf::drop_duplicates( *input, keys_indices, keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : cudf::duplicate_keep_option::KEEP_LAST, diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 1a7ff14f4a1..897b61f8001 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -33,7 +33,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] sort_and_drop_duplicates( + cdef unique_ptr[table] drop_duplicates( table_view source_table, vector[size_type] keys, duplicate_keep_option keep, diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 7b6037a925e..9506cf90f99 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -13,7 +13,7 @@ from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, drop_nulls as cpp_drop_nulls, duplicate_keep_option, - sort_and_drop_duplicates as cpp_sort_and_drop_duplicates, + drop_duplicates as cpp_drop_duplicates, unordered_distinct_count as cpp_unordered_distinct_count, ) from cudf._lib.cpp.table.table cimport table @@ -149,7 +149,7 @@ def drop_duplicates(columns: list, with nogil: c_result = move( - cpp_sort_and_drop_duplicates( + cpp_drop_duplicates( source_table_view, cpp_keys, cpp_keep_option, From b6c1634ec55bc01ffb3b2f2ee97a4083ec214ad3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 24 Jan 2022 11:58:36 -0500 Subject: [PATCH 39/50] Style fixing --- python/cudf/cudf/_lib/stream_compaction.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 9506cf90f99..c4f885382f3 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -11,9 +11,9 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, + drop_duplicates as cpp_drop_duplicates, drop_nulls as cpp_drop_nulls, duplicate_keep_option, - drop_duplicates as cpp_drop_duplicates, unordered_distinct_count as cpp_unordered_distinct_count, ) from cudf._lib.cpp.table.table cimport table From 3ad0f76fd9b4f419fd105bd77e30835daa3f5009 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 24 Jan 2022 13:24:29 -0500 Subject: [PATCH 40/50] Fix test failures: sort the output --- cpp/src/dictionary/detail/concatenate.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index c9e05877ee8..fcecf370fbe 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -216,11 +217,13 @@ std::unique_ptr concatenate(host_span columns, // sort keys and remove duplicates; // this becomes the keys child for the output dictionary column - auto table_keys = - cudf::detail::unordered_drop_duplicates( - table_view{{all_keys->view()}}, std::vector{0}, null_equality::EQUAL, stream, mr) - ->release(); - std::unique_ptr keys_column(std::move(table_keys.front())); + auto table_keys = cudf::detail::unordered_drop_duplicates( + table_view{{all_keys->view()}}, std::vector{0}, null_equality::EQUAL, stream, mr); + std::vector column_order{order::ASCENDING}; + std::vector null_precedence{null_order::BEFORE}; + auto sorted_keys = + cudf::detail::sort(table_keys->view(), column_order, null_precedence, stream, mr)->release(); + std::unique_ptr keys_column(std::move(sorted_keys.front())); // next, concatenate the indices std::vector indices_views(columns.size()); From 58f6cb65a76ad1ec3dcf3f3986e116653af39826 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 24 Jan 2022 13:28:21 -0500 Subject: [PATCH 41/50] Minor cleanups --- .../drop_duplicates_benchmark.cpp | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index 511c465d99d..b540db56b05 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -27,29 +27,6 @@ #include #include -enum class algorithm { SORT_BASED, HASH_BASED }; - -// mandatory for enum types -NVBENCH_DECLARE_ENUM_TYPE_STRINGS( - // Enum type: - algorithm, - // Callable to generate input strings: - // Short identifier used for tables, command-line args, etc. - // Used when context is available to figure out the enum type. - [](algorithm algo) { - switch (algo) { - case algorithm::SORT_BASED: return "SORT_BASED"; - case algorithm::HASH_BASED: return "HASH_BASED"; - default: return "ERROR"; - } - }, - // Callable to generate descriptions: - // If non-empty, these are used in `--list` to describe values. - // Used when context may not be available to figure out the type from the - // input string. - // Just use `[](auto) { return std::string{}; }` if you don't want these. - [](auto) { return std::string{}; }) - NVBENCH_DECLARE_ENUM_TYPE_STRINGS( // Enum type: cudf::duplicate_keep_option, From e381815c5e86a78cf4b51740c479180bd0be70f7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 24 Jan 2022 13:34:04 -0500 Subject: [PATCH 42/50] Minor cleanup --- cpp/src/dictionary/detail/concatenate.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index fcecf370fbe..301338fa1a8 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -219,10 +219,12 @@ std::unique_ptr concatenate(host_span columns, // this becomes the keys child for the output dictionary column auto table_keys = cudf::detail::unordered_drop_duplicates( table_view{{all_keys->view()}}, std::vector{0}, null_equality::EQUAL, stream, mr); - std::vector column_order{order::ASCENDING}; - std::vector null_precedence{null_order::BEFORE}; - auto sorted_keys = - cudf::detail::sort(table_keys->view(), column_order, null_precedence, stream, mr)->release(); + auto sorted_keys = cudf::detail::sort(table_keys->view(), + std::vector{order::ASCENDING}, + std::vector{null_order::BEFORE}, + stream, + mr) + ->release(); std::unique_ptr keys_column(std::move(sorted_keys.front())); // next, concatenate the indices From fa796aa38486436d3a5be8e2e6db7eef2067c83c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 24 Jan 2022 15:12:26 -0500 Subject: [PATCH 43/50] Address review comments --- .../drop_duplicates_benchmark.cpp | 10 +++------- cpp/src/stream_compaction/drop_duplicates.cu | 14 +++++++------- .../stream_compaction/distinct_count_tests.cpp | 7 ++++--- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index b540db56b05..317db92ae8b 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,12 +27,12 @@ #include #include +// necessary for custom enum types +// see: https://github.com/NVIDIA/nvbench/blob/main/examples/enums.cu NVBENCH_DECLARE_ENUM_TYPE_STRINGS( // Enum type: cudf::duplicate_keep_option, // Callable to generate input strings: - // Short identifier used for tables, command-line args, etc. - // Used when context is available to figure out the enum type. [](cudf::duplicate_keep_option option) { switch (option) { case cudf::duplicate_keep_option::KEEP_FIRST: return "KEEP_FIRST"; @@ -42,10 +42,6 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( } }, // Callable to generate descriptions: - // If non-empty, these are used in `--list` to describe values. - // Used when context may not be available to figure out the type from the - // input string. - // Just use `[](auto) { return std::string{}; }` if you don't want these. [](auto) { return std::string{}; }) NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 14b25a20d08..9af8846ba3f 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -88,12 +88,12 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, auto comp = row_equality_comparator( nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal); - auto result_end = unique_copy(sorted_indices->view().begin(), - sorted_indices->view().end(), - unique_indices.begin(), - comp, - keep, - stream); + auto result_end = cudf::detail::unique_copy(sorted_indices->view().begin(), + sorted_indices->view().end(), + unique_indices.begin(), + comp, + keep, + stream); return cudf::detail::slice(column_view(unique_indices), 0, @@ -174,7 +174,7 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, auto const output_size{key_map.get_size()}; // write unique indices to a numeric column - auto unique_indices = make_numeric_column( + auto unique_indices = cudf::make_numeric_column( data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr); auto mutable_view = mutable_column_device_view::create(*unique_indices, stream); thrust::copy_if(rmm::exec_policy(stream), diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index f1ae4da68e8..168db5c8ce9 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -139,10 +139,11 @@ TEST_F(DistinctCount, IgnoringNull) expected - 2, cudf::unordered_distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); - auto const new_end = std::unique(input.begin(), input.end()); - auto const gold_ordered = new_end - input.begin() - 5; + auto const new_end = std::unique(input.begin(), input.end()); + // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows + auto const gold_ordered = new_end - input.begin() - 1; EXPECT_EQ(gold_ordered, - cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); + cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } TEST_F(DistinctCount, WithNansAndNull) From 118468ee1612f55c530ae0ad487b3f6a7dcea836 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 27 Jan 2022 15:52:30 -0500 Subject: [PATCH 44/50] Address review comments --- cpp/src/stream_compaction/distinct_count.cu | 72 ++++++------------- cpp/src/stream_compaction/drop_duplicates.cu | 9 +-- .../stream_compaction_common.cuh | 10 +-- cpp/src/transform/encode.cu | 3 + .../distinct_count_tests.cpp | 30 ++++---- 5 files changed, 52 insertions(+), 72 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 1ce914ad21d..45640649324 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -14,8 +14,8 @@ * limitations under the License. */ -#include -#include +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" #include #include @@ -35,6 +35,11 @@ #include #include #include + +#include +#include +#include +#include #include namespace cudf { @@ -114,25 +119,6 @@ struct has_nans { } }; -/** - * @brief A functor to be used along with type_dispatcher to determine if the device - * `check_nan` functor should be used. - */ -struct check_nan_predicate { - // Check `NaN` for floating point type columns - template >* = nullptr> - bool operator()() - { - return true; - } - // Non-floating point type columns can never have `NaN`, so it will always return false. - template >* = nullptr> - bool operator()() - { - return false; - } -}; - /** * @brief A functor to be used along with device type_dispatcher to check if * the row `index` of `column_device_view` is `NaN`. @@ -140,13 +126,13 @@ struct check_nan_predicate { struct check_nan { // Check if it's `NaN` for floating point type columns template >* = nullptr> - __device__ __forceinline__ bool operator()(column_device_view const& input, size_type index) + __device__ inline bool operator()(column_device_view const& input, size_type index) { return std::isnan(input.data()[index]); } // Non-floating point type columns can never have `NaN`, so it will always return false. template >* = nullptr> - __device__ __forceinline__ bool operator()(column_device_view const&, size_type) + __device__ inline bool operator()(column_device_view const&, size_type) { return false; } @@ -215,7 +201,7 @@ cudf::size_type distinct_count(column_view const& input, auto const count_nulls = null_handling == null_policy::INCLUDE; auto const nan_is_null = nan_handling == nan_policy::NAN_IS_NULL; - auto const should_check_nan = cudf::type_dispatcher(input.type(), check_nan_predicate{}); + auto const should_check_nan = cudf::is_floating_point(input.type()); auto input_device_view = cudf::column_device_view::create(input, stream); auto device_view = *input_device_view; auto input_table_view = table_view{{input}}; @@ -230,21 +216,20 @@ cudf::size_type distinct_count(column_view const& input, thrust::counting_iterator(0), thrust::counting_iterator(num_rows), [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) { - bool is_nan = false; - bool is_null = device_view.is_null(i); - if (nan_is_null and should_check_nan) { - is_nan = cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); - } + auto const is_null = device_view.is_null(i); + auto const is_nan = nan_is_null and should_check_nan + ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i) + : false; if (count_nulls) { if (nan_is_null and (is_nan or is_null)) { - auto prev_is_nan = + auto const prev_is_nan = should_check_nan ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) : false; return (i == 0 or not(device_view.is_null(i - 1) or prev_is_nan)); } - } else { - if (is_null or (nan_is_null and is_nan)) { return false; } + } else if (is_null or (nan_is_null and is_nan)) { + return false; } return (i == 0 or not comp(i, i - 1)); }); @@ -257,32 +242,21 @@ cudf::size_type unordered_distinct_count(column_view const& input, { if (0 == input.size() or input.null_count() == input.size()) { return 0; } - auto const has_null = input.has_nulls(); - - bool has_nan = false; // Check for Nans // Checking for nulls in input and flag nan_handling, as the count will // only get affected if these two conditions are true. NaN will only be // be an extra if nan_handling was NAN_IS_NULL and input also had null, which // will increase the count by 1. - if (nan_handling == nan_policy::NAN_IS_NULL) { - has_nan = cudf::type_dispatcher(input.type(), has_nans{}, input, stream); - } + auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and + cudf::type_dispatcher(input.type(), has_nans{}, input, stream); + auto const has_null = input.has_nulls(); auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // if nan is considered null and there are already null values - if (nan_handling == nan_policy::NAN_IS_NULL and has_nan and has_null) { --count; } - if (not has_null and has_nan and null_handling == null_policy::EXCLUDE and - nan_handling == nan_policy::NAN_IS_NULL) { - --count; - } - - if (null_handling == null_policy::EXCLUDE and has_null) { - return --count; - } else { - return count; - } + if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; } + if (null_handling == null_policy::EXCLUDE and has_null) { --count; } + return count; } } // namespace detail diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 9af8846ba3f..7d0000dc346 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include -#include -#include +#include "drop_duplicates.cuh" +#include "stream_compaction_common.cuh" +#include "stream_compaction_common.hpp" #include #include @@ -40,6 +40,7 @@ #include #include +#include #include namespace cudf { @@ -139,7 +140,7 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) { + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { return empty_like(input); } diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh index 81ed7dc03f8..8ba9223a1bc 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.cuh +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include "stream_compaction_common.hpp" namespace cudf { namespace detail { @@ -28,7 +28,7 @@ class compaction_hash { public: compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {} - __device__ __forceinline__ auto operator()(size_type i) const noexcept + __device__ inline auto operator()(size_type i) const noexcept { auto hash = _hash(i); return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash; @@ -39,13 +39,13 @@ class compaction_hash { }; /** - * @brief Device functor to determine if a row is valid. - */ + * @brief Device functor to determine if a row is valid. + */ class row_validity { public: row_validity(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {} - __device__ __inline__ bool operator()(const size_type& i) const noexcept + __device__ inline bool operator()(const size_type& i) const noexcept { return cudf::bit_is_set(_row_bitmask, i); } diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 400d1b26757..405c83ab872 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -30,7 +30,10 @@ #include #include +#include #include +#include +#include namespace cudf { namespace detail { diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index 168db5c8ce9..78b52db5255 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -53,7 +53,8 @@ TYPED_TEST(DistinctCountCommon, NoNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end()); // explicit instantiation to one particular type (`double`) to reduce build time - cudf::size_type const expected = std::set(input.begin(), input.end()).size(); + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); @@ -61,7 +62,7 @@ TYPED_TEST(DistinctCountCommon, NoNull) // explicit instantiation to one particular type (`double`) to reduce build time std::vector input_data(input.begin(), input.end()); auto const new_end = std::unique(input_data.begin(), input_data.end()); - auto const gold_ordered = new_end - input_data.begin(); + auto const gold_ordered = std::distance(input_data.begin(), new_end); EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } @@ -83,16 +84,14 @@ TYPED_TEST(DistinctCountCommon, TableNoNull) cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); cudf::test::fixed_width_column_wrapper input_col2(input2.begin(), input2.end()); + cudf::table_view input_table({input_col1, input_col2}); - std::vector cols{input_col1, input_col2}; - cudf::table_view input_table(cols); - - cudf::size_type const expected = - std::set>(pair_input.begin(), pair_input.end()).size(); + auto const expected = static_cast( + std::set>(pair_input.begin(), pair_input.end()).size()); EXPECT_EQ(expected, cudf::unordered_distinct_count(input_table, null_equality::EQUAL)); auto const new_end = std::unique(pair_input.begin(), pair_input.end()); - auto const gold_ordered = new_end - pair_input.begin(); + auto const gold_ordered = std::distance(pair_input.begin(), new_end); EXPECT_EQ(gold_ordered, cudf::distinct_count(input_table, null_equality::EQUAL)); } @@ -111,13 +110,14 @@ TEST_F(DistinctCount, WithNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); // explicit instantiation to one particular type (`double`) to reduce build time - cudf::size_type const expected = std::set(input.begin(), input.end()).size(); + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); EXPECT_EQ( expected, cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); auto const new_end = std::unique(input.begin(), input.end()); - auto const gold_ordered = new_end - input.begin() - 3; + auto const gold_ordered = std::distance(input.begin(), new_end) - 3; EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_VALID)); } @@ -133,7 +133,8 @@ TEST_F(DistinctCount, IgnoringNull) cudf::test::fixed_width_column_wrapper input_col(input.begin(), input.end(), valid.begin()); - cudf::size_type const expected = std::set(input.begin(), input.end()).size(); + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); // Removing 2 from expected to remove count for `XXX` and `YYY` EXPECT_EQ( expected - 2, @@ -141,7 +142,7 @@ TEST_F(DistinctCount, IgnoringNull) auto const new_end = std::unique(input.begin(), input.end()); // -1 since `YYY, YYY, XXX` is in the same group of equivalent rows - auto const gold_ordered = new_end - input.begin() - 1; + auto const gold_ordered = std::distance(input.begin(), new_end) - 1; EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); } @@ -157,13 +158,14 @@ TEST_F(DistinctCount, WithNansAndNull) cudf::test::fixed_width_column_wrapper input_col{input.begin(), input.end(), valid.begin()}; - cudf::size_type const expected = std::set(input.begin(), input.end()).size(); + auto const expected = + static_cast(std::set(input.begin(), input.end()).size()); EXPECT_EQ( expected + 1, // +1 since `NAN` is not in std::set cudf::unordered_distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); auto const new_end = std::unique(input.begin(), input.end()); - auto const gold_ordered = new_end - input.begin(); + auto const gold_ordered = std::distance(input.begin(), new_end); EXPECT_EQ(gold_ordered, cudf::distinct_count(input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID)); From d1535d505c1b8a17ef21fa28f6ba05ad3e5e955f Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 27 Jan 2022 17:31:52 -0500 Subject: [PATCH 45/50] Simply if logic --- cpp/src/stream_compaction/distinct_count.cu | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 45640649324..f3c2e83d2dc 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -220,15 +220,13 @@ cudf::size_type distinct_count(column_view const& input, auto const is_nan = nan_is_null and should_check_nan ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i) : false; - if (count_nulls) { - if (nan_is_null and (is_nan or is_null)) { - auto const prev_is_nan = - should_check_nan - ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) - : false; - return (i == 0 or not(device_view.is_null(i - 1) or prev_is_nan)); - } - } else if (is_null or (nan_is_null and is_nan)) { + if (count_nulls and nan_is_null and (is_nan or is_null)) { + auto const prev_is_nan = + should_check_nan + ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) + : false; + return (i == 0 or not(device_view.is_null(i - 1) or prev_is_nan)); + } else if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } return (i == 0 or not comp(i, i - 1)); From 0b0d0154e32a82829e03f7844749e52500bd94fd Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 27 Jan 2022 17:57:35 -0500 Subject: [PATCH 46/50] Minor updates --- cpp/src/stream_compaction/distinct_count.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index f3c2e83d2dc..41a16c33f33 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -221,15 +221,14 @@ cudf::size_type distinct_count(column_view const& input, ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i) : false; if (count_nulls and nan_is_null and (is_nan or is_null)) { - auto const prev_is_nan = - should_check_nan - ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) - : false; - return (i == 0 or not(device_view.is_null(i - 1) or prev_is_nan)); + return i == 0 or + not(device_view.is_null(i - 1) or should_check_nan + ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) + : false); } else if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } - return (i == 0 or not comp(i, i - 1)); + return i == 0 or not comp(i, i - 1); }); } From 906f4690b9b5fea9f1ba708fa19162e9c0b27823 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 27 Jan 2022 18:31:26 -0500 Subject: [PATCH 47/50] Add early exit --- cpp/src/stream_compaction/distinct_count.cu | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 41a16c33f33..12e2ceccf40 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -220,15 +220,16 @@ cudf::size_type distinct_count(column_view const& input, auto const is_nan = nan_is_null and should_check_nan ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i) : false; + if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } + if (i == 0) { return true; } if (count_nulls and nan_is_null and (is_nan or is_null)) { - return i == 0 or - not(device_view.is_null(i - 1) or should_check_nan - ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) - : false); - } else if (not count_nulls and (is_null or (nan_is_null and is_nan))) { - return false; + auto const prev_is_nan = + should_check_nan + ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) + : false; + return not(device_view.is_null(i - 1) or prev_is_nan); } - return i == 0 or not comp(i, i - 1); + return not comp(i, i - 1); }); } From c8a3e879577c1d9e01b2729d8ea0d5d30ca87884 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 28 Jan 2022 10:40:36 -0500 Subject: [PATCH 48/50] Fix cuco pair issues with the latest cuco tag --- cpp/cmake/thirdparty/get_cucollections.cmake | 2 +- cpp/src/stream_compaction/distinct_count.cu | 8 ++------ cpp/src/stream_compaction/drop_duplicates.cu | 8 ++------ 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 0acebeaf4ac..5a20f78b798 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -21,7 +21,7 @@ function(find_and_configure_cucollections) cuco 0.0 GLOBAL_TARGETS cuco::cuco CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections - GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82 + GIT_TAG 6ec8b6dcdeceea07ab4456d32461a05c18864411 OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 12e2ceccf40..a9a3fb3791c 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -169,12 +169,8 @@ cudf::size_type unordered_distinct_count(table_view const& keys, compaction_hash hash_key{has_null, *table_ptr}; row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); - auto iter = cudf::detail::make_counting_transform_iterator(0, [] __device__(size_type i) { - // TODO: cuco::make_pair currently requires rvalue references. We - // create a copy to avoid double-move invoking undefined behavior. - auto ii = i; - return cuco::make_pair(std::move(i), std::move(ii)); - }); + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); // when nulls are equal, insert non-null rows only to improve efficiency if (nulls_equal == null_equality::EQUAL and has_null) { diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 7d0000dc346..2fd1f530b6d 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -158,12 +158,8 @@ std::unique_ptr
unordered_drop_duplicates(table_view const& input, compaction_hash hash_key{has_null, *table_ptr}; row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); - auto iter = cudf::detail::make_counting_transform_iterator(0, [] __device__(size_type i) { - // TODO: cuco::make_pair currently requires rvalue references. We - // create a copy to avoid double-move invoking undefined behavior. - auto ii = i; - return cuco::make_pair(std::move(i), std::move(ii)); - }); + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); // insert unique indices into the map. key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); From 070d5ce4561406de740ffc0b40d1cab978b60d3b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 1 Feb 2022 19:05:19 -0500 Subject: [PATCH 49/50] Address review comments --- cpp/src/dictionary/add_keys.cu | 6 +++--- cpp/src/stream_compaction/distinct_count.cu | 20 +++++++++---------- .../drop_duplicates_tests.cpp | 4 ++-- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index ee3a05285cf..96b7fd48dc9 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -58,15 +58,15 @@ std::unique_ptr add_keys( // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e] auto combined_keys = cudf::detail::concatenate(std::vector{old_keys, new_keys}, stream); - // sort and remove any duplicates from the combined keys - // unordered_drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f] + + // Drop duplicates from the combined keys, then sort the result. + // sort(unordered_drop_duplicates([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f] auto table_keys = cudf::detail::unordered_drop_duplicates(table_view{{combined_keys->view()}}, std::vector{0}, // only one key column null_equality::EQUAL, stream, mr); - std::vector column_order{order::ASCENDING}; std::vector null_precedence{null_order::AFTER}; // should be no nulls here auto sorted_keys = diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index a9a3fb3791c..b49c12fc7c5 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -213,17 +213,15 @@ cudf::size_type distinct_count(column_view const& input, thrust::counting_iterator(num_rows), [count_nulls, nan_is_null, should_check_nan, device_view, comp] __device__(cudf::size_type i) { auto const is_null = device_view.is_null(i); - auto const is_nan = nan_is_null and should_check_nan - ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i) - : false; + auto const is_nan = nan_is_null and should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i); if (not count_nulls and (is_null or (nan_is_null and is_nan))) { return false; } if (i == 0) { return true; } if (count_nulls and nan_is_null and (is_nan or is_null)) { auto const prev_is_nan = - should_check_nan - ? cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1) - : false; - return not(device_view.is_null(i - 1) or prev_is_nan); + should_check_nan and + cudf::type_dispatcher(device_view.type(), check_nan{}, device_view, i - 1); + return not(prev_is_nan or device_view.is_null(i - 1)); } return not comp(i, i - 1); }); @@ -236,11 +234,11 @@ cudf::size_type unordered_distinct_count(column_view const& input, { if (0 == input.size() or input.null_count() == input.size()) { return 0; } - // Check for Nans + // Check for NaNs // Checking for nulls in input and flag nan_handling, as the count will // only get affected if these two conditions are true. NaN will only be - // be an extra if nan_handling was NAN_IS_NULL and input also had null, which - // will increase the count by 1. + // double-counted as a null if nan_handling was NAN_IS_NULL and input also + // had null values. If so, we decrement the count. auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and cudf::type_dispatcher(input.type(), has_nans{}, input, stream); auto const has_null = input.has_nulls(); @@ -248,8 +246,8 @@ cudf::size_type unordered_distinct_count(column_view const& input, auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); // if nan is considered null and there are already null values - if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; } if (null_handling == null_policy::EXCLUDE and has_null) { --count; } + if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; } return count; } } // namespace detail diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp index 052e79222ff..d49b8208094 100644 --- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp +++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp @@ -115,7 +115,7 @@ TEST_F(DropDuplicates, NonNullTable) cudf::table_view input{{col1, col2, col1_key, col2_key}}; std::vector keys{2, 3}; - // Keep the first of duplicate + // Keep the first duplicate row // The expected table would be sorted in ascending order with respect to keys cudf::test::fixed_width_column_wrapper exp_col1_first{{5, 5, 5, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_first{{4, 4, 4, 3, 9}}; @@ -128,7 +128,7 @@ TEST_F(DropDuplicates, NonNullTable) CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, got_first->view()); - // Keep the last of duplicate + // Keep the last duplicate row cudf::test::fixed_width_column_wrapper exp_col1_last{{5, 5, 4, 3, 8}}; cudf::test::fixed_width_column_wrapper exp_col2_last{{4, 4, 5, 3, 9}}; cudf::test::fixed_width_column_wrapper exp_col1_key_last{{9, 19, 20, 20, 21}}; From a60c128f3f107858dac4cb18e6deac5cce086fd7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 2 Feb 2022 12:47:58 -0500 Subject: [PATCH 50/50] Address review + update comments --- cpp/src/stream_compaction/distinct_count.cu | 23 ++++++++++++--------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index b49c12fc7c5..2c7488084b5 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -234,19 +234,22 @@ cudf::size_type unordered_distinct_count(column_view const& input, { if (0 == input.size() or input.null_count() == input.size()) { return 0; } - // Check for NaNs - // Checking for nulls in input and flag nan_handling, as the count will - // only get affected if these two conditions are true. NaN will only be - // double-counted as a null if nan_handling was NAN_IS_NULL and input also - // had null values. If so, we decrement the count. - auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and - cudf::type_dispatcher(input.type(), has_nans{}, input, stream); - auto const has_null = input.has_nulls(); - auto count = detail::unordered_distinct_count(table_view{{input}}, null_equality::EQUAL, stream); - // if nan is considered null and there are already null values + // Check for nulls. If the null policy is EXCLUDE and null values were found, + // we decrement the count. + auto const has_null = input.has_nulls(); if (null_handling == null_policy::EXCLUDE and has_null) { --count; } + + // Check for NaNs. There are two cases that can lead to decrementing the + // count. The first case is when the input has no nulls, but has NaN values + // handled as a null via NAN_IS_NULL and has a policy to EXCLUDE null values + // from the count. The second case is when the input has null values and NaN + // values handled as nulls via NAN_IS_NULL. Regardless of whether the null + // policy is set to EXCLUDE, we decrement the count to avoid double-counting + // null and NaN as distinct entities. + auto const has_nan_as_null = (nan_handling == nan_policy::NAN_IS_NULL) and + cudf::type_dispatcher(input.type(), has_nans{}, input, stream); if (has_nan_as_null and (has_null or null_handling == null_policy::EXCLUDE)) { --count; } return count; }