From afa07706f6fab0fe020f0af69896fd4090cad757 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 14 Jul 2021 00:34:18 +0530 Subject: [PATCH 1/4] add unit tests to replicate the issue #8717 --- cpp/tests/groupby/max_tests.cpp | 36 +++++++++++++++++++++++++++++++++ cpp/tests/groupby/min_tests.cpp | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp index e0da55b080f..c4352825151 100644 --- a/cpp/tests/groupby/max_tests.cpp +++ b/cpp/tests/groupby/max_tests.cpp @@ -162,6 +162,42 @@ TEST_F(groupby_max_string_test, zero_valid_values) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } +TEST_F(groupby_max_string_test, max_sorted_strings) +{ + // testcase replicated in issue #8717 + cudf::test::strings_column_wrapper keys( + {"", "", "", "", "", "", "06", "06", "06", "06", "10", "10", "10", "10", "14", "14", + "14", "14", "18", "18", "18", "18", "22", "22", "22", "22", "26", "26", "26", "26", "30", "30", + "30", "30", "34", "34", "34", "34", "38", "38", "38", "38", "42", "42", "42", "42"}, + {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + cudf::test::strings_column_wrapper vals( + {"", "", "", "", "", "", "06", "", "", "", "10", "", "", "", "14", "", + "", "", "18", "", "", "", "22", "", "", "", "26", "", "", "", "30", "", + "", "", "34", "", "", "", "38", "", "", "", "42", "", "", ""}, + {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, + 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}); + cudf::test::strings_column_wrapper expect_keys( + {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + cudf::test::strings_column_wrapper expect_vals( + {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + + // fixed_width_column_wrapper expect_argmax( + // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1}, + // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + auto agg = cudf::make_max_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::NO, + null_policy::INCLUDE, + sorted::YES); +} + struct groupby_dictionary_max_test : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp index 8f997875a78..f148e3c8210 100644 --- a/cpp/tests/groupby/min_tests.cpp +++ b/cpp/tests/groupby/min_tests.cpp @@ -162,6 +162,42 @@ TEST_F(groupby_min_string_test, zero_valid_values) test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES); } +TEST_F(groupby_min_string_test, min_sorted_strings) +{ + // testcase replicated in issue #8717 + cudf::test::strings_column_wrapper keys( + {"", "", "", "", "", "", "06", "06", "06", "06", "10", "10", "10", "10", "14", "14", + "14", "14", "18", "18", "18", "18", "22", "22", "22", "22", "26", "26", "26", "26", "30", "30", + "30", "30", "34", "34", "34", "34", "38", "38", "38", "38", "42", "42", "42", "42"}, + {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + cudf::test::strings_column_wrapper vals( + {"", "", "", "", "", "", "06", "", "", "", "10", "", "", "", "14", "", + "", "", "18", "", "", "", "22", "", "", "", "26", "", "", "", "30", "", + "", "", "34", "", "", "", "38", "", "", "", "42", "", "", ""}, + {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, + 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0}); + cudf::test::strings_column_wrapper expect_keys( + {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + cudf::test::strings_column_wrapper expect_vals( + {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + + // fixed_width_column_wrapper expect_argmin( + // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1}, + // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + auto agg = cudf::make_min_aggregation(); + test_single_agg(keys, + vals, + expect_keys, + expect_vals, + std::move(agg), + force_use_sort_impl::NO, + null_policy::INCLUDE, + sorted::YES); +} + struct groupby_dictionary_min_test : public cudf::test::BaseFixture { }; From 008abeaa5a00f9e7554711e803db9e7fa991bb5a Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 14 Jul 2021 00:37:39 +0530 Subject: [PATCH 2/4] replace gather with gather_if on indices to fix missing SENTINEL values for argmin, argmax --- cpp/src/groupby/sort/group_argmax.cu | 31 ++++++++++++---------------- cpp/src/groupby/sort/group_argmin.cu | 31 ++++++++++++---------------- 2 files changed, 26 insertions(+), 36 deletions(-) diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu index bed64c5147a..2d1a5ab548b 100644 --- a/cpp/src/groupby/sort/group_argmax.cu +++ b/cpp/src/groupby/sort/group_argmax.cu @@ -21,7 +21,7 @@ #include -#include +#include namespace cudf { namespace groupby { @@ -39,29 +39,24 @@ std::unique_ptr group_argmax(column_view const& values, num_groups, group_labels, stream, - rmm::mr::get_current_device_resource()); + mr); // The functor returns the index of maximum in the sorted values. // We need the index of maximum in the original unsorted values. // So use indices to gather the sort order used to sort `values`. // Gather map cannot be null so we make a view with the mask removed. // The values in data buffer of indices corresponding to null values was - // initialized to ARGMAX_SENTINEL which is an out of bounds index value (-1) - // and causes the gathered value to be null. - column_view null_removed_indices( - data_type(type_to_id()), - indices->size(), - static_cast(indices->view().template data())); - auto result_table = - cudf::detail::gather(table_view({key_sort_order}), - null_removed_indices, - indices->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - - return std::move(result_table->release()[0]); + // initialized to ARGMAX_SENTINEL. Using gather_if. + // This can't use gather because nulls in gathered column will not store ARGMAX_SENTINEL. + auto indices_view = indices->mutable_view(); + thrust::gather_if(rmm::exec_policy(stream), + indices_view.begin(), // map first + indices_view.end(), // map last + indices_view.begin(), // stencil + key_sort_order.data(), // input + indices_view.begin(), // result + [] __device__(auto i) { return (i != cudf::detail::ARGMAX_SENTINEL); }); + return indices; } } // namespace detail diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu index ec97a609390..b96367c1b5f 100644 --- a/cpp/src/groupby/sort/group_argmin.cu +++ b/cpp/src/groupby/sort/group_argmin.cu @@ -21,7 +21,7 @@ #include -#include +#include namespace cudf { namespace groupby { @@ -39,29 +39,24 @@ std::unique_ptr group_argmin(column_view const& values, num_groups, group_labels, stream, - rmm::mr::get_current_device_resource()); + mr); // The functor returns the index of minimum in the sorted values. // We need the index of minimum in the original unsorted values. // So use indices to gather the sort order used to sort `values`. - // Gather map cannot be null so we make a view with the mask removed. // The values in data buffer of indices corresponding to null values was - // initialized to ARGMIN_SENTINEL which is an out of bounds index value (-1) - // and causes the gathered value to be null. - column_view null_removed_indices( - data_type(type_to_id()), - indices->size(), - static_cast(indices->view().template data())); - auto result_table = - cudf::detail::gather(table_view({key_sort_order}), - null_removed_indices, - indices->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); + // initialized to ARGMIN_SENTINEL. Using gather_if. + // This can't use gather because nulls in gathered column will not store ARGMIN_SENTINEL. + auto indices_view = indices->mutable_view(); + thrust::gather_if(rmm::exec_policy(stream), + indices_view.begin(), // map first + indices_view.end(), // map last + indices_view.begin(), // stencil + key_sort_order.data(), // input + indices_view.begin(), // result + [] __device__(auto i) { return (i != cudf::detail::ARGMIN_SENTINEL); }); - return std::move(result_table->release()[0]); + return indices; } } // namespace detail From 9cfff6039b62f37c7be1a5486f40956d92015476 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 14 Jul 2021 00:40:33 +0530 Subject: [PATCH 3/4] update copyright year --- cpp/src/groupby/sort/group_argmax.cu | 2 +- cpp/src/groupby/sort/group_argmin.cu | 2 +- cpp/tests/groupby/max_tests.cpp | 2 +- cpp/tests/groupby/min_tests.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu index 2d1a5ab548b..0b612a10c30 100644 --- a/cpp/src/groupby/sort/group_argmax.cu +++ b/cpp/src/groupby/sort/group_argmax.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu index b96367c1b5f..8e19c3ff22a 100644 --- a/cpp/src/groupby/sort/group_argmin.cu +++ b/cpp/src/groupby/sort/group_argmin.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp index c4352825151..b5710d3f4bc 100644 --- a/cpp/tests/groupby/max_tests.cpp +++ b/cpp/tests/groupby/max_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp index f148e3c8210..1544e867595 100644 --- a/cpp/tests/groupby/min_tests.cpp +++ b/cpp/tests/groupby/min_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 956e12e759a65d5c328d45ca93b1e330ef16538b Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 14 Jul 2021 00:48:53 +0530 Subject: [PATCH 4/4] use begin --- cpp/src/groupby/sort/group_argmax.cu | 10 +++++----- cpp/src/groupby/sort/group_argmin.cu | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu index 0b612a10c30..6ce23ffc35b 100644 --- a/cpp/src/groupby/sort/group_argmax.cu +++ b/cpp/src/groupby/sort/group_argmax.cu @@ -50,11 +50,11 @@ std::unique_ptr group_argmax(column_view const& values, // This can't use gather because nulls in gathered column will not store ARGMAX_SENTINEL. auto indices_view = indices->mutable_view(); thrust::gather_if(rmm::exec_policy(stream), - indices_view.begin(), // map first - indices_view.end(), // map last - indices_view.begin(), // stencil - key_sort_order.data(), // input - indices_view.begin(), // result + indices_view.begin(), // map first + indices_view.end(), // map last + indices_view.begin(), // stencil + key_sort_order.begin(), // input + indices_view.begin(), // result [] __device__(auto i) { return (i != cudf::detail::ARGMAX_SENTINEL); }); return indices; } diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu index 8e19c3ff22a..ab91c2c0d29 100644 --- a/cpp/src/groupby/sort/group_argmin.cu +++ b/cpp/src/groupby/sort/group_argmin.cu @@ -49,11 +49,11 @@ std::unique_ptr group_argmin(column_view const& values, // This can't use gather because nulls in gathered column will not store ARGMIN_SENTINEL. auto indices_view = indices->mutable_view(); thrust::gather_if(rmm::exec_policy(stream), - indices_view.begin(), // map first - indices_view.end(), // map last - indices_view.begin(), // stencil - key_sort_order.data(), // input - indices_view.begin(), // result + indices_view.begin(), // map first + indices_view.end(), // map last + indices_view.begin(), // stencil + key_sort_order.begin(), // input + indices_view.begin(), // result [] __device__(auto i) { return (i != cudf::detail::ARGMIN_SENTINEL); }); return indices;