Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support nested types for nth_element reduction #9043

Merged
merged 10 commits into from
Aug 24, 2021
14 changes: 14 additions & 0 deletions cpp/include/cudf/scalar/scalar_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,20 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
*
* @throw cudf::logic_error if the `input` column is struct type and empty
*
* @param input Immutable view of input column to emulate
* @param stream CUDA stream used for device memory operations.
* @param mr Device memory resource used to allocate the scalar's `data` and `is_valid` bool.
*/
std::unique_ptr<scalar> make_empty_scalar_like(
column_view const& input,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Construct scalar using the given value of fixed width type
*
Expand Down
19 changes: 11 additions & 8 deletions cpp/src/reductions/reductions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cudf/reduction.hpp>
#include <cudf/scalar/scalar_factories.hpp>

#include <cudf/structs/structs_column_view.hpp>
#include <rmm/cuda_stream_view.hpp>

namespace cudf {
Expand Down Expand Up @@ -112,15 +113,17 @@ std::unique_ptr<scalar> reduce(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
std::unique_ptr<scalar> result = make_default_constructed_scalar(output_dtype, stream, mr);
result->set_valid_async(false, stream);

// check if input column is empty
if (col.size() <= col.null_count()) return result;
// Returns default scalar if input column is non-valid. In terms of nested columns, we need to
// handcraft the default scalar with input column.
if (col.size() <= col.null_count()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like we need something like a make_empty_scalar_like(column_view)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. And I created the method make_empty_scalar_like based on the above code.

if (col.type().id() == type_id::EMPTY || col.type() != output_dtype) {
return make_default_constructed_scalar(output_dtype, stream, mr);
}
return make_empty_scalar_like(col, stream, mr);
}

result =
aggregation_dispatcher(agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
return result;
return aggregation_dispatcher(
agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
}
} // namespace detail

Expand Down
21 changes: 21 additions & 0 deletions cpp/src/scalar/scalar_factories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <cudf/detail/copy.hpp>
#include <rmm/cuda_stream_view.hpp>

namespace cudf {
Expand Down Expand Up @@ -165,4 +166,24 @@ std::unique_ptr<scalar> make_default_constructed_scalar(data_type type,
return type_dispatcher(type, default_scalar_functor{}, stream, mr);
}

std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
std::unique_ptr<scalar> result;
switch (column.type().id()) {
case type_id::LIST:
result = make_list_scalar(empty_like(column)->view(), stream, mr);
result->set_valid_async(false, stream);
break;
case type_id::STRUCT:
// The input column must have at least 1 row to extract a scalar (row) from it.
result = detail::get_element(column, 0, stream, mr);
result->set_valid_async(false, stream);
break;
default: result = make_default_constructed_scalar(column.type(), stream, mr);
}
return result;
}

} // namespace cudf
96 changes: 96 additions & 0 deletions cpp/tests/groupby/nth_element_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,5 +405,101 @@ TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2));
}

struct groupby_nth_element_structs_test : BaseFixture {
};

TEST_F(groupby_nth_element_structs_test, Basics)
{
using structs = cudf::test::structs_column_wrapper;
using ints = cudf::test::fixed_width_column_wrapper<int>;
using doubles = cudf::test::fixed_width_column_wrapper<double>;
using strings = cudf::test::strings_column_wrapper;

auto keys = ints{0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
auto child0 = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
auto child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
auto child2 = strings{"", "a", "b", "c", "d", "e", "f", "g", "HH", "JJJ"};
auto values = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};

auto expected_keys = ints{0, 1, 2, 3};
auto expected_ch0 = ints{1, 4, 7, 0};
auto expected_ch1 = doubles{1.2, 4.51, -0.07, 0.0};
auto expected_ch2 = strings{"a", "d", "g", ""};
auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
test_single_agg(
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(1));

expected_keys = ints{0, 1, 2, 3};
expected_ch0 = ints{0, 4, 6, 9};
expected_ch1 = doubles{0.1, 4.51, 6.3231, 9.999};
expected_ch2 = strings{"", "d", "f", "JJJ"};
expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
test_single_agg(keys,
values,
expected_keys,
expected_values,
cudf::make_nth_element_aggregation(0, null_policy::EXCLUDE));
}

TEST_F(groupby_nth_element_structs_test, NestedStructs)
{
using structs = cudf::test::structs_column_wrapper;
using ints = cudf::test::fixed_width_column_wrapper<int>;
using doubles = cudf::test::fixed_width_column_wrapper<double>;
using lists = cudf::test::lists_column_wrapper<int>;

auto keys = ints{0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
auto child0 = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
auto child0_of_child1 = ints{0, -1, -2, -3, -4, -5, -6, -7, -8, -9};
auto child1_of_child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
auto child1 = structs{child0_of_child1, child1_of_child1};
auto child2 = lists{{0}, {1, 2, 3}, {}, {4}, {5, 6}, {}, {}, {7}, {8, 9}, {}};
auto values = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};

auto expected_keys = ints{0, 1, 2, 3};
auto expected_ch0 = ints{1, 4, 7, 0};
auto expected_ch0_of_ch1 = ints{-1, -4, -7, 0};
auto expected_ch1_of_ch1 = doubles{1.2, 4.51, -0.07, 0.0};
auto expected_ch1 = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
auto expected_ch2 = lists{{1, 2, 3}, {5, 6}, {7}, {}};
auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
test_single_agg(
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(1));

expected_keys = ints{0, 1, 2, 3};
expected_ch0 = ints{0, 4, 6, 9};
expected_ch0_of_ch1 = ints{0, -4, -6, -9};
expected_ch1_of_ch1 = doubles{0.1, 4.51, 6.3231, 9.999};
expected_ch1 = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
expected_ch2 = lists{{0}, {5, 6}, {}, {}};
expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
test_single_agg(keys,
values,
expected_keys,
expected_values,
cudf::make_nth_element_aggregation(0, null_policy::EXCLUDE));
}

TEST_F(groupby_nth_element_structs_test, EmptyInput)
{
using structs = cudf::test::structs_column_wrapper;
using ints = cudf::test::fixed_width_column_wrapper<int>;
using doubles = cudf::test::fixed_width_column_wrapper<double>;
using strings = cudf::test::strings_column_wrapper;

auto keys = ints{};
auto child0 = ints{};
auto child1 = doubles{};
auto child2 = strings{};
auto values = structs{{child0, child1, child2}};

auto expected_keys = ints{};
auto expected_ch0 = ints{};
auto expected_ch1 = doubles{};
auto expected_ch2 = strings{};
auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}};
test_single_agg(
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0));
}
} // namespace test
} // namespace cudf
Loading