Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support nested types for nth_element reduction #9043

Merged
merged 10 commits into from
Aug 24, 2021
14 changes: 14 additions & 0 deletions cpp/include/cudf/scalar/scalar_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,20 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
*
* @throw cudf::logic_error if the `input` column is struct type and empty
*
* @param input Immutable view of input column to emulate
* @param stream CUDA stream used for device memory operations.
* @param mr Device memory resource used to allocate the scalar's `data` and `is_valid` bool.
*/
std::unique_ptr<scalar> make_empty_scalar_like(
column_view const& input,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Construct scalar using the given value of fixed width type
*
Expand Down
19 changes: 11 additions & 8 deletions cpp/src/reductions/reductions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <cudf/reduction.hpp>
#include <cudf/scalar/scalar_factories.hpp>

#include <cudf/structs/structs_column_view.hpp>
#include <rmm/cuda_stream_view.hpp>

namespace cudf {
Expand Down Expand Up @@ -112,15 +113,17 @@ std::unique_ptr<scalar> reduce(
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
std::unique_ptr<scalar> result = make_default_constructed_scalar(output_dtype, stream, mr);
result->set_valid_async(false, stream);

// check if input column is empty
if (col.size() <= col.null_count()) return result;
// Returns default scalar if input column is non-valid. In terms of nested columns, we need to
// handcraft the default scalar with input column.
if (col.size() <= col.null_count()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like we need something like a make_empty_scalar_like(column_view)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. And I created the method make_empty_scalar_like based on the above code.

if (col.type().id() == type_id::EMPTY || col.type() != output_dtype) {
return make_default_constructed_scalar(output_dtype, stream, mr);
}
return make_empty_scalar_like(col, stream, mr);
}

result =
aggregation_dispatcher(agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
return result;
return aggregation_dispatcher(
agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
}
} // namespace detail

Expand Down
23 changes: 23 additions & 0 deletions cpp/src/scalar/scalar_factories.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <cudf/copying.hpp>
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
#include <cudf/detail/copy.hpp>
#include <rmm/cuda_stream_view.hpp>

namespace cudf {
Expand Down Expand Up @@ -165,4 +167,25 @@ std::unique_ptr<scalar> make_default_constructed_scalar(data_type type,
return type_dispatcher(type, default_scalar_functor{}, stream, mr);
}

std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
std::unique_ptr<scalar> result;
switch (column.type().id()) {
case type_id::LIST:
result = make_list_scalar(empty_like(column)->view(), stream, mr);
result->set_valid_async(false, stream);
break;
case type_id::STRUCT:
// Struct scalar inputs must have exactly 1 row.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should have "at least one row", I think...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If so, you don't have to have your own CUDF_EXPECT because rows == 1 will be handled in the scalar constructor.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, then you should use make_struct_scalar too :|

ttnghia marked this conversation as resolved.
Show resolved Hide resolved
CUDF_EXPECTS(!column.is_empty(), "Can not create empty struct scalar");
result = detail::get_element(column, 1, stream, mr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

index=1 ? (should be 0 right?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about result = make_struct_scalar(column, stream, mr)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make_struct_scalar only accepts table_view or host_span<column_view>. In addition, it assumes the size of input data == 1 (rather than slicing them to 1).

Copy link
Contributor

@ttnghia ttnghia Aug 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wow, expecting full_size() == 1 (not sliced_size() == 1) should be wrong!
OK get it. So you are just slicing the input column (get one row) from it and don't care how many rows it has.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

index=1 ? (should be 0 right?)

Yes, I corrected it. Thanks!

result->set_valid_async(false, stream);
break;
default: result = make_default_constructed_scalar(column.type(), stream, mr);
}
return result;
}

} // namespace cudf
96 changes: 96 additions & 0 deletions cpp/tests/groupby/nth_element_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,5 +405,101 @@ TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2));
}

struct groupby_nth_element_structs_test : BaseFixture {
};

TEST_F(groupby_nth_element_structs_test, Basics)
{
using structs = cudf::test::structs_column_wrapper;
using ints = cudf::test::fixed_width_column_wrapper<int>;
using doubles = cudf::test::fixed_width_column_wrapper<double>;
using strings = cudf::test::strings_column_wrapper;

auto keys = ints{0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
auto child0 = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
auto child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
auto child2 = strings{"", "a", "b", "c", "d", "e", "f", "g", "HH", "JJJ"};
auto values = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};

auto expected_keys = ints{0, 1, 2, 3};
auto expected_ch0 = ints{1, 4, 7, 0};
auto expected_ch1 = doubles{1.2, 4.51, -0.07, 0.0};
auto expected_ch2 = strings{"a", "d", "g", ""};
auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
test_single_agg(
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(1));

expected_keys = ints{0, 1, 2, 3};
expected_ch0 = ints{0, 4, 6, 9};
expected_ch1 = doubles{0.1, 4.51, 6.3231, 9.999};
expected_ch2 = strings{"", "d", "f", "JJJ"};
expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
test_single_agg(keys,
values,
expected_keys,
expected_values,
cudf::make_nth_element_aggregation(0, null_policy::EXCLUDE));
}

TEST_F(groupby_nth_element_structs_test, NestedStructs)
{
using structs = cudf::test::structs_column_wrapper;
using ints = cudf::test::fixed_width_column_wrapper<int>;
using doubles = cudf::test::fixed_width_column_wrapper<double>;
using lists = cudf::test::lists_column_wrapper<int>;

auto keys = ints{0, 0, 0, 1, 1, 1, 2, 2, 2, 3};
auto child0 = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
auto child0_of_child1 = ints{0, -1, -2, -3, -4, -5, -6, -7, -8, -9};
auto child1_of_child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
auto child1 = structs{child0_of_child1, child1_of_child1};
auto child2 = lists{{0}, {1, 2, 3}, {}, {4}, {5, 6}, {}, {}, {7}, {8, 9}, {}};
auto values = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};

auto expected_keys = ints{0, 1, 2, 3};
auto expected_ch0 = ints{1, 4, 7, 0};
auto expected_ch0_of_ch1 = ints{-1, -4, -7, 0};
auto expected_ch1_of_ch1 = doubles{1.2, 4.51, -0.07, 0.0};
auto expected_ch1 = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
auto expected_ch2 = lists{{1, 2, 3}, {5, 6}, {7}, {}};
auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
test_single_agg(
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(1));

expected_keys = ints{0, 1, 2, 3};
expected_ch0 = ints{0, 4, 6, 9};
expected_ch0_of_ch1 = ints{0, -4, -6, -9};
expected_ch1_of_ch1 = doubles{0.1, 4.51, 6.3231, 9.999};
expected_ch1 = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
expected_ch2 = lists{{0}, {5, 6}, {}, {}};
expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
test_single_agg(keys,
values,
expected_keys,
expected_values,
cudf::make_nth_element_aggregation(0, null_policy::EXCLUDE));
}

TEST_F(groupby_nth_element_structs_test, EmptyInput)
{
using structs = cudf::test::structs_column_wrapper;
using ints = cudf::test::fixed_width_column_wrapper<int>;
using doubles = cudf::test::fixed_width_column_wrapper<double>;
using strings = cudf::test::strings_column_wrapper;

auto keys = ints{};
auto child0 = ints{};
auto child1 = doubles{};
auto child2 = strings{};
auto values = structs{{child0, child1, child2}};

auto expected_keys = ints{};
auto expected_ch0 = ints{};
auto expected_ch1 = doubles{};
auto expected_ch2 = strings{};
auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}};
test_single_agg(
keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0));
}
} // namespace test
} // namespace cudf
Loading