Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support structs for cudf::contains with column/scalar input #9929

Merged
merged 15 commits into from
Jan 15, 2022
66 changes: 52 additions & 14 deletions cpp/src/search/search.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -173,11 +173,56 @@ bool contains_scalar_dispatch::operator()<cudf::list_view>(column_view const&,
}

template <>
bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const&,
scalar const&,
rmm::cuda_stream_view)
bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const& col,
scalar const& value,
rmm::cuda_stream_view stream)
{
CUDF_FAIL("struct_view type not supported yet");
CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match");

auto const scalar_table = static_cast<struct_scalar const*>(&value)->view();
CUDF_EXPECTS(col.num_children() == scalar_table.num_columns(),
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
"struct scalar and structs column must have the same number of children");
for (size_type i = 0; i < col.num_children(); ++i) {
CUDF_EXPECTS(col.child(i).type() == scalar_table.column(i).type(),
"scalar and column children types must match");
}

// Prepare to flatten the structs column and scalar.
auto const has_null_elements =
has_nested_nulls(table_view{std::vector<column_view>{col.child_begin(), col.child_end()}}) ||
has_nested_nulls(scalar_table);
auto const flatten_nullability = has_null_elements
? structs::detail::column_nullability::FORCE
: structs::detail::column_nullability::MATCH_INCOMING;

// Flatten the input structs column, only materialize the bitmask if there is null in the input.
auto const col_flattened =
structs::detail::flatten_nested_columns(table_view{{col}}, {}, {}, flatten_nullability);
auto const val_flattened =
structs::detail::flatten_nested_columns(scalar_table, {}, {}, flatten_nullability);

// The struct scalar only contains the struct member columns.
// Thus, if there is any null in the input, we must exclude the first column in the flattened
// table of the input column from searching because that column is the materialized bitmask of
// the input structs column.
auto const col_flattened_content = col_flattened.flattened_columns();
auto const col_flattened_children = table_view{std::vector<column_view>{
col_flattened_content.begin() + static_cast<size_type>(has_null_elements),
col_flattened_content.end()}};

auto const d_col_children_ptr = table_device_view::create(col_flattened_children, stream);
auto const d_val_ptr = table_device_view::create(val_flattened, stream);

auto const start_iter = thrust::make_counting_iterator<size_type>(0);
auto const end_iter = start_iter + col.size();
auto const comp = row_equality_comparator(
nullate::DYNAMIC{has_null_elements}, *d_col_children_ptr, *d_val_ptr, null_equality::EQUAL);
auto const found_iter = thrust::find_if(
rmm::exec_policy(stream), start_iter, end_iter, [comp] __device__(auto const idx) {
return comp(idx, 0); // compare col[idx] == val[0].
});

return found_iter != end_iter;
}

template <>
Expand All @@ -203,7 +248,6 @@ namespace detail {
bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream)
{
if (col.is_empty()) { return false; }

if (not value.is_valid(stream)) { return col.has_nulls(); }

return cudf::type_dispatcher(col.type(), contains_scalar_dispatch{}, col, value, stream);
Expand Down Expand Up @@ -264,20 +308,14 @@ struct multi_contains_dispatch {

template <>
std::unique_ptr<column> multi_contains_dispatch::operator()<list_view>(
column_view const& haystack,
column_view const& needles,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
{
CUDF_FAIL("list_view type not supported");
}

template <>
std::unique_ptr<column> multi_contains_dispatch::operator()<struct_view>(
column_view const& haystack,
column_view const& needles,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
{
CUDF_FAIL("struct_view type not supported");
}
Expand Down
241 changes: 236 additions & 5 deletions cpp/tests/search/search_struct_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,7 @@
#include <cudf_test/type_lists.hpp>

#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/search.hpp>
#include <cudf/table/table_view.hpp>

Expand All @@ -35,15 +36,14 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve
constexpr int32_t null{0}; // Mark for null child elements
constexpr int32_t XXX{0}; // Mark for null struct elements

template <typename T>
struct TypedStructSearchTest : public cudf::test::BaseFixture {
};

using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
cudf::test::FloatingPointTypes,
cudf::test::DurationTypes,
cudf::test::TimestampTypes>;

template <typename T>
struct TypedStructSearchTest : public cudf::test::BaseFixture {
};
TYPED_TEST_SUITE(TypedStructSearchTest, TestTypes);

namespace {
Expand Down Expand Up @@ -353,3 +353,234 @@ TYPED_TEST(TypedStructSearchTest, ComplexStructTest)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), verbosity);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity);
}

template <typename T>
struct TypedScalarStructContainTest : public cudf::test::BaseFixture {
};
TYPED_TEST_SUITE(TypedScalarStructContainTest, TestTypes);

TYPED_TEST(TypedScalarStructContainTest, EmptyInputTest)
{
using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;

auto const col = [] {
auto child = col_wrapper{};
return structs_col{{child}};
}();

auto const val = [] {
auto child = col_wrapper{1};
return cudf::struct_scalar(std::vector<cudf::column_view>{child});
}();

EXPECT_EQ(false, cudf::contains(col, val));
}

TYPED_TEST(TypedScalarStructContainTest, TrivialInputTests)
{
using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;

auto const col = [] {
auto child1 = col_wrapper{1, 2, 3};
auto child2 = col_wrapper{4, 5, 6};
auto child3 = strings_col{"x", "y", "z"};
return structs_col{{child1, child2, child3}};
}();

auto const val1 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{"x"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();
auto const val2 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{"a"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();

EXPECT_EQ(true, cudf::contains(col, val1));
EXPECT_EQ(false, cudf::contains(col, val2));
}

TYPED_TEST(TypedScalarStructContainTest, SlicedColumnInputTests)
{
using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;

constexpr int32_t dont_care{0};

auto const col_original = [] {
auto child1 = col_wrapper{dont_care, dont_care, 1, 2, 3, dont_care};
auto child2 = col_wrapper{dont_care, dont_care, 4, 5, 6, dont_care};
auto child3 = strings_col{"dont_care", "dont_care", "x", "y", "z", "dont_care"};
return structs_col{{child1, child2, child3}};
}();
auto const col = cudf::slice(col_original, {2, 5})[0];

auto const val1 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{"x"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();
auto const val2 = [] {
auto child1 = col_wrapper{dont_care};
auto child2 = col_wrapper{dont_care};
auto child3 = strings_col{"dont_care"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();

EXPECT_EQ(true, cudf::contains(col, val1));
EXPECT_EQ(false, cudf::contains(col, val2));
}

TYPED_TEST(TypedScalarStructContainTest, SimpleInputWithNullsTests)
{
using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;

constexpr int32_t null{0};

// Test with nulls at the top level.
{
auto const col = [] {
auto child1 = col_wrapper{1, null, 3};
auto child2 = col_wrapper{4, null, 6};
auto child3 = strings_col{"x", "" /*NULL*/, "z"};
return structs_col{{child1, child2, child3}, null_at(1)};
}();

auto const val1 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{"x"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();
auto const val2 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{"a"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();

EXPECT_EQ(true, cudf::contains(col, val1));
EXPECT_EQ(false, cudf::contains(col, val2));
}

// Test with nulls at the children level.
{
auto const col = [] {
auto child1 = col_wrapper{{1, null, 3}, null_at(1)};
auto child2 = col_wrapper{{4, null, 6}, null_at(1)};
auto child3 = strings_col{{"" /*NULL*/, "y", "z"}, null_at(0)};
return structs_col{{child1, child2, child3}};
}();

auto const val1 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{{"" /*NULL*/}, null_at(0)};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();
auto const val2 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{""};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();

EXPECT_EQ(true, cudf::contains(col, val1));
EXPECT_EQ(false, cudf::contains(col, val2));
}

// Test with nulls in the input scalar.
{
auto const col = [] {
auto child1 = col_wrapper{1, 2, 3};
auto child2 = col_wrapper{4, 5, 6};
auto child3 = strings_col{"x", "y", "z"};
return structs_col{{child1, child2, child3}};
}();

auto const val1 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{"x"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();
auto const val2 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{{"" /*NULL*/}, null_at(0)};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();

EXPECT_EQ(true, cudf::contains(col, val1));
EXPECT_EQ(false, cudf::contains(col, val2));
}
}

TYPED_TEST(TypedScalarStructContainTest, SlicedInputWithNullsTests)
{
using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;

constexpr int32_t dont_care{0};
constexpr int32_t null{0};

// Test with nulls at the top level.
{
auto const col_original = [] {
auto child1 = col_wrapper{dont_care, dont_care, 1, null, 3, dont_care};
auto child2 = col_wrapper{dont_care, dont_care, 4, null, 6, dont_care};
auto child3 = strings_col{"dont_care", "dont_care", "x", "" /*NULL*/, "z", "dont_care"};
return structs_col{{child1, child2, child3}, null_at(3)};
}();
auto const col = cudf::slice(col_original, {2, 5})[0];

auto const val1 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{"x"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();
auto const val2 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{"a"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();

EXPECT_EQ(true, cudf::contains(col, val1));
EXPECT_EQ(false, cudf::contains(col, val2));
}

// Test with nulls at the children level.
{
auto const col_original = [] {
auto child1 =
col_wrapper{{dont_care, dont_care /*also NULL*/, 1, null, 3, dont_care}, null_at(3)};
auto child2 =
col_wrapper{{dont_care, dont_care /*also NULL*/, 4, null, 6, dont_care}, null_at(3)};
auto child3 = strings_col{
{"dont_care", "dont_care" /*also NULL*/, "" /*NULL*/, "y", "z", "dont_care"}, null_at(2)};
return structs_col{{child1, child2, child3}, null_at(1)};
}();
auto const col = cudf::slice(col_original, {2, 5})[0];

auto const val1 = [] {
auto child1 = col_wrapper{1};
auto child2 = col_wrapper{4};
auto child3 = strings_col{{"x"}, null_at(0)};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();
auto const val2 = [] {
auto child1 = col_wrapper{dont_care};
auto child2 = col_wrapper{dont_care};
auto child3 = strings_col{"dont_care"};
return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
}();

EXPECT_EQ(true, cudf::contains(col, val1));
EXPECT_EQ(false, cudf::contains(col, val2));
}
}