From 9ae85aee213dc9ebb5ec51390f65ae0c48c5cf8b Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 26 Feb 2021 15:00:19 -0500 Subject: [PATCH] Add cudf::explode_position (#7376) This PR adds support for pos_explode in cuDF. It is very similar to explode, but includes a column with the index of the array that was exploded on each row. ``` a b [0, 1] 100 [2] 200 [3, 4, 5] 300 ``` exploded on column a would result in ``` a pos b 0 0 100 1 1 100 2 0 200 3 0 300 4 1 300 5 2 300 ``` partially fixes #6151 Authors: - Mike Wilson (@hyperbolic2346) Approvers: - David (@davidwendt) - Jake Hemstad (@jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7376 --- cpp/include/cudf/lists/lists_column_view.hpp | 23 ++++- cpp/include/cudf/reshape.hpp | 49 ++++++++- cpp/src/reshape/explode.cu | 101 ++++++++++++------- cpp/tests/reshape/explode_tests.cpp | 78 +++++++++++++- 4 files changed, 210 insertions(+), 41 deletions(-) diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index d494ee445b3..8cabf5287c8 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,6 +56,10 @@ class lists_column_view : private column_view { using column_view::null_mask; using column_view::offset; using column_view::size; + using offset_type = int32_t; + static_assert(std::is_same::value, + "offset_type is expected to be the same as size_type."); + using offset_iterator = offset_type const*; /** * @brief Returns the parent column. @@ -87,6 +91,23 @@ class lists_column_view : private column_view { * @throw cudf::logic error if this is an empty column */ column_view get_sliced_child(rmm::cuda_stream_view stream) const; + + /** + * @brief Return first offset (accounting for column offset) + * + * @return int32_t const* Pointer to the first offset + */ + offset_iterator offsets_begin() const noexcept + { + return offsets().begin() + offset(); + } + + /** + * @brief Return one past the last offset + * + * @return int32_t const* Pointer to one past the last offset + */ + offset_iterator offsets_end() const noexcept { return offsets_begin() + size(); } }; /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp index 29c9fa2e720..a5a9db4e15f 100644 --- a/cpp/include/cudf/reshape.hpp +++ b/cpp/include/cudf/reshape.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -125,8 +125,8 @@ std::unique_ptr byte_cast( * [null, 100], * [15, 100], * ``` - * Note that null lists are completely removed from the output - * and nulls and empty lists inside lists are pulled out and remain. + * Note that null lists are not included in the resulting table, but nulls inside + * lists and empty lists will be represented with a null entry for that column in that row. * * @param input_table Table to explode. * @param explode_column_idx Column index to explode inside the table. @@ -139,6 +139,49 @@ std::unique_ptr explode( size_type explode_column_idx, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Explodes a list column's elements and includes a position column. + * + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. A position + * column is added that has the index inside the original list for each row. Example: + * ``` + * [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * returns + * [5, 0, 100], + * [10, 1, 100], + * [15, 2, 100], + * [20, 0, 200], + * [25, 1, 200], + * [30, 0, 300], + * ``` + * + * Nulls and empty lists propagate in different ways depending on what is null or empty. + *``` + * [[5,null,15], 100], + * [null, 200], + * [[], 300], + * returns + * [5, 0, 100], + * [null, 1, 100], + * [15, 2, 100], + * ``` + * Note that null lists are not included in the resulting table, but nulls inside + * lists and empty lists will be represented with a null entry for that column in that row. + * + * @param input_table Table to explode. + * @param explode_column_idx Column index to explode inside the table. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @return A new table with explode_col exploded. + */ +std::unique_ptr
explode_position( + table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu index bc532893fb0..34d7d8fe31d 100644 --- a/cpp/src/reshape/explode.cu +++ b/cpp/src/reshape/explode.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include @@ -40,9 +40,13 @@ namespace { * @brief Function object for exploding a column. */ struct explode_functor { + /** + * @brief Function object for exploding a column. + */ template std::unique_ptr
operator()(table_view const& input_table, - size_type explode_column_idx, + size_type const explode_column_idx, + bool include_pos, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { @@ -55,58 +59,72 @@ struct explode_functor { template <> std::unique_ptr
explode_functor::operator()( table_view const& input_table, - size_type explode_column_idx, + size_type const explode_column_idx, + bool include_pos, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - /* we explode by building a gather map that includes the number of entries in each list inside - the column for each index. Interestingly, this can be done with lower_bound across the offsets - as values between the offsets will all map down to the index below. We have some off-by-one - manipulations we need to do with the output, but it's almost our gather map by itself. Once we - build the gather map we need to remove the explode column from the table and run gather on it. - Next we build the explode column, which turns out is simply lifting the child column out of the - explode column. This unrolls the top level of lists. Then we need to insert the explode column - back into the table and return it. */ lists_column_view lc{input_table.column(explode_column_idx)}; auto sliced_child = lc.get_sliced_child(stream); - rmm::device_uvector gather_map_indices(sliced_child.size(), stream, mr); + rmm::device_uvector gather_map_indices(sliced_child.size(), stream); - // sliced columns can make this a little tricky. We have to start iterating at the start of the - // offsets for this column, which could be > 0. Then we also have to handle rebasing the offsets - // as we go. - auto offsets = lc.offsets().begin() + lc.offset(); + // Sliced columns may require rebasing of the offsets. + auto offsets = lc.offsets_begin(); + // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later. auto offsets_minus_one = thrust::make_transform_iterator( - offsets, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); + offsets + 1, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); auto counting_iter = thrust::make_counting_iterator(0); + rmm::device_uvector pos(include_pos ? sliced_child.size() : 0, stream, mr); + // This looks like an off-by-one bug, but what is going on here is that we need to reduce each - // result from `lower_bound` by 1 to build the correct gather map. It was pointed out that - // this can be accomplished by simply skipping the first entry and using the result of - // `lower_bound` directly. - thrust::lower_bound(rmm::exec_policy(stream), - offsets_minus_one + 1, - offsets_minus_one + lc.size() + 1, - counting_iter, - counting_iter + gather_map_indices.size(), - gather_map_indices.begin()); + // result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by + // skipping the first entry and using the result of `lower_bound` directly. + if (include_pos) { + thrust::transform( + rmm::exec_policy(stream), + counting_iter, + counting_iter + gather_map_indices.size(), + gather_map_indices.begin(), + [position_array = pos.data(), offsets_minus_one, offsets, offset_size = lc.size()] __device__( + auto idx) -> size_type { + auto lb_idx = thrust::lower_bound( + thrust::seq, offsets_minus_one, offsets_minus_one + offset_size, idx) - + offsets_minus_one; + position_array[idx] = idx - (offsets[lb_idx] - offsets[0]); + return lb_idx; + }); + } else { + thrust::lower_bound(rmm::exec_policy(stream), + offsets_minus_one, + offsets_minus_one + lc.size(), + counting_iter, + counting_iter + gather_map_indices.size(), + gather_map_indices.begin()); + } auto select_iter = thrust::make_transform_iterator( thrust::make_counting_iterator(0), [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; }); std::vector selected_columns(select_iter, select_iter + input_table.num_columns() - 1); - auto gathered_table = cudf::detail::gather( - input_table.select(selected_columns), - column_view(data_type(type_to_id()), sliced_child.size(), gather_map_indices.data()), - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::ALLOWED, - stream, - mr); + auto gathered_table = cudf::detail::gather(input_table.select(selected_columns), + gather_map_indices.begin(), + gather_map_indices.end(), + cudf::out_of_bounds_policy::DONT_CHECK, + stream, + mr); std::vector> columns = gathered_table.release()->release(); columns.insert(columns.begin() + explode_column_idx, - std::make_unique(column(sliced_child, stream, mr))); + std::make_unique(sliced_child, stream, mr)); + + if (include_pos) { + columns.insert(columns.begin() + explode_column_idx, + std::make_unique( + data_type(type_to_id()), sliced_child.size(), pos.release())); + } return std::make_unique
(std::move(columns)); } @@ -120,6 +138,7 @@ std::unique_ptr
explode_functor::operator()( */ std::unique_ptr
explode(table_view const& input_table, size_type explode_column_idx, + bool include_pos, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -127,6 +146,7 @@ std::unique_ptr
explode(table_view const& input_table, explode_functor{}, input_table, explode_column_idx, + include_pos, stream, mr); } @@ -141,7 +161,18 @@ std::unique_ptr
explode(table_view const& input_table, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr); + return detail::explode(input_table, explode_column_idx, false, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::explode_position(input_table,explode_column_idx,rmm::mr::device_memory_resource) + */ +std::unique_ptr
explode_position(table_view const& input_table, + size_type explode_column_idx, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::explode(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr); } } // namespace cudf diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp index b2db3251889..5f3237ce46d 100644 --- a/cpp/tests/reshape/explode_tests.cpp +++ b/cpp/tests/reshape/explode_tests.cpp @@ -47,6 +47,13 @@ TEST_F(ExplodeTest, Empty) cudf::table_view expected({expected_a, expected_b}); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + auto pos_ret = cudf::explode_position(t, 0); + + fixed_width_column_wrapper expected_c{}; + cudf::table_view pos_expected({expected_a, expected_b, expected_c}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, NonList) @@ -57,6 +64,7 @@ TEST_F(ExplodeTest, NonList) cudf::table_view t({a, b}); EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error); + EXPECT_THROW(cudf::explode_position(t, 1), cudf::logic_error); } TEST_F(ExplodeTest, Basics) @@ -85,6 +93,12 @@ TEST_F(ExplodeTest, Basics) auto ret = cudf::explode(t, 1); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 2, 0, 1, 0, 1}; + cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c}); + + auto pos_ret = cudf::explode_position(t, 1); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, SingleNull) @@ -116,6 +130,12 @@ TEST_F(ExplodeTest, SingleNull) auto ret = cudf::explode(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, Nulls) @@ -147,6 +167,12 @@ TEST_F(ExplodeTest, Nulls) auto ret = cudf::explode(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 2, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, NullsInList) @@ -178,6 +204,12 @@ TEST_F(ExplodeTest, NullsInList) auto ret = cudf::explode(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, Nested) @@ -214,6 +246,12 @@ TEST_F(ExplodeTest, Nested) auto ret = cudf::explode(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 0, 0, 1, 2, 3}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, NestedNulls) @@ -253,6 +291,12 @@ TEST_F(ExplodeTest, NestedNulls) auto ret = cudf::explode(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, NullsInNested) @@ -290,6 +334,12 @@ TEST_F(ExplodeTest, NullsInNested) auto ret = cudf::explode(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, NullsInNestedDoubleExplode) @@ -322,10 +372,16 @@ TEST_F(ExplodeTest, NullsInNestedDoubleExplode) cudf::table_view t({a, b}); cudf::table_view expected({expected_a, expected_b}); - auto ret = cudf::explode(t, 0); - ret = cudf::explode(ret->view(), 0); + auto first_explode_ret = cudf::explode(t, 0); + auto ret = cudf::explode(first_explode_ret->view(), 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(first_explode_ret->view(), 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, NestedStructs) @@ -367,6 +423,12 @@ TEST_F(ExplodeTest, NestedStructs) auto ret = cudf::explode(t, 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TYPED_TEST(ExplodeTypedTest, ListOfStructs) @@ -406,6 +468,12 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs) cudf::table_view expected({expected_a->view(), expected_b}); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b}); + + auto pos_ret = cudf::explode_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } TEST_F(ExplodeTest, SlicedList) @@ -453,4 +521,10 @@ TEST_F(ExplodeTest, SlicedList) auto ret = cudf::explode(sliced_t[0], 0); CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + fixed_width_column_wrapper expected_pos_col{0, 1, 2, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_position(sliced_t[0], 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); }