From 1a8c59dbba8073824d94d562dd3681879f879b8f Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Wed, 13 Jan 2021 17:24:51 +0000 Subject: [PATCH 01/10] First pass at explode --- cpp/include/cudf/reshape.hpp | 29 ++++++ cpp/src/reshape/explode.cu | 140 ++++++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/reshape/explode_tests.cpp | 112 ++++++++++++++++++++++ 4 files changed, 282 insertions(+) create mode 100644 cpp/src/reshape/explode.cu create mode 100644 cpp/tests/reshape/explode_tests.cpp diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp index 4561554a0f5..f01d0756357 100644 --- a/cpp/include/cudf/reshape.hpp +++ b/cpp/include/cudf/reshape.hpp @@ -97,6 +97,35 @@ std::unique_ptr byte_cast( flip_endianness endian_configuration, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Explodes a list column's elements. + * + * Any list is exploded into individual rows and + * the other rows are duplicated. + * ``` + * [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * returns + * [5, 100], + * [10, 100], + * [15, 100], + * [20, 200], + * [25, 200], + * [30, 300], + * ``` + * + * @param input_table Table to explode. + * @param explode_column_idx Column index to explode inside the table. + * @param mr Device memory resource used to allocate the returned column's device memory. + * + * @return A new table with explode_col exploded. + */ +std::unique_ptr explode( + table_view const& input_table, + int const explode_column_idx, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu new file mode 100644 index 00000000000..10a617c4e28 --- /dev/null +++ b/cpp/src/reshape/explode.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include "thrust/iterator/counting_iterator.h" +#include "thrust/iterator/transform_iterator.h" + +namespace cudf { +namespace detail { +namespace { +struct explode_functor { + /** + * @brief Function object for exploding a column. + */ + template + std::enable_if_t::value, std::unique_ptr
> operator()( + table_view const& input_table, + int const explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + CUDF_FAIL("Unsupported non-list column"); + } + + /** + * @brief Function object for exploding a column. + */ + template + std::enable_if_t::value, std::unique_ptr
> operator()( + table_view const& input_table, + int const explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + /* we explode by building a gather map that includes the number of entries in each list inside the column for each index. Interestingly, this + can be done with lower_bound across the offsets as values between the offsets will all map down to the index below. We have some off-by-one + manipulations we need to do with the output, but it's almost our gather map by itself. Once we build the gather map we need to remove + the explode column from the table and run gather on it. Next we build the explode column, which turns out is simply lifting + the child column out of the explode column. This unrolls the top level of lists. Then we need to insert the explode column back + into the table and return it. */ + lists_column_view lc{input_table.column(explode_column_idx)}; + thrust::device_vector gather_map_indices(lc.child().size()); + auto offsets = lc.offsets(); + + auto offsets_minus_one = thrust::make_transform_iterator( + offsets.begin(), [] __device__(auto i) { return i - 1; }); + auto counting_iter = thrust::make_counting_iterator(0); + + thrust::lower_bound(rmm::exec_policy(stream), + offsets_minus_one, + offsets_minus_one + offsets.size(), + counting_iter, + counting_iter + gather_map_indices.size(), + gather_map_indices.begin()); + + auto select_iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [explode_column_idx](int i) { return i >= explode_column_idx ? i + 1 : i; }); + std::vector selected_columns(select_iter, select_iter + input_table.num_columns() - 1); + + auto gather_map_iter = thrust::make_transform_iterator(gather_map_indices.begin(), + [] __device__(int i) { return i - 1; }); + + auto gathered_table = cudf::detail::gather(input_table.select(selected_columns), + gather_map_iter, + gather_map_iter + gather_map_indices.size(), + cudf::out_of_bounds_policy::DONT_CHECK, + stream, + mr); + + std::vector> columns = gathered_table.release()->release(); + + columns.insert(columns.begin() + explode_column_idx, + std::make_unique(column(lc.child()))); + + return std::make_unique
(std::move(columns)); + } +}; +} // namespace + +/** + * @copydoc + * cudf::explode(input_table,explode_column,column,flip_endianess,rmm::mr::device_memory_resource) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr
explode(table_view const& input_table, + int const explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type_dispatcher(input_table.column(explode_column_idx).type(), + explode_functor{}, + input_table, + explode_column_idx, + stream, + mr); +} + +} // namespace detail + +/** + * @copydoc cudf::explode(input_table,explode_column,flip_endianess,rmm::mr::device_memory_resource) + */ +std::unique_ptr
explode(table_view const& input_table, + int const explode_column_idx, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 2592b5e4221..0c817d52744 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -515,6 +515,7 @@ ConfigureTest(SEARCH_TEST "${SEARCH_TEST_SRC}") set(RESHAPE_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reshape/byte_cast_tests.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/reshape/explode_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reshape/interleave_columns_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reshape/tile_tests.cpp") diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp new file mode 100644 index 00000000000..f2cbdea45d2 --- /dev/null +++ b/cpp/tests/reshape/explode_tests.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +using namespace cudf::test; + +class ExplodeTest : public cudf::test::BaseFixture { +}; + +TEST_F(ExplodeTest, Empty) +{ + lists_column_wrapper a{}; + fixed_width_column_wrapper b{}; + + cudf::table_view t({a, b}); + + auto ret = cudf::explode(t, 0); + + fixed_width_column_wrapper expected_a{}; + fixed_width_column_wrapper expected_b{}; + cudf::table_view expected({expected_a, expected_b}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NonList) +{ + fixed_width_column_wrapper a{100, 200, 300}; + fixed_width_column_wrapper b{100, 200, 300}; + + cudf::table_view t({a, b}); + + EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error); +} + +TEST_F(ExplodeTest, Basics) +{ + /* + a b + [1, 2, 7] 100 + [5, 6] 200 + [0, 3] 300 + */ + + lists_column_wrapper a{lists_column_wrapper{1, 2, 7}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}}; + fixed_width_column_wrapper b{100, 200, 300}; + + fixed_width_column_wrapper expected_a{1, 2, 7, 5, 6, 0, 3}; + fixed_width_column_wrapper expected_b{100, 100, 100, 200, 200, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, Nested) +{ + /* + a b + [[1, 2], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + */ + + lists_column_wrapper a{ + lists_column_wrapper{lists_column_wrapper{1, 2}, + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper{2, 1}}}; + fixed_width_column_wrapper b{100, 200, 300}; + + lists_column_wrapper expected_a{lists_column_wrapper{1, 2}, + lists_column_wrapper{7, 6, 5}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper{2, 1}}; + fixed_width_column_wrapper expected_b{100, 100, 200, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} From 69acc159cedf00177ad9e4419565ef60fe972c95 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 14 Jan 2021 20:08:01 +0000 Subject: [PATCH 02/10] Updating based on review comments, adding more tests, and adding null support --- cpp/include/cudf/reshape.hpp | 6 +- cpp/src/reshape/explode.cu | 135 +++++++++--------- cpp/tests/reshape/explode_tests.cpp | 208 +++++++++++++++++++++++++++- 3 files changed, 276 insertions(+), 73 deletions(-) diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp index f01d0756357..d89589abc54 100644 --- a/cpp/include/cudf/reshape.hpp +++ b/cpp/include/cudf/reshape.hpp @@ -100,8 +100,8 @@ std::unique_ptr byte_cast( /** * @brief Explodes a list column's elements. * - * Any list is exploded into individual rows and - * the other rows are duplicated. + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. Example: * ``` * [[5,10,15], 100], * [[20,25], 200], @@ -123,7 +123,7 @@ std::unique_ptr byte_cast( */ std::unique_ptr
explode( table_view const& input_table, - int const explode_column_idx, + size_type explode_column_idx, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu index 10a617c4e28..008a6d1bd44 100644 --- a/cpp/src/reshape/explode.cu +++ b/cpp/src/reshape/explode.cu @@ -26,93 +26,94 @@ #include #include +#include +#include #include #include -#include "thrust/iterator/counting_iterator.h" -#include "thrust/iterator/transform_iterator.h" namespace cudf { namespace detail { namespace { +/** + * @brief Function object for exploding a column. + */ struct explode_functor { - /** - * @brief Function object for exploding a column. - */ template - std::enable_if_t::value, std::unique_ptr
> operator()( - table_view const& input_table, - int const explode_column_idx, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const + std::unique_ptr
operator()(table_view const& input_table, + size_type explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FAIL("Unsupported non-list column"); - } - /** - * @brief Function object for exploding a column. - */ - template - std::enable_if_t::value, std::unique_ptr
> operator()( - table_view const& input_table, - int const explode_column_idx, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - /* we explode by building a gather map that includes the number of entries in each list inside the column for each index. Interestingly, this - can be done with lower_bound across the offsets as values between the offsets will all map down to the index below. We have some off-by-one - manipulations we need to do with the output, but it's almost our gather map by itself. Once we build the gather map we need to remove - the explode column from the table and run gather on it. Next we build the explode column, which turns out is simply lifting - the child column out of the explode column. This unrolls the top level of lists. Then we need to insert the explode column back - into the table and return it. */ - lists_column_view lc{input_table.column(explode_column_idx)}; - thrust::device_vector gather_map_indices(lc.child().size()); - auto offsets = lc.offsets(); - - auto offsets_minus_one = thrust::make_transform_iterator( - offsets.begin(), [] __device__(auto i) { return i - 1; }); - auto counting_iter = thrust::make_counting_iterator(0); - - thrust::lower_bound(rmm::exec_policy(stream), - offsets_minus_one, - offsets_minus_one + offsets.size(), - counting_iter, - counting_iter + gather_map_indices.size(), - gather_map_indices.begin()); - - auto select_iter = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [explode_column_idx](int i) { return i >= explode_column_idx ? i + 1 : i; }); - std::vector selected_columns(select_iter, select_iter + input_table.num_columns() - 1); - - auto gather_map_iter = thrust::make_transform_iterator(gather_map_indices.begin(), - [] __device__(int i) { return i - 1; }); - - auto gathered_table = cudf::detail::gather(input_table.select(selected_columns), - gather_map_iter, - gather_map_iter + gather_map_indices.size(), - cudf::out_of_bounds_policy::DONT_CHECK, - stream, - mr); - - std::vector> columns = gathered_table.release()->release(); - - columns.insert(columns.begin() + explode_column_idx, - std::make_unique(column(lc.child()))); - - return std::make_unique
(std::move(columns)); + return std::make_unique
(); } }; + +template <> +std::unique_ptr
explode_functor::operator()( + table_view const& input_table, + size_type explode_column_idx, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + /* we explode by building a gather map that includes the number of entries in each list inside + the column for each index. Interestingly, this can be done with lower_bound across the offsets + as values between the offsets will all map down to the index below. We have some off-by-one + manipulations we need to do with the output, but it's almost our gather map by itself. Once we + build the gather map we need to remove the explode column from the table and run gather on it. + Next we build the explode column, which turns out is simply lifting the child column out of the + explode column. This unrolls the top level of lists. Then we need to insert the explode column + back into the table and return it. */ + lists_column_view lc{input_table.column(explode_column_idx)}; + // rmm::device_uvector gather_map_indices(lc.child().size(), stream, mr); + rmm::device_vector gather_map_indices(lc.child().size()); + auto offsets = lc.offsets(); + + auto offsets_minus_one = thrust::make_transform_iterator(offsets.begin(), + [] __device__(auto i) { return i - 1; }); + auto counting_iter = thrust::make_counting_iterator(0); + + thrust::lower_bound(rmm::exec_policy(stream), + offsets_minus_one, + offsets_minus_one + offsets.size(), + counting_iter, + counting_iter + gather_map_indices.size(), + gather_map_indices.begin()); + + auto select_iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [explode_column_idx](int i) { return i >= explode_column_idx ? i + 1 : i; }); + std::vector selected_columns(select_iter, select_iter + input_table.num_columns() - 1); + + auto gather_map_iter = thrust::make_transform_iterator(gather_map_indices.begin(), + [] __device__(int i) { return i - 1; }); + + auto gathered_table = cudf::detail::gather(input_table.select(selected_columns), + gather_map_iter, + gather_map_iter + gather_map_indices.size(), + cudf::out_of_bounds_policy::DONT_CHECK, + stream, + mr); + + std::vector> columns = gathered_table.release()->release(); + + columns.insert(columns.begin() + explode_column_idx, + std::make_unique(column(lc.child(), stream, mr))); + + return std::make_unique
(std::move(columns)); +} } // namespace /** * @copydoc - * cudf::explode(input_table,explode_column,column,flip_endianess,rmm::mr::device_memory_resource) + * cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource) * * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr
explode(table_view const& input_table, - int const explode_column_idx, + size_type explode_column_idx, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -127,10 +128,10 @@ std::unique_ptr
explode(table_view const& input_table, } // namespace detail /** - * @copydoc cudf::explode(input_table,explode_column,flip_endianess,rmm::mr::device_memory_resource) + * @copydoc cudf::explode(input_table,explode_column_idx,rmm::mr::device_memory_resource) */ std::unique_ptr
explode(table_view const& input_table, - int const explode_column_idx, + size_type explode_column_idx, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp index f2cbdea45d2..9fcc8d96403 100644 --- a/cpp/tests/reshape/explode_tests.cpp +++ b/cpp/tests/reshape/explode_tests.cpp @@ -61,13 +61,105 @@ TEST_F(ExplodeTest, Basics) [0, 3] 300 */ - lists_column_wrapper a{lists_column_wrapper{1, 2, 7}, + fixed_width_column_wrapper a{100, 200, 300}; + lists_column_wrapper b{lists_column_wrapper{1, 2, 7}, lists_column_wrapper{5, 6}, lists_column_wrapper{0, 3}}; + strings_column_wrapper c{"string0", "string1", "string2"}; + + fixed_width_column_wrapper expected_a{100, 100, 100, 200, 200, 300, 300}; + fixed_width_column_wrapper expected_b{1, 2, 7, 5, 6, 0, 3}; + strings_column_wrapper expected_c{ + "string0", "string0", "string0", "string1", "string1", "string2", "string2"}; + + cudf::table_view t({a, b, c}); + cudf::table_view expected({expected_a, expected_b, expected_c}); + + auto ret = cudf::explode(t, 1); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, SingleNull) +{ + /* + a b + [1, 2, 7] 100 + [5, 6] 200 + [0, 3] 300 + */ + + auto first_invalid = + cudf::test::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; }); + + lists_column_wrapper a({lists_column_wrapper{1, 2, 7}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}}, + first_invalid); + fixed_width_column_wrapper b({100, 200, 300}); + + fixed_width_column_wrapper expected_a{5, 6, 0, 3}; + fixed_width_column_wrapper expected_b{200, 200, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, Nulls) +{ + /* + a b + [1, 2, 7] 100 + [5, 6] 200 + [0, 3] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto always_valid = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; }); + + lists_column_wrapper a({lists_column_wrapper{1, 2, 7}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}}, + valids); + fixed_width_column_wrapper b({100, 200, 300}, valids); + + fixed_width_column_wrapper expected_a({1, 2, 7, 0, 3}); + fixed_width_column_wrapper expected_b({100, 100, 100, 300, 300}, always_valid); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NullsInList) +{ + /* + a b + [1, 2, 7] 100 + [5, 6, 0, 9] 200 + [0, 3, 8] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto always_valid = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; }); + + lists_column_wrapper a{lists_column_wrapper({1, 2, 7}, valids), + lists_column_wrapper({5, 6, 0, 9}, valids), + lists_column_wrapper({0, 3, 8}, valids)}; fixed_width_column_wrapper b{100, 200, 300}; - fixed_width_column_wrapper expected_a{1, 2, 7, 5, 6, 0, 3}; - fixed_width_column_wrapper expected_b{100, 100, 100, 200, 200, 300, 300}; + fixed_width_column_wrapper expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, + {1, 0, 1, 1, 0, 1, 0, 1, 0, 1}); + fixed_width_column_wrapper expected_b{100, 100, 100, 200, 200, 200, 200, 300, 300, 300}; cudf::table_view t({a, b}); cudf::table_view expected({expected_a, expected_b}); @@ -110,3 +202,113 @@ TEST_F(ExplodeTest, Nested) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); } + +TEST_F(ExplodeTest, NestedNulls) +{ + /* + a b + [[1, 2], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + auto always_valid = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; }); + + lists_column_wrapper a( + {lists_column_wrapper{lists_column_wrapper{1, 2}, + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper{2, 1}}}, + valids); + fixed_width_column_wrapper b({100, 200, 300}, valids); + + lists_column_wrapper expected_a{lists_column_wrapper{1, 2}, + lists_column_wrapper{7, 6, 5}, + lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper{2, 1}}; + fixed_width_column_wrapper expected_b({100, 100, 300, 300, 300}, always_valid); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NullsInNested) +{ + /* + a b + [[1, 2], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a( + {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}}); + fixed_width_column_wrapper b({100, 200, 300}); + + lists_column_wrapper expected_a{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}; + fixed_width_column_wrapper expected_b{100, 100, 200, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TEST_F(ExplodeTest, NullsInNestedDoubleExplode) +{ + /* + a b + [[1, 2], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a{ + lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}}; + fixed_width_column_wrapper b{100, 200, 300}; + + fixed_width_column_wrapper expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + fixed_width_column_wrapper expected_b{ + 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300}; + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + ret = cudf::explode(ret->view(), 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} From 368e396bfd118047a157cfd65eb13243b7e6656a Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 14 Jan 2021 21:49:45 +0000 Subject: [PATCH 03/10] removing debug code and adding device_uvector back --- cpp/src/reshape/explode.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu index 008a6d1bd44..8a34da3f676 100644 --- a/cpp/src/reshape/explode.cu +++ b/cpp/src/reshape/explode.cu @@ -67,8 +67,7 @@ std::unique_ptr
explode_functor::operator()( explode column. This unrolls the top level of lists. Then we need to insert the explode column back into the table and return it. */ lists_column_view lc{input_table.column(explode_column_idx)}; - // rmm::device_uvector gather_map_indices(lc.child().size(), stream, mr); - rmm::device_vector gather_map_indices(lc.child().size()); + rmm::device_uvector gather_map_indices(lc.child().size(), stream, mr); auto offsets = lc.offsets(); auto offsets_minus_one = thrust::make_transform_iterator(offsets.begin(), From 99ff76c37f98fcf810f66b0d0b47078dcbb8e0d2 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 15 Jan 2021 18:47:43 +0000 Subject: [PATCH 04/10] Adding some struct tests --- cpp/tests/reshape/explode_tests.cpp | 89 +++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp index 9fcc8d96403..133eab70524 100644 --- a/cpp/tests/reshape/explode_tests.cpp +++ b/cpp/tests/reshape/explode_tests.cpp @@ -26,6 +26,12 @@ using namespace cudf::test; class ExplodeTest : public cudf::test::BaseFixture { }; +template +class ExplodeTypedTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes); + TEST_F(ExplodeTest, Empty) { lists_column_wrapper a{}; @@ -312,3 +318,86 @@ TEST_F(ExplodeTest, NullsInNestedDoubleExplode) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); } + +TEST_F(ExplodeTest, NestedStructs) +{ + /* + a b + [[1, 2], [7, 6, 5]] {100, "100"} + [[5, 6]] {200, "200"} + [[0, 3],[5],[2, 1]] {300, "300"} + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a( + {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}}); + fixed_width_column_wrapper b1({100, 200, 300}); + strings_column_wrapper b2{"100", "200", "300"}; + structs_column_wrapper b({b1, b2}); + + lists_column_wrapper expected_a{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}, + lists_column_wrapper{5, 6}, + lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}; + fixed_width_column_wrapper expected_b1{100, 100, 200, 300, 300, 300}; + strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"}; + structs_column_wrapper expected_b({expected_b1, expected_b2}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(t, 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} + +TYPED_TEST(ExplodeTypedTest, ListOfStructs) +{ + /* + a b + [{70, "70"}, {75, "75"}] 100 + [{50, "50"}, {55, "55"}] 200 + [{35, "35"}, {45, "45"}] 300 + [{25, "25"}, {30, "30"}] 400 + [{15, "15"}, {20, "20"}] 500 +*/ + + auto numeric_col = + fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; + strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; + auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release(); + auto a = cudf::make_lists_column(5, + fixed_width_column_wrapper{0, 2, 4, 6, 8, 10}.release(), + std::move(struct_col), + cudf::UNKNOWN_NULL_COUNT, + {}); + + fixed_width_column_wrapper b{100, 200, 300, 400, 500}; + + cudf::test::print(a->view()); + cudf::test::print(b); + + cudf::table_view t({a->view(), b}); + auto ret = cudf::explode(t, 0); + + auto expected_numeric_col = + fixed_width_column_wrapper{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; + strings_column_wrapper expected_string_col{ + "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; + + auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release(); + fixed_width_column_wrapper expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500}; + + cudf::table_view expected({expected_a->view(), expected_b}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} From d62e031d1739c2fe35dc196f9fcce82fd02fe54e Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 15 Jan 2021 18:56:38 +0000 Subject: [PATCH 05/10] Adding documentation on nulls --- cpp/include/cudf/reshape.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp index d89589abc54..6ece52cc096 100644 --- a/cpp/include/cudf/reshape.hpp +++ b/cpp/include/cudf/reshape.hpp @@ -115,6 +115,18 @@ std::unique_ptr byte_cast( * [30, 300], * ``` * + * Nulls propagate in different ways depending on what is null. + *``` + * [[5,null,15], 100], + * [null, 200] + * returns + * [5, 100], + * [null, 100], + * [15, 100], + * ``` + * Note that null lists are completely removed from the output + * and nulls inside lists are pulled out and remain. + * * @param input_table Table to explode. * @param explode_column_idx Column index to explode inside the table. * @param mr Device memory resource used to allocate the returned column's device memory. From 7e1161208401b36b1a02a1a5e0fc730e4591040c Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Sat, 16 Jan 2021 01:09:26 +0000 Subject: [PATCH 06/10] fixing unsigned int -> size_type --- cpp/src/reshape/explode.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu index 8a34da3f676..835c976c904 100644 --- a/cpp/src/reshape/explode.cu +++ b/cpp/src/reshape/explode.cu @@ -67,7 +67,7 @@ std::unique_ptr
explode_functor::operator()( explode column. This unrolls the top level of lists. Then we need to insert the explode column back into the table and return it. */ lists_column_view lc{input_table.column(explode_column_idx)}; - rmm::device_uvector gather_map_indices(lc.child().size(), stream, mr); + rmm::device_uvector gather_map_indices(lc.child().size(), stream, mr); auto offsets = lc.offsets(); auto offsets_minus_one = thrust::make_transform_iterator(offsets.begin(), From 7fc7c049b83f7906f92727cd6b6de704c2bc3923 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Tue, 19 Jan 2021 18:10:14 +0000 Subject: [PATCH 07/10] removing unused variable in tests --- cpp/tests/reshape/explode_tests.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp index 133eab70524..ca9122bd095 100644 --- a/cpp/tests/reshape/explode_tests.cpp +++ b/cpp/tests/reshape/explode_tests.cpp @@ -156,7 +156,6 @@ TEST_F(ExplodeTest, NullsInList) auto valids = cudf::test::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - auto always_valid = cudf::test::make_counting_transform_iterator(0, [](auto i) { return true; }); lists_column_wrapper a{lists_column_wrapper({1, 2, 7}, valids), lists_column_wrapper({5, 6, 0, 9}, valids), From 0ae9cd62dbb3f07b266a3ff14c77f966837c7d43 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 21 Jan 2021 06:24:45 +0000 Subject: [PATCH 08/10] Review comments, fixing sliced columns, added sliced column test --- cpp/src/reshape/explode.cu | 43 +++++++++++++----------- cpp/tests/reshape/explode_tests.cpp | 52 ++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu index 835c976c904..7d60592124d 100644 --- a/cpp/src/reshape/explode.cu +++ b/cpp/src/reshape/explode.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -67,39 +68,41 @@ std::unique_ptr
explode_functor::operator()( explode column. This unrolls the top level of lists. Then we need to insert the explode column back into the table and return it. */ lists_column_view lc{input_table.column(explode_column_idx)}; - rmm::device_uvector gather_map_indices(lc.child().size(), stream, mr); - auto offsets = lc.offsets(); + auto sliced_child = lc.get_sliced_child(stream); + rmm::device_uvector gather_map_indices(sliced_child.size(), stream, mr); - auto offsets_minus_one = thrust::make_transform_iterator(offsets.begin(), - [] __device__(auto i) { return i - 1; }); - auto counting_iter = thrust::make_counting_iterator(0); + // sliced columns can make this a little tricky. We have to start iterating at the start of the + // offsets for this column, which could be > 0. Then we also have to handle rebasing the offsets + // as we go. + auto offsets = lc.offsets().begin() + lc.offset(); + auto offsets_minus_one = thrust::make_transform_iterator( + offsets, [offsets] __device__(auto i) { return i - offsets[0] - 1; }); + auto counting_iter = thrust::make_counting_iterator(0); thrust::lower_bound(rmm::exec_policy(stream), - offsets_minus_one, - offsets_minus_one + offsets.size(), + offsets_minus_one + 1, + offsets_minus_one + lc.offsets().size(), counting_iter, counting_iter + gather_map_indices.size(), gather_map_indices.begin()); auto select_iter = thrust::make_transform_iterator( thrust::make_counting_iterator(0), - [explode_column_idx](int i) { return i >= explode_column_idx ? i + 1 : i; }); - std::vector selected_columns(select_iter, select_iter + input_table.num_columns() - 1); + [explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; }); + std::vector selected_columns(select_iter, select_iter + input_table.num_columns() - 1); - auto gather_map_iter = thrust::make_transform_iterator(gather_map_indices.begin(), - [] __device__(int i) { return i - 1; }); - - auto gathered_table = cudf::detail::gather(input_table.select(selected_columns), - gather_map_iter, - gather_map_iter + gather_map_indices.size(), - cudf::out_of_bounds_policy::DONT_CHECK, - stream, - mr); + auto gathered_table = cudf::detail::gather( + input_table.select(selected_columns), + column_view(data_type(type_to_id()), sliced_child.size(), gather_map_indices.data()), + cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::ALLOWED, + stream, + mr); std::vector> columns = gathered_table.release()->release(); columns.insert(columns.begin() + explode_column_idx, - std::make_unique(column(lc.child(), stream, mr))); + std::make_unique(column(sliced_child, stream, mr))); return std::make_unique
(std::move(columns)); } diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp index ca9122bd095..07d65b26e0a 100644 --- a/cpp/tests/reshape/explode_tests.cpp +++ b/cpp/tests/reshape/explode_tests.cpp @@ -15,11 +15,11 @@ */ #include + #include #include -#include - #include +#include using namespace cudf::test; @@ -382,9 +382,6 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs) fixed_width_column_wrapper b{100, 200, 300, 400, 500}; - cudf::test::print(a->view()); - cudf::test::print(b); - cudf::table_view t({a->view(), b}); auto ret = cudf::explode(t, 0); @@ -400,3 +397,48 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs) CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); } + +TEST_F(ExplodeTest, SlicedList) +{ + /* + a b + [[1, 2],[7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + [[8, 3],[4, 3, 1, 2]] 400 + [[2, 3, 4],[9, 8]] 500 + + slicing the top 2 rows off + */ + + auto valids = cudf::test::make_counting_transform_iterator( + 0, [](auto i) { return i % 2 == 0 ? true : false; }); + + lists_column_wrapper a( + {lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{7, 6, 5}}, + lists_column_wrapper{lists_column_wrapper{5, 6}}, + lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids)}, + lists_column_wrapper{lists_column_wrapper{8, 3}, + lists_column_wrapper({4, 3, 1, 2}, valids)}, + lists_column_wrapper{lists_column_wrapper{2, 3, 4}, + lists_column_wrapper{9, 8}}}); + fixed_width_column_wrapper b({100, 200, 300, 400, 500}); + + lists_column_wrapper expected_a{lists_column_wrapper{0, 3}, + lists_column_wrapper{5}, + lists_column_wrapper({2, 1}, valids), + lists_column_wrapper{8, 3}, + lists_column_wrapper({4, 3, 1, 2}, valids)}; + fixed_width_column_wrapper expected_b{300, 300, 300, 400, 400}; + + cudf::table_view t({a, b}); + auto sliced_t = cudf::slice(t, {2, 4}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode(sliced_t[0], 0); + + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); +} From 898c98d77cdfc3586cdcfa466a9926a1bc92584d Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 22 Jan 2021 06:58:16 +0000 Subject: [PATCH 09/10] Adding more detail to header comments and addressing review comments --- cpp/include/cudf/reshape.hpp | 7 ++-- cpp/src/reshape/explode.cu | 4 +- cpp/tests/reshape/explode_tests.cpp | 57 +++++++++++++++++------------ 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp index 6ece52cc096..29c9fa2e720 100644 --- a/cpp/include/cudf/reshape.hpp +++ b/cpp/include/cudf/reshape.hpp @@ -115,17 +115,18 @@ std::unique_ptr byte_cast( * [30, 300], * ``` * - * Nulls propagate in different ways depending on what is null. + * Nulls and empty lists propagate in different ways depending on what is null or empty. *``` * [[5,null,15], 100], - * [null, 200] + * [null, 200], + * [[], 300], * returns * [5, 100], * [null, 100], * [15, 100], * ``` * Note that null lists are completely removed from the output - * and nulls inside lists are pulled out and remain. + * and nulls and empty lists inside lists are pulled out and remain. * * @param input_table Table to explode. * @param explode_column_idx Column index to explode inside the table. diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu index 7d60592124d..e83251c790d 100644 --- a/cpp/src/reshape/explode.cu +++ b/cpp/src/reshape/explode.cu @@ -76,12 +76,12 @@ std::unique_ptr
explode_functor::operator()( // as we go. auto offsets = lc.offsets().begin() + lc.offset(); auto offsets_minus_one = thrust::make_transform_iterator( - offsets, [offsets] __device__(auto i) { return i - offsets[0] - 1; }); + offsets, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); auto counting_iter = thrust::make_counting_iterator(0); thrust::lower_bound(rmm::exec_policy(stream), offsets_minus_one + 1, - offsets_minus_one + lc.offsets().size(), + offsets_minus_one + lc.size() + 1, counting_iter, counting_iter + gather_map_indices.size(), gather_map_indices.begin()); diff --git a/cpp/tests/reshape/explode_tests.cpp b/cpp/tests/reshape/explode_tests.cpp index 07d65b26e0a..6f98332243e 100644 --- a/cpp/tests/reshape/explode_tests.cpp +++ b/cpp/tests/reshape/explode_tests.cpp @@ -92,7 +92,8 @@ TEST_F(ExplodeTest, SingleNull) a b [1, 2, 7] 100 [5, 6] 200 - [0, 3] 300 + [] 300 + [0, 3] 400 */ auto first_invalid = @@ -100,12 +101,13 @@ TEST_F(ExplodeTest, SingleNull) lists_column_wrapper a({lists_column_wrapper{1, 2, 7}, lists_column_wrapper{5, 6}, + lists_column_wrapper{}, lists_column_wrapper{0, 3}}, first_invalid); - fixed_width_column_wrapper b({100, 200, 300}); + fixed_width_column_wrapper b({100, 200, 300, 400}); fixed_width_column_wrapper expected_a{5, 6, 0, 3}; - fixed_width_column_wrapper expected_b{200, 200, 300, 300}; + fixed_width_column_wrapper expected_b{200, 200, 400, 400}; cudf::table_view t({a, b}); cudf::table_view expected({expected_a, expected_b}); @@ -151,7 +153,8 @@ TEST_F(ExplodeTest, NullsInList) a b [1, 2, 7] 100 [5, 6, 0, 9] 200 - [0, 3, 8] 300 + [] 300 + [0, 3, 8] 400 */ auto valids = cudf::test::make_counting_transform_iterator( @@ -159,12 +162,13 @@ TEST_F(ExplodeTest, NullsInList) lists_column_wrapper a{lists_column_wrapper({1, 2, 7}, valids), lists_column_wrapper({5, 6, 0, 9}, valids), + lists_column_wrapper{}, lists_column_wrapper({0, 3, 8}, valids)}; - fixed_width_column_wrapper b{100, 200, 300}; + fixed_width_column_wrapper b{100, 200, 300, 400}; fixed_width_column_wrapper expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1}); - fixed_width_column_wrapper expected_b{100, 100, 100, 200, 200, 200, 200, 300, 300, 300}; + fixed_width_column_wrapper expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400}; cudf::table_view t({a, b}); cudf::table_view expected({expected_a, expected_b}); @@ -177,10 +181,10 @@ TEST_F(ExplodeTest, NullsInList) TEST_F(ExplodeTest, Nested) { /* - a b - [[1, 2], [7, 6, 5]] 100 - [[5, 6]] 200 - [[0, 3],[5],[2, 1]] 300 + a b + [[1, 2], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[],[5],[2, 1]] 300 */ lists_column_wrapper a{ @@ -188,6 +192,7 @@ TEST_F(ExplodeTest, Nested) lists_column_wrapper{7, 6, 5}}, lists_column_wrapper{lists_column_wrapper{5, 6}}, lists_column_wrapper{lists_column_wrapper{0, 3}, + lists_column_wrapper{}, lists_column_wrapper{5}, lists_column_wrapper{2, 1}}}; fixed_width_column_wrapper b{100, 200, 300}; @@ -196,9 +201,10 @@ TEST_F(ExplodeTest, Nested) lists_column_wrapper{7, 6, 5}, lists_column_wrapper{5, 6}, lists_column_wrapper{0, 3}, + lists_column_wrapper{}, lists_column_wrapper{5}, lists_column_wrapper{2, 1}}; - fixed_width_column_wrapper expected_b{100, 100, 200, 300, 300, 300}; + fixed_width_column_wrapper expected_b{100, 100, 200, 300, 300, 300, 300}; cudf::table_view t({a, b}); cudf::table_view expected({expected_a, expected_b}); @@ -286,10 +292,10 @@ TEST_F(ExplodeTest, NullsInNested) TEST_F(ExplodeTest, NullsInNestedDoubleExplode) { /* - a b - [[1, 2], [7, 6, 5]] 100 - [[5, 6]] 200 - [[0, 3],[5],[2, 1]] 300 + a b + [[1, 2], [], [7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 */ auto valids = cudf::test::make_counting_transform_iterator( @@ -297,6 +303,7 @@ TEST_F(ExplodeTest, NullsInNestedDoubleExplode) lists_column_wrapper a{ lists_column_wrapper{lists_column_wrapper({1, 2}, valids), + lists_column_wrapper{}, lists_column_wrapper{7, 6, 5}}, lists_column_wrapper{lists_column_wrapper{5, 6}}, lists_column_wrapper{lists_column_wrapper{0, 3}, @@ -401,14 +408,14 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs) TEST_F(ExplodeTest, SlicedList) { /* - a b - [[1, 2],[7, 6, 5]] 100 - [[5, 6]] 200 - [[0, 3],[5],[2, 1]] 300 - [[8, 3],[4, 3, 1, 2]] 400 - [[2, 3, 4],[9, 8]] 500 - - slicing the top 2 rows off + a b + [[1, 2],[7, 6, 5]] 100 + [[5, 6]] 200 + [[0, 3],[5],[2, 1]] 300 + [[8, 3],[],[4, 3, 1, 2]] 400 + [[2, 3, 4],[9, 8]] 500 + + slicing the top 2 rows and the bottom row off */ auto valids = cudf::test::make_counting_transform_iterator( @@ -422,6 +429,7 @@ TEST_F(ExplodeTest, SlicedList) lists_column_wrapper{5}, lists_column_wrapper({2, 1}, valids)}, lists_column_wrapper{lists_column_wrapper{8, 3}, + lists_column_wrapper{}, lists_column_wrapper({4, 3, 1, 2}, valids)}, lists_column_wrapper{lists_column_wrapper{2, 3, 4}, lists_column_wrapper{9, 8}}}); @@ -431,8 +439,9 @@ TEST_F(ExplodeTest, SlicedList) lists_column_wrapper{5}, lists_column_wrapper({2, 1}, valids), lists_column_wrapper{8, 3}, + lists_column_wrapper{}, lists_column_wrapper({4, 3, 1, 2}, valids)}; - fixed_width_column_wrapper expected_b{300, 300, 300, 400, 400}; + fixed_width_column_wrapper expected_b{300, 300, 300, 400, 400, 400}; cudf::table_view t({a, b}); auto sliced_t = cudf::slice(t, {2, 4}); From fed595a5a72ced01bb4dda17850c82990de49dca Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 22 Jan 2021 16:09:55 +0000 Subject: [PATCH 10/10] Adding comment to offset_minus_one +1 line that looks like a bug --- cpp/src/reshape/explode.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/reshape/explode.cu b/cpp/src/reshape/explode.cu index e83251c790d..bc532893fb0 100644 --- a/cpp/src/reshape/explode.cu +++ b/cpp/src/reshape/explode.cu @@ -79,6 +79,10 @@ std::unique_ptr
explode_functor::operator()( offsets, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); auto counting_iter = thrust::make_counting_iterator(0); + // This looks like an off-by-one bug, but what is going on here is that we need to reduce each + // result from `lower_bound` by 1 to build the correct gather map. It was pointed out that + // this can be accomplished by simply skipping the first entry and using the result of + // `lower_bound` directly. thrust::lower_bound(rmm::exec_policy(stream), offsets_minus_one + 1, offsets_minus_one + lc.size() + 1,