From 1c525883f2ee9213c3e466c271b7dd47abca84fe Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Thu, 21 Nov 2019 20:36:44 -0800 Subject: [PATCH 01/14] Add scatter_to_tables API --- cpp/include/cudf/copying.hpp | 40 +++++++++++++++++++++++++++ cpp/include/cudf/detail/scatter.hpp | 42 +++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index fbacd4984d7..c46a7b89a6f 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -129,6 +129,46 @@ std::unique_ptr scatter( table_view const& target, bool check_bounds = false, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); +/** + * @brief Scatters the rows of a table to `n` tables according to a scatter map + * + * Copies the rows from the input table to new tables according to the table + * indices given by scatter map. The number of output tables is one more than + * the maximum value in `scatter_map`. + * + * If a value in [0, n] does not appear in scatter_map, then the corresponding + * output table will be empty. + * + * `scatter_map` is a non-nullable column of integers whose `size` + * equals `input.num_rows()` and contains numbers in range of [0, n]. + * + * @throw cudf::logic_error when scatter map is a non-integer type + * @throw cudf::logic_error when scatter map is larger than input + * @throw cudf::logic_error when scatter map has nulls + * + * Example: + * input: [{10, 12, 14, 16, 18, 20, 22, 24, 26, 28}, + * { 1, 2, 3, 4, null, 0, 2, 4, 6, 2}] + * scatter_map: { 3, 4, 3, 1, 4, 4, 0, 1, 1, 1} + * output: {[{22}, {2}], + * [{16, 24, 26, 28}, {4, 4, 6, 2}], + * [{}, {}], + * [{10, 14}, {1, 3}], + * [{12, 18, 20}, {2, null, 0}]} + * + * @param input Table whose rows will be partitioned into a set of + * tables according to `scatter_map` + * @param scatter_map Non-nullable column of integer values that map + * each row in `input` table into one of the output tables + * @param mr The resource to use for all allocations + * + * @return A vector of tables containing the scattered rows of `input`. + * `table` `i` contains all rows `j` from `input` where `scatter_map[j] == i`. + */ +std::vector> scatter_to_tables( + table_view const& input, column_view const& scatter_map, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + /** ---------------------------------------------------------------------------* * @brief Indicates when to allocate a mask, based on an existing mask. * ---------------------------------------------------------------------------**/ diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp index 0d0f7571015..30d33d48ba2 100644 --- a/cpp/include/cudf/detail/scatter.hpp +++ b/cpp/include/cudf/detail/scatter.hpp @@ -105,6 +105,48 @@ std::unique_ptr
scatter( rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), cudaStream_t stream = 0); +/** + * @brief Scatters the rows of a table to `n` tables according to a scatter map + * + * Copies the rows from the input table to new tables according to the table + * indices given by scatter map. The number of output tables is one more than + * the maximum value in `scatter_map`. + * + * If a value in [0, n] does not appear in scatter_map, then the corresponding + * output table will be empty. + * + * `scatter_map` is a non-nullable column of integers whose `size` + * equals `input.num_rows()` and contains numbers in range of [0, n]. + * + * @throw cudf::logic_error when scatter map is a non-integer type + * @throw cudf::logic_error when scatter map is larger than input + * @throw cudf::logic_error when scatter map has nulls + * + * Example: + * input: [{10, 12, 14, 16, 18, 20, 22, 24, 26, 28}, + * { 1, 2, 3, 4, null, 0, 2, 4, 6, 2}] + * scatter_map: { 3, 4, 3, 1, 4, 4, 0, 1, 1, 1} + * output: {[{22}, {2}], + * [{16, 24, 26, 28}, {4, 4, 6, 2}], + * [{}, {}], + * [{10, 14}, {1, 3}], + * [{12, 18, 20}, {2, null, 0}]} + * + * @param input Table whose rows will be partitioned into a set of + * tables according to `scatter_map` + * @param scatter_map Non-nullable column of integer values that map + * each row in `input` table into one of the output tables + * @param mr The resource to use for all allocations + * @param stream The stream to use for CUDA operations + * + * @return A vector of tables containing the scattered rows of `input`. + * `table` `i` contains all rows `j` from `input` where `scatter_map[j] == i`. + */ +std::vector> scatter_to_tables( + table_view const& input, column_view const& scatter_map, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0); + } // namespace detail } // namespace experimental } // namespace cudf From c4db7827864d97d15d0430f551f4c27de40318a3 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Thu, 21 Nov 2019 21:09:50 -0800 Subject: [PATCH 02/14] Add dispatch for scatter_map type --- cpp/src/copying/scatter.cu | 44 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index d768f4d7dc8..f7ea2a9b6bc 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -309,6 +309,26 @@ struct scatter_scalar_impl { } }; +struct scatter_to_tables_impl { + template ::value + and not std::is_same::value>* = nullptr> + std::vector> operator()( + table_view const& input, column_view const& scatter_map, + rmm::mr::device_memory_resource* mr, cudaStream_t stream) + { + return std::vector>{empty_like(input, stream)}; + } + + template ::value + or std::is_same::value>* = nullptr> + std::vector> operator()( + table_view const& input, column_view const& scatter_map, + rmm::mr::device_memory_resource* mr, cudaStream_t stream) + { + CUDF_FAIL("Scatter index column must be an integral, non-boolean type"); + } +}; + } // namespace std::unique_ptr
scatter( @@ -359,6 +379,23 @@ std::unique_ptr
scatter( indices, target, check_bounds, mr, stream); } +std::vector> scatter_to_tables( + table_view const& input, column_view const& scatter_map, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), + cudaStream_t stream = 0) +{ + CUDF_EXPECTS(scatter_map.size() <= input.num_rows(), "scatter map larger than input"); + CUDF_EXPECTS(scatter_map.has_nulls() == false, "scatter map contains nulls"); + + if (scatter_map.size() == 0 || input.num_rows() == 0) { + return std::vector>{empty_like(input, stream)}; + } + + // First dispatch for scatter index type + return type_dispatcher(scatter_map.type(), scatter_to_tables_impl{}, + input, scatter_map, mr, stream); +} + } // namespace detail std::unique_ptr
scatter( @@ -377,5 +414,12 @@ std::unique_ptr
scatter( return detail::scatter(source, indices, target, check_bounds, mr); } +std::vector> scatter_to_tables( + table_view const& input, column_view const& scatter_map, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()) +{ + return detail::scatter_to_tables(input, scatter_map, mr); +} + } // namespace experimental } // namespace cudf From b1a55662d87d56bc232b79531826f06dd3627f4f Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Fri, 22 Nov 2019 15:11:27 -0800 Subject: [PATCH 03/14] Rename scatter_map to partition_map --- cpp/include/cudf/copying.hpp | 31 +++++++++++++---------------- cpp/include/cudf/detail/scatter.hpp | 31 +++++++++++++---------------- cpp/src/copying/scatter.cu | 22 ++++++++++---------- 3 files changed, 39 insertions(+), 45 deletions(-) diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index c46a7b89a6f..d12cecc8bc4 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -130,26 +130,23 @@ std::unique_ptr
scatter( rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); /** - * @brief Scatters the rows of a table to `n` tables according to a scatter map + * @brief Scatters the rows of a table to `n` tables according to a partition map * * Copies the rows from the input table to new tables according to the table - * indices given by scatter map. The number of output tables is one more than - * the maximum value in `scatter_map`. + * indices given by partition_map. The number of output tables is one more than + * the maximum value in `partition_map`. * - * If a value in [0, n] does not appear in scatter_map, then the corresponding + * If a value in [0, n] does not appear in partition_map, then the corresponding * output table will be empty. * - * `scatter_map` is a non-nullable column of integers whose `size` - * equals `input.num_rows()` and contains numbers in range of [0, n]. - * - * @throw cudf::logic_error when scatter map is a non-integer type - * @throw cudf::logic_error when scatter map is larger than input - * @throw cudf::logic_error when scatter map has nulls + * @throw cudf::logic_error when partition_map is a non-integer type + * @throw cudf::logic_error when partition_map is larger than input + * @throw cudf::logic_error when partition_map has nulls * * Example: - * input: [{10, 12, 14, 16, 18, 20, 22, 24, 26, 28}, - * { 1, 2, 3, 4, null, 0, 2, 4, 6, 2}] - * scatter_map: { 3, 4, 3, 1, 4, 4, 0, 1, 1, 1} + * input: [{10, 12, 14, 16, 18, 20, 22, 24, 26, 28}, + * { 1, 2, 3, 4, null, 0, 2, 4, 6, 2}] + * partition_map: {3, 4, 3, 1, 4, 4, 0, 1, 1, 1} * output: {[{22}, {2}], * [{16, 24, 26, 28}, {4, 4, 6, 2}], * [{}, {}], @@ -157,16 +154,16 @@ std::unique_ptr
scatter( * [{12, 18, 20}, {2, null, 0}]} * * @param input Table whose rows will be partitioned into a set of - * tables according to `scatter_map` - * @param scatter_map Non-nullable column of integer values that map + * tables according to `partition_map` + * @param partition_map Non-null column of integer values that map * each row in `input` table into one of the output tables * @param mr The resource to use for all allocations * * @return A vector of tables containing the scattered rows of `input`. - * `table` `i` contains all rows `j` from `input` where `scatter_map[j] == i`. + * `table` `i` contains all rows `j` from `input` where `partition_map[j] == i`. */ std::vector> scatter_to_tables( - table_view const& input, column_view const& scatter_map, + table_view const& input, column_view const& partition_map, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); /** ---------------------------------------------------------------------------* diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp index 30d33d48ba2..282e666f420 100644 --- a/cpp/include/cudf/detail/scatter.hpp +++ b/cpp/include/cudf/detail/scatter.hpp @@ -106,26 +106,23 @@ std::unique_ptr
scatter( cudaStream_t stream = 0); /** - * @brief Scatters the rows of a table to `n` tables according to a scatter map + * @brief Scatters the rows of a table to `n` tables according to a partition map * * Copies the rows from the input table to new tables according to the table - * indices given by scatter map. The number of output tables is one more than - * the maximum value in `scatter_map`. + * indices given by partition_map. The number of output tables is one more than + * the maximum value in `partition_map`. * - * If a value in [0, n] does not appear in scatter_map, then the corresponding + * If a value in [0, n] does not appear in partition_map, then the corresponding * output table will be empty. * - * `scatter_map` is a non-nullable column of integers whose `size` - * equals `input.num_rows()` and contains numbers in range of [0, n]. - * - * @throw cudf::logic_error when scatter map is a non-integer type - * @throw cudf::logic_error when scatter map is larger than input - * @throw cudf::logic_error when scatter map has nulls + * @throw cudf::logic_error when partition_map is a non-integer type + * @throw cudf::logic_error when partition_map is larger than input + * @throw cudf::logic_error when partition_map has nulls * * Example: - * input: [{10, 12, 14, 16, 18, 20, 22, 24, 26, 28}, - * { 1, 2, 3, 4, null, 0, 2, 4, 6, 2}] - * scatter_map: { 3, 4, 3, 1, 4, 4, 0, 1, 1, 1} + * input: [{10, 12, 14, 16, 18, 20, 22, 24, 26, 28}, + * { 1, 2, 3, 4, null, 0, 2, 4, 6, 2}] + * partition_map: {3, 4, 3, 1, 4, 4, 0, 1, 1, 1} * output: {[{22}, {2}], * [{16, 24, 26, 28}, {4, 4, 6, 2}], * [{}, {}], @@ -133,17 +130,17 @@ std::unique_ptr
scatter( * [{12, 18, 20}, {2, null, 0}]} * * @param input Table whose rows will be partitioned into a set of - * tables according to `scatter_map` - * @param scatter_map Non-nullable column of integer values that map + * tables according to `partition_map` + * @param partition_map Non-null column of integer values that map * each row in `input` table into one of the output tables * @param mr The resource to use for all allocations * @param stream The stream to use for CUDA operations * * @return A vector of tables containing the scattered rows of `input`. - * `table` `i` contains all rows `j` from `input` where `scatter_map[j] == i`. + * `table` `i` contains all rows `j` from `input` where `partition_map[j] == i`. */ std::vector> scatter_to_tables( - table_view const& input, column_view const& scatter_map, + table_view const& input, column_view const& partition_map, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), cudaStream_t stream = 0); diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 2c71366e610..5e92de8cc4c 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -310,7 +310,7 @@ struct scatter_to_tables_impl { template ::value and not std::is_same::value>* = nullptr> std::vector> operator()( - table_view const& input, column_view const& scatter_map, + table_view const& input, column_view const& partition_map, rmm::mr::device_memory_resource* mr, cudaStream_t stream) { return std::vector>{empty_like(input, stream)}; @@ -319,10 +319,10 @@ struct scatter_to_tables_impl { template ::value or std::is_same::value>* = nullptr> std::vector> operator()( - table_view const& input, column_view const& scatter_map, + table_view const& input, column_view const& partition_map, rmm::mr::device_memory_resource* mr, cudaStream_t stream) { - CUDF_FAIL("Scatter index column must be an integral, non-boolean type"); + CUDF_FAIL("Partition map column must be an integral, non-boolean type"); } }; @@ -377,20 +377,20 @@ std::unique_ptr
scatter( } std::vector> scatter_to_tables( - table_view const& input, column_view const& scatter_map, + table_view const& input, column_view const& partition_map, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), cudaStream_t stream = 0) { - CUDF_EXPECTS(scatter_map.size() <= input.num_rows(), "scatter map larger than input"); - CUDF_EXPECTS(scatter_map.has_nulls() == false, "scatter map contains nulls"); + CUDF_EXPECTS(partition_map.size() <= input.num_rows(), "scatter map larger than input"); + CUDF_EXPECTS(partition_map.has_nulls() == false, "scatter map contains nulls"); - if (scatter_map.size() == 0 || input.num_rows() == 0) { + if (partition_map.size() == 0 || input.num_rows() == 0) { return std::vector>{empty_like(input, stream)}; } // First dispatch for scatter index type - return type_dispatcher(scatter_map.type(), scatter_to_tables_impl{}, - input, scatter_map, mr, stream); + return type_dispatcher(partition_map.type(), scatter_to_tables_impl{}, + input, partition_map, mr, stream); } } // namespace detail @@ -412,10 +412,10 @@ std::unique_ptr
scatter( } std::vector> scatter_to_tables( - table_view const& input, column_view const& scatter_map, + table_view const& input, column_view const& partition_map, rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()) { - return detail::scatter_to_tables(input, scatter_map, mr); + return detail::scatter_to_tables(input, partition_map, mr); } } // namespace experimental From 366d2dd8ba3fcb903b3544bd7e28b5c5b7756be2 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Fri, 22 Nov 2019 17:32:17 -0800 Subject: [PATCH 04/14] Add initial scatter_to_tables implementation --- cpp/src/copying/scatter.cu | 57 ++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 5e92de8cc4c..c4d998e9cac 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -313,7 +314,53 @@ struct scatter_to_tables_impl { table_view const& input, column_view const& partition_map, rmm::mr::device_memory_resource* mr, cudaStream_t stream) { - return std::vector>{empty_like(input, stream)}; + // Make a mutable copy of the partition map + auto d_partitions = rmm::device_vector( + partition_map.begin(), partition_map.end()); + + // Initialize gather maps and offsets to sequence + auto d_gather_maps = rmm::device_vector(partition_map.size()); + auto d_offsets = rmm::device_vector(partition_map.size()); + thrust::sequence(rmm::exec_policy(stream)->on(stream), + d_gather_maps.begin(), d_gather_maps.end()); + thrust::sequence(rmm::exec_policy(stream)->on(stream), + d_offsets.begin(), d_offsets.end()); + + // Sort sequence using partition map as key to generate gather maps + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + d_partitions.begin(), d_partitions.end(), d_gather_maps.begin()); + + // Reduce unique partitions to extract gather map offsets from sequence + auto end = thrust::unique_by_key(rmm::exec_policy(stream)->on(stream), + d_partitions.begin(), d_partitions.end(), d_offsets.begin()); + + // Copy partition indices and gather map offsets to host + auto partitions = thrust::host_vector(d_partitions.begin(), end.first); + auto offsets = thrust::host_vector(d_offsets.begin(), end.second); + offsets.push_back(partition_map.size()); + + CUDF_EXPECTS(partitions.front() >= 0, "Invalid negative partition index"); + auto output = std::vector>(partitions.back() + 1); + + size_t next_partition = 0; + for (size_t index = 0; index < partitions.size(); ++index) { + auto const partition = static_cast(partitions[index]); + + // Create empty tables for unused partitions + for (; next_partition < partition; ++next_partition) { + output[next_partition] = empty_like(input, stream); + } + + // Gather input rows for the current partition + auto const data = d_gather_maps.data().get() + offsets[index]; + auto const size = offsets[index + 1] - offsets[index]; + auto const gather_map = column_view(data_type(INT32), size, data); + output[partition] = gather(input, gather_map, false, false, false, mr, stream); + + next_partition = partition + 1; + } + + return output; } template ::value @@ -378,14 +425,14 @@ std::unique_ptr
scatter( std::vector> scatter_to_tables( table_view const& input, column_view const& partition_map, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(), - cudaStream_t stream = 0) + rmm::mr::device_memory_resource* mr, + cudaStream_t stream) { CUDF_EXPECTS(partition_map.size() <= input.num_rows(), "scatter map larger than input"); CUDF_EXPECTS(partition_map.has_nulls() == false, "scatter map contains nulls"); if (partition_map.size() == 0 || input.num_rows() == 0) { - return std::vector>{empty_like(input, stream)}; + return std::vector>{}; } // First dispatch for scatter index type @@ -413,7 +460,7 @@ std::unique_ptr
scatter( std::vector> scatter_to_tables( table_view const& input, column_view const& partition_map, - rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()) + rmm::mr::device_memory_resource* mr) { return detail::scatter_to_tables(input, partition_map, mr); } From aa944da463bc2fa8fe82cce57f8795a2ecd738f3 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Fri, 22 Nov 2019 20:40:51 -0800 Subject: [PATCH 05/14] Add scatter_to_tables functionality test --- cpp/tests/CMakeLists.txt | 1 + cpp/tests/copying/scatter_to_tables_tests.cu | 148 +++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 cpp/tests/copying/scatter_to_tables_tests.cu diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 0845eea34e8..90199410227 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -431,6 +431,7 @@ set(COPYING_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/copying/utility_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/gather_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/scatter_tests.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/copying/scatter_to_tables_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/copy_range_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/slice_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/split_tests.cu" diff --git a/cpp/tests/copying/scatter_to_tables_tests.cu b/cpp/tests/copying/scatter_to_tables_tests.cu new file mode 100644 index 00000000000..4c1b1a5d488 --- /dev/null +++ b/cpp/tests/copying/scatter_to_tables_tests.cu @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include + +using cudf::test::fixed_width_column_wrapper; +using cudf::test::strings_column_wrapper; +using cudf::test::expect_tables_equal; + +// Return vector of gather maps per partition +template +auto make_gather_maps(std::vector const& partition_map) +{ + auto const max = *std::max_element(partition_map.begin(), partition_map.end()); + std::vector> gather_maps(max + 1); + for (size_t i = 0; i < partition_map.size(); ++i) { + auto const partition = static_cast(partition_map[i]); + gather_maps.at(partition).push_back(i); + } + return gather_maps; +} + +// Return vector of columns per partition +template +auto gather_fixed_width(std::vector const& values, + std::vector> const& gather_maps) +{ + std::vector> columns(gather_maps.size()); + + std::transform(gather_maps.begin(), gather_maps.end(), columns.begin(), + [&values](auto const& gather_map) { + auto gather_iter = thrust::make_permutation_iterator( + values.begin(), gather_map.begin()); + return fixed_width_column_wrapper(gather_iter, + gather_iter + gather_map.size()); + }); + + return columns; +} + +// Return vector of columns per partition +template +auto gather_strings(std::vector const& strings, + std::vector> const& gather_maps) +{ + // No default constructor so reserve and push_back + std::vector columns; + columns.reserve(gather_maps.size()); + + for (auto const& gather_map : gather_maps) { + auto gather_iter = thrust::make_permutation_iterator( + strings.begin(), gather_map.begin()); + if (nullable) { + auto valid_iter = thrust::make_transform_iterator(gather_iter, + [](char const* ptr) { return ptr != nullptr; }); + columns.push_back(strings_column_wrapper(gather_iter, + gather_iter + gather_map.size(), valid_iter)); + } else { + columns.push_back(strings_column_wrapper(gather_iter, + gather_iter + gather_map.size())); + } + }; + + return columns; +} + +// Transform vector of column wrappers to vector of column views +template +auto make_view_vector(std::vector const& columns) +{ + std::vector views(columns.size()); + std::transform(columns.begin(), columns.end(), views.begin(), + [](auto const& col) { return static_cast(col); }); + return views; +} + +// Splice vector of partitioned columns into vector of tables +auto make_table_view_vector(std::vector> const& partitions) { + auto const num_cols = partitions.size(); + auto const num_parts = partitions.front().size(); + + // No default constructor so reserve and push_back + std::vector views; + views.reserve(num_parts); + + std::vector cols(num_cols); + for (size_t i_part = 0; i_part < num_parts; ++i_part) { + for (size_t i_col = 0; i_col < num_cols; ++i_col) { + cols.at(i_col) = partitions.at(i_col).at(i_part); + } + views.push_back(cudf::table_view(cols)); + } + + return views; +} + +class ScatterToTablesUntyped : public cudf::test::BaseFixture {}; + +TEST_F(ScatterToTablesUntyped, Functionality) +{ + auto floats = std::vector({1.f, 2.f, 3.f, 4.f, 5.f, 6.f}); + auto integers = std::vector({1, 2, 3, 4, 5, 6}); + auto strings = std::vector({"a", "bb", "ccc", "d", "ee", "fff"}); + auto partition_map = std::vector({3, 1, 1, 4, 1, 3}); + + // Assemble input table + auto floats_in = fixed_width_column_wrapper(floats.begin(), floats.end()); + auto integers_in = fixed_width_column_wrapper(integers.begin(), integers.end()); + auto strings_in = strings_column_wrapper(strings.begin(), strings.end()); + auto input = cudf::table_view({floats_in, integers_in, strings_in}); + + auto const partition_col = fixed_width_column_wrapper( + partition_map.begin(), partition_map.end()); + + // Compute expected tables + auto gather_maps = make_gather_maps(partition_map); + auto floats_cols = gather_fixed_width(floats, gather_maps); + auto integers_cols = gather_fixed_width(integers, gather_maps); + auto strings_cols = gather_strings(strings, gather_maps); + + auto floats_views = make_view_vector(floats_cols); + auto integers_views = make_view_vector(integers_cols); + auto strings_views = make_view_vector(strings_cols); + auto expected = make_table_view_vector({floats_views, integers_views, strings_views}); + + auto result = cudf::experimental::scatter_to_tables(input, partition_col); + EXPECT_EQ(expected.size(), result.size()); + for (size_t i = 0; i < result.size(); ++i) { + expect_tables_equal(expected[i], result[i]->view()); + } +} From 18fa52f8648ce7d09eba29f9b2bd0e2561f0670c Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Fri, 22 Nov 2019 21:05:15 -0800 Subject: [PATCH 06/14] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d5c8858822..65bf5a80001 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -145,6 +145,7 @@ - PR #3425 Strings column copy_if_else implementation - PR #3422 Move utilities to legacy - PR #3201 Define and implement new datetime_ops APIs +- PR #3448 Port scatter_to_tables to libcudf++ ## Bug Fixes From acc4b3e81e7b72eaaae6f18c7a5ecf9b769a0c44 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Fri, 22 Nov 2019 21:14:10 -0800 Subject: [PATCH 07/14] Disable num_children check in expect_column_properties_equal --- cpp/tests/utilities/column_utilities.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 20892ab72ec..d2642fe2922 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -42,7 +42,7 @@ void expect_column_properties_equal(cudf::column_view lhs, cudf::column_view rhs EXPECT_EQ(lhs.nullable(), rhs.nullable()); } EXPECT_EQ(lhs.has_nulls(), rhs.has_nulls()); - EXPECT_EQ(lhs.num_children(), rhs.num_children()); + //EXPECT_EQ(lhs.num_children(), rhs.num_children()); } class corresponding_rows_unequal { From a525172688a7e49288264ab6e95ec144babf5ea9 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Tue, 26 Nov 2019 10:28:25 -0800 Subject: [PATCH 08/14] Apply suggestions from code review Co-Authored-By: Mark Harris --- cpp/include/cudf/copying.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index d12cecc8bc4..6c31ebc9272 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -136,7 +136,7 @@ std::unique_ptr
scatter( * indices given by partition_map. The number of output tables is one more than * the maximum value in `partition_map`. * - * If a value in [0, n] does not appear in partition_map, then the corresponding + * Output table `i` in [0, n] is empty if `i` does not appear in partition_map. * output table will be empty. * * @throw cudf::logic_error when partition_map is a non-integer type @@ -153,7 +153,7 @@ std::unique_ptr
scatter( * [{10, 14}, {1, 3}], * [{12, 18, 20}, {2, null, 0}]} * - * @param input Table whose rows will be partitioned into a set of + * @param input Table of rows to be partitioned into a set of tables * tables according to `partition_map` * @param partition_map Non-null column of integer values that map * each row in `input` table into one of the output tables From 60029d8e0795e3aac4e10239299dcbed953fb1a6 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Tue, 26 Nov 2019 10:31:46 -0800 Subject: [PATCH 09/14] Apply doc suggestions to detail header --- cpp/include/cudf/detail/scatter.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp index 282e666f420..1779c18e731 100644 --- a/cpp/include/cudf/detail/scatter.hpp +++ b/cpp/include/cudf/detail/scatter.hpp @@ -112,7 +112,7 @@ std::unique_ptr
scatter( * indices given by partition_map. The number of output tables is one more than * the maximum value in `partition_map`. * - * If a value in [0, n] does not appear in partition_map, then the corresponding + * Output table `i` in [0, n] is empty if `i` does not appear in partition_map. * output table will be empty. * * @throw cudf::logic_error when partition_map is a non-integer type @@ -129,7 +129,7 @@ std::unique_ptr
scatter( * [{10, 14}, {1, 3}], * [{12, 18, 20}, {2, null, 0}]} * - * @param input Table whose rows will be partitioned into a set of + * @param input Table of rows to be partitioned into a set of tables * tables according to `partition_map` * @param partition_map Non-null column of integer values that map * each row in `input` table into one of the output tables From 0c537de304cbcaa9ad0d12755130f8109cc0ea00 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Tue, 26 Nov 2019 10:54:53 -0800 Subject: [PATCH 10/14] Remove child allocation from empty_like --- cpp/src/copying/copy.cpp | 12 +++--------- cpp/tests/copying/utility_tests.cu | 15 ++++----------- cpp/tests/utilities/column_utilities.cu | 2 +- 3 files changed, 8 insertions(+), 21 deletions(-) diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index da1c12f7f43..f37839a0010 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -45,14 +45,8 @@ inline mask_state should_allocate_mask(mask_allocation_policy mask_alloc, bool m */ std::unique_ptr empty_like(column_view const& input, cudaStream_t stream) { - std::vector> children {}; - children.reserve(input.num_children()); - for (size_type index = 0; index < input.num_children(); index++) { - children.emplace_back(empty_like(input.child(index), stream)); - } - return std::make_unique(input.type(), 0, rmm::device_buffer {}, - rmm::device_buffer {}, 0, std::move(children)); + rmm::device_buffer {}, 0); } /* @@ -63,7 +57,7 @@ std::unique_ptr allocate_like(column_view const& input, size_type size, mask_allocation_policy mask_alloc, rmm::mr::device_memory_resource *mr, - cudaStream_t stream) + cudaStream_t stream) { CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column"); mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable()); @@ -108,7 +102,7 @@ std::unique_ptr allocate_like(column_view const& input, } std::unique_ptr allocate_like(column_view const& input, - size_type size, + size_type size, mask_allocation_policy mask_alloc, rmm::mr::device_memory_resource *mr) { return detail::allocate_like(input, size, mask_alloc, mr); diff --git a/cpp/tests/copying/utility_tests.cu b/cpp/tests/copying/utility_tests.cu index 74b35fdfeef..58feec3a662 100644 --- a/cpp/tests/copying/utility_tests.cu +++ b/cpp/tests/copying/utility_tests.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include template @@ -80,16 +81,6 @@ rmm::device_vector> create_test_string return d_strings; } -void check_empty_string_columns(cudf::column_view lhs, cudf::column_view rhs) -{ - EXPECT_EQ(lhs.type(), rhs.type()); - EXPECT_EQ(lhs.size(), 0); - EXPECT_EQ(lhs.null_count(), 0); - EXPECT_EQ(lhs.nullable(), false); - EXPECT_EQ(lhs.has_nulls(), false); - EXPECT_EQ(lhs.num_children(), rhs.num_children()); -} - TEST_F(EmptyLikeStringTest, ColumnStringTest) { rmm::device_vector> d_strings = create_test_string(); @@ -97,7 +88,9 @@ TEST_F(EmptyLikeStringTest, ColumnStringTest) { auto got = cudf::experimental::empty_like(column->view()); - check_empty_string_columns(got->view(), column->view()); + auto expected = cudf::strings::detail::make_empty_strings_column(); + + cudf::test::expect_columns_equal(got->view(), expected->view()); } std::unique_ptr create_table (cudf::size_type size, cudf::mask_state state){ diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index d2642fe2922..20892ab72ec 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -42,7 +42,7 @@ void expect_column_properties_equal(cudf::column_view lhs, cudf::column_view rhs EXPECT_EQ(lhs.nullable(), rhs.nullable()); } EXPECT_EQ(lhs.has_nulls(), rhs.has_nulls()); - //EXPECT_EQ(lhs.num_children(), rhs.num_children()); + EXPECT_EQ(lhs.num_children(), rhs.num_children()); } class corresponding_rows_unequal { From ca90554bf690cc0eab17a3121a23dfacf0b5cef4 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Tue, 26 Nov 2019 11:02:27 -0800 Subject: [PATCH 11/14] scatter_to_tables_tests cu -> cpp --- cpp/tests/CMakeLists.txt | 2 +- .../{scatter_to_tables_tests.cu => scatter_to_tables_tests.cpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/tests/copying/{scatter_to_tables_tests.cu => scatter_to_tables_tests.cpp} (100%) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 0548a95b75c..cec72070c8b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -439,7 +439,7 @@ set(COPYING_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/copying/utility_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/gather_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/scatter_tests.cu" - "${CMAKE_CURRENT_SOURCE_DIR}/copying/scatter_to_tables_tests.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/copying/scatter_to_tables_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/copying/copy_range_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/slice_tests.cu" "${CMAKE_CURRENT_SOURCE_DIR}/copying/split_tests.cu" diff --git a/cpp/tests/copying/scatter_to_tables_tests.cu b/cpp/tests/copying/scatter_to_tables_tests.cpp similarity index 100% rename from cpp/tests/copying/scatter_to_tables_tests.cu rename to cpp/tests/copying/scatter_to_tables_tests.cpp From 45802121595c8564413ab25a22dfd0469173ecf4 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Tue, 26 Nov 2019 11:35:59 -0800 Subject: [PATCH 12/14] Replace slow thrust::all_of with count_if --- cpp/src/copying/scatter.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index c4d998e9cac..f5013569f47 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -137,7 +137,8 @@ struct scatter_impl { auto const begin = -target.num_rows(); auto const end = target.num_rows(); auto bounds = bounds_checker{begin, end}; - CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream)->on(stream), + CUDF_EXPECTS(scatter_map.size() == thrust::count_if( + rmm::exec_policy(stream)->on(stream), scatter_map.begin(), scatter_map.end(), bounds), "Scatter map index out of bounds"); } @@ -272,7 +273,8 @@ struct scatter_scalar_impl { auto const begin = -target.num_rows(); auto const end = target.num_rows(); auto bounds = bounds_checker{begin, end}; - CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream)->on(stream), + CUDF_EXPECTS(indices.size() == thrust::count_if( + rmm::exec_policy(stream)->on(stream), indices.begin(), indices.end(), bounds), "Scatter map index out of bounds"); } @@ -351,7 +353,7 @@ struct scatter_to_tables_impl { output[next_partition] = empty_like(input, stream); } - // Gather input rows for the current partition + // Gather input rows for the current partition (second dispatch for column types) auto const data = d_gather_maps.data().get() + offsets[index]; auto const size = offsets[index + 1] - offsets[index]; auto const gather_map = column_view(data_type(INT32), size, data); From 3b33016a32edffa8efaa98dc046ac932f628d247 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Wed, 27 Nov 2019 12:13:41 -0800 Subject: [PATCH 13/14] Remove declaration of undefined detail::empty_like --- cpp/include/cudf/detail/copy.hpp | 12 ------------ cpp/src/copying/scatter.cu | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index 4edee96dff5..7b05158e382 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -81,18 +81,6 @@ std::unique_ptr allocate_like(column_view const& input, size_type size, rmm::mr::get_default_resource(), cudaStream_t stream = 0); -/** - * @brief Creates a table of empty columns with the same types as the `input_table` - * - * Creates the `cudf::column` objects, but does not allocate any underlying device - * memory for the column's data or bitmask. - * - * @param[in] input_table Immutable view of input table to emulate - * @param[in] stream Optional CUDA stream on which to execute kernels - * @return std::unique_ptr
A table of empty columns with the same types as the columns in `input_table` - */ -std::unique_ptr
empty_like(table_view const& input_table, cudaStream_t stream = 0); - /** * @brief Returns a new column, where each element is selected from either @p lhs or * @p rhs based on the value of the corresponding element in @p boolean_mask diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index f5013569f47..8874a90e47e 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -350,7 +350,7 @@ struct scatter_to_tables_impl { // Create empty tables for unused partitions for (; next_partition < partition; ++next_partition) { - output[next_partition] = empty_like(input, stream); + output[next_partition] = empty_like(input); } // Gather input rows for the current partition (second dispatch for column types) From 44887ffcd6ff7bbbe24de60d840e19a380da0672 Mon Sep 17 00:00:00 2001 From: Trevor Smith Date: Wed, 27 Nov 2019 13:13:55 -0800 Subject: [PATCH 14/14] Add additional scatter_to_tables tests --- cpp/tests/copying/scatter_to_tables_tests.cpp | 159 +++++++++++++++++- 1 file changed, 155 insertions(+), 4 deletions(-) diff --git a/cpp/tests/copying/scatter_to_tables_tests.cpp b/cpp/tests/copying/scatter_to_tables_tests.cpp index 4c1b1a5d488..d71e8b98c5a 100644 --- a/cpp/tests/copying/scatter_to_tables_tests.cpp +++ b/cpp/tests/copying/scatter_to_tables_tests.cpp @@ -20,10 +20,14 @@ #include #include +#include + using cudf::test::fixed_width_column_wrapper; using cudf::test::strings_column_wrapper; using cudf::test::expect_tables_equal; +namespace { + // Return vector of gather maps per partition template auto make_gather_maps(std::vector const& partition_map) @@ -55,6 +59,26 @@ auto gather_fixed_width(std::vector const& values, return columns; } +// Return vector of columns per partition +template +auto gather_fixed_width(std::vector const& values, ValidIterator valid_iterator, + std::vector> const& gather_maps) +{ + std::vector> columns(gather_maps.size()); + + std::transform(gather_maps.begin(), gather_maps.end(), columns.begin(), + [&values, &valid_iterator](auto const& gather_map) { + auto gather_iter = thrust::make_permutation_iterator( + values.begin(), gather_map.begin()); + auto valid_permuted = thrust::make_permutation_iterator( + valid_iterator, gather_map.begin()); + return fixed_width_column_wrapper(gather_iter, + gather_iter + gather_map.size(), valid_permuted); + }); + + return columns; +} + // Return vector of columns per partition template auto gather_strings(std::vector const& strings, @@ -111,14 +135,76 @@ auto make_table_view_vector(std::vector> const& p return views; } -class ScatterToTablesUntyped : public cudf::test::BaseFixture {}; +} // namespace + +template +class ScatterToTablesInvalidIndexTypes : public cudf::test::BaseFixture {}; + +// NOTE string types hit static assert in fixed_width_column_wrapper +using InvalidIndexTypes = cudf::test::Concat< + cudf::test::Types, + cudf::test::TimestampTypes>; +TYPED_TEST_CASE(ScatterToTablesInvalidIndexTypes, InvalidIndexTypes); + +TYPED_TEST(ScatterToTablesInvalidIndexTypes, InvalidPartitionMap) +{ + auto integers = std::vector({1, 2, 3, 4, 5, 6}); + auto partition_map = std::vector({3, 1, 1, 4, 1, 3}); + + // Assemble input table + auto integers_in = fixed_width_column_wrapper(integers.begin(), integers.end()); + auto input = cudf::table_view({integers_in}); + + auto const partition_col = fixed_width_column_wrapper( + partition_map.begin(), partition_map.end()); + + EXPECT_THROW(cudf::experimental::scatter_to_tables(input, partition_col), cudf::logic_error); +} + +template +class ScatterToTablesIndexTypes : public cudf::test::BaseFixture {}; + +using IndexTypes = cudf::test::Types; +TYPED_TEST_CASE(ScatterToTablesIndexTypes, IndexTypes); + +TYPED_TEST(ScatterToTablesIndexTypes, MapTooManyRows) +{ + auto integers = std::vector({1, 2, 3, 4, 5, 6}); + auto partition_map = std::vector({3, 1, 1, 4, 1, 3, 4}); // one row too many + + // Assemble input table + auto integers_in = fixed_width_column_wrapper(integers.begin(), integers.end()); + auto input = cudf::table_view({integers_in}); + + auto const partition_col = fixed_width_column_wrapper( + partition_map.begin(), partition_map.end()); + + EXPECT_THROW(cudf::experimental::scatter_to_tables(input, partition_col), cudf::logic_error); +} + +TYPED_TEST(ScatterToTablesIndexTypes, MapHasNulls) +{ + auto integers = std::vector({1, 2, 3, 4, 5, 6}); + auto partition_map = std::vector({3, 1, 1, 4, 1, 3}); + + // Assemble input table + auto integers_in = fixed_width_column_wrapper(integers.begin(), integers.end()); + auto input = cudf::table_view({integers_in}); + + // Add nulls to partition column + auto nulls_iterator = thrust::make_constant_iterator(0); + auto const partition_col = fixed_width_column_wrapper( + partition_map.begin(), partition_map.end(), nulls_iterator); + + EXPECT_THROW(cudf::experimental::scatter_to_tables(input, partition_col), cudf::logic_error); +} -TEST_F(ScatterToTablesUntyped, Functionality) +TYPED_TEST(ScatterToTablesIndexTypes, MultipleTypes) { auto floats = std::vector({1.f, 2.f, 3.f, 4.f, 5.f, 6.f}); auto integers = std::vector({1, 2, 3, 4, 5, 6}); auto strings = std::vector({"a", "bb", "ccc", "d", "ee", "fff"}); - auto partition_map = std::vector({3, 1, 1, 4, 1, 3}); + auto partition_map = std::vector({3, 1, 1, 4}); // shorter than input is ok // Assemble input table auto floats_in = fixed_width_column_wrapper(floats.begin(), floats.end()); @@ -126,7 +212,7 @@ TEST_F(ScatterToTablesUntyped, Functionality) auto strings_in = strings_column_wrapper(strings.begin(), strings.end()); auto input = cudf::table_view({floats_in, integers_in, strings_in}); - auto const partition_col = fixed_width_column_wrapper( + auto const partition_col = fixed_width_column_wrapper( partition_map.begin(), partition_map.end()); // Compute expected tables @@ -146,3 +232,68 @@ TEST_F(ScatterToTablesUntyped, Functionality) expect_tables_equal(expected[i], result[i]->view()); } } + +TYPED_TEST(ScatterToTablesIndexTypes, StringsNulls) +{ + auto strings = std::vector({"a", "bb", nullptr, "d", nullptr, "fff"}); + auto partition_map = std::vector({3, 1, 1, 4, 1, 3}); + + // Assemble input table + auto strings_in = strings_column_wrapper(strings.begin(), strings.end(), + thrust::make_transform_iterator(strings.begin(), [](auto str) { return str != nullptr; })); + auto input = cudf::table_view({strings_in}); + + auto const partition_col = fixed_width_column_wrapper( + partition_map.begin(), partition_map.end()); + + // Compute expected tables + auto gather_maps = make_gather_maps(partition_map); + auto strings_cols = gather_strings(strings, gather_maps); + + auto strings_views = make_view_vector(strings_cols); + auto expected = make_table_view_vector({strings_views}); + + auto result = cudf::experimental::scatter_to_tables(input, partition_col); + EXPECT_EQ(expected.size(), result.size()); + for (size_t i = 0; i < result.size(); ++i) { + expect_tables_equal(expected[i], result[i]->view()); + } +} + +template +class ScatterToTablesFixedWidth : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(ScatterToTablesFixedWidth, cudf::test::FixedWidthTypes); + +TYPED_TEST(ScatterToTablesFixedWidth, MixedNulls) +{ + auto first = std::vector({1, 2, 3, 4, 5, 6}); + auto second = std::vector({7, 8, 9, 10, 11, 12}); + auto partition_map = std::vector({3, 1, 1, 4, 1, 3}); + + auto valid_iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [](auto index) { return index % 3 > 0; }); + + // Assemble input table + auto first_in = fixed_width_column_wrapper(first.begin(), first.end()); + auto second_in = fixed_width_column_wrapper(second.begin(), second.end(), valid_iter); + auto input = cudf::table_view({first_in, second_in}); + + auto const partition_col = fixed_width_column_wrapper( + partition_map.begin(), partition_map.end()); + + // Compute expected tables + auto gather_maps = make_gather_maps(partition_map); + auto first_cols = gather_fixed_width(first, gather_maps); + auto second_cols = gather_fixed_width(second, valid_iter, gather_maps); + + auto first_views = make_view_vector(first_cols); + auto second_views = make_view_vector(second_cols); + auto expected = make_table_view_vector({first_views, second_views}); + + auto result = cudf::experimental::scatter_to_tables(input, partition_col); + EXPECT_EQ(expected.size(), result.size()); + for (size_t i = 0; i < result.size(); ++i) { + expect_tables_equal(expected[i], result[i]->view()); + } +}