From 36fa5f313a31c06aba94d69779d90b6a1128ead4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 4 Jan 2022 15:09:43 -0700 Subject: [PATCH] Implement per-list sequence (#9839) This PR adds `lists::sequences` API, allowing to generate per-list sequence. In particular, it allows generating a lists column in which each list is a sequence of numbers/durations. These sequences are generated individually from separate sets of (start, step, size) input values. Closes #9424. Note: `lists::sequences` supports only numeric types (integer types + floating-point types) and duration types. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - https://github.com/nvdbaranec - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/9839 --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/filling.hpp | 8 +- cpp/include/cudf/lists/filling.hpp | 105 ++++++++++++ cpp/src/lists/sequences.cu | 225 +++++++++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/lists/sequences_tests.cpp | 251 ++++++++++++++++++++++++++++ 6 files changed, 587 insertions(+), 4 deletions(-) create mode 100644 cpp/include/cudf/lists/filling.hpp create mode 100644 cpp/src/lists/sequences.cu create mode 100644 cpp/tests/lists/sequences_tests.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 86ec24c1b7b..624293ad87c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -343,6 +343,7 @@ add_library( src/lists/lists_column_factories.cu src/lists/lists_column_view.cu src/lists/segmented_sort.cu + src/lists/sequences.cu src/merge/merge.cu src/partitioning/partitioning.cu src/partitioning/round_robin.cu diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp index aff0d20a467..905a897eb40 100644 --- a/cpp/include/cudf/filling.hpp +++ b/cpp/include/cudf/filling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -169,7 +169,7 @@ std::unique_ptr repeat( * @param init First value in the sequence * @param step Increment value * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr The result table containing the sequence + * @return The result column containing the generated sequence */ std::unique_ptr sequence( size_type size, @@ -195,7 +195,7 @@ std::unique_ptr sequence( * @param size Size of the output column * @param init First value in the sequence * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr The result table containing the sequence + * @return The result column containing the generated sequence */ std::unique_ptr sequence( size_type size, @@ -223,7 +223,7 @@ std::unique_ptr sequence( * @param months Months to increment * @param mr Device memory resource used to allocate the returned column's device memory * - * @returns Timestamps column with sequences of months. + * @return Timestamps column with sequences of months. */ std::unique_ptr calendrical_month_sequence( size_type size, diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp new file mode 100644 index 00000000000..74a4dac1e10 --- /dev/null +++ b/cpp/include/cudf/lists/filling.hpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf::lists { +/** + * @addtogroup lists_filling + * @{ + * @file + * @brief Column APIs for individual list sequence + */ + +/** + * @brief Create a lists column in which each row contains a sequence of values specified by a tuple + * of (`start`, `size`) parameters. + * + * Create a lists column in which each row is a sequence of values starting from a `start` value, + * incrementing by one, and its cardinality is specified by a `size` value. The `start` and `size` + * values used to generate each list is taken from the corresponding row of the input @p starts and + * @p sizes columns. + * + * - @p sizes must be a column of integer types. + * - All the input columns must not have nulls. + * - If any row of the @p sizes column contains negative value, the output is undefined. + * + * @code{.pseudo} + * starts = [0, 1, 2, 3, 4] + * sizes = [0, 2, 2, 1, 3] + * + * output = [ [], [1, 2], [2, 3], [3], [4, 5, 6] ] + * @endcode + * + * @throws cudf::logic_error if @p sizes column is not of integer types. + * @throws cudf::logic_error if any input column has nulls. + * @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size. + * + * @param starts First values in the result sequences. + * @param sizes Numbers of values in the result sequences. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return The result column containing generated sequences. + */ +std::unique_ptr sequences( + column_view const& starts, + column_view const& sizes, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create a lists column in which each row contains a sequence of values specified by a tuple + * of (`start`, `step`, `size`) parameters. + * + * Create a lists column in which each row is a sequence of values starting from a `start` value, + * incrementing by a `step` value, and its cardinality is specified by a `size` value. The values + * `start`, `step`, and `size` used to generate each list is taken from the corresponding row of the + * input @p starts, @p steps, and @p sizes columns. + * + * - @p sizes must be a column of integer types. + * - @p starts and @p steps columns must have the same type. + * - All the input columns must not have nulls. + * - If any row of the @p sizes column contains negative value, the output is undefined. + * + * @code{.pseudo} + * starts = [0, 1, 2, 3, 4] + * steps = [2, 1, 1, 1, -3] + * sizes = [0, 2, 2, 1, 3] + * + * output = [ [], [1, 2], [2, 3], [3], [4, 1, -2] ] + * @endcode + * + * @throws cudf::logic_error if @p sizes column is not of integer types. + * @throws cudf::logic_error if any input column has nulls. + * @throws cudf::logic_error if @p starts and @p steps columns have different types. + * @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size. + * + * @param starts First values in the result sequences. + * @param steps Increment values for the result sequences. + * @param sizes Numbers of values in the result sequences. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return The result column containing generated sequences. + */ +std::unique_ptr sequences( + column_view const& starts, + column_view const& steps, + column_view const& sizes, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace cudf::lists diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu new file mode 100644 index 00000000000..5007918441b --- /dev/null +++ b/cpp/src/lists/sequences.cu @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cudf::lists { +namespace detail { +namespace { +template +struct tabulator { + size_type const n_lists; + size_type const n_elements; + + T const* const starts; + T const* const steps; + offset_type const* const offsets; + + template + static std::enable_if_t(), T> __device__ multiply(U x, size_type times) + { + return x * static_cast(times); + } + + template + static std::enable_if_t(), T> __device__ multiply(U x, size_type times) + { + return T{x.count() * times}; + } + + auto __device__ operator()(size_type idx) const + { + auto const list_idx_end = thrust::upper_bound(thrust::seq, offsets, offsets + n_lists, idx); + auto const list_idx = thrust::distance(offsets, list_idx_end) - 1; + auto const list_offset = offsets[list_idx]; + auto const list_step = steps ? steps[list_idx] : T{1}; + return starts[list_idx] + multiply(list_step, idx - list_offset); + } +}; + +template +struct sequences_functor { + template + static std::unique_ptr invoke(Args&&...) + { + CUDF_FAIL("Unsupported per-list sequence type-agg combination."); + } +}; + +struct sequences_dispatcher { + template + std::unique_ptr operator()(size_type n_lists, + size_type n_elements, + column_view const& starts, + std::optional const& steps, + offset_type const* offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + return sequences_functor::invoke(n_lists, n_elements, starts, steps, offsets, stream, mr); + } +}; + +template +static constexpr bool is_supported() +{ + return (cudf::is_numeric() && !cudf::is_boolean()) || cudf::is_duration(); +} + +template +struct sequences_functor()>> { + static std::unique_ptr invoke(size_type n_lists, + size_type n_elements, + column_view const& starts, + std::optional const& steps, + offset_type const* offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto result = + make_fixed_width_column(starts.type(), n_elements, mask_state::UNALLOCATED, stream, mr); + if (starts.is_empty()) { return result; } + + auto const result_begin = result->mutable_view().template begin(); + + // Use pointers instead of column_device_view to access start and step values should be enough. + // This is because we don't need to check for nulls and only support numeric and duration types. + auto const starts_begin = starts.template begin(); + auto const steps_begin = steps ? steps.value().template begin() : nullptr; + + auto const op = tabulator{n_lists, n_elements, starts_begin, steps_begin, offsets}; + thrust::tabulate(rmm::exec_policy(stream), result_begin, result_begin + n_elements, op); + + return result; + } +}; + +std::unique_ptr make_empty_lists_column(data_type child_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto offsets = make_empty_column(data_type(type_to_id())); + auto child = make_empty_column(child_type); + return make_lists_column( + 0, std::move(offsets), std::move(child), 0, rmm::device_buffer(0, stream, mr), stream, mr); +} + +std::unique_ptr sequences(column_view const& starts, + std::optional const& steps, + column_view const& sizes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!starts.has_nulls() && !sizes.has_nulls(), + "starts and sizes input columns must not have nulls."); + CUDF_EXPECTS(starts.size() == sizes.size(), + "starts and sizes input columns must have the same number of rows."); + CUDF_EXPECTS(cudf::is_index_type(sizes.type()), "Input sizes column must be of integer types."); + + if (steps) { + auto const& steps_cv = steps.value(); + CUDF_EXPECTS(!steps_cv.has_nulls(), "steps input column must not have nulls."); + CUDF_EXPECTS(starts.size() == steps_cv.size(), + "starts and steps input columns must have the same number of rows."); + CUDF_EXPECTS(starts.type() == steps_cv.type(), + "starts and steps input columns must have the same type."); + } + + auto const n_lists = starts.size(); + if (n_lists == 0) { return make_empty_lists_column(starts.type(), stream, mr); } + + // Generate list offsets for the output. + auto list_offsets = make_numeric_column( + data_type(type_to_id()), n_lists + 1, mask_state::UNALLOCATED, stream, mr); + auto const offsets_begin = list_offsets->mutable_view().template begin(); + auto const sizes_input_it = cudf::detail::indexalator_factory::make_input_iterator(sizes); + + thrust::exclusive_scan( + rmm::exec_policy(stream), sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin); + auto const n_elements = cudf::detail::get_value(list_offsets->view(), n_lists, stream); + + auto child = type_dispatcher(starts.type(), + sequences_dispatcher{}, + n_lists, + n_elements, + starts, + steps, + offsets_begin, + stream, + mr); + + return make_lists_column(n_lists, + std::move(list_offsets), + std::move(child), + 0, + rmm::device_buffer(0, stream, mr), + stream, + mr); +} + +} // anonymous namespace + +std::unique_ptr sequences(column_view const& starts, + column_view const& sizes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return sequences(starts, std::nullopt, sizes, stream, mr); +} + +std::unique_ptr sequences(column_view const& starts, + column_view const& steps, + column_view const& sizes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return sequences(starts, std::optional{steps}, sizes, stream, mr); +} + +} // namespace detail + +std::unique_ptr sequences(column_view const& starts, + column_view const& sizes, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sequences(starts, sizes, rmm::cuda_stream_default, mr); +} + +std::unique_ptr sequences(column_view const& starts, + column_view const& steps, + column_view const& sizes, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sequences(starts, steps, sizes, rmm::cuda_stream_default, mr); +} + +} // namespace cudf::lists diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index c1c209b2413..d90260400a0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -442,6 +442,7 @@ ConfigureTest( lists/drop_list_duplicates_tests.cpp lists/explode_tests.cpp lists/extract_tests.cpp + lists/sequences_tests.cpp lists/sort_lists_tests.cpp ) diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp new file mode 100644 index 00000000000..2dafeaf5cea --- /dev/null +++ b/cpp/tests/lists/sequences_tests.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +using namespace cudf::test::iterators; + +namespace { +template +using ListsCol = cudf::test::lists_column_wrapper; + +template +using FWDCol = cudf::test::fixed_width_column_wrapper; + +using IntsCol = cudf::test::fixed_width_column_wrapper; +} // namespace + +/*-----------------------------------------------------------------------------------------------*/ +template +class NumericSequencesTypedTest : public cudf::test::BaseFixture { +}; +using NumericTypes = + cudf::test::Concat; +TYPED_TEST_SUITE(NumericSequencesTypedTest, NumericTypes); + +TYPED_TEST(NumericSequencesTypedTest, SimpleTestNoNull) +{ + using T = TypeParam; + + auto const starts = FWDCol{1, 2, 3}; + auto const sizes = IntsCol{5, 3, 4}; + + // Sequences with step == 1. + { + auto const expected = + ListsCol{ListsCol{1, 2, 3, 4, 5}, ListsCol{2, 3, 4}, ListsCol{3, 4, 5, 6}}; + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with various steps. + { + auto const steps = FWDCol{1, 3, 2}; + auto const expected = + ListsCol{ListsCol{1, 2, 3, 4, 5}, ListsCol{2, 5, 8}, ListsCol{3, 5, 7, 9}}; + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TYPED_TEST(NumericSequencesTypedTest, ZeroSizesTest) +{ + using T = TypeParam; + + auto const starts = FWDCol{1, 2, 3}; + auto const sizes = IntsCol{0, 3, 0}; + + // Sequences with step == 1. + { + auto const expected = ListsCol{ListsCol{}, ListsCol{2, 3, 4}, ListsCol{}}; + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with various steps. + { + auto const steps = FWDCol{1, 3, 2}; + auto const expected = ListsCol{ListsCol{}, ListsCol{2, 5, 8}, ListsCol{}}; + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TYPED_TEST(NumericSequencesTypedTest, SlicedInputTestNoNulls) +{ + using T = TypeParam; + constexpr int32_t dont_care{123}; + + auto const starts_original = + FWDCol{dont_care, dont_care, dont_care, 1, 2, 3, 4, 5, dont_care, dont_care}; + auto const sizes_original = IntsCol{dont_care, 5, 3, 4, 1, 2, dont_care, dont_care}; + + auto const starts = cudf::slice(starts_original, {3, 8})[0]; + auto const sizes = cudf::slice(sizes_original, {1, 6})[0]; + + // Sequences with step == 1. + { + auto const expected = ListsCol{ListsCol{1, 2, 3, 4, 5}, + ListsCol{2, 3, 4}, + ListsCol{3, 4, 5, 6}, + ListsCol{4}, + ListsCol{5, 6} + + }; + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with various steps. + { + auto const steps_original = FWDCol{dont_care, dont_care, 1, 3, 2, 2, 3, dont_care}; + auto const steps = cudf::slice(steps_original, {2, 7})[0]; + + auto const expected = ListsCol{ListsCol{1, 2, 3, 4, 5}, + ListsCol{2, 5, 8}, + ListsCol{3, 5, 7, 9}, + ListsCol{4}, + ListsCol{5, 8} + + }; + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +/*-----------------------------------------------------------------------------------------------*/ +// Data generated using https://www.epochconverter.com/ +template +class DurationSequencesTypedTest : public cudf::test::BaseFixture { +}; +TYPED_TEST_SUITE(DurationSequencesTypedTest, cudf::test::DurationTypes); + +// Start time is 1638477473L - Thursday, December 2, 2021 8:37:53 PM. +constexpr int64_t start_time = 1638477473L; + +TYPED_TEST(DurationSequencesTypedTest, SequencesNoNull) +{ + using T = TypeParam; + + auto const starts = FWDCol{start_time, start_time, start_time}; + auto const sizes = IntsCol{1, 2, 3}; + + // Sequences with step == 1. + { + auto const expected_h = std::vector{start_time, start_time + 1L, start_time + 2L}; + auto const expected = + ListsCol{ListsCol{expected_h.begin(), expected_h.begin() + 1}, + ListsCol{expected_h.begin(), expected_h.begin() + 2}, + ListsCol{expected_h.begin(), expected_h.begin() + 3}}; + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with various steps, including negative. + { + auto const steps = FWDCol{10L, -155L, -13L}; + auto const expected = ListsCol{ + ListsCol{start_time}, + ListsCol{start_time, start_time - 155L}, + ListsCol{start_time, start_time - 13L, start_time - 13L * 2L}}; + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +/*-----------------------------------------------------------------------------------------------*/ +class NumericSequencesTest : public cudf::test::BaseFixture { +}; + +TEST_F(NumericSequencesTest, EmptyInput) +{ + auto const starts = IntsCol{}; + auto const sizes = IntsCol{}; + auto const steps = IntsCol{}; + auto const expected = ListsCol{}; + + // Sequences with step == 1. + { + auto const result = cudf::lists::sequences(starts, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Sequences with given steps. + { + auto const result = cudf::lists::sequences(starts, steps, sizes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TEST_F(NumericSequencesTest, InvalidSizesInput) +{ + auto const starts = IntsCol{}; + auto const steps = IntsCol{}; + auto const sizes = FWDCol{}; + + EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error); + EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error); +} + +TEST_F(NumericSequencesTest, MismatchedColumnSizesInput) +{ + auto const starts = IntsCol{1, 2, 3}; + auto const steps = IntsCol{1, 2}; + auto const sizes = IntsCol{1, 2, 3, 4}; + + EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error); + EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error); +} + +TEST_F(NumericSequencesTest, MismatchedColumnTypesInput) +{ + auto const starts = IntsCol{1, 2, 3}; + auto const steps = FWDCol{1, 2, 3}; + auto const sizes = IntsCol{1, 2, 3}; + + EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error); +} + +TEST_F(NumericSequencesTest, InputHasNulls) +{ + constexpr int32_t null{0}; + + { + auto const starts = IntsCol{{null, 2, 3}, null_at(0)}; + auto const sizes = IntsCol{1, 2, 3}; + EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error); + } + + { + auto const starts = IntsCol{1, 2, 3}; + auto const sizes = IntsCol{{null, 2, 3}, null_at(0)}; + EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error); + } + + { + auto const starts = IntsCol{1, 2, 3}; + auto const steps = IntsCol{{null, 2, 3}, null_at(0)}; + auto const sizes = IntsCol{1, 2, 3}; + EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error); + } +}