Skip to content

Commit

Permalink
Implement per-list sequence (#9839)
Browse files Browse the repository at this point in the history
This PR adds `lists::sequences` API, allowing to generate per-list sequence. In particular, it allows generating a lists column in which each list is a sequence of numbers/durations. These sequences are generated individually from separate sets of (start, step, size) input values.

Closes #9424.

Note: `lists::sequences` supports only numeric types (integer types + floating-point types) and duration types.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - https://github.com/nvdbaranec
  - Karthikeyan (https://github.com/karthikeyann)

URL: #9839
  • Loading branch information
ttnghia authored Jan 4, 2022
1 parent cc4a2bd commit 36fa5f3
Show file tree
Hide file tree
Showing 6 changed files with 587 additions and 4 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ add_library(
src/lists/lists_column_factories.cu
src/lists/lists_column_view.cu
src/lists/segmented_sort.cu
src/lists/sequences.cu
src/merge/merge.cu
src/partitioning/partitioning.cu
src/partitioning/round_robin.cu
Expand Down
8 changes: 4 additions & 4 deletions cpp/include/cudf/filling.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -169,7 +169,7 @@ std::unique_ptr<table> repeat(
* @param init First value in the sequence
* @param step Increment value
* @param mr Device memory resource used to allocate the returned column's device memory
* @return std::unique_ptr<column> The result table containing the sequence
* @return The result column containing the generated sequence
*/
std::unique_ptr<column> sequence(
size_type size,
Expand All @@ -195,7 +195,7 @@ std::unique_ptr<column> sequence(
* @param size Size of the output column
* @param init First value in the sequence
* @param mr Device memory resource used to allocate the returned column's device memory
* @return std::unique_ptr<column> The result table containing the sequence
* @return The result column containing the generated sequence
*/
std::unique_ptr<column> sequence(
size_type size,
Expand Down Expand Up @@ -223,7 +223,7 @@ std::unique_ptr<column> sequence(
* @param months Months to increment
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns Timestamps column with sequences of months.
* @return Timestamps column with sequences of months.
*/
std::unique_ptr<cudf::column> calendrical_month_sequence(
size_type size,
Expand Down
105 changes: 105 additions & 0 deletions cpp/include/cudf/lists/filling.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/types.hpp>

#include <memory>

namespace cudf::lists {
/**
* @addtogroup lists_filling
* @{
* @file
* @brief Column APIs for individual list sequence
*/

/**
* @brief Create a lists column in which each row contains a sequence of values specified by a tuple
* of (`start`, `size`) parameters.
*
* Create a lists column in which each row is a sequence of values starting from a `start` value,
* incrementing by one, and its cardinality is specified by a `size` value. The `start` and `size`
* values used to generate each list is taken from the corresponding row of the input @p starts and
* @p sizes columns.
*
* - @p sizes must be a column of integer types.
* - All the input columns must not have nulls.
* - If any row of the @p sizes column contains negative value, the output is undefined.
*
* @code{.pseudo}
* starts = [0, 1, 2, 3, 4]
* sizes = [0, 2, 2, 1, 3]
*
* output = [ [], [1, 2], [2, 3], [3], [4, 5, 6] ]
* @endcode
*
* @throws cudf::logic_error if @p sizes column is not of integer types.
* @throws cudf::logic_error if any input column has nulls.
* @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size.
*
* @param starts First values in the result sequences.
* @param sizes Numbers of values in the result sequences.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return The result column containing generated sequences.
*/
std::unique_ptr<column> sequences(
column_view const& starts,
column_view const& sizes,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a lists column in which each row contains a sequence of values specified by a tuple
* of (`start`, `step`, `size`) parameters.
*
* Create a lists column in which each row is a sequence of values starting from a `start` value,
* incrementing by a `step` value, and its cardinality is specified by a `size` value. The values
* `start`, `step`, and `size` used to generate each list is taken from the corresponding row of the
* input @p starts, @p steps, and @p sizes columns.
*
* - @p sizes must be a column of integer types.
* - @p starts and @p steps columns must have the same type.
* - All the input columns must not have nulls.
* - If any row of the @p sizes column contains negative value, the output is undefined.
*
* @code{.pseudo}
* starts = [0, 1, 2, 3, 4]
* steps = [2, 1, 1, 1, -3]
* sizes = [0, 2, 2, 1, 3]
*
* output = [ [], [1, 2], [2, 3], [3], [4, 1, -2] ]
* @endcode
*
* @throws cudf::logic_error if @p sizes column is not of integer types.
* @throws cudf::logic_error if any input column has nulls.
* @throws cudf::logic_error if @p starts and @p steps columns have different types.
* @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size.
*
* @param starts First values in the result sequences.
* @param steps Increment values for the result sequences.
* @param sizes Numbers of values in the result sequences.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return The result column containing generated sequences.
*/
std::unique_ptr<column> sequences(
column_view const& starts,
column_view const& steps,
column_view const& sizes,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
} // namespace cudf::lists
225 changes: 225 additions & 0 deletions cpp/src/lists/sequences.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/lists/filling.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/binary_search.h>
#include <thrust/tabulate.h>

#include <optional>

namespace cudf::lists {
namespace detail {
namespace {
template <typename T>
struct tabulator {
size_type const n_lists;
size_type const n_elements;

T const* const starts;
T const* const steps;
offset_type const* const offsets;

template <typename U>
static std::enable_if_t<!cudf::is_duration<U>(), T> __device__ multiply(U x, size_type times)
{
return x * static_cast<T>(times);
}

template <typename U>
static std::enable_if_t<cudf::is_duration<U>(), T> __device__ multiply(U x, size_type times)
{
return T{x.count() * times};
}

auto __device__ operator()(size_type idx) const
{
auto const list_idx_end = thrust::upper_bound(thrust::seq, offsets, offsets + n_lists, idx);
auto const list_idx = thrust::distance(offsets, list_idx_end) - 1;
auto const list_offset = offsets[list_idx];
auto const list_step = steps ? steps[list_idx] : T{1};
return starts[list_idx] + multiply(list_step, idx - list_offset);
}
};

template <typename T, typename Enable = void>
struct sequences_functor {
template <typename... Args>
static std::unique_ptr<column> invoke(Args&&...)
{
CUDF_FAIL("Unsupported per-list sequence type-agg combination.");
}
};

struct sequences_dispatcher {
template <typename T>
std::unique_ptr<column> operator()(size_type n_lists,
size_type n_elements,
column_view const& starts,
std::optional<column_view> const& steps,
offset_type const* offsets,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return sequences_functor<T>::invoke(n_lists, n_elements, starts, steps, offsets, stream, mr);
}
};

template <typename T>
static constexpr bool is_supported()
{
return (cudf::is_numeric<T>() && !cudf::is_boolean<T>()) || cudf::is_duration<T>();
}

template <typename T>
struct sequences_functor<T, std::enable_if_t<is_supported<T>()>> {
static std::unique_ptr<column> invoke(size_type n_lists,
size_type n_elements,
column_view const& starts,
std::optional<column_view> const& steps,
offset_type const* offsets,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto result =
make_fixed_width_column(starts.type(), n_elements, mask_state::UNALLOCATED, stream, mr);
if (starts.is_empty()) { return result; }

auto const result_begin = result->mutable_view().template begin<T>();

// Use pointers instead of column_device_view to access start and step values should be enough.
// This is because we don't need to check for nulls and only support numeric and duration types.
auto const starts_begin = starts.template begin<T>();
auto const steps_begin = steps ? steps.value().template begin<T>() : nullptr;

auto const op = tabulator<T>{n_lists, n_elements, starts_begin, steps_begin, offsets};
thrust::tabulate(rmm::exec_policy(stream), result_begin, result_begin + n_elements, op);

return result;
}
};

std::unique_ptr<column> make_empty_lists_column(data_type child_type,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto offsets = make_empty_column(data_type(type_to_id<offset_type>()));
auto child = make_empty_column(child_type);
return make_lists_column(
0, std::move(offsets), std::move(child), 0, rmm::device_buffer(0, stream, mr), stream, mr);
}

std::unique_ptr<column> sequences(column_view const& starts,
std::optional<column_view> const& steps,
column_view const& sizes,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(!starts.has_nulls() && !sizes.has_nulls(),
"starts and sizes input columns must not have nulls.");
CUDF_EXPECTS(starts.size() == sizes.size(),
"starts and sizes input columns must have the same number of rows.");
CUDF_EXPECTS(cudf::is_index_type(sizes.type()), "Input sizes column must be of integer types.");

if (steps) {
auto const& steps_cv = steps.value();
CUDF_EXPECTS(!steps_cv.has_nulls(), "steps input column must not have nulls.");
CUDF_EXPECTS(starts.size() == steps_cv.size(),
"starts and steps input columns must have the same number of rows.");
CUDF_EXPECTS(starts.type() == steps_cv.type(),
"starts and steps input columns must have the same type.");
}

auto const n_lists = starts.size();
if (n_lists == 0) { return make_empty_lists_column(starts.type(), stream, mr); }

// Generate list offsets for the output.
auto list_offsets = make_numeric_column(
data_type(type_to_id<offset_type>()), n_lists + 1, mask_state::UNALLOCATED, stream, mr);
auto const offsets_begin = list_offsets->mutable_view().template begin<offset_type>();
auto const sizes_input_it = cudf::detail::indexalator_factory::make_input_iterator(sizes);

thrust::exclusive_scan(
rmm::exec_policy(stream), sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin);
auto const n_elements = cudf::detail::get_value<size_type>(list_offsets->view(), n_lists, stream);

auto child = type_dispatcher(starts.type(),
sequences_dispatcher{},
n_lists,
n_elements,
starts,
steps,
offsets_begin,
stream,
mr);

return make_lists_column(n_lists,
std::move(list_offsets),
std::move(child),
0,
rmm::device_buffer(0, stream, mr),
stream,
mr);
}

} // anonymous namespace

std::unique_ptr<column> sequences(column_view const& starts,
column_view const& sizes,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return sequences(starts, std::nullopt, sizes, stream, mr);
}

std::unique_ptr<column> sequences(column_view const& starts,
column_view const& steps,
column_view const& sizes,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return sequences(starts, std::optional<column_view>{steps}, sizes, stream, mr);
}

} // namespace detail

std::unique_ptr<column> sequences(column_view const& starts,
column_view const& sizes,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::sequences(starts, sizes, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> sequences(column_view const& starts,
column_view const& steps,
column_view const& sizes,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::sequences(starts, steps, sizes, rmm::cuda_stream_default, mr);
}

} // namespace cudf::lists
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ ConfigureTest(
lists/drop_list_duplicates_tests.cpp
lists/explode_tests.cpp
lists/extract_tests.cpp
lists/sequences_tests.cpp
lists/sort_lists_tests.cpp
)

Expand Down
Loading

0 comments on commit 36fa5f3

Please sign in to comment.