Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change strings copy_if_else to use optional-iterator instead of pair-iterator #9266

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/detail/copy_if_else.cuh>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

Expand Down
42 changes: 19 additions & 23 deletions cpp/include/cudf/strings/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,28 @@ namespace detail {
* strings from the lhs iterator or the rhs iterator.
*
* ```
* output[i] = filter_fn(i) ? lhs(i).first : rhs(i).first
* output[i] = filter_fn(i) ? lhs(i) : rhs(i)
* ```
*
* @tparam StringPairIterLeft Pair iterator returning thrust::pair<string_view,bool> where the
* bool parameter specifies if the string_view is valid (true) or not (false).
* @tparam StringPairIterRight Pair iterator returning thrust::pair<string_view,bool> where the
* bool parameter specifies if the string_view is valid (true) or not (false).
* @tparam StringIterLeft A random access iterator whose value_type is
* `thrust::optional<string_view>` where the `optional` has a value iff the element is valid.
* @tparam StringIterRight A random access iterator whose value_type is
* `thrust::optional<string_view>` where the `optional` has a value iff the element is valid.
* @tparam Filter Functor that takes an index and returns a boolean.
*
* @param lhs_begin Start of first set of data. Used when filter_fn returns true.
* @param lhs_begin Start of first set of data. Used when `filter_fn` returns true.
* @param lhs_end End of first set of data.
* @param rhs_begin Strings of second set of data. Used when filter_fn returns false.
* @param filter_fn Called to determine which iterator (lhs or rhs) to retrieve an entry for a
* specific row.
* @param rhs_begin Strings of second set of data. Used when `filter_fn` returns false.
* @param filter_fn Called to determine which iterator to use for a specific row.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column.
*/
template <typename StringPairIterLeft, typename StringPairIterRight, typename Filter>
template <typename StringIterLeft, typename StringIterRight, typename Filter>
std::unique_ptr<cudf::column> copy_if_else(
StringPairIterLeft lhs_begin,
StringPairIterLeft lhs_end,
StringPairIterRight rhs_begin,
StringIterLeft lhs_begin,
StringIterLeft lhs_end,
StringIterRight rhs_begin,
Filter filter_fn,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
Expand All @@ -68,7 +67,7 @@ std::unique_ptr<cudf::column> copy_if_else(
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
[lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
return filter_fn(idx) ? thrust::get<1>(lhs_begin[idx]) : thrust::get<1>(rhs_begin[idx]);
return filter_fn(idx) ? lhs_begin[idx].has_value() : rhs_begin[idx].has_value();
},
stream,
mr);
Expand All @@ -77,13 +76,10 @@ std::unique_ptr<cudf::column> copy_if_else(

// build offsets column
auto offsets_transformer = [lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
bool bfilter = filter_fn(idx);
size_type bytes = 0;
if (bfilter ? thrust::get<1>(lhs_begin[idx]) : thrust::get<1>(rhs_begin[idx]))
bytes = bfilter ? thrust::get<0>(lhs_begin[idx]).size_bytes()
: thrust::get<0>(rhs_begin[idx]).size_bytes();
return bytes;
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
return result.has_value() ? result->size_bytes() : 0;
};

auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
auto offsets_column = make_offsets_child_column(
Expand All @@ -101,9 +97,9 @@ std::unique_ptr<cudf::column> copy_if_else(
thrust::make_counting_iterator<size_type>(0),
strings_count,
[lhs_begin, rhs_begin, filter_fn, d_offsets, d_chars] __device__(size_type idx) {
auto bfilter = filter_fn(idx);
if (bfilter ? !thrust::get<1>(lhs_begin[idx]) : !thrust::get<1>(rhs_begin[idx])) return;
string_view d_str = bfilter ? thrust::get<0>(lhs_begin[idx]) : thrust::get<0>(rhs_begin[idx]);
auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
if (!result.has_value()) return;
auto const d_str = *result;
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
});

Expand Down
23 changes: 5 additions & 18 deletions cpp/src/copying/copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/scatter.hpp>
#include <cudf/strings/detail/copy_if_else.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/utilities/traits.hpp>

Expand Down Expand Up @@ -119,24 +120,10 @@ struct copy_if_else_functor_impl<string_view> {
auto const& lhs = *p_lhs;
auto const& rhs = *p_rhs;

if (left_nullable) {
if (right_nullable) {
auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
return strings::detail::copy_if_else(
lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
}
auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
}
if (right_nullable) {
auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
}
auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
auto lhs_iter =
cudf::detail::make_optional_iterator<T>(lhs, contains_nulls::DYNAMIC{}, left_nullable);
auto rhs_iter =
cudf::detail::make_optional_iterator<T>(rhs, contains_nulls::DYNAMIC{}, right_nullable);
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
}
};
Expand Down
161 changes: 43 additions & 118 deletions cpp/src/copying/segmented_shift.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/detail/copy_if_else.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/detail/copy_if_else.cuh>
#include <cudf/strings/string_view.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/traits.hpp>
Expand All @@ -33,88 +34,24 @@ namespace detail {
namespace {

/**
* @brief Helper function to invoke general `copy_if_else`
* @brief Common filter function to convert index values into copy-if-else left/right result.
*
* The offset position is used to identify which segment to copy from.
*/
template <typename PairIterator, typename ScalarIterator>
std::unique_ptr<column> segmented_shift_rep_impl(PairIterator input_pair_iterator,
ScalarIterator fill_pair_iterator,
bool nullable,
size_type offset,
device_span<size_type const> segment_offsets,
data_type value_type,
size_type column_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (offset > 0) {
auto filter = [segment_offsets, offset] __device__(auto const& i) {
auto segment_bound_idx =
thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i) - 1;
return not(*segment_bound_idx <= i and i < *segment_bound_idx + offset);
};
return copy_if_else(nullable,
input_pair_iterator,
input_pair_iterator + column_size,
fill_pair_iterator,
filter,
value_type,
stream,
mr);
} else {
auto filter = [segment_offsets, offset] __device__(auto const& i) {
auto segment_bound_idx =
thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i);
return not(*segment_bound_idx + offset <= i and i < *segment_bound_idx);
};
return copy_if_else(nullable,
input_pair_iterator,
input_pair_iterator + column_size,
fill_pair_iterator,
filter,
value_type,
stream,
mr);
}
}
struct segmented_shift_filter {
device_span<size_type const> const segment_offsets;
size_type const offset;

/**
* @brief Helper function to invoke string specialization of `copy_if_else`
*/
template <typename PairIterator, typename ScalarIterator>
std::unique_ptr<column> segmented_shift_string_impl(PairIterator input_pair_iterator,
ScalarIterator fill_pair_iterator,
size_type offset,
device_span<size_type const> segment_offsets,
size_type column_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (offset > 0) {
auto filter = [segment_offsets, offset] __device__(auto const& i) {
auto segment_bound_idx =
thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i) - 1;
return not(*segment_bound_idx <= i and i < *segment_bound_idx + offset);
};
return strings::detail::copy_if_else(input_pair_iterator,
input_pair_iterator + column_size,
fill_pair_iterator,
filter,
stream,
mr);
} else {
auto filter = [segment_offsets, offset] __device__(auto const& i) {
auto segment_bound_idx =
thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i);
return not(*segment_bound_idx + offset <= i and i < *segment_bound_idx);
};
return strings::detail::copy_if_else(input_pair_iterator,
input_pair_iterator + column_size,
fill_pair_iterator,
filter,
stream,
mr);
}
}
__device__ bool operator()(size_type const i) const
{
auto const segment_bound_idx =
thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i) -
(offset > 0);
auto const left_idx = *segment_bound_idx + (offset < 0 ? offset : 0);
auto const right_idx = *segment_bound_idx + (offset > 0 ? offset : 0);
return not(left_idx <= i and i < right_idx);
};
};

template <typename T, typename Enable = void>
struct segmented_shift_functor {
Expand Down Expand Up @@ -143,26 +80,24 @@ struct segmented_shift_functor<T, std::enable_if_t<is_rep_layout_compatible<T>()

if (segmented_values.has_nulls()) {
auto input_pair_iterator = make_pair_iterator<T, true>(*values_device_view) - offset;
return segmented_shift_rep_impl(input_pair_iterator,
fill_pair_iterator,
nullable,
offset,
segment_offsets,
segmented_values.type(),
segmented_values.size(),
stream,
mr);
return copy_if_else(nullable,
input_pair_iterator,
input_pair_iterator + segmented_values.size(),
fill_pair_iterator,
segmented_shift_filter{segment_offsets, offset},
segmented_values.type(),
stream,
mr);
} else {
auto input_pair_iterator = make_pair_iterator<T, false>(*values_device_view) - offset;
return segmented_shift_rep_impl(input_pair_iterator,
fill_pair_iterator,
nullable,
offset,
segment_offsets,
segmented_values.type(),
segmented_values.size(),
stream,
mr);
return copy_if_else(nullable,
input_pair_iterator,
input_pair_iterator + segmented_values.size(),
fill_pair_iterator,
segmented_shift_filter{segment_offsets, offset},
segmented_values.type(),
stream,
mr);
}
}
};
Expand All @@ -179,29 +114,19 @@ struct segmented_shift_functor<string_view> {
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using T = string_view;

auto values_device_view = column_device_view::create(segmented_values, stream);
auto fill_pair_iterator = make_pair_iterator<T>(fill_value);
if (segmented_values.has_nulls()) {
auto input_pair_iterator = make_pair_iterator<T, true>(*values_device_view) - offset;
return segmented_shift_string_impl(input_pair_iterator,
fill_pair_iterator,
offset,
segment_offsets,
segmented_values.size(),
stream,
mr);
} else {
auto input_pair_iterator = make_pair_iterator<T, false>(*values_device_view) - offset;
return segmented_shift_string_impl(input_pair_iterator,
fill_pair_iterator,
offset,
segment_offsets,
segmented_values.size(),
auto input_iterator =
make_optional_iterator<cudf::string_view>(
*values_device_view, contains_nulls::DYNAMIC{}, segmented_values.has_nulls()) -
offset;
auto fill_iterator =
make_optional_iterator<cudf::string_view>(fill_value, contains_nulls::YES{});
return strings::detail::copy_if_else(input_iterator,
input_iterator + segmented_values.size(),
fill_iterator,
segmented_shift_filter{segment_offsets, offset},
stream,
mr);
}
}
};

Expand Down
1 change: 1 addition & 0 deletions cpp/src/replace/nans.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious why this change was necessary.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was previously included in the include file copy_if_else.cuh which did not use it. Cleaning up the headers there required moving it here where it is actually used.


#include <thrust/transform_scan.h>

Expand Down