Skip to content

Commit

Permalink
Use optional-iterator for copy-if-else kernel (#9324)
Browse files Browse the repository at this point in the history
This PR changes the `cudf::detail::copy_if_else` utility functions to accept an optional-iterator instead of a pair-iterator. This improves the compile time of source files by generating 4x less kernels since the two input data arrays can each have nulls requiring 4 different pair-iterators to be created to call it. The optional-iterator allows the nulls check to occur at runtime instead of compile time.

The changes in this PR are for the callers of `detail::copy_if_else` to provide optional-iterators instead of pair-iterators. This PR is dependent on the changes in #9306 

The benchmarks for the effected calling functions showed no significant change in runtime performance using the single optional-iterator over 4 unique pair-iterators. Two additional benchmarks are included cover non-null measurement which this PR impacts the most.

Also, the `copy_tests.cu` was renamed `copy_tests.cpp` and the test that launched to the internal  `cudf::detail::copy_if_else_kernel` was replaced with one with a data-set large enough to require multiple blocks.

Related changes for the strings specific `cudf::strings::detail::copy_if_else` are in #9266

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: #9324
  • Loading branch information
davidwendt authored Oct 13, 2021
1 parent 794863c commit f9806ff
Show file tree
Hide file tree
Showing 11 changed files with 270 additions and 263 deletions.
7 changes: 6 additions & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
# - shift benchmark -------------------------------------------------------------------------------
ConfigureBench(SHIFT_BENCH copying/shift_benchmark.cu)

###################################################################################################
# - copy-if-else benchmark -----------------------------------------------------------------------------
ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else_benchmark.cpp)

###################################################################################################
# - transpose benchmark ---------------------------------------------------------------------------
ConfigureBench(TRANSPOSE_BENCH transpose/transpose_benchmark.cu)
Expand Down Expand Up @@ -141,7 +145,8 @@ ConfigureBench(REDUCTION_BENCH
###################################################################################################
# - reduction benchmark ---------------------------------------------------------------------------
ConfigureBench(REPLACE_BENCH
replace/clamp_benchmark.cpp)
replace/clamp_benchmark.cpp
replace/nans_benchmark.cpp)

###################################################################################################
# - filling benchmark -----------------------------------------------------------------------------
Expand Down
67 changes: 67 additions & 0 deletions cpp/benchmarks/copying/copy_if_else_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/copying.hpp>

#include <rmm/device_buffer.hpp>

class CopyIfElse : public cudf::benchmark {
};

template <class TypeParam>
static void BM_copy_if_else(benchmark::State& state, bool nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto input_type = cudf::type_to_id<TypeParam>();
auto bool_type = cudf::type_id::BOOL8;
auto const input = create_random_table({input_type, input_type, bool_type}, 3, row_count{n_rows});

if (!nulls) {
input->get_column(2).set_null_mask(rmm::device_buffer{}, 0);
input->get_column(1).set_null_mask(rmm::device_buffer{}, 0);
input->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
}

cudf::column_view decision(input->view().column(2));
cudf::column_view rhs(input->view().column(1));
cudf::column_view lhs(input->view().column(0));

for (auto _ : state) {
cuda_event_timer raii(state, true, rmm::cuda_stream_default);
cudf::copy_if_else(lhs, rhs, decision);
}
}

#define COPY_BENCHMARK_DEFINE(name, type, b) \
BENCHMARK_DEFINE_F(CopyIfElse, name) \
(::benchmark::State & st) { BM_copy_if_else<type>(st, b); } \
BENCHMARK_REGISTER_F(CopyIfElse, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 12, 1 << 27}}) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

COPY_BENCHMARK_DEFINE(int16, int16_t, true)
COPY_BENCHMARK_DEFINE(uint32, uint32_t, true)
COPY_BENCHMARK_DEFINE(float64, double, true)
COPY_BENCHMARK_DEFINE(int16_no_nulls, int16_t, false)
COPY_BENCHMARK_DEFINE(uint32_no_nulls, uint32_t, false)
COPY_BENCHMARK_DEFINE(float64_no_nulls, double, false)
20 changes: 10 additions & 10 deletions cpp/benchmarks/replace/clamp_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class ReplaceClamp : public cudf::benchmark {
};

template <typename type>
static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
static void BM_clamp(benchmark::State& state, bool include_nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const dtype = cudf::type_to_id<type>();
Expand Down Expand Up @@ -58,15 +58,15 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
}
}

#define CLAMP_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceClamp, name) \
(::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceClamp, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
#define CLAMP_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceClamp, name) \
(::benchmark::State & state) { BM_clamp<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceClamp, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

CLAMP_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);
Expand Down
63 changes: 63 additions & 0 deletions cpp/benchmarks/replace/nans_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/replace.hpp>
#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/table/table.hpp>
#include <cudf/types.hpp>

class ReplaceNans : public cudf::benchmark {
};

template <typename type>
static void BM_replace_nans(benchmark::State& state, bool include_nulls)
{
cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
auto const dtype = cudf::type_to_id<type>();
auto const table = create_random_table({dtype}, 1, row_count{n_rows});
if (!include_nulls) { table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); }
cudf::column_view input(table->view().column(0));

auto zero = cudf::make_fixed_width_scalar<type>(0);

for (auto _ : state) {
cuda_event_timer timer(state, true);
auto result = cudf::replace_nans(input, *zero);
}
}

#define NANS_BENCHMARK_DEFINE(name, type, nulls) \
BENCHMARK_DEFINE_F(ReplaceNans, name) \
(::benchmark::State & state) { BM_replace_nans<type>(state, nulls); } \
BENCHMARK_REGISTER_F(ReplaceNans, name) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

NANS_BENCHMARK_DEFINE(float32_nulls, float, true);
NANS_BENCHMARK_DEFINE(float64_nulls, double, true);
NANS_BENCHMARK_DEFINE(float32_no_nulls, float, false);
NANS_BENCHMARK_DEFINE(float64_no_nulls, double, false);
34 changes: 9 additions & 25 deletions cpp/include/cudf/detail/copy_if_else.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,11 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/device_scalar.hpp>

#include <cub/cub.cuh>

namespace cudf {
namespace detail {
namespace { // anonymous
Expand All @@ -40,7 +33,7 @@ template <size_type block_size,
typename LeftIter,
typename RightIter,
typename Filter,
bool has_validity>
bool has_nulls>
__launch_bounds__(block_size) __global__
void copy_if_else_kernel(LeftIter lhs,
RightIter rhs,
Expand Down Expand Up @@ -71,23 +64,14 @@ __launch_bounds__(block_size) __global__
size_type warp_cur = warp_begin + warp_id;
size_type index = tid;
while (warp_cur <= warp_end) {
bool in_range = (index >= begin && index < end);

bool valid = true;
if (has_validity) {
valid = in_range && (filter(index) ? thrust::get<1>(lhs[index]) : thrust::get<1>(rhs[index]));
}

// do the copy if-else
if (in_range) {
out.element<T>(index) = filter(index) ? static_cast<T>(thrust::get<0>(lhs[index]))
: static_cast<T>(thrust::get<0>(rhs[index]));
}
auto const opt_value =
(index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
if (not has_nulls or opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }

// update validity
if (has_validity) {
if (has_nulls) {
// the final validity mask for this warp
int warp_mask = __ballot_sync(0xFFFF'FFFF, valid && in_range);
int warp_mask = __ballot_sync(0xFFFF'FFFF, opt_value.has_value());
// only one guy in the warp needs to update the mask and count
if (lane_id == 0) {
out.set_mask_word(warp_cur, warp_mask);
Expand All @@ -100,7 +84,7 @@ __launch_bounds__(block_size) __global__
index += block_size * gridDim.x;
}

if (has_validity) {
if (has_nulls) {
// sum all null counts across all warps
size_type block_valid_count =
single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_count);
Expand Down Expand Up @@ -168,8 +152,8 @@ std::unique_ptr<column> copy_if_else(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
using Element =
typename thrust::tuple_element<0, typename thrust::iterator_traits<LeftIter>::value_type>::type;
// This is the type of the thrust::optional element in the passed iterators
using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;

size_type size = std::distance(lhs_begin, lhs_end);
size_type num_els = cudf::util::round_up_safe(size, warp_size);
Expand Down
Loading

0 comments on commit f9806ff

Please sign in to comment.