Use optional-iterator for copy-if-else kernel (#9324)

This PR changes the `cudf::detail::copy_if_else` utility functions to accept an optional-iterator instead of a pair-iterator. This improves the compile time of source files by generating 4x less kernels since the two input data arrays can each have nulls requiring 4 different pair-iterators to be created to call it. The optional-iterator allows the nulls check to occur at runtime instead of compile time. The changes in this PR are for the callers of `detail::copy_if_else` to provide optional-iterators instead of pair-iterators. This PR is dependent on the changes in #9306 The benchmarks for the effected calling functions showed no significant change in runtime performance using the single optional-iterator over 4 unique pair-iterators. Two additional benchmarks are included cover non-null measurement which this PR impacts the most. Also, the `copy_tests.cu` was renamed `copy_tests.cpp` and the test that launched to the internal `cudf::detail::copy_if_else_kernel` was replaced with one with a data-set large enough to require multiple blocks. Related changes for the strings specific `cudf::strings::detail::copy_if_else` are in #9266 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Robert Maynard (https://github.com/robertmaynard) - Jake Hemstad (https://github.com/jrhemstad) URL: #9324
rapidsai · Oct 13, 2021 · f9806ff · f9806ff
1 parent 794863c
commit f9806ff
Show file tree

Hide file tree

Showing 11 changed files with 270 additions and 263 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -88,6 +88,10 @@ ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
 # - shift benchmark -------------------------------------------------------------------------------
 ConfigureBench(SHIFT_BENCH copying/shift_benchmark.cu)
 
+###################################################################################################
+# - copy-if-else benchmark -----------------------------------------------------------------------------
+ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else_benchmark.cpp)
+
 ###################################################################################################
 # - transpose benchmark ---------------------------------------------------------------------------
 ConfigureBench(TRANSPOSE_BENCH transpose/transpose_benchmark.cu)
@@ -141,7 +145,8 @@ ConfigureBench(REDUCTION_BENCH
 ###################################################################################################
 # - reduction benchmark ---------------------------------------------------------------------------
 ConfigureBench(REPLACE_BENCH
-  replace/clamp_benchmark.cpp)
+  replace/clamp_benchmark.cpp
+  replace/nans_benchmark.cpp)
 
 ###################################################################################################
 # - filling benchmark -----------------------------------------------------------------------------

diff --git a/cpp/benchmarks/copying/copy_if_else_benchmark.cpp b/cpp/benchmarks/copying/copy_if_else_benchmark.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/copying.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+class CopyIfElse : public cudf::benchmark {
+};
+
+template <class TypeParam>
+static void BM_copy_if_else(benchmark::State& state, bool nulls)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto input_type  = cudf::type_to_id<TypeParam>();
+  auto bool_type   = cudf::type_id::BOOL8;
+  auto const input = create_random_table({input_type, input_type, bool_type}, 3, row_count{n_rows});
+
+  if (!nulls) {
+    input->get_column(2).set_null_mask(rmm::device_buffer{}, 0);
+    input->get_column(1).set_null_mask(rmm::device_buffer{}, 0);
+    input->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
+  }
+
+  cudf::column_view decision(input->view().column(2));
+  cudf::column_view rhs(input->view().column(1));
+  cudf::column_view lhs(input->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+    cudf::copy_if_else(lhs, rhs, decision);
+  }
+}
+
+#define COPY_BENCHMARK_DEFINE(name, type, b)                  \
+  BENCHMARK_DEFINE_F(CopyIfElse, name)                        \
+  (::benchmark::State & st) { BM_copy_if_else<type>(st, b); } \
+  BENCHMARK_REGISTER_F(CopyIfElse, name)                      \
+    ->RangeMultiplier(8)                                      \
+    ->Ranges({{1 << 12, 1 << 27}})                            \
+    ->UseManualTime()                                         \
+    ->Unit(benchmark::kMillisecond);
+
+COPY_BENCHMARK_DEFINE(int16, int16_t, true)
+COPY_BENCHMARK_DEFINE(uint32, uint32_t, true)
+COPY_BENCHMARK_DEFINE(float64, double, true)
+COPY_BENCHMARK_DEFINE(int16_no_nulls, int16_t, false)
+COPY_BENCHMARK_DEFINE(uint32_no_nulls, uint32_t, false)
+COPY_BENCHMARK_DEFINE(float64_no_nulls, double, false)
diff --git a/cpp/benchmarks/replace/clamp_benchmark.cpp b/cpp/benchmarks/replace/clamp_benchmark.cpp
@@ -30,7 +30,7 @@ class ReplaceClamp : public cudf::benchmark {
 };
 
 template <typename type>
-static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
+static void BM_clamp(benchmark::State& state, bool include_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const dtype = cudf::type_to_id<type>();
@@ -58,15 +58,15 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
   }
 }
 
-#define CLAMP_BENCHMARK_DEFINE(name, type, nulls)                         \
-  BENCHMARK_DEFINE_F(ReplaceClamp, name)                                  \
-  (::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
-  BENCHMARK_REGISTER_F(ReplaceClamp, name)                                \
-    ->UseManualTime()                                                     \
-    ->Arg(10000)      /* 10k */                                           \
-    ->Arg(100000)     /* 100k */                                          \
-    ->Arg(1000000)    /* 1M */                                            \
-    ->Arg(10000000)   /* 10M */                                           \
+#define CLAMP_BENCHMARK_DEFINE(name, type, nulls)                \
+  BENCHMARK_DEFINE_F(ReplaceClamp, name)                         \
+  (::benchmark::State & state) { BM_clamp<type>(state, nulls); } \
+  BENCHMARK_REGISTER_F(ReplaceClamp, name)                       \
+    ->UseManualTime()                                            \
+    ->Arg(10000)      /* 10k */                                  \
+    ->Arg(100000)     /* 100k */                                 \
+    ->Arg(1000000)    /* 1M */                                   \
+    ->Arg(10000000)   /* 10M */                                  \
     ->Arg(100000000); /* 100M */
 
 CLAMP_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);

diff --git a/cpp/benchmarks/replace/nans_benchmark.cpp b/cpp/benchmarks/replace/nans_benchmark.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+class ReplaceNans : public cudf::benchmark {
+};
+
+template <typename type>
+static void BM_replace_nans(benchmark::State& state, bool include_nulls)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const dtype = cudf::type_to_id<type>();
+  auto const table = create_random_table({dtype}, 1, row_count{n_rows});
+  if (!include_nulls) { table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); }
+  cudf::column_view input(table->view().column(0));
+
+  auto zero = cudf::make_fixed_width_scalar<type>(0);
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    auto result = cudf::replace_nans(input, *zero);
+  }
+}
+
+#define NANS_BENCHMARK_DEFINE(name, type, nulls)                        \
+  BENCHMARK_DEFINE_F(ReplaceNans, name)                                 \
+  (::benchmark::State & state) { BM_replace_nans<type>(state, nulls); } \
+  BENCHMARK_REGISTER_F(ReplaceNans, name)                               \
+    ->UseManualTime()                                                   \
+    ->Arg(10000)      /* 10k */                                         \
+    ->Arg(100000)     /* 100k */                                        \
+    ->Arg(1000000)    /* 1M */                                          \
+    ->Arg(10000000)   /* 10M */                                         \
+    ->Arg(100000000); /* 100M */
+
+NANS_BENCHMARK_DEFINE(float32_nulls, float, true);
+NANS_BENCHMARK_DEFINE(float64_nulls, double, true);
+NANS_BENCHMARK_DEFINE(float32_no_nulls, float, false);
+NANS_BENCHMARK_DEFINE(float64_no_nulls, double, false);
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -19,18 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_scalar.hpp>
 
-#include <cub/cub.cuh>
-
 namespace cudf {
 namespace detail {
 namespace {  // anonymous
@@ -40,7 +33,7 @@ template <size_type block_size,
           typename LeftIter,
           typename RightIter,
           typename Filter,
-          bool has_validity>
+          bool has_nulls>
 __launch_bounds__(block_size) __global__
   void copy_if_else_kernel(LeftIter lhs,
                            RightIter rhs,
@@ -71,23 +64,14 @@ __launch_bounds__(block_size) __global__
   size_type warp_cur = warp_begin + warp_id;
   size_type index    = tid;
   while (warp_cur <= warp_end) {
-    bool in_range = (index >= begin && index < end);
-
-    bool valid = true;
-    if (has_validity) {
-      valid = in_range && (filter(index) ? thrust::get<1>(lhs[index]) : thrust::get<1>(rhs[index]));
-    }
-
-    // do the copy if-else
-    if (in_range) {
-      out.element<T>(index) = filter(index) ? static_cast<T>(thrust::get<0>(lhs[index]))
-                                            : static_cast<T>(thrust::get<0>(rhs[index]));
-    }
+    auto const opt_value =
+      (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
+    if (not has_nulls or opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
 
     // update validity
-    if (has_validity) {
+    if (has_nulls) {
       // the final validity mask for this warp
-      int warp_mask = __ballot_sync(0xFFFF'FFFF, valid && in_range);
+      int warp_mask = __ballot_sync(0xFFFF'FFFF, opt_value.has_value());
       // only one guy in the warp needs to update the mask and count
       if (lane_id == 0) {
         out.set_mask_word(warp_cur, warp_mask);
@@ -100,7 +84,7 @@ __launch_bounds__(block_size) __global__
     index += block_size * gridDim.x;
   }
 
-  if (has_validity) {
+  if (has_nulls) {
     // sum all null counts across all warps
     size_type block_valid_count =
       single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_count);
@@ -168,8 +152,8 @@ std::unique_ptr<column> copy_if_else(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  using Element =
-    typename thrust::tuple_element<0, typename thrust::iterator_traits<LeftIter>::value_type>::type;
+  // This is the type of the thrust::optional element in the passed iterators
+  using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
 
   size_type size           = std::distance(lhs_begin, lhs_end);
   size_type num_els        = cudf::util::round_up_safe(size, warp_size);