Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch-22.02' into refactor/host…
Browse files Browse the repository at this point in the history
…_device_macros
  • Loading branch information
vyasr committed Dec 9, 2021
2 parents c8016b1 + 05c209f commit 3937104
Show file tree
Hide file tree
Showing 59 changed files with 662 additions and 300 deletions.
2 changes: 1 addition & 1 deletion ci/checks/style.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ LANG=C.UTF-8
. /opt/conda/etc/profile.d/conda.sh
conda activate rapids

FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/main/cmake-format-rapids-cmake.json
FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/cmake-format-rapids-cmake.json
export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
Expand Down
62 changes: 17 additions & 45 deletions cpp/benchmarks/ast/transform_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include <benchmark/benchmark.h>
#include <fixture/benchmark_fixture.hpp>
#include <fixture/templated_benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <thrust/iterator/counting_iterator.h>
Expand All @@ -40,7 +41,6 @@ enum class TreeType {
// child column reference
};

template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
class AST : public cudf::benchmark {
};

Expand Down Expand Up @@ -127,9 +127,22 @@ static void BM_ast_transform(benchmark::State& state)
(tree_levels + 1) * sizeof(key_type));
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \
(::benchmark::State & st) { BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); }
static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
}

#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
TEMPLATED_BENCHMARK_F(AST, BM_ast_transform, key_type, tree_type, reuse_columns, nullable) \
->Apply(CustomRanges) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
Expand All @@ -144,44 +157,3 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
AST_TRANSFORM_BENCHMARK_DEFINE(
ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);

static void CustomRanges(benchmark::internal::Benchmark* b)
{
auto row_counts = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
for (auto const& row_count : row_counts) {
for (auto const& operation_count : operation_counts) {
b->Args({row_count, operation_count});
}
}
}

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();

BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls)
->Apply(CustomRanges)
->Unit(benchmark::kMillisecond)
->UseManualTime();
100 changes: 50 additions & 50 deletions cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <fixture/benchmark_fixture.hpp>
#include <fixture/templated_benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <cudf_test/column_wrapper.hpp>
Expand All @@ -23,12 +24,11 @@

#include <thrust/iterator/counting_iterator.h>

template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
class COMPILED_BINARYOP : public cudf::benchmark {
};

template <typename TypeLhs, typename TypeRhs, typename TypeOut>
void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator binop>
void BM_compiled_binaryop(benchmark::State& state)
{
const cudf::size_type column_size{(cudf::size_type)state.range(0)};

Expand All @@ -50,56 +50,56 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
}

// TODO tparam boolean for null.
#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut) \
BENCHMARK_TEMPLATE_DEFINE_F( \
COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \
(::benchmark::State & st) \
{ \
BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
} \
BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \
->Unit(benchmark::kMicrosecond) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \
TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP, \
BM_compiled_binaryop, \
TypeLhs, \
TypeRhs, \
TypeOut, \
cudf::binary_operator::binop) \
->Unit(benchmark::kMicrosecond) \
->UseManualTime() \
->Arg(10000) /* 10k */ \
->Arg(100000) /* 100k */ \
->Arg(1000000) /* 1M */ \
->Arg(10000000) /* 10M */ \
->Arg(100000000); /* 100M */

using namespace cudf;
using namespace numeric;

// clang-format off
BINARYOP_BENCHMARK_DEFINE(ADD_1, float, float, ADD, float);
BINARYOP_BENCHMARK_DEFINE(ADD_2, timestamp_s, duration_s, ADD, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(SUB_1, duration_s, duration_D, SUB, duration_ms);
BINARYOP_BENCHMARK_DEFINE(SUB_2, int64_t, int64_t, SUB, int64_t);
BINARYOP_BENCHMARK_DEFINE(MUL_1, float, float, MUL, int64_t);
BINARYOP_BENCHMARK_DEFINE(MUL_2, duration_s, int64_t, MUL, duration_s);
BINARYOP_BENCHMARK_DEFINE(DIV_1, int64_t, int64_t, DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(DIV_2, duration_ms, int32_t, DIV, duration_ms);
BINARYOP_BENCHMARK_DEFINE(TRUE_DIV, int64_t, int64_t, TRUE_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV, int64_t, int64_t, FLOOR_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(MOD_1, double, double, MOD, double);
BINARYOP_BENCHMARK_DEFINE(MOD_2, duration_ms, int64_t, MOD, duration_ms);
BINARYOP_BENCHMARK_DEFINE(PMOD, int32_t, int64_t, PMOD, double);
BINARYOP_BENCHMARK_DEFINE(PYMOD, int32_t, uint8_t, PYMOD, int64_t);
BINARYOP_BENCHMARK_DEFINE(POW, int64_t, int64_t, POW, double);
BINARYOP_BENCHMARK_DEFINE(LOG_BASE, float, double, LOG_BASE, double);
BINARYOP_BENCHMARK_DEFINE(ATAN2, float, double, ATAN2, double);
BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT, int, int, SHIFT_LEFT, int);
BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT, int16_t, int64_t, SHIFT_RIGHT, int);
BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT, int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_AND, int64_t, int32_t, BITWISE_AND, int16_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_OR, int16_t, int32_t, BITWISE_OR, int64_t);
BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR, int16_t, int64_t, BITWISE_XOR, int32_t);
BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND, double, int8_t, LOGICAL_AND, bool);
BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR, int16_t, int64_t, LOGICAL_OR, bool);
BINARYOP_BENCHMARK_DEFINE(EQUAL_1, int32_t, int64_t, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(EQUAL_2, duration_ms, duration_ns, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL, decimal32, decimal32, NOT_EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(LESS, timestamp_s, timestamp_s, LESS, bool);
BINARYOP_BENCHMARK_DEFINE(GREATER, timestamp_ms, timestamp_s, GREATER, bool);
BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS, duration_ms, duration_ns, NULL_EQUALS, bool);
BINARYOP_BENCHMARK_DEFINE(NULL_MAX, decimal32, decimal32, NULL_MAX, decimal32);
BINARYOP_BENCHMARK_DEFINE(NULL_MIN, timestamp_D, timestamp_s, NULL_MIN, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(float, int64_t, ADD, int32_t);
BINARYOP_BENCHMARK_DEFINE(float, float, ADD, float);
BINARYOP_BENCHMARK_DEFINE(timestamp_s, duration_s, ADD, timestamp_s);
BINARYOP_BENCHMARK_DEFINE(duration_s, duration_D, SUB, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, SUB, int64_t);
BINARYOP_BENCHMARK_DEFINE(float, float, MUL, int64_t);
BINARYOP_BENCHMARK_DEFINE(duration_s, int64_t, MUL, duration_s);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(duration_ms, int32_t, DIV, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, TRUE_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, FLOOR_DIV, int64_t);
BINARYOP_BENCHMARK_DEFINE(double, double, MOD, double);
BINARYOP_BENCHMARK_DEFINE(duration_ms, int64_t, MOD, duration_ms);
BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double);
BINARYOP_BENCHMARK_DEFINE(int32_t, uint8_t, PYMOD, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, POW, double);
BINARYOP_BENCHMARK_DEFINE(float, double, LOG_BASE, double);
BINARYOP_BENCHMARK_DEFINE(float, double, ATAN2, double);
BINARYOP_BENCHMARK_DEFINE(int, int, SHIFT_LEFT, int);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, SHIFT_RIGHT, int);
BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t);
BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, BITWISE_AND, int16_t);
BINARYOP_BENCHMARK_DEFINE(int16_t, int32_t, BITWISE_OR, int64_t);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, BITWISE_XOR, int32_t);
BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, bool);
BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, LOGICAL_OR, bool);
BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NOT_EQUAL, bool);
BINARYOP_BENCHMARK_DEFINE(timestamp_s, timestamp_s, LESS, bool);
BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s, GREATER, bool);
BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, NULL_EQUALS, bool);
BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NULL_MAX, decimal32);
BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s);
51 changes: 22 additions & 29 deletions cpp/benchmarks/column/concatenate_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
#include <cudf_test/column_wrapper.hpp>

#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <thrust/iterator/constant_iterator.h>

#include <algorithm>
#include <vector>

template <typename T, bool Nullable>
class Concatenate : public cudf::benchmark {
};

Expand Down Expand Up @@ -69,17 +69,15 @@ static void BM_concatenate(benchmark::State& state)
state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
}

#define CONCAT_BENCHMARK_DEFINE(name, type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \
(::benchmark::State & state) { BM_concatenate<type, nullable>(state); } \
BENCHMARK_REGISTER_F(Concatenate, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_BENCHMARK_DEFINE(type, nullable) \
TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate, type, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_BENCHMARK_DEFINE(concat_columns_int64_non_null, int64_t, false)
CONCAT_BENCHMARK_DEFINE(concat_columns_int64_nullable, int64_t, true)
CONCAT_BENCHMARK_DEFINE(int64_t, false)
CONCAT_BENCHMARK_DEFINE(int64_t, true)

template <typename T, bool Nullable>
static void BM_concatenate_tables(benchmark::State& state)
Expand Down Expand Up @@ -131,19 +129,16 @@ static void BM_concatenate_tables(benchmark::State& state)
state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
}

#define CONCAT_TABLES_BENCHMARK_DEFINE(name, type, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable) \
(::benchmark::State & state) { BM_concatenate_tables<type, nullable>(state); } \
BENCHMARK_REGISTER_F(Concatenate, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \
TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate_tables, type, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_non_null, int64_t, false)
CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_nullable, int64_t, true)
CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)

template <bool Nullable>
class ConcatenateStrings : public cudf::benchmark {
};

Expand Down Expand Up @@ -192,14 +187,12 @@ static void BM_concatenate_strings(benchmark::State& state)
(sizeof(int32_t) + num_chars)); // offset + chars
}

#define CONCAT_STRINGS_BENCHMARK_DEFINE(name, nullable) \
BENCHMARK_TEMPLATE_DEFINE_F(ConcatenateStrings, name, nullable) \
(::benchmark::State & state) { BM_concatenate_strings<nullable>(state); } \
BENCHMARK_REGISTER_F(ConcatenateStrings, name) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \
->Unit(benchmark::kMillisecond) \
#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \
TEMPLATED_BENCHMARK_F(ConcatenateStrings, BM_concatenate_strings, nullable) \
->RangeMultiplier(8) \
->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \
->Unit(benchmark::kMillisecond) \
->UseManualTime();

CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_non_null, false)
CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_nullable, true)
CONCAT_STRINGS_BENCHMARK_DEFINE(false)
CONCAT_STRINGS_BENCHMARK_DEFINE(true)
73 changes: 73 additions & 0 deletions cpp/benchmarks/fixture/templated_benchmark_fixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <benchmark/benchmark.h>

namespace cudf {
/**
* @brief Templated Google Benchmark with fixture
*
* Extends Google benchmarks to support templated Benchmarks with non-templated fixture class.
*
* The SetUp and TearDown methods is called before each templated benchmark function is run.
* These methods are called automatically by Google Benchmark
*
* Example:
*
* @code
* template <class T, class U>
* void my_benchmark(::benchmark::State& state) {
* std::vector<T> v1(state.range(0));
* std::vector<U> v2(state.range(0));
* for (auto _ : state) {
* // benchmark stuff
* }
* }
*
* TEMPLATED_BENCHMARK_F(cudf::benchmark, my_benchmark, int, double)->Range(128, 512);
* @endcode
*/
template <class Fixture>
class FunctionTemplateBenchmark : public Fixture {
public:
FunctionTemplateBenchmark(const char* name, ::benchmark::internal::Function* func)
: Fixture(), func_(func)
{
this->SetName(name);
}

virtual void Run(::benchmark::State& st)
{
this->SetUp(st);
this->BenchmarkCase(st);
this->TearDown(st);
}

private:
::benchmark::internal::Function* func_;

protected:
virtual void BenchmarkCase(::benchmark::State& st) { func_(st); }
};

#define TEMPLATED_BENCHMARK_F(BaseClass, n, ...) \
BENCHMARK_PRIVATE_DECLARE(n) = (::benchmark::internal::RegisterBenchmarkInternal( \
new cudf::FunctionTemplateBenchmark<BaseClass>(#BaseClass "/" #n "<" #__VA_ARGS__ ">", \
n<__VA_ARGS__>)))

} // namespace cudf
Loading

0 comments on commit 3937104

Please sign in to comment.