Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix performance bugs in scalar reductions #509

Merged
Merged
8 changes: 4 additions & 4 deletions src/cunumeric/binary/binary_red.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
{
const size_t idx = global_tid_1d();
if (idx >= volume) return;
if (!func(in1[idx], in2[idx])) out <<= false;
if (!func(in1[idx], in2[idx])) out.reduce<false /*EXCLUSIVE*/>(false);
}

template <typename Function, typename RES, typename ReadAcc, typename Pitches, typename Rect>
Expand All @@ -39,7 +39,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) gen
const size_t idx = global_tid_1d();
if (idx >= volume) return;
auto point = pitches.unflatten(idx, rect.lo);
if (!func(in1[point], in2[point])) out <<= false;
if (!func(in1[point], in2[point])) out.reduce<false /*EXCLUSIVE*/>(false);
}

template <typename Buffer, typename RedAcc>
Expand All @@ -64,8 +64,8 @@ struct BinaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
{
size_t volume = rect.volume();
const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
DeferredReduction<ProdReduction<bool>> result;
auto stream = get_cached_stream();
auto stream = get_cached_stream();
DeviceScalarReductionBuffer<ProdReduction<bool>> result(stream);
if (dense) {
auto in1ptr = in1.ptr(rect);
auto in2ptr = in2.ptr(rect);
Expand Down
257 changes: 22 additions & 235 deletions src/cunumeric/cuda_help.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "core/cuda/stream_pool.h"
#include "cunumeric/arg.h"
#include "cunumeric/arg.inl"
#include "cunumeric/device_scalar_reduction_buffer.h"
#include <cublas_v2.h>
#include <cusolverDn.h>
#include <cuda_runtime.h>
Expand Down Expand Up @@ -211,71 +212,35 @@ __device__ __forceinline__ T shuffle(unsigned mask, T var, int laneMask, int wid
return var;
}

// Overload for complex
// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
// version
template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
complex<T> value)
{
__shared__ complex<T> trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const complex<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
result <<= value;
// Make sure the result is visible externally
__threadfence_system();
}
}
template <typename T>
struct HasNativeShuffle {
static constexpr bool value = true;
};

// Overload for argval
// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
// version
template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
Argval<T> value)
{
__shared__ Argval<T> trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const Argval<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
result <<= value;
// Make sure the result is visible externally
__threadfence_system();
}
}
template <typename T>
struct HasNativeShuffle<complex<T>> {
static constexpr bool value = false;
};

template <typename T>
struct HasNativeShuffle<Argval<T>> {
static constexpr bool value = false;
};

template <typename T, typename REDUCTION>
__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result, T value)
__device__ __forceinline__ void reduce_output(DeviceScalarReductionBuffer<REDUCTION> result,
T value)
{
__shared__ T trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const T shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
T shuffle_value;
if constexpr (HasNativeShuffle<T>::value)
shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
else
shuffle_value = shuffle(0xffffffff, value, i, 32);
REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
}
// Write warp values into shared memory
Expand All @@ -285,190 +250,12 @@ __device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTIO
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
result <<= value;
result.reduce<false /*EXCLUSIVE*/>(value);
// Make sure the result is visible externally
__threadfence_system();
}
}

__device__ __forceinline__ void reduce_bool(Legion::DeferredValue<bool> result, int value)
{
__shared__ int trampoline[THREADS_PER_BLOCK / 32];
// Reduce across the warp
const int laneid = threadIdx.x & 0x1f;
const int warpid = threadIdx.x >> 5;
for (int i = 16; i >= 1; i /= 2) {
const int shuffle_value = __shfl_xor_sync(0xffffffff, value, i, 32);
if (shuffle_value == 0) value = 0;
}
// Write warp values into shared memory
if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
__syncthreads();
// Output reduction
if (threadIdx.x == 0) {
for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
if (trampoline[i] == 0) {
value = 0;
break;
}
if (value == 0) {
result = false;
// Make sure the result is visible externally
__threadfence_system();
}
}
}

template <typename T>
__device__ __forceinline__ T load_cached(const T* ptr)
{
return *ptr;
}

// Specializations to use PTX cache qualifiers to keep
// all the input data in as many caches as we can
// Use .ca qualifier to cache at all levels
template <>
__device__ __forceinline__ uint16_t load_cached<uint16_t>(const uint16_t* ptr)
{
uint16_t value;
asm volatile("ld.global.ca.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint32_t load_cached<uint32_t>(const uint32_t* ptr)
{
uint32_t value;
asm volatile("ld.global.ca.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint64_t load_cached<uint64_t>(const uint64_t* ptr)
{
uint64_t value;
asm volatile("ld.global.ca.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int16_t load_cached<int16_t>(const int16_t* ptr)
{
int16_t value;
asm volatile("ld.global.ca.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int32_t load_cached<int32_t>(const int32_t* ptr)
{
int32_t value;
asm volatile("ld.global.ca.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int64_t load_cached<int64_t>(const int64_t* ptr)
{
int64_t value;
asm volatile("ld.global.ca.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

// No half because inline ptx is dumb about the type

template <>
__device__ __forceinline__ float load_cached<float>(const float* ptr)
{
float value;
asm volatile("ld.global.ca.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ double load_cached<double>(const double* ptr)
{
double value;
asm volatile("ld.global.ca.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
return value;
}

template <typename T>
__device__ __forceinline__ T load_l2(const T* ptr)
{
return *ptr;
}

// Specializations to use PTX cache qualifiers to keep
// data loaded into L2 but no higher in the hierarchy
// Use .cg qualifier to cache at L2
template <>
__device__ __forceinline__ uint16_t load_l2<uint16_t>(const uint16_t* ptr)
{
uint16_t value;
asm volatile("ld.global.cg.u16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint32_t load_l2<uint32_t>(const uint32_t* ptr)
{
uint32_t value;
asm volatile("ld.global.cg.u32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ uint64_t load_l2<uint64_t>(const uint64_t* ptr)
{
uint64_t value;
asm volatile("ld.global.cg.u64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int16_t load_l2<int16_t>(const int16_t* ptr)
{
int16_t value;
asm volatile("ld.global.cg.s16 %0, [%1];" : "=h"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int32_t load_l2<int32_t>(const int32_t* ptr)
{
int32_t value;
asm volatile("ld.global.cg.s32 %0, [%1];" : "=r"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ int64_t load_l2<int64_t>(const int64_t* ptr)
{
int64_t value;
asm volatile("ld.global.cg.s64 %0, [%1];" : "=l"(value) : "l"(ptr) : "memory");
return value;
}

// No half because inline ptx is dumb about the type

template <>
__device__ __forceinline__ float load_l2<float>(const float* ptr)
{
float value;
asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(value) : "l"(ptr) : "memory");
return value;
}

template <>
__device__ __forceinline__ double load_l2<double>(const double* ptr)
{
double value;
asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(value) : "l"(ptr) : "memory");
return value;
}

template <typename T>
__device__ __forceinline__ T load_streaming(const T* ptr)
{
Expand Down
59 changes: 59 additions & 0 deletions src/cunumeric/device_scalar_reduction_buffer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/* Copyright 2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#pragma once

#include "core/cuda/cuda_help.h"
#include "core/data/buffer.h"

namespace cunumeric {

template <typename REDOP>
class DeviceScalarReductionBuffer {
private:
using VAL = typename REDOP::RHS;

public:
DeviceScalarReductionBuffer(cudaStream_t stream)
: buffer_(legate::create_buffer<VAL>(1, Legion::Memory::Kind::GPU_FB_MEM))
{
VAL identity{REDOP::identity};
ptr_ = buffer_.ptr(0);
CHECK_CUDA(cudaMemcpyAsync(ptr_, &identity, sizeof(VAL), cudaMemcpyHostToDevice, stream));
}

template <bool EXCLUSIVE>
__device__ void reduce(const VAL& value) const
{
REDOP::template fold<EXCLUSIVE /*exclusive*/>(*ptr_, value);
}

__host__ VAL read(cudaStream_t stream) const
{
VAL result{REDOP::identity};
CHECK_CUDA(cudaMemcpyAsync(&result, ptr_, sizeof(VAL), cudaMemcpyDeviceToHost, stream));
CHECK_CUDA(cudaStreamSynchronize(stream));
return result;
}

__device__ VAL read() const { return *ptr_; }

private:
legate::Buffer<VAL> buffer_;
VAL* ptr_;
};

} // namespace cunumeric
Loading