diff --git a/src/Makefile b/src/Makefile index 7e17bd3be..819d618a6 100644 --- a/src/Makefile +++ b/src/Makefile @@ -45,7 +45,8 @@ LD_FLAGS ?= LD_FLAGS += -L$(OPENBLAS_PATH)/lib -l$(OPENBLAS_LIBNAME) -Wl,-rpath,$(OPENBLAS_PATH)/lib LD_FLAGS += -L$(TBLIS_PATH)/lib -ltblis -Wl,-rpath,$(TBLIS_PATH)/lib ifeq ($(strip $(USE_CUDA)),1) -LD_FLAGS += -lcublas -lcusolver -lcufft +DEVICE_LD_FLAGS += -lcufft_static +LD_FLAGS += -lcublas -lcusolver -lcufft_static -lculibos LD_FLAGS += -L$(CUTENSOR_PATH)/lib -lcutensor -Wl,-rpath,$(CUTENSOR_PATH)/lib endif NVCC_FLAGS ?= diff --git a/src/cunumeric.mk b/src/cunumeric.mk index 89915743e..222e6811d 100644 --- a/src/cunumeric.mk +++ b/src/cunumeric.mk @@ -117,3 +117,5 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu \ cunumeric/transform/flip.cu \ cunumeric/cudalibs.cu \ cunumeric/cunumeric.cu + +GEN_DEVICE_SRC += cunumeric/convolution/convolve_callbacks.cu diff --git a/src/cunumeric/convolution/convolve.cu b/src/cunumeric/convolution/convolve.cu index 3898cd0cd..2a12622e8 100644 --- a/src/cunumeric/convolution/convolve.cu +++ b/src/cunumeric/convolution/convolve.cu @@ -17,8 +17,11 @@ #include "cunumeric/divmod.h" #include "cunumeric/cuda_help.h" #include "cunumeric/convolution/convolve.h" +#include "cunumeric/convolution/convolve_common.h" #include "cunumeric/convolution/convolve_template.inl" +#include + namespace cunumeric { using namespace Legion; @@ -1072,113 +1075,131 @@ __host__ void direct_convolution(AccessorWO out, // FFT-based convolution implementation /////////////////////////////////////// -template -struct FFTPitches { - size_t pitches[DIM]; - __host__ inline size_t& operator[](unsigned idx) { return pitches[idx]; } - __device__ __forceinline__ size_t operator[](unsigned idx) const { return pitches[idx]; } -}; - -template -struct CopyPitches { - FastDivmodU64 pitches[DIM]; - __host__ inline FastDivmodU64& operator[](unsigned idx) { return pitches[idx]; } - __device__ __forceinline__ const FastDivmodU64& operator[](unsigned idx) const +template +class Shadow { + public: + template + bool update(Fnargs&&... args) { - return pitches[idx]; + dirty_ = host_.update(std::forward(args)...); + return dirty_; } + T& host() { return host_; } + T* device(cudaStream_t stream) + { + if (nullptr == device_) { + CHECK_CUDA(cudaMalloc(&device_, sizeof(T))); + dirty_ = true; + } + if (dirty_) { + CHECK_CUDA(cudaMemcpyAsync(device_, &host_, sizeof(T), cudaMemcpyHostToDevice, stream)); + dirty_ = false; + } + assert(device_ != nullptr); + return device_; + } + + private: + bool dirty_{true}; + T host_{}; + T* device_{nullptr}; }; -template -__global__ static void __launch_bounds__(THREADS_PER_BLOCK, 4) - copy_into_buffer(const AccessorRO accessor, - const DeferredBuffer buffer, - const Point lo, - const CopyPitches copy_pitches, - const size_t volume) -{ - size_t offset = blockIdx.x * blockDim.x + threadIdx.x; - if (offset >= volume) return; - Point point; - for (int d = 0; d < DIM; d++) point[d] = copy_pitches[d].divmod(offset, offset); - buffer[point] = accessor[lo + point]; -} +template +class Cache { + public: + void* operator()() + { + if (nullptr == cache_) cache_ = F(); + return cache_; + }; -template -__global__ static void __launch_bounds__(THREADS_PER_BLOCK, 4) - copy_from_buffer(const VAL* buffer, - const AccessorWO accessor, - const Point buffer_lo, - const Point accessor_lo, - const CopyPitches copy_pitches, - const FFTPitches fft_pitches, - const size_t volume, - const VAL scaling) -{ - size_t offset = blockIdx.x * blockDim.x + threadIdx.x; - if (offset >= volume) return; - Point point; - size_t buffer_offset = 0; - for (int d = 0; d < DIM; d++) { - point[d] = copy_pitches[d].divmod(offset, offset); - buffer_offset += (buffer_lo[d] + point[d]) * fft_pitches[d]; - } - accessor[accessor_lo + point] = scaling * buffer[buffer_offset]; -} + private: + void* cache_{nullptr}; +}; template -__global__ static void __launch_bounds__(THREADS_PER_BLOCK, 4) - complex_multiply(complex* inout, complex* in, const size_t volume) -{ - size_t offset = blockIdx.x * blockDim.x + threadIdx.x; - if (offset >= volume) return; - inout[offset] *= in[offset]; -} - -__host__ inline void cufft_execute_forward(cufftHandle plan, float* idata, float* odata) -{ - CHECK_CUFFT(cufftExecR2C(plan, (cufftReal*)idata, (cufftComplex*)odata)); -} +struct ForwardPass; -__host__ inline void cufft_execute_forward(cufftHandle plan, double* idata, double* odata) -{ - CHECK_CUFFT(cufftExecD2Z(plan, (cufftDoubleReal*)idata, (cufftDoubleComplex*)odata)); -} - -__host__ inline void cufft_execute_backward(cufftHandle plan, float* idata, float* odata) -{ - CHECK_CUFFT(cufftExecC2R(plan, (cufftComplex*)idata, (cufftReal*)odata)); -} +template <> +struct ForwardPass { + static constexpr cufftType type = CUFFT_R2C; + static constexpr cufftXtCallbackType callback_type(bool load) + { + return load ? CUFFT_CB_LD_REAL : CUFFT_CB_ST_COMPLEX; + } + static __host__ inline void execute(cufftHandle plan, const float* idata, float* odata) + { + CHECK_CUFFT(cufftExecR2C(plan, (cufftReal*)idata, (cufftComplex*)odata)); + } +}; -__host__ inline void cufft_execute_backward(cufftHandle plan, double* idata, double* odata) -{ - CHECK_CUFFT(cufftExecZ2D(plan, (cufftDoubleComplex*)idata, (cufftDoubleReal*)odata)); -} +template <> +struct ForwardPass { + static constexpr cufftType type = CUFFT_D2Z; + static constexpr cufftXtCallbackType callback_type(bool load) + { + return load ? CUFFT_CB_LD_REAL_DOUBLE : CUFFT_CB_ST_COMPLEX_DOUBLE; + } + static __host__ inline void execute(cufftHandle plan, const double* idata, double* odata) + { + CHECK_CUFFT(cufftExecD2Z(plan, (cufftDoubleReal*)idata, (cufftDoubleComplex*)odata)); + } +}; template -struct ForwardPlanType; +struct BackwardPass; template <> -struct ForwardPlanType { - static constexpr cufftType value = CUFFT_R2C; +struct BackwardPass { + static constexpr cufftType type = CUFFT_C2R; + static constexpr cufftXtCallbackType callback_type(bool load) + { + return load ? CUFFT_CB_LD_COMPLEX : CUFFT_CB_ST_REAL; + } + static __host__ inline void execute(cufftHandle plan, float* idata, float* odata) + { + CHECK_CUFFT(cufftExecC2R(plan, (cufftComplex*)idata, (cufftReal*)odata)); + } }; template <> -struct ForwardPlanType { - static constexpr cufftType value = CUFFT_D2Z; +struct BackwardPass { + static constexpr cufftType type = CUFFT_Z2D; + static constexpr cufftXtCallbackType callback_type(bool load) + { + return load ? CUFFT_CB_LD_COMPLEX_DOUBLE : CUFFT_CB_ST_REAL_DOUBLE; + } + static __host__ inline void execute(cufftHandle plan, double* idata, double* odata) + { + CHECK_CUFFT(cufftExecZ2D(plan, (cufftDoubleComplex*)idata, (cufftDoubleReal*)odata)); + } }; +extern __host__ void* load_zero_pad_callback_float(); +extern __host__ void* load_zero_pad_callback_double(); + +extern __host__ void* load_multiply_callback_float(); +extern __host__ void* load_multiply_callback_double(); + +extern __host__ void* load_store_callback_float(); +extern __host__ void* load_store_callback_double(); + template -struct BackwardPlanType; +struct Callbacks; template <> -struct BackwardPlanType { - static constexpr cufftType value = CUFFT_C2R; +struct Callbacks { + static __host__ inline void* zero_pad() { return load_zero_pad_callback_float(); } + static __host__ inline void* multiply() { return load_multiply_callback_float(); } + static __host__ inline void* store() { return load_store_callback_float(); } }; template <> -struct BackwardPlanType { - static constexpr cufftType value = CUFFT_Z2D; +struct Callbacks { + static __host__ inline void* zero_pad() { return load_zero_pad_callback_double(); } + static __host__ inline void* multiply() { return load_multiply_callback_double(); } + static __host__ inline void* store() { return load_store_callback_double(); } }; template @@ -1255,11 +1276,27 @@ __host__ static inline void cufft_convolution(AccessorWO out, smem_size, max_smem_size); } else { + // Cache for metadata allocations + struct Metadata { + Shadow filter_meta; + Shadow signal_meta; + Shadow load_meta; + Shadow> store_meta; + Cache::zero_pad> zero_pad; + Cache::multiply> multiply; + Cache::store> store; + }; + static Metadata metadata_cache[LEGION_MAX_NUM_PROCS]; + // Instead of doing the large tile case, we can instead do this // by transforming both the input and the filter to the frequency // domain using an FFT, perform the convolution with a point-wise // multiplication, and then transform the result back to the spatial domain - auto stream = get_cached_stream(); + + auto stream = get_cached_stream(); + auto proc_id = Processor::get_executing_processor().id & (LEGION_MAX_NUM_PROCS - 1); + auto& cache = metadata_cache[proc_id]; + // First compute how big our temporary allocation needs to be // We'll need two of them to store the zero-padded data for the inputs const Point zero = Point::ZEROES(); @@ -1273,62 +1310,34 @@ __host__ static inline void cufft_convolution(AccessorWO out, const Point signal_bounds = input_bounds.hi - input_bounds.lo + one; const Point filter_bounds = filter_rect.hi - filter_rect.lo + one; Point fftsize = signal_bounds + filter_bounds; - for (int d = 0; d < DIM; d++) { - // Technically we can shrink this by one and still be sound but we'll - // only do that if it will make the number even - if ((fftsize[d] % 2) == 1) fftsize[d]--; - } + // Technically we can shrink this by one and still be sound but we'll + // only do that if it will make the number even + for (int d = 0; d < DIM; d++) fftsize[d] -= fftsize[d] % 2; // Cufft needs the last dimension to have fftsize/2+1 complex elements for - // the temporary buffer - // Since we know fftsize is even, we just need to add two to it for the output + // the temporary buffer, we know that the last dimension is already even + // so we just need to add two elements to the last dim of the fftsize Point buffersize = fftsize; buffersize[DIM - 1] += 2; size_t buffervolume = 1; for (int d = 0; d < DIM; d++) buffervolume *= buffersize[d]; - // Zero pad and copy in the input data - DeferredBuffer signal_buffer(Rect(zero, buffersize - one), - Memory::GPU_FB_MEM, - nullptr /*initial*/, - 128 /*alignment*/); - VAL* signal_ptr = signal_buffer.ptr(zero); - CHECK_CUDA(cudaMemsetAsync(signal_ptr, 0, buffervolume * sizeof(VAL), stream)); - // Check to see if the input pointer is dense and we can do this with a CUDA memcpy - size_t strides[DIM]; - const VAL* input_ptr = in.ptr(input_bounds, strides); - size_t pitch = 1; - CopyPitches copy_pitches; - for (int d = DIM - 1; d >= 0; d--) { - copy_pitches[d] = FastDivmodU64(pitch); - pitch *= signal_bounds[d]; - } - size_t blocks = (pitch + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - copy_into_buffer<<>>( - in, signal_buffer, input_bounds.lo, copy_pitches, pitch); - // Zero pad and copy in the filter data - DeferredBuffer filter_buffer(Rect(zero, buffersize - one), - Memory::GPU_FB_MEM, - nullptr /*initial*/, - 128 /*alignment*/); - VAL* filter_ptr = filter_buffer.ptr(zero); - CHECK_CUDA(cudaMemsetAsync(filter_ptr, 0, buffervolume * sizeof(VAL), stream)); - const VAL* filt_ptr = filter.ptr(filter_rect, strides); - pitch = 1; - for (int d = DIM - 1; d >= 0; d--) { - copy_pitches[d] = FastDivmodU64(pitch); - pitch *= filter_bounds[d]; - } - blocks = (pitch + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - copy_into_buffer<<>>( - filter, filter_buffer, filter_rect.lo, copy_pitches, pitch); + // In theory we could do this with a single output buffer by doing + // += operations in the second forward FFT kernel into the buffer, + // but unfortunately cufft likes to use the output buffer during its + // execution and that destroys the data from the first FFT + DeferredBuffer buffer(Rect<1>(Point<1>(0), Point<1>(2 * buffervolume - 1)), + Memory::GPU_FB_MEM, + nullptr /*initial*/, + 128 /*alignment*/); + VAL* buffer_ptr = buffer.ptr(Point<1>(0)); - auto* forward_plan = get_cufft_plan(ForwardPlanType::value, fftsize); - auto* backward_plan = get_cufft_plan(BackwardPlanType::value, fftsize); + auto forward_plan = get_cufft_plan(ForwardPass::type, fftsize); + auto backward_plan = get_cufft_plan(BackwardPass::type, fftsize); // Set the stream and working area for the plans - CHECK_CUFFT(cufftSetStream(forward_plan->handle, stream)); - CHECK_CUFFT(cufftSetStream(backward_plan->handle, stream)); + CHECK_CUFFT(cufftSetStream(forward_plan.handle(), stream)); + CHECK_CUFFT(cufftSetStream(backward_plan.handle(), stream)); - auto workarea_size = std::max(forward_plan->workarea_size, backward_plan->workarea_size); + auto workarea_size = std::max(forward_plan.workarea_size(), backward_plan.workarea_size()); // Create the plan and allocate a temporary buffer for it if it needs one DeferredBuffer workarea_buffer; @@ -1339,59 +1348,67 @@ __host__ static inline void cufft_convolution(AccessorWO out, nullptr /*initial*/, 128 /*alignment*/); void* workarea = workarea_buffer.ptr(zero1d); - CHECK_CUFFT(cufftSetWorkArea(forward_plan->handle, workarea)); - CHECK_CUFFT(cufftSetWorkArea(backward_plan->handle, workarea)); + CHECK_CUFFT(cufftSetWorkArea(forward_plan.handle(), workarea)); + CHECK_CUFFT(cufftSetWorkArea(backward_plan.handle(), workarea)); } - // FFT the input data - cufft_execute_forward(forward_plan->handle, signal_ptr, signal_ptr); + // FFT the filter data - cufft_execute_forward(forward_plan->handle, filter_ptr, filter_ptr); - // Perform the pointwise multiplcation - { - size_t volume = (buffervolume / 2); - blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - complex_multiply<<>>( - (complex*)signal_ptr, (complex*)filter_ptr, volume); - } - // Inverse FFT for the ouptut - // Allow this out-of-place for better performance - cufft_execute_backward(backward_plan->handle, signal_ptr, filter_ptr); - // Copy the result data out of the temporary buffer and scale - // because CUFFT inverse does not perform the scale for us - pitch = 1; - FFTPitches fft_pitches; - for (int d = DIM - 1; d >= 0; d--) { - fft_pitches[d] = pitch; - pitch *= fftsize[d]; - } - const VAL scaling_factor = VAL(1) / pitch; - Point buffer_offset; - for (int d = 0; d < DIM; d++) - buffer_offset[d] = - centers[d] - (((extents[d] % 2) == 0) ? 1 : 0) + + size_t strides[DIM]; + const auto* filter_ptr = filter.ptr(filter_rect, strides); + cache.filter_meta.update(fftsize, strides, filter_bounds); + + auto* d_filter_meta = cache.filter_meta.device(stream); + forward_plan.set_callback( + ForwardPass::callback_type(true), cache.zero_pad(), d_filter_meta); + ForwardPass::execute(forward_plan.handle(), filter_ptr, buffer_ptr); + + // FFT the input data + const auto* signal_ptr = in.ptr(input_bounds, strides); + cache.signal_meta.update(fftsize, strides, signal_bounds); + + auto* d_signal_meta = cache.signal_meta.device(stream); + forward_plan.set_callback( + ForwardPass::callback_type(true), cache.zero_pad(), d_signal_meta); + ForwardPass::execute(forward_plan.handle(), signal_ptr, buffer_ptr + buffervolume); + + // Inverse FFT for the output in-place in the temporary buffer + auto* output_ptr = out.ptr(subrect, strides); + + Point offsets; + for (int32_t d = 0; d < DIM; d++) + offsets[d] = + centers[d] - (1 - (extents[d] % 2)) + ((offset_bounds.lo[d] < root_rect.lo[d]) ? (subrect.lo[d] - root_rect.lo[d]) : centers[d]); - Point output_bounds = subrect.hi - subrect.lo + one; - pitch = 1; - for (int d = DIM - 1; d >= 0; d--) { - copy_pitches[d] = FastDivmodU64(pitch); - pitch *= output_bounds[d]; - } - blocks = (pitch + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - copy_from_buffer<<>>( - filter_ptr, out, buffer_offset, subrect.lo, copy_pitches, fft_pitches, pitch, scaling_factor); + auto output_bounds = subrect.hi - subrect.lo + one; + + cache.load_meta.update(buffervolume / 2); + cache.store_meta.update(fftsize, strides, offsets, output_bounds); + + auto* d_load_meta = cache.load_meta.device(stream); + auto* d_store_meta = cache.store_meta.device(stream); + backward_plan.set_callback( + BackwardPass::callback_type(true), cache.multiply(), d_load_meta); + backward_plan.set_callback( + BackwardPass::callback_type(false), cache.store(), d_store_meta); + BackwardPass::execute(backward_plan.handle(), buffer_ptr, output_ptr); -#if 0 // This is useful debugging code for finding the output - VAL *buffer = (VAL*)malloc(buffervolume*sizeof(VAL)); - CHECK_CUDA( cudaMemcpyAsync(buffer, filter_ptr, buffervolume*sizeof(VAL), cudaMemcpyDeviceToHost, stream) ); - CHECK_CUDA( cudaStreamSynchronize(stream) ); - for (unsigned idx = 0; idx < buffervolume; idx++) { - if ((idx % fftsize[DIM-1]) == 0) - printf("\n"); - printf("%.8g ", buffer[idx]*scaling_factor); +#if 0 + { + std::vector vec_debug_buffer(buffervolume); + auto* debug_buffer = vec_debug_buffer.data(); + CHECK_CUDA(cudaMemcpyAsync(debug_buffer, + buffer_ptr + buffervolume, + buffervolume * sizeof(VAL), + cudaMemcpyDeviceToHost, + stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + for (size_t idx = 0; idx < buffervolume; idx++) { + if ((idx % fftsize[DIM - 1]) == 0) printf("\n"); + printf("%.8g ", debug_buffer[idx] * cache.store_meta.host().scale_factor); + } + printf("\n"); } - printf("\n"); - free(buffer); #endif } } diff --git a/src/cunumeric/convolution/convolve_callbacks.cu b/src/cunumeric/convolution/convolve_callbacks.cu new file mode 100644 index 000000000..3fbec1a45 --- /dev/null +++ b/src/cunumeric/convolution/convolve_callbacks.cu @@ -0,0 +1,94 @@ +/* Copyright 2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include "cunumeric/convolution/convolve_common.h" + +#include "cunumeric/cuda_help.h" +#include + +using namespace Legion; + +namespace cunumeric { + +template +static __device__ T cb_zero_pad(void* data, size_t offset, void* callerinfo, void* sharedptr) +{ + auto* info = static_cast(callerinfo); + size_t actual = 0; +#pragma unroll 3 + for (int32_t d = 0; d < info->dim; d++) { + coord_t coord = info->pitches[d].divmod(offset, offset); + if (coord >= info->bounds[d]) return 0.f; + actual += coord * info->strides[d]; + } + auto* ptr = static_cast(data); + return ptr[actual]; +} + +template +static __device__ Complex cb_multiply(void* data, size_t offset, void* callerinfo, void* sharedptr) +{ + auto* info = static_cast(callerinfo); + auto* ptr = static_cast(data); + auto lhs = ptr[offset]; + auto rhs = ptr[offset + info->buffervolume]; + return CTOR(lhs.x * rhs.x - lhs.y * rhs.y, lhs.x * rhs.y + lhs.y * rhs.x); +} + +template +static __device__ void cb_store( + void* data, size_t offset, T value, void* callerinfo, void* sharedptr) +{ + auto* info = static_cast*>(callerinfo); + size_t actual = 0; +#pragma unroll 3 + for (int32_t d = 0; d < info->dim; d++) { + coord_t coord = info->pitches[d].divmod(offset, offset); + coord -= info->offsets[d]; + if (coord < 0) return; + if (coord >= info->bounds[d]) return; + actual += coord * info->strides[d]; + } + auto* ptr = static_cast(data); + ptr[actual] = value * info->scale_factor; +} + +static __device__ cufftCallbackLoadR d_zero_pad_float = cb_zero_pad; +static __device__ cufftCallbackLoadD d_zero_pad_double = cb_zero_pad; +static __device__ cufftCallbackLoadC d_multiply_float = + cb_multiply; +static __device__ cufftCallbackLoadZ d_multiply_double = + cb_multiply; +static __device__ cufftCallbackStoreR d_store_float = cb_store; +static __device__ cufftCallbackStoreD d_store_double = cb_store; + +#define LOAD_SYMBOL(FUN, DEV_FUN) \ + __host__ void* FUN() \ + { \ + void* ptr = nullptr; \ + CHECK_CUDA(cudaMemcpyFromSymbol(&ptr, DEV_FUN, sizeof(ptr))); \ + assert(ptr != nullptr); \ + return ptr; \ + } + +LOAD_SYMBOL(load_zero_pad_callback_float, d_zero_pad_float) +LOAD_SYMBOL(load_zero_pad_callback_double, d_zero_pad_double) +LOAD_SYMBOL(load_multiply_callback_float, d_multiply_float) +LOAD_SYMBOL(load_multiply_callback_double, d_multiply_double) +LOAD_SYMBOL(load_store_callback_float, d_store_float) +LOAD_SYMBOL(load_store_callback_double, d_store_double) + +} // namespace cunumeric diff --git a/src/cunumeric/convolution/convolve_common.h b/src/cunumeric/convolution/convolve_common.h new file mode 100644 index 000000000..927e8176b --- /dev/null +++ b/src/cunumeric/convolution/convolve_common.h @@ -0,0 +1,92 @@ +/* Copyright 2021-2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#pragma once + +#include "cunumeric/divmod.h" +#include "legion.h" + +namespace cunumeric { + +struct LoadComplexData { + size_t buffervolume{0}; + + __host__ inline bool update(size_t new_volume) + { + auto changed = buffervolume != new_volume; + buffervolume = new_volume; + return changed; + } +}; + +struct ZeroPadLoadData { + FastDivmodU64 pitches[3]; + size_t strides[3]; + size_t bounds[3]; + int32_t dim{0}; + + template + __host__ inline bool update(const Legion::Point& fftsize, + const size_t* new_strides, + const Legion::Point& new_bounds) + { + auto changed = dim != DIM; + changed = true; + dim = DIM; + size_t pitch = 1; + for (int32_t d = DIM - 1; d >= 0; --d) { + if (changed || (pitches[d].divisor != pitch)) { + pitches[d] = FastDivmodU64(pitch); + changed = true; + } + pitch *= fftsize[d]; + if (changed || (strides[d] != new_strides[d])) { + strides[d] = new_strides[d]; + changed = true; + } + if (changed || (bounds[d] != new_bounds[d])) { + bounds[d] = new_bounds[d]; + changed = true; + } + } + return changed; + } +}; + +template +struct StoreOutputData : public ZeroPadLoadData { + size_t offsets[3]; + T scale_factor; + using ZeroPadLoadData::update; + template + __host__ inline bool update(const Legion::Point& fftsize, + const size_t* new_strides, + const Legion::Point& new_offsets, + const Legion::Point& new_bounds) + { + auto changed = update(fftsize, new_strides, new_bounds); + for (int32_t d = DIM - 1; d >= 0; --d) { + if (changed || offsets[d] != new_offsets[d]) { + offsets[d] = new_offsets[d]; + changed = true; + } + } + if (changed) scale_factor = 1.0 / pitches[0].divisor; + return changed; + } +}; + +} // namespace cunumeric diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h index a54e1ee2e..385c2e0ac 100644 --- a/src/cunumeric/cuda_help.h +++ b/src/cunumeric/cuda_help.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #define THREADS_PER_BLOCK 128 @@ -70,7 +71,30 @@ namespace cunumeric { struct cufftPlan { cufftHandle handle; - size_t workarea_size; + size_t workarea; +}; + +class cufftContext { + public: + cufftContext(cufftPlan* plan); + ~cufftContext(); + + public: + cufftContext(const cufftContext&) = delete; + cufftContext& operator=(const cufftContext&) = delete; + + public: + cufftContext(cufftContext&&) = default; + cufftContext& operator=(cufftContext&&) = default; + + public: + cufftHandle handle(); + size_t workarea_size(); + void set_callback(cufftXtCallbackType type, void* callback, void* data); + + private: + cufftPlan* plan_{nullptr}; + std::set callback_types_{}; }; // Defined in cudalibs.cu @@ -80,7 +104,7 @@ cudaStream_t get_cached_stream(); cublasHandle_t get_cublas(); cusolverDnHandle_t get_cusolver(); cutensorHandle_t* get_cutensor(); -cufftPlan* get_cufft_plan(cufftType type, const Legion::DomainPoint& size); +cufftContext get_cufft_plan(cufftType type, const Legion::DomainPoint& size); __host__ inline void check_cuda(cudaError_t error, const char* file, int line) { diff --git a/src/cunumeric/cudalibs.cu b/src/cunumeric/cudalibs.cu index 5b1d7bc32..3e79647fc 100644 --- a/src/cunumeric/cudalibs.cu +++ b/src/cunumeric/cudalibs.cu @@ -27,6 +27,29 @@ using namespace Legion; static Logger log_cudalibs("cunumeric.cudalibs"); +cufftContext::cufftContext(cufftPlan* plan) : plan_(plan) {} + +cufftContext::~cufftContext() +{ + auto hdl = handle(); + for (auto type : callback_types_) CHECK_CUFFT(cufftXtClearCallback(hdl, type)); +} + +cufftHandle cufftContext::handle() { return plan_->handle; } + +size_t cufftContext::workarea_size() { return plan_->workarea; } + +void cufftContext::set_callback(cufftXtCallbackType type, void* callback, void* data) +{ + auto hdl = handle(); + if (callback_types_.find(type) != callback_types_.end()) + CHECK_CUFFT(cufftXtClearCallback(hdl, type)); + void* callbacks[1] = {callback}; + void* datas[1] = {data}; + CHECK_CUFFT(cufftXtSetCallback(hdl, callbacks, type, datas)); + callback_types_.insert(type); +} + struct cufftPlanCache { private: // Maximum number of plans to keep per dimension @@ -119,7 +142,7 @@ cufftPlan* cufftPlanCache::get_cufft_plan(const DomainPoint& size) 1, type_, 1 /*batch*/, - &result->workarea_size)); + &result->workarea)); } // Otherwise, we return the cached plan and adjust the LRU count else { @@ -206,7 +229,7 @@ cutensorHandle_t* CUDALibraries::get_cutensor() return cutensor_; } -cufftPlan* CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& size) +cufftContext CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& size) { auto finder = plan_caches_.find(type); cufftPlanCache* cache{nullptr}; @@ -216,7 +239,7 @@ cufftPlan* CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& size plan_caches_[type] = cache; } else cache = finder->second; - return cache->get_cufft_plan(size); + return cufftContext(cache->get_cufft_plan(size)); } static CUDALibraries& get_cuda_libraries(Processor proc) @@ -265,7 +288,7 @@ cutensorHandle_t* get_cutensor() return lib.get_cutensor(); } -cufftPlan* get_cufft_plan(cufftType type, const Legion::DomainPoint& size) +cufftContext get_cufft_plan(cufftType type, const Legion::DomainPoint& size) { const auto proc = Processor::get_executing_processor(); auto& lib = get_cuda_libraries(proc); diff --git a/src/cunumeric/cudalibs.h b/src/cunumeric/cudalibs.h index 5d7514b73..3edd9e8f8 100644 --- a/src/cunumeric/cudalibs.h +++ b/src/cunumeric/cudalibs.h @@ -38,7 +38,7 @@ struct CUDALibraries { cublasHandle_t get_cublas(); cusolverDnHandle_t get_cusolver(); cutensorHandle_t* get_cutensor(); - cufftPlan* get_cufft_plan(cufftType type, const Legion::DomainPoint& size); + cufftContext get_cufft_plan(cufftType type, const Legion::DomainPoint& size); private: void finalize_cublas();