diff --git a/src/Makefile b/src/Makefile
index 7e17bd3be..819d618a6 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -45,7 +45,8 @@ LD_FLAGS ?=
 LD_FLAGS += -L$(OPENBLAS_PATH)/lib -l$(OPENBLAS_LIBNAME) -Wl,-rpath,$(OPENBLAS_PATH)/lib
 LD_FLAGS += -L$(TBLIS_PATH)/lib -ltblis -Wl,-rpath,$(TBLIS_PATH)/lib
 ifeq ($(strip $(USE_CUDA)),1)
-LD_FLAGS += -lcublas -lcusolver -lcufft
+DEVICE_LD_FLAGS += -lcufft_static
+LD_FLAGS += -lcublas -lcusolver -lcufft_static -lculibos
 LD_FLAGS += -L$(CUTENSOR_PATH)/lib -lcutensor -Wl,-rpath,$(CUTENSOR_PATH)/lib
 endif
 NVCC_FLAGS ?=
diff --git a/src/cunumeric.mk b/src/cunumeric.mk
index 89915743e..222e6811d 100644
--- a/src/cunumeric.mk
+++ b/src/cunumeric.mk
@@ -117,3 +117,5 @@ GEN_GPU_SRC += cunumeric/ternary/where.cu               \
 							 cunumeric/transform/flip.cu              \
 							 cunumeric/cudalibs.cu                    \
 							 cunumeric/cunumeric.cu
+
+GEN_DEVICE_SRC += cunumeric/convolution/convolve_callbacks.cu
diff --git a/src/cunumeric/convolution/convolve.cu b/src/cunumeric/convolution/convolve.cu
index 3898cd0cd..2a12622e8 100644
--- a/src/cunumeric/convolution/convolve.cu
+++ b/src/cunumeric/convolution/convolve.cu
@@ -17,8 +17,11 @@
 #include "cunumeric/divmod.h"
 #include "cunumeric/cuda_help.h"
 #include "cunumeric/convolution/convolve.h"
+#include "cunumeric/convolution/convolve_common.h"
 #include "cunumeric/convolution/convolve_template.inl"
 
+#include <cufftXt.h>
+
 namespace cunumeric {
 
 using namespace Legion;
@@ -1072,113 +1075,131 @@ __host__ void direct_convolution(AccessorWO<VAL, DIM> out,
 // FFT-based convolution implementation
 ///////////////////////////////////////
 
-template <int DIM>
-struct FFTPitches {
-  size_t pitches[DIM];
-  __host__ inline size_t& operator[](unsigned idx) { return pitches[idx]; }
-  __device__ __forceinline__ size_t operator[](unsigned idx) const { return pitches[idx]; }
-};
-
-template <int DIM>
-struct CopyPitches {
-  FastDivmodU64 pitches[DIM];
-  __host__ inline FastDivmodU64& operator[](unsigned idx) { return pitches[idx]; }
-  __device__ __forceinline__ const FastDivmodU64& operator[](unsigned idx) const
+template <typename T>
+class Shadow {
+ public:
+  template <typename... Fnargs>
+  bool update(Fnargs&&... args)
   {
-    return pitches[idx];
+    dirty_ = host_.update(std::forward<Fnargs>(args)...);
+    return dirty_;
   }
+  T& host() { return host_; }
+  T* device(cudaStream_t stream)
+  {
+    if (nullptr == device_) {
+      CHECK_CUDA(cudaMalloc(&device_, sizeof(T)));
+      dirty_ = true;
+    }
+    if (dirty_) {
+      CHECK_CUDA(cudaMemcpyAsync(device_, &host_, sizeof(T), cudaMemcpyHostToDevice, stream));
+      dirty_ = false;
+    }
+    assert(device_ != nullptr);
+    return device_;
+  }
+
+ private:
+  bool dirty_{true};
+  T host_{};
+  T* device_{nullptr};
 };
 
-template <typename VAL, int DIM>
-__global__ static void __launch_bounds__(THREADS_PER_BLOCK, 4)
-  copy_into_buffer(const AccessorRO<VAL, DIM> accessor,
-                   const DeferredBuffer<VAL, DIM> buffer,
-                   const Point<DIM> lo,
-                   const CopyPitches<DIM> copy_pitches,
-                   const size_t volume)
-{
-  size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
-  if (offset >= volume) return;
-  Point<DIM> point;
-  for (int d = 0; d < DIM; d++) point[d] = copy_pitches[d].divmod(offset, offset);
-  buffer[point] = accessor[lo + point];
-}
+template <void* (*F)()>
+class Cache {
+ public:
+  void* operator()()
+  {
+    if (nullptr == cache_) cache_ = F();
+    return cache_;
+  };
 
-template <typename VAL, int DIM>
-__global__ static void __launch_bounds__(THREADS_PER_BLOCK, 4)
-  copy_from_buffer(const VAL* buffer,
-                   const AccessorWO<VAL, DIM> accessor,
-                   const Point<DIM> buffer_lo,
-                   const Point<DIM> accessor_lo,
-                   const CopyPitches<DIM> copy_pitches,
-                   const FFTPitches<DIM> fft_pitches,
-                   const size_t volume,
-                   const VAL scaling)
-{
-  size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
-  if (offset >= volume) return;
-  Point<DIM> point;
-  size_t buffer_offset = 0;
-  for (int d = 0; d < DIM; d++) {
-    point[d] = copy_pitches[d].divmod(offset, offset);
-    buffer_offset += (buffer_lo[d] + point[d]) * fft_pitches[d];
-  }
-  accessor[accessor_lo + point] = scaling * buffer[buffer_offset];
-}
+ private:
+  void* cache_{nullptr};
+};
 
 template <typename VAL>
-__global__ static void __launch_bounds__(THREADS_PER_BLOCK, 4)
-  complex_multiply(complex<VAL>* inout, complex<VAL>* in, const size_t volume)
-{
-  size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
-  if (offset >= volume) return;
-  inout[offset] *= in[offset];
-}
-
-__host__ inline void cufft_execute_forward(cufftHandle plan, float* idata, float* odata)
-{
-  CHECK_CUFFT(cufftExecR2C(plan, (cufftReal*)idata, (cufftComplex*)odata));
-}
+struct ForwardPass;
 
-__host__ inline void cufft_execute_forward(cufftHandle plan, double* idata, double* odata)
-{
-  CHECK_CUFFT(cufftExecD2Z(plan, (cufftDoubleReal*)idata, (cufftDoubleComplex*)odata));
-}
-
-__host__ inline void cufft_execute_backward(cufftHandle plan, float* idata, float* odata)
-{
-  CHECK_CUFFT(cufftExecC2R(plan, (cufftComplex*)idata, (cufftReal*)odata));
-}
+template <>
+struct ForwardPass<float> {
+  static constexpr cufftType type = CUFFT_R2C;
+  static constexpr cufftXtCallbackType callback_type(bool load)
+  {
+    return load ? CUFFT_CB_LD_REAL : CUFFT_CB_ST_COMPLEX;
+  }
+  static __host__ inline void execute(cufftHandle plan, const float* idata, float* odata)
+  {
+    CHECK_CUFFT(cufftExecR2C(plan, (cufftReal*)idata, (cufftComplex*)odata));
+  }
+};
 
-__host__ inline void cufft_execute_backward(cufftHandle plan, double* idata, double* odata)
-{
-  CHECK_CUFFT(cufftExecZ2D(plan, (cufftDoubleComplex*)idata, (cufftDoubleReal*)odata));
-}
+template <>
+struct ForwardPass<double> {
+  static constexpr cufftType type = CUFFT_D2Z;
+  static constexpr cufftXtCallbackType callback_type(bool load)
+  {
+    return load ? CUFFT_CB_LD_REAL_DOUBLE : CUFFT_CB_ST_COMPLEX_DOUBLE;
+  }
+  static __host__ inline void execute(cufftHandle plan, const double* idata, double* odata)
+  {
+    CHECK_CUFFT(cufftExecD2Z(plan, (cufftDoubleReal*)idata, (cufftDoubleComplex*)odata));
+  }
+};
 
 template <typename VAL>
-struct ForwardPlanType;
+struct BackwardPass;
 
 template <>
-struct ForwardPlanType<float> {
-  static constexpr cufftType value = CUFFT_R2C;
+struct BackwardPass<float> {
+  static constexpr cufftType type = CUFFT_C2R;
+  static constexpr cufftXtCallbackType callback_type(bool load)
+  {
+    return load ? CUFFT_CB_LD_COMPLEX : CUFFT_CB_ST_REAL;
+  }
+  static __host__ inline void execute(cufftHandle plan, float* idata, float* odata)
+  {
+    CHECK_CUFFT(cufftExecC2R(plan, (cufftComplex*)idata, (cufftReal*)odata));
+  }
 };
 
 template <>
-struct ForwardPlanType<double> {
-  static constexpr cufftType value = CUFFT_D2Z;
+struct BackwardPass<double> {
+  static constexpr cufftType type = CUFFT_Z2D;
+  static constexpr cufftXtCallbackType callback_type(bool load)
+  {
+    return load ? CUFFT_CB_LD_COMPLEX_DOUBLE : CUFFT_CB_ST_REAL_DOUBLE;
+  }
+  static __host__ inline void execute(cufftHandle plan, double* idata, double* odata)
+  {
+    CHECK_CUFFT(cufftExecZ2D(plan, (cufftDoubleComplex*)idata, (cufftDoubleReal*)odata));
+  }
 };
 
+extern __host__ void* load_zero_pad_callback_float();
+extern __host__ void* load_zero_pad_callback_double();
+
+extern __host__ void* load_multiply_callback_float();
+extern __host__ void* load_multiply_callback_double();
+
+extern __host__ void* load_store_callback_float();
+extern __host__ void* load_store_callback_double();
+
 template <typename VAL>
-struct BackwardPlanType;
+struct Callbacks;
 
 template <>
-struct BackwardPlanType<float> {
-  static constexpr cufftType value = CUFFT_C2R;
+struct Callbacks<float> {
+  static __host__ inline void* zero_pad() { return load_zero_pad_callback_float(); }
+  static __host__ inline void* multiply() { return load_multiply_callback_float(); }
+  static __host__ inline void* store() { return load_store_callback_float(); }
 };
 
 template <>
-struct BackwardPlanType<double> {
-  static constexpr cufftType value = CUFFT_Z2D;
+struct Callbacks<double> {
+  static __host__ inline void* zero_pad() { return load_zero_pad_callback_double(); }
+  static __host__ inline void* multiply() { return load_multiply_callback_double(); }
+  static __host__ inline void* store() { return load_store_callback_double(); }
 };
 
 template <typename VAL, int DIM>
@@ -1255,11 +1276,27 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
                                        smem_size,
                                        max_smem_size);
   } else {
+    // Cache for metadata allocations
+    struct Metadata {
+      Shadow<ZeroPadLoadData> filter_meta;
+      Shadow<ZeroPadLoadData> signal_meta;
+      Shadow<LoadComplexData> load_meta;
+      Shadow<StoreOutputData<VAL>> store_meta;
+      Cache<Callbacks<VAL>::zero_pad> zero_pad;
+      Cache<Callbacks<VAL>::multiply> multiply;
+      Cache<Callbacks<VAL>::store> store;
+    };
+    static Metadata metadata_cache[LEGION_MAX_NUM_PROCS];
+
     // Instead of doing the large tile case, we can instead do this
     // by transforming both the input and the filter to the frequency
     // domain using an FFT, perform the convolution with a point-wise
     // multiplication, and then transform the result back to the spatial domain
-    auto stream = get_cached_stream();
+
+    auto stream  = get_cached_stream();
+    auto proc_id = Processor::get_executing_processor().id & (LEGION_MAX_NUM_PROCS - 1);
+    auto& cache  = metadata_cache[proc_id];
+
     // First compute how big our temporary allocation needs to be
     // We'll need two of them to store the zero-padded data for the inputs
     const Point<DIM> zero = Point<DIM>::ZEROES();
@@ -1273,62 +1310,34 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
     const Point<DIM> signal_bounds = input_bounds.hi - input_bounds.lo + one;
     const Point<DIM> filter_bounds = filter_rect.hi - filter_rect.lo + one;
     Point<DIM> fftsize             = signal_bounds + filter_bounds;
-    for (int d = 0; d < DIM; d++) {
-      // Technically we can shrink this by one and still be sound but we'll
-      // only do that if it will make the number even
-      if ((fftsize[d] % 2) == 1) fftsize[d]--;
-    }
+    // Technically we can shrink this by one and still be sound but we'll
+    // only do that if it will make the number even
+    for (int d = 0; d < DIM; d++) fftsize[d] -= fftsize[d] % 2;
     // Cufft needs the last dimension to have fftsize/2+1 complex elements for
-    // the temporary buffer
-    // Since we know fftsize is even, we just need to add two to it for the output
+    // the temporary buffer, we know that the last dimension is already even
+    // so we just need to add two elements to the last dim of the fftsize
     Point<DIM> buffersize = fftsize;
     buffersize[DIM - 1] += 2;
     size_t buffervolume = 1;
     for (int d = 0; d < DIM; d++) buffervolume *= buffersize[d];
-    // Zero pad and copy in the input data
-    DeferredBuffer<VAL, DIM> signal_buffer(Rect<DIM>(zero, buffersize - one),
-                                           Memory::GPU_FB_MEM,
-                                           nullptr /*initial*/,
-                                           128 /*alignment*/);
-    VAL* signal_ptr = signal_buffer.ptr(zero);
-    CHECK_CUDA(cudaMemsetAsync(signal_ptr, 0, buffervolume * sizeof(VAL), stream));
-    // Check to see if the input pointer is dense and we can do this with a CUDA memcpy
-    size_t strides[DIM];
-    const VAL* input_ptr = in.ptr(input_bounds, strides);
-    size_t pitch         = 1;
-    CopyPitches<DIM> copy_pitches;
-    for (int d = DIM - 1; d >= 0; d--) {
-      copy_pitches[d] = FastDivmodU64(pitch);
-      pitch *= signal_bounds[d];
-    }
-    size_t blocks = (pitch + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    copy_into_buffer<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-      in, signal_buffer, input_bounds.lo, copy_pitches, pitch);
-    // Zero pad and copy in the filter data
-    DeferredBuffer<VAL, DIM> filter_buffer(Rect<DIM>(zero, buffersize - one),
-                                           Memory::GPU_FB_MEM,
-                                           nullptr /*initial*/,
-                                           128 /*alignment*/);
-    VAL* filter_ptr = filter_buffer.ptr(zero);
-    CHECK_CUDA(cudaMemsetAsync(filter_ptr, 0, buffervolume * sizeof(VAL), stream));
-    const VAL* filt_ptr = filter.ptr(filter_rect, strides);
-    pitch               = 1;
-    for (int d = DIM - 1; d >= 0; d--) {
-      copy_pitches[d] = FastDivmodU64(pitch);
-      pitch *= filter_bounds[d];
-    }
-    blocks = (pitch + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    copy_into_buffer<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-      filter, filter_buffer, filter_rect.lo, copy_pitches, pitch);
+    // In theory we could do this with a single output buffer by doing
+    // += operations in the second forward FFT kernel into the buffer,
+    // but unfortunately cufft likes to use the output buffer during its
+    // execution and that destroys the data from the first FFT
+    DeferredBuffer<VAL, 1> buffer(Rect<1>(Point<1>(0), Point<1>(2 * buffervolume - 1)),
+                                  Memory::GPU_FB_MEM,
+                                  nullptr /*initial*/,
+                                  128 /*alignment*/);
+    VAL* buffer_ptr = buffer.ptr(Point<1>(0));
 
-    auto* forward_plan  = get_cufft_plan(ForwardPlanType<VAL>::value, fftsize);
-    auto* backward_plan = get_cufft_plan(BackwardPlanType<VAL>::value, fftsize);
+    auto forward_plan  = get_cufft_plan(ForwardPass<VAL>::type, fftsize);
+    auto backward_plan = get_cufft_plan(BackwardPass<VAL>::type, fftsize);
 
     // Set the stream and working area for the plans
-    CHECK_CUFFT(cufftSetStream(forward_plan->handle, stream));
-    CHECK_CUFFT(cufftSetStream(backward_plan->handle, stream));
+    CHECK_CUFFT(cufftSetStream(forward_plan.handle(), stream));
+    CHECK_CUFFT(cufftSetStream(backward_plan.handle(), stream));
 
-    auto workarea_size = std::max(forward_plan->workarea_size, backward_plan->workarea_size);
+    auto workarea_size = std::max(forward_plan.workarea_size(), backward_plan.workarea_size());
 
     // Create the plan and allocate a temporary buffer for it if it needs one
     DeferredBuffer<uint8_t, 1> workarea_buffer;
@@ -1339,59 +1348,67 @@ __host__ static inline void cufft_convolution(AccessorWO<VAL, DIM> out,
                                                    nullptr /*initial*/,
                                                    128 /*alignment*/);
       void* workarea  = workarea_buffer.ptr(zero1d);
-      CHECK_CUFFT(cufftSetWorkArea(forward_plan->handle, workarea));
-      CHECK_CUFFT(cufftSetWorkArea(backward_plan->handle, workarea));
+      CHECK_CUFFT(cufftSetWorkArea(forward_plan.handle(), workarea));
+      CHECK_CUFFT(cufftSetWorkArea(backward_plan.handle(), workarea));
     }
-    // FFT the input data
-    cufft_execute_forward(forward_plan->handle, signal_ptr, signal_ptr);
+
     // FFT the filter data
-    cufft_execute_forward(forward_plan->handle, filter_ptr, filter_ptr);
-    // Perform the pointwise multiplcation
-    {
-      size_t volume = (buffervolume / 2);
-      blocks        = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-      complex_multiply<VAL><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-        (complex<VAL>*)signal_ptr, (complex<VAL>*)filter_ptr, volume);
-    }
-    // Inverse FFT for the ouptut
-    // Allow this out-of-place for better performance
-    cufft_execute_backward(backward_plan->handle, signal_ptr, filter_ptr);
-    // Copy the result data out of the temporary buffer and scale
-    // because CUFFT inverse does not perform the scale for us
-    pitch = 1;
-    FFTPitches<DIM> fft_pitches;
-    for (int d = DIM - 1; d >= 0; d--) {
-      fft_pitches[d] = pitch;
-      pitch *= fftsize[d];
-    }
-    const VAL scaling_factor = VAL(1) / pitch;
-    Point<DIM> buffer_offset;
-    for (int d = 0; d < DIM; d++)
-      buffer_offset[d] =
-        centers[d] - (((extents[d] % 2) == 0) ? 1 : 0) +
+    size_t strides[DIM];
+    const auto* filter_ptr = filter.ptr(filter_rect, strides);
+    cache.filter_meta.update(fftsize, strides, filter_bounds);
+
+    auto* d_filter_meta = cache.filter_meta.device(stream);
+    forward_plan.set_callback(
+      ForwardPass<VAL>::callback_type(true), cache.zero_pad(), d_filter_meta);
+    ForwardPass<VAL>::execute(forward_plan.handle(), filter_ptr, buffer_ptr);
+
+    // FFT the input data
+    const auto* signal_ptr = in.ptr(input_bounds, strides);
+    cache.signal_meta.update(fftsize, strides, signal_bounds);
+
+    auto* d_signal_meta = cache.signal_meta.device(stream);
+    forward_plan.set_callback(
+      ForwardPass<VAL>::callback_type(true), cache.zero_pad(), d_signal_meta);
+    ForwardPass<VAL>::execute(forward_plan.handle(), signal_ptr, buffer_ptr + buffervolume);
+
+    // Inverse FFT for the output in-place in the temporary buffer
+    auto* output_ptr = out.ptr(subrect, strides);
+
+    Point<DIM> offsets;
+    for (int32_t d = 0; d < DIM; d++)
+      offsets[d] =
+        centers[d] - (1 - (extents[d] % 2)) +
         ((offset_bounds.lo[d] < root_rect.lo[d]) ? (subrect.lo[d] - root_rect.lo[d]) : centers[d]);
-    Point<DIM> output_bounds = subrect.hi - subrect.lo + one;
-    pitch                    = 1;
-    for (int d = DIM - 1; d >= 0; d--) {
-      copy_pitches[d] = FastDivmodU64(pitch);
-      pitch *= output_bounds[d];
-    }
-    blocks = (pitch + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    copy_from_buffer<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-      filter_ptr, out, buffer_offset, subrect.lo, copy_pitches, fft_pitches, pitch, scaling_factor);
+    auto output_bounds = subrect.hi - subrect.lo + one;
+
+    cache.load_meta.update(buffervolume / 2);
+    cache.store_meta.update(fftsize, strides, offsets, output_bounds);
+
+    auto* d_load_meta  = cache.load_meta.device(stream);
+    auto* d_store_meta = cache.store_meta.device(stream);
+    backward_plan.set_callback(
+      BackwardPass<VAL>::callback_type(true), cache.multiply(), d_load_meta);
+    backward_plan.set_callback(
+      BackwardPass<VAL>::callback_type(false), cache.store(), d_store_meta);
+    BackwardPass<VAL>::execute(backward_plan.handle(), buffer_ptr, output_ptr);
 
-#if 0
     // This is useful debugging code for finding the output
-    VAL *buffer = (VAL*)malloc(buffervolume*sizeof(VAL));
-    CHECK_CUDA( cudaMemcpyAsync(buffer, filter_ptr, buffervolume*sizeof(VAL), cudaMemcpyDeviceToHost, stream) );
-    CHECK_CUDA( cudaStreamSynchronize(stream) );
-    for (unsigned idx = 0; idx < buffervolume; idx++) {
-      if ((idx % fftsize[DIM-1]) == 0)
-        printf("\n");
-      printf("%.8g ", buffer[idx]*scaling_factor);
+#if 0
+    {
+      std::vector<VAL> vec_debug_buffer(buffervolume);
+      auto* debug_buffer = vec_debug_buffer.data();
+      CHECK_CUDA(cudaMemcpyAsync(debug_buffer,
+                                 buffer_ptr + buffervolume,
+                                 buffervolume * sizeof(VAL),
+                                 cudaMemcpyDeviceToHost,
+                                 stream));
+      CHECK_CUDA(cudaStreamSynchronize(stream));
+      for (size_t idx = 0; idx < buffervolume; idx++) {
+        if ((idx % fftsize[DIM - 1]) == 0) printf("\n");
+        printf("%.8g ", debug_buffer[idx] * cache.store_meta.host().scale_factor);
+      }
+      printf("\n");
     }
-    printf("\n");
-    free(buffer);
 #endif
   }
 }
diff --git a/src/cunumeric/convolution/convolve_callbacks.cu b/src/cunumeric/convolution/convolve_callbacks.cu
new file mode 100644
index 000000000..3fbec1a45
--- /dev/null
+++ b/src/cunumeric/convolution/convolve_callbacks.cu
@@ -0,0 +1,94 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/convolution/convolve_common.h"
+
+#include "cunumeric/cuda_help.h"
+#include <cufftXt.h>
+
+using namespace Legion;
+
+namespace cunumeric {
+
+template <typename T>
+static __device__ T cb_zero_pad(void* data, size_t offset, void* callerinfo, void* sharedptr)
+{
+  auto* info    = static_cast<const ZeroPadLoadData*>(callerinfo);
+  size_t actual = 0;
+#pragma unroll 3
+  for (int32_t d = 0; d < info->dim; d++) {
+    coord_t coord = info->pitches[d].divmod(offset, offset);
+    if (coord >= info->bounds[d]) return 0.f;
+    actual += coord * info->strides[d];
+  }
+  auto* ptr = static_cast<T*>(data);
+  return ptr[actual];
+}
+
+template <typename Complex, typename Field, Complex (*CTOR)(Field, Field)>
+static __device__ Complex cb_multiply(void* data, size_t offset, void* callerinfo, void* sharedptr)
+{
+  auto* info = static_cast<const LoadComplexData*>(callerinfo);
+  auto* ptr  = static_cast<Complex*>(data);
+  auto lhs   = ptr[offset];
+  auto rhs   = ptr[offset + info->buffervolume];
+  return CTOR(lhs.x * rhs.x - lhs.y * rhs.y, lhs.x * rhs.y + lhs.y * rhs.x);
+}
+
+template <typename T>
+static __device__ void cb_store(
+  void* data, size_t offset, T value, void* callerinfo, void* sharedptr)
+{
+  auto* info    = static_cast<const StoreOutputData<T>*>(callerinfo);
+  size_t actual = 0;
+#pragma unroll 3
+  for (int32_t d = 0; d < info->dim; d++) {
+    coord_t coord = info->pitches[d].divmod(offset, offset);
+    coord -= info->offsets[d];
+    if (coord < 0) return;
+    if (coord >= info->bounds[d]) return;
+    actual += coord * info->strides[d];
+  }
+  auto* ptr   = static_cast<T*>(data);
+  ptr[actual] = value * info->scale_factor;
+}
+
+static __device__ cufftCallbackLoadR d_zero_pad_float  = cb_zero_pad<cufftReal>;
+static __device__ cufftCallbackLoadD d_zero_pad_double = cb_zero_pad<cufftDoubleReal>;
+static __device__ cufftCallbackLoadC d_multiply_float =
+  cb_multiply<cufftComplex, float, make_cuComplex>;
+static __device__ cufftCallbackLoadZ d_multiply_double =
+  cb_multiply<cufftDoubleComplex, double, make_cuDoubleComplex>;
+static __device__ cufftCallbackStoreR d_store_float  = cb_store<cufftReal>;
+static __device__ cufftCallbackStoreD d_store_double = cb_store<cufftDoubleReal>;
+
+#define LOAD_SYMBOL(FUN, DEV_FUN)                                 \
+  __host__ void* FUN()                                            \
+  {                                                               \
+    void* ptr = nullptr;                                          \
+    CHECK_CUDA(cudaMemcpyFromSymbol(&ptr, DEV_FUN, sizeof(ptr))); \
+    assert(ptr != nullptr);                                       \
+    return ptr;                                                   \
+  }
+
+LOAD_SYMBOL(load_zero_pad_callback_float, d_zero_pad_float)
+LOAD_SYMBOL(load_zero_pad_callback_double, d_zero_pad_double)
+LOAD_SYMBOL(load_multiply_callback_float, d_multiply_float)
+LOAD_SYMBOL(load_multiply_callback_double, d_multiply_double)
+LOAD_SYMBOL(load_store_callback_float, d_store_float)
+LOAD_SYMBOL(load_store_callback_double, d_store_double)
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/convolution/convolve_common.h b/src/cunumeric/convolution/convolve_common.h
new file mode 100644
index 000000000..927e8176b
--- /dev/null
+++ b/src/cunumeric/convolution/convolve_common.h
@@ -0,0 +1,92 @@
+/* Copyright 2021-2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/divmod.h"
+#include "legion.h"
+
+namespace cunumeric {
+
+struct LoadComplexData {
+  size_t buffervolume{0};
+
+  __host__ inline bool update(size_t new_volume)
+  {
+    auto changed = buffervolume != new_volume;
+    buffervolume = new_volume;
+    return changed;
+  }
+};
+
+struct ZeroPadLoadData {
+  FastDivmodU64 pitches[3];
+  size_t strides[3];
+  size_t bounds[3];
+  int32_t dim{0};
+
+  template <int32_t DIM>
+  __host__ inline bool update(const Legion::Point<DIM>& fftsize,
+                              const size_t* new_strides,
+                              const Legion::Point<DIM>& new_bounds)
+  {
+    auto changed = dim != DIM;
+    changed      = true;
+    dim          = DIM;
+    size_t pitch = 1;
+    for (int32_t d = DIM - 1; d >= 0; --d) {
+      if (changed || (pitches[d].divisor != pitch)) {
+        pitches[d] = FastDivmodU64(pitch);
+        changed    = true;
+      }
+      pitch *= fftsize[d];
+      if (changed || (strides[d] != new_strides[d])) {
+        strides[d] = new_strides[d];
+        changed    = true;
+      }
+      if (changed || (bounds[d] != new_bounds[d])) {
+        bounds[d] = new_bounds[d];
+        changed   = true;
+      }
+    }
+    return changed;
+  }
+};
+
+template <typename T>
+struct StoreOutputData : public ZeroPadLoadData {
+  size_t offsets[3];
+  T scale_factor;
+  using ZeroPadLoadData::update;
+  template <int32_t DIM>
+  __host__ inline bool update(const Legion::Point<DIM>& fftsize,
+                              const size_t* new_strides,
+                              const Legion::Point<DIM>& new_offsets,
+                              const Legion::Point<DIM>& new_bounds)
+  {
+    auto changed = update(fftsize, new_strides, new_bounds);
+    for (int32_t d = DIM - 1; d >= 0; --d) {
+      if (changed || offsets[d] != new_offsets[d]) {
+        offsets[d] = new_offsets[d];
+        changed    = true;
+      }
+    }
+    if (changed) scale_factor = 1.0 / pitches[0].divisor;
+    return changed;
+  }
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
index a54e1ee2e..385c2e0ac 100644
--- a/src/cunumeric/cuda_help.h
+++ b/src/cunumeric/cuda_help.h
@@ -21,6 +21,7 @@
 #include <cusolverDn.h>
 #include <cuda_runtime.h>
 #include <cufft.h>
+#include <cufftXt.h>
 #include <cutensor.h>
 
 #define THREADS_PER_BLOCK 128
@@ -70,7 +71,30 @@ namespace cunumeric {
 
 struct cufftPlan {
   cufftHandle handle;
-  size_t workarea_size;
+  size_t workarea;
+};
+
+class cufftContext {
+ public:
+  cufftContext(cufftPlan* plan);
+  ~cufftContext();
+
+ public:
+  cufftContext(const cufftContext&) = delete;
+  cufftContext& operator=(const cufftContext&) = delete;
+
+ public:
+  cufftContext(cufftContext&&) = default;
+  cufftContext& operator=(cufftContext&&) = default;
+
+ public:
+  cufftHandle handle();
+  size_t workarea_size();
+  void set_callback(cufftXtCallbackType type, void* callback, void* data);
+
+ private:
+  cufftPlan* plan_{nullptr};
+  std::set<cufftXtCallbackType> callback_types_{};
 };
 
 // Defined in cudalibs.cu
@@ -80,7 +104,7 @@ cudaStream_t get_cached_stream();
 cublasHandle_t get_cublas();
 cusolverDnHandle_t get_cusolver();
 cutensorHandle_t* get_cutensor();
-cufftPlan* get_cufft_plan(cufftType type, const Legion::DomainPoint& size);
+cufftContext get_cufft_plan(cufftType type, const Legion::DomainPoint& size);
 
 __host__ inline void check_cuda(cudaError_t error, const char* file, int line)
 {
diff --git a/src/cunumeric/cudalibs.cu b/src/cunumeric/cudalibs.cu
index 5b1d7bc32..3e79647fc 100644
--- a/src/cunumeric/cudalibs.cu
+++ b/src/cunumeric/cudalibs.cu
@@ -27,6 +27,29 @@ using namespace Legion;
 
 static Logger log_cudalibs("cunumeric.cudalibs");
 
+cufftContext::cufftContext(cufftPlan* plan) : plan_(plan) {}
+
+cufftContext::~cufftContext()
+{
+  auto hdl = handle();
+  for (auto type : callback_types_) CHECK_CUFFT(cufftXtClearCallback(hdl, type));
+}
+
+cufftHandle cufftContext::handle() { return plan_->handle; }
+
+size_t cufftContext::workarea_size() { return plan_->workarea; }
+
+void cufftContext::set_callback(cufftXtCallbackType type, void* callback, void* data)
+{
+  auto hdl = handle();
+  if (callback_types_.find(type) != callback_types_.end())
+    CHECK_CUFFT(cufftXtClearCallback(hdl, type));
+  void* callbacks[1] = {callback};
+  void* datas[1]     = {data};
+  CHECK_CUFFT(cufftXtSetCallback(hdl, callbacks, type, datas));
+  callback_types_.insert(type);
+}
+
 struct cufftPlanCache {
  private:
   // Maximum number of plans to keep per dimension
@@ -119,7 +142,7 @@ cufftPlan* cufftPlanCache::get_cufft_plan(const DomainPoint& size)
                                   1,
                                   type_,
                                   1 /*batch*/,
-                                  &result->workarea_size));
+                                  &result->workarea));
   }
   // Otherwise, we return the cached plan and adjust the LRU count
   else {
@@ -206,7 +229,7 @@ cutensorHandle_t* CUDALibraries::get_cutensor()
   return cutensor_;
 }
 
-cufftPlan* CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& size)
+cufftContext CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& size)
 {
   auto finder = plan_caches_.find(type);
   cufftPlanCache* cache{nullptr};
@@ -216,7 +239,7 @@ cufftPlan* CUDALibraries::get_cufft_plan(cufftType type, const DomainPoint& size
     plan_caches_[type] = cache;
   } else
     cache = finder->second;
-  return cache->get_cufft_plan(size);
+  return cufftContext(cache->get_cufft_plan(size));
 }
 
 static CUDALibraries& get_cuda_libraries(Processor proc)
@@ -265,7 +288,7 @@ cutensorHandle_t* get_cutensor()
   return lib.get_cutensor();
 }
 
-cufftPlan* get_cufft_plan(cufftType type, const Legion::DomainPoint& size)
+cufftContext get_cufft_plan(cufftType type, const Legion::DomainPoint& size)
 {
   const auto proc = Processor::get_executing_processor();
   auto& lib       = get_cuda_libraries(proc);
diff --git a/src/cunumeric/cudalibs.h b/src/cunumeric/cudalibs.h
index 5d7514b73..3edd9e8f8 100644
--- a/src/cunumeric/cudalibs.h
+++ b/src/cunumeric/cudalibs.h
@@ -38,7 +38,7 @@ struct CUDALibraries {
   cublasHandle_t get_cublas();
   cusolverDnHandle_t get_cusolver();
   cutensorHandle_t* get_cutensor();
-  cufftPlan* get_cufft_plan(cufftType type, const Legion::DomainPoint& size);
+  cufftContext get_cufft_plan(cufftType type, const Legion::DomainPoint& size);
 
  private:
   void finalize_cublas();