Use DynamicScratchpad in KernelManager. (NVIDIA#3670)

* Use DynamicScratchpad in KernelManager. * Use invalid stream as a default for KernelContext. * Lazily initialize upstream resources for dynamic scratchpad. * Do not initialize the underlying resource for DynamicScratchpad when allocating 0 bytes. Signed-off-by: Michal Zientkiewicz <[email protected]>
cyyever · Jun 7, 2022 · 5e1136b · 5e1136b
1 parent 83d2fca
commit 5e1136b
Show file tree

Hide file tree

Showing 27 changed files with 132 additions and 57 deletions.
diff --git a/dali/kernels/context.h b/dali/kernels/context.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <vector>
 #include <algorithm>
 #include <type_traits>
+#include "dali/core/access_order.h"
 #include "dali/core/tensor_view.h"
 #include "dali/core/mm/memory_resource.h"
 #include "dali/core/mm/memory_kind.h"
@@ -34,7 +35,7 @@ struct Context {};
 
 template <>
 struct Context<ComputeGPU> {
-  cudaStream_t stream = 0;
+  cudaStream_t stream = AccessOrder::null_stream();
 };
 
 class Scratchpad;

diff --git a/dali/kernels/dynamic_scratchpad.h b/dali/kernels/dynamic_scratchpad.h
@@ -60,6 +60,8 @@ class DynamicScratchpadImplT {
                              AccessOrder dealloc_order = {}) {
     static_assert(!std::is_same<Kind, mm::memory_kind::host>::value,
       "Cannot use a stream-ordered resource for plain host memory");
+    if (!dealloc_order.has_value())
+      dealloc_order = alloc_order;
     adapter<Kind>() = { rsrc, alloc_order, dealloc_order };
     set_upstream_resource<Kind>(&adapter<Kind>());
   }
@@ -135,41 +137,71 @@ class DynamicScratchpad
     initial_sizes_ = initial_sizes;
     for (auto &s : initial_sizes_) {
       if (s == 0)
-        s = 4096;
+        s = 0x10000;  // 64k
     }
     if (!pinned_dealloc_order.has_value())
       pinned_dealloc_order = device_order;
     if (!managed_dealloc_order.has_value())
       managed_dealloc_order = device_order;
 
+    device_order_ = device_order;
+    pinned_dealloc_order_ = pinned_dealloc_order;
+    managed_dealloc_order_ = managed_dealloc_order;
+  }
+
+  virtual void *Alloc(mm::memory_kind_id kind_id, size_t bytes, size_t alignment) {
+    void *ret = nullptr;
+    TYPE_SWITCH(kind_id, mm::kind2id, Kind,
+      (mm::memory_kind::host,
+       mm::memory_kind::pinned,
+       mm::memory_kind::device,
+       mm::memory_kind::managed),
+      (ret = AllocImpl<Kind>(bytes, alignment)),
+      (assert(!"Incorrect memory kind id");));
+    return ret;
+  }
+
+  template <typename T>
+  struct type_tag {};
+
+  void InitResource(type_tag<mm::memory_kind::host>) {
     set_upstream_resource<mm::memory_kind::host>(mm::GetDefaultResource<mm::memory_kind::host>());
+  }
 
+  void InitResource(type_tag<mm::memory_kind::pinned>) {
     set_upstream_resource<mm::memory_kind::pinned>(
         mm::GetDefaultResource<mm::memory_kind::pinned>(),
         AccessOrder::host(),
-        pinned_dealloc_order);
+        pinned_dealloc_order_);
+  }
 
+  void InitResource(type_tag<mm::memory_kind::device>) {
     set_upstream_resource<mm::memory_kind::device>(
         mm::GetDefaultResource<mm::memory_kind::device>(),
-        device_order);
+        device_order_);
+  }
 
+  void InitResource(type_tag<mm::memory_kind::managed>) {
     set_upstream_resource<mm::memory_kind::managed>(
         mm::GetDefaultResource<mm::memory_kind::managed>(),
         AccessOrder::host(),
-        managed_dealloc_order);
+        managed_dealloc_order_);
   }
 
-  virtual void *Alloc(mm::memory_kind_id kind_id, size_t bytes, size_t alignment) {
-    void *ret = nullptr;
-    TYPE_SWITCH(kind_id, mm::kind2id, Kind,
-      (mm::memory_kind::host,
-       mm::memory_kind::pinned,
-       mm::memory_kind::device,
-       mm::memory_kind::managed),
-      (ret = resource<Kind>().allocate(bytes, alignment)),
-      (assert(!"Incorrect memory kind id");));
-    return ret;
+  template <typename Kind>
+  void *AllocImpl(size_t bytes, size_t alignment) {
+    if (bytes == 0)
+      return nullptr;  // do not initialize the resource in case of 0-sized allocation
+
+    auto &r = resource<Kind>();
+    if (!r.get_upstream()) {
+      InitResource(type_tag<Kind>());
+      assert(r.get_upstream() != nullptr);
+    }
+    return r.allocate(bytes, alignment);
   }
+
+  AccessOrder device_order_, pinned_dealloc_order_, managed_dealloc_order_;
 };
 
 }  // namespace kernels

diff --git a/dali/kernels/imgproc/convolution/convolution_gpu_test.cu b/dali/kernels/imgproc/convolution/convolution_gpu_test.cu
@@ -158,6 +158,7 @@ struct ConvolutionGpuKernelTest : public ::testing::Test {
 
   void RunTest() {
     KernelContext ctx_cpu, ctx_gpu;
+    ctx_gpu.gpu.stream = 0;
     KernelCpu kernel_cpu;
     KernelGpu kernel_gpu;
 

diff --git a/dali/kernels/imgproc/convolution/laplacian_gpu_test.cu b/dali/kernels/imgproc/convolution/laplacian_gpu_test.cu
@@ -161,6 +161,7 @@ struct LaplacianGpuTest : public ::testing::Test {
 
   void RunTest() {
     KernelContext ctx_cpu = {}, ctx_gpu = {};
+    ctx_gpu.gpu.stream = 0;
     KernelCpu kernel_cpu;
     KernelGpu kernel_gpu;
     int nsamples = in_.shape.size();

diff --git a/dali/kernels/imgproc/flip_gpu_test.cu b/dali/kernels/imgproc/flip_gpu_test.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2019, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -51,6 +51,7 @@ class FlipGpuTest: public testing::TestWithParam<std::array<Index, sample_ndim>>
 
 TEST_P(FlipGpuTest, ImplTest) {
   KernelContext ctx;
+  ctx.gpu.stream = 0;
   FlipGPU<float> kernel;
   auto in_view = ttl_in_.gpu(nullptr);
   ttl_in_.invalidate_cpu();
@@ -75,6 +76,7 @@ TEST_P(FlipGpuTest, ImplTest) {
 
 TEST_P(FlipGpuTest, KernelTest) {
   KernelContext ctx;
+  ctx.gpu.stream = 0;
   FlipGPU<float> kernel;
   auto in_view = ttl_in_.gpu(nullptr);
   ttl_in_.invalidate_cpu();

diff --git a/dali/kernels/imgproc/pointwise/linear_transformation_gpu_test.cu b/dali/kernels/imgproc/pointwise/linear_transformation_gpu_test.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -126,6 +126,7 @@ TYPED_TEST(LinearTransformationGpuTest, check_kernel) {
 TYPED_TEST(LinearTransformationGpuTest, setup_test) {
   TheKernel<TypeParam> kernel;
   KernelContext ctx;
+  ctx.gpu.stream = 0;
   InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);
   auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
   ASSERT_EQ(this->out_shapes_.size(), static_cast<size_t>(reqs.output_shapes[0].num_samples()))
@@ -140,6 +141,7 @@ TYPED_TEST(LinearTransformationGpuTest, setup_test) {
 TYPED_TEST(LinearTransformationGpuTest, setup_test_with_roi) {
   TheKernel<TypeParam> kernel;
   KernelContext ctx;
+  ctx.gpu.stream = 0;
   InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);
   auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_),
                            make_cspan(this->rois_));
@@ -150,20 +152,21 @@ TYPED_TEST(LinearTransformationGpuTest, setup_test_with_roi) {
 
 TYPED_TEST(LinearTransformationGpuTest, run_test) {
   TheKernel<TypeParam> kernel;
-  KernelContext c;
+  KernelContext ctx;
+  ctx.gpu.stream = 0;
   InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);
 
-  auto reqs = kernel.Setup(c, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
+  auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
 
   ScratchpadAllocator sa;
   sa.Reserve(reqs.scratch_sizes);
   auto scratchpad = sa.GetScratchpad();
-  c.scratchpad = &scratchpad;
+  ctx.scratchpad = &scratchpad;
 
   OutListGPU<typename TypeParam::Out, kNDims> out(
           this->output_, reqs.output_shapes[0].template to_static<kNDims>());
 
-  kernel.Run(c, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
+  kernel.Run(ctx, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
   CUDA_CALL(cudaDeviceSynchronize());
 
   auto res = copy<mm::memory_kind::host>(out[0]);
@@ -175,22 +178,24 @@ TYPED_TEST(LinearTransformationGpuTest, run_test) {
 
 TYPED_TEST(LinearTransformationGpuTest, run_test_with_roi) {
   TheKernel<TypeParam> kernel;
-  KernelContext c;
+  KernelContext ctx;
+  ctx.gpu.stream = 0;
   InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);
 
-  auto reqs = kernel.Setup(c, in,
+  auto reqs = kernel.Setup(ctx, in,
                            make_cspan(this->vmat_), make_cspan(this->vvec_),
                            make_cspan(this->rois_));
 
   ScratchpadAllocator sa;
   sa.Reserve(reqs.scratch_sizes);
   auto scratchpad = sa.GetScratchpad();
-  c.scratchpad = &scratchpad;
+  ctx.scratchpad = &scratchpad;
 
   OutListGPU<typename TypeParam::Out, kNDims> out(
           this->output_, reqs.output_shapes[0].template to_static<kNDims>());
 
-  kernel.Run(c, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_), make_cspan(this->rois_));
+  kernel.Run(ctx, out, in,
+             make_cspan(this->vmat_), make_cspan(this->vvec_), make_cspan(this->rois_));
   CUDA_CALL(cudaDeviceSynchronize());
 
   auto res = copy<mm::memory_kind::host>(out[0]);

diff --git a/dali/kernels/imgproc/pointwise/multiply_add_gpu_test.cu b/dali/kernels/imgproc/pointwise/multiply_add_gpu_test.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -130,6 +130,7 @@ TYPED_TEST(MultiplyAddGpuTest, check_kernel) {
 TYPED_TEST(MultiplyAddGpuTest, setup_test) {
   TheKernel<TypeParam> kernel;
   KernelContext ctx;
+  ctx.gpu.stream = 0;
   InListGPU<typename TypeParam::In, kNdims> in(this->input_device_, this->shapes_);
   auto reqs = kernel.Setup(ctx, in, this->addends_, this->multipliers_);
   ASSERT_EQ(this->shapes_.size(), static_cast<size_t>(reqs.output_shapes[0].num_samples()))
@@ -143,18 +144,19 @@ TYPED_TEST(MultiplyAddGpuTest, setup_test) {
 
 TYPED_TEST(MultiplyAddGpuTest, run_test) {
   TheKernel<TypeParam> kernel;
-  KernelContext c;
+  KernelContext ctx;
+  ctx.gpu.stream = 0;
   InListGPU<typename TypeParam::In, kNdims> in(this->input_device_, this->shapes_);
   OutListGPU<typename TypeParam::Out, kNdims> out(this->output_,
                                                   TensorListShape<kNdims>(this->shapes_));
 
-  auto reqs = kernel.Setup(c, in, this->addends_, this->multipliers_);
+  auto reqs = kernel.Setup(ctx, in, this->addends_, this->multipliers_);
 
   ScratchpadAllocator sa;
   sa.Reserve(reqs.scratch_sizes);
   auto scratchpad = sa.GetScratchpad();
-  c.scratchpad = &scratchpad;
-  kernel.Run(c, out, in, this->addends_, this->multipliers_);
+  ctx.scratchpad = &scratchpad;
+  kernel.Run(ctx, out, in, this->addends_, this->multipliers_);
   CUDA_CALL(cudaDeviceSynchronize());
 
   auto res = copy<mm::memory_kind::host>(out[0]);

diff --git a/dali/kernels/kernel_manager.h b/dali/kernels/kernel_manager.h
@@ -22,6 +22,7 @@
 #include "dali/kernels/scratch.h"
 #include "dali/kernels/context.h"
 #include "dali/kernels/kernel_req.h"
+#include "dali/kernels/dynamic_scratchpad.h"
 #include "dali/core/small_vector.h"
 #include "dali/core/mm/memory_kind.h"
 
@@ -218,9 +219,15 @@ class DLL_PUBLIC KernelManager {
    */
   template <typename Kernel, typename... OutInArgs>
   void Run(int thread_idx, int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
-    assert(static_cast<size_t>(thread_idx) < scratchpads.size());
-    auto &sa = GetScratchpadAllocator(thread_idx);
-    Run<Kernel>(sa, instance_idx, context, std::forward<OutInArgs>(out_in_args)...);
+    assert(instance_idx >= 0 &&
+           static_cast<size_t>(instance_idx) < NumInstances() &&
+           "Kernel instance index (instance_idx) out of range");
+    auto &inst = instances[instance_idx];
+    DynamicScratchpad scratchpad({}, AccessOrder(context.gpu.stream));
+    auto *old_scratchpad = context.scratchpad;
+    context.scratchpad = &scratchpad;
+    inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
+    context.scratchpad = old_scratchpad;
   }
 
   /**

diff --git a/dali/kernels/math/transform_points_test.cu b/dali/kernels/math/transform_points_test.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -70,6 +70,7 @@ struct TransformPointsTest : ::testing::Test {
       const auto *in_points = reinterpret_cast<const vec<in_dim, In> *>(in_tensor.data);
       const auto *out_points = reinterpret_cast<const vec<out_dim, Out> *>(out_tensor.data);
       KernelContext ctx;
+      ctx.gpu.stream = 0;
       auto &req = kmgr_.Setup<Kernel>(0, ctx, in_tensor.shape);
       ASSERT_EQ(req.output_shapes[0][0], out_tensor.shape);
       kmgr_.Run<Kernel>(0, 0, ctx, out_tensor, in_tensor, M, T);
@@ -103,6 +104,7 @@ struct TransformPointsTest : ::testing::Test {
 
     kmgr_.Resize<Kernel>(1, 1);
     KernelContext ctx;
+    ctx.gpu.stream = 0;
     auto &req = kmgr_.Setup<Kernel>(0, ctx, in_gpu.shape);
     ASSERT_EQ(req.output_shapes[0], out_gpu.shape);
     kmgr_.Run<Kernel>(0, 0, ctx, out_gpu, in_gpu, make_span(M), make_span(T));

diff --git a/dali/kernels/normalize/normalize_gpu_test.cu b/dali/kernels/normalize/normalize_gpu_test.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -246,6 +246,7 @@ class NormalizeImplGPUTest<std::pair<Out, In>> : public ::testing::Test {
   void RunTest() {
     kmgr_.Resize<Kernel>(1, 1);
     KernelContext ctx;
+    ctx.gpu.stream = 0;
     for (int iter = 0; iter < 3; iter++) {
       auto req = kmgr_.Setup<Kernel>(0, ctx, data_shape_, param_shape_,
                                      use_scalar_base_, use_scalar_scale_, scale_is_stddev_);
@@ -276,6 +277,7 @@ class NormalizeImplGPUTest<std::pair<Out, In>> : public ::testing::Test {
   void RunPerf() {
     kmgr_.Resize<Kernel>(1, 1);
     KernelContext ctx;
+    ctx.gpu.stream = 0;
     auto req = kmgr_.Setup<Kernel>(0, ctx, data_shape_, param_shape_,
                                    use_scalar_base_, use_scalar_scale_, scale_is_stddev_);
     ASSERT_EQ(req.output_shapes.size(), 1u);

diff --git a/dali/kernels/reduce/reduce_gpu_test.h b/dali/kernels/reduce/reduce_gpu_test.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -49,6 +49,7 @@ struct ReductionKernelTest {
       Args &&...args) {
     in.reshape(in_shape);
     ref.reshape(ref_out_shape);
+    ctx.gpu.stream = 0;
     auto req = kernel.Setup(ctx, in_shape, axes, keep_dims, batch, std::forward<Args>(args)...);
     ASSERT_EQ(req.output_shapes.size(), 1), req;
     ASSERT_EQ(req.output_shapes[0], ref_out_shape), req;
@@ -66,6 +67,7 @@ struct ReductionKernelTest {
   template <typename... Args>
   void Run(Args &&...args) {
     auto scratchpad = sa.GetScratchpad();
+    ctx.gpu.stream = 0;
     ctx.scratchpad = &scratchpad;
     kernel.Run(ctx, out.gpu(stream()), in.gpu(stream()), std::forward<Args>(args)...);
   }

diff --git a/dali/kernels/signal/fft/fft_postprocess_test.cu b/dali/kernels/signal/fft/fft_postprocess_test.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -108,6 +108,7 @@ class FFTPostprocessTest<FFTPostprocessArgs<Out, In, Convert>> : public ::testin
 
     ToFreqMajorSpectrum<Out, In, Convert> tr;
     KernelContext ctx;
+    ctx.gpu.stream = 0;
     ScratchpadAllocator sa;
     KernelRequirements req = tr.Setup(ctx, in_shape);
     ASSERT_EQ(req.output_shapes.size(), 1u);
@@ -174,6 +175,7 @@ class FFTPostprocessTest<FFTPostprocessArgs<Out, In, Convert>> : public ::testin
 
     ConvertTimeMajorSpectrum<Out, In, Convert> tr;
     KernelContext ctx;
+    ctx.gpu.stream = 0;
     tr.Setup(ctx, in_shape);
     tr.Run(ctx, out_gpu, in.gpu());
     CUDA_CALL(cudaGetLastError());