Skip to content

Commit

Permalink
Use DynamicScratchpad in KernelManager. (NVIDIA#3670)
Browse files Browse the repository at this point in the history
* Use DynamicScratchpad in KernelManager.
* Use invalid stream as a default for KernelContext.
* Lazily initialize upstream resources for dynamic scratchpad.
* Do not initialize the underlying resource for DynamicScratchpad when allocating 0 bytes.

Signed-off-by: Michal Zientkiewicz <[email protected]>
  • Loading branch information
mzient authored and cyyever committed Jun 7, 2022
1 parent 83d2fca commit 5e1136b
Show file tree
Hide file tree
Showing 27 changed files with 132 additions and 57 deletions.
5 changes: 3 additions & 2 deletions dali/kernels/context.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2018-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,7 @@
#include <vector>
#include <algorithm>
#include <type_traits>
#include "dali/core/access_order.h"
#include "dali/core/tensor_view.h"
#include "dali/core/mm/memory_resource.h"
#include "dali/core/mm/memory_kind.h"
Expand All @@ -34,7 +35,7 @@ struct Context {};

template <>
struct Context<ComputeGPU> {
cudaStream_t stream = 0;
cudaStream_t stream = AccessOrder::null_stream();
};

class Scratchpad;
Expand Down
60 changes: 46 additions & 14 deletions dali/kernels/dynamic_scratchpad.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class DynamicScratchpadImplT {
AccessOrder dealloc_order = {}) {
static_assert(!std::is_same<Kind, mm::memory_kind::host>::value,
"Cannot use a stream-ordered resource for plain host memory");
if (!dealloc_order.has_value())
dealloc_order = alloc_order;
adapter<Kind>() = { rsrc, alloc_order, dealloc_order };
set_upstream_resource<Kind>(&adapter<Kind>());
}
Expand Down Expand Up @@ -135,41 +137,71 @@ class DynamicScratchpad
initial_sizes_ = initial_sizes;
for (auto &s : initial_sizes_) {
if (s == 0)
s = 4096;
s = 0x10000; // 64k
}
if (!pinned_dealloc_order.has_value())
pinned_dealloc_order = device_order;
if (!managed_dealloc_order.has_value())
managed_dealloc_order = device_order;

device_order_ = device_order;
pinned_dealloc_order_ = pinned_dealloc_order;
managed_dealloc_order_ = managed_dealloc_order;
}

virtual void *Alloc(mm::memory_kind_id kind_id, size_t bytes, size_t alignment) {
void *ret = nullptr;
TYPE_SWITCH(kind_id, mm::kind2id, Kind,
(mm::memory_kind::host,
mm::memory_kind::pinned,
mm::memory_kind::device,
mm::memory_kind::managed),
(ret = AllocImpl<Kind>(bytes, alignment)),
(assert(!"Incorrect memory kind id");));
return ret;
}

template <typename T>
struct type_tag {};

void InitResource(type_tag<mm::memory_kind::host>) {
set_upstream_resource<mm::memory_kind::host>(mm::GetDefaultResource<mm::memory_kind::host>());
}

void InitResource(type_tag<mm::memory_kind::pinned>) {
set_upstream_resource<mm::memory_kind::pinned>(
mm::GetDefaultResource<mm::memory_kind::pinned>(),
AccessOrder::host(),
pinned_dealloc_order);
pinned_dealloc_order_);
}

void InitResource(type_tag<mm::memory_kind::device>) {
set_upstream_resource<mm::memory_kind::device>(
mm::GetDefaultResource<mm::memory_kind::device>(),
device_order);
device_order_);
}

void InitResource(type_tag<mm::memory_kind::managed>) {
set_upstream_resource<mm::memory_kind::managed>(
mm::GetDefaultResource<mm::memory_kind::managed>(),
AccessOrder::host(),
managed_dealloc_order);
managed_dealloc_order_);
}

virtual void *Alloc(mm::memory_kind_id kind_id, size_t bytes, size_t alignment) {
void *ret = nullptr;
TYPE_SWITCH(kind_id, mm::kind2id, Kind,
(mm::memory_kind::host,
mm::memory_kind::pinned,
mm::memory_kind::device,
mm::memory_kind::managed),
(ret = resource<Kind>().allocate(bytes, alignment)),
(assert(!"Incorrect memory kind id");));
return ret;
template <typename Kind>
void *AllocImpl(size_t bytes, size_t alignment) {
if (bytes == 0)
return nullptr; // do not initialize the resource in case of 0-sized allocation

auto &r = resource<Kind>();
if (!r.get_upstream()) {
InitResource(type_tag<Kind>());
assert(r.get_upstream() != nullptr);
}
return r.allocate(bytes, alignment);
}

AccessOrder device_order_, pinned_dealloc_order_, managed_dealloc_order_;
};

} // namespace kernels
Expand Down
1 change: 1 addition & 0 deletions dali/kernels/imgproc/convolution/convolution_gpu_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ struct ConvolutionGpuKernelTest : public ::testing::Test {

void RunTest() {
KernelContext ctx_cpu, ctx_gpu;
ctx_gpu.gpu.stream = 0;
KernelCpu kernel_cpu;
KernelGpu kernel_gpu;

Expand Down
1 change: 1 addition & 0 deletions dali/kernels/imgproc/convolution/laplacian_gpu_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ struct LaplacianGpuTest : public ::testing::Test {

void RunTest() {
KernelContext ctx_cpu = {}, ctx_gpu = {};
ctx_gpu.gpu.stream = 0;
KernelCpu kernel_cpu;
KernelGpu kernel_gpu;
int nsamples = in_.shape.size();
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/imgproc/flip_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2019, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -51,6 +51,7 @@ class FlipGpuTest: public testing::TestWithParam<std::array<Index, sample_ndim>>

TEST_P(FlipGpuTest, ImplTest) {
KernelContext ctx;
ctx.gpu.stream = 0;
FlipGPU<float> kernel;
auto in_view = ttl_in_.gpu(nullptr);
ttl_in_.invalidate_cpu();
Expand All @@ -75,6 +76,7 @@ TEST_P(FlipGpuTest, ImplTest) {

TEST_P(FlipGpuTest, KernelTest) {
KernelContext ctx;
ctx.gpu.stream = 0;
FlipGPU<float> kernel;
auto in_view = ttl_in_.gpu(nullptr);
ttl_in_.invalidate_cpu();
Expand Down
23 changes: 14 additions & 9 deletions dali/kernels/imgproc/pointwise/linear_transformation_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -126,6 +126,7 @@ TYPED_TEST(LinearTransformationGpuTest, check_kernel) {
TYPED_TEST(LinearTransformationGpuTest, setup_test) {
TheKernel<TypeParam> kernel;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);
auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
ASSERT_EQ(this->out_shapes_.size(), static_cast<size_t>(reqs.output_shapes[0].num_samples()))
Expand All @@ -140,6 +141,7 @@ TYPED_TEST(LinearTransformationGpuTest, setup_test) {
TYPED_TEST(LinearTransformationGpuTest, setup_test_with_roi) {
TheKernel<TypeParam> kernel;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);
auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_),
make_cspan(this->rois_));
Expand All @@ -150,20 +152,21 @@ TYPED_TEST(LinearTransformationGpuTest, setup_test_with_roi) {

TYPED_TEST(LinearTransformationGpuTest, run_test) {
TheKernel<TypeParam> kernel;
KernelContext c;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);

auto reqs = kernel.Setup(c, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_));

ScratchpadAllocator sa;
sa.Reserve(reqs.scratch_sizes);
auto scratchpad = sa.GetScratchpad();
c.scratchpad = &scratchpad;
ctx.scratchpad = &scratchpad;

OutListGPU<typename TypeParam::Out, kNDims> out(
this->output_, reqs.output_shapes[0].template to_static<kNDims>());

kernel.Run(c, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
kernel.Run(ctx, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
CUDA_CALL(cudaDeviceSynchronize());

auto res = copy<mm::memory_kind::host>(out[0]);
Expand All @@ -175,22 +178,24 @@ TYPED_TEST(LinearTransformationGpuTest, run_test) {

TYPED_TEST(LinearTransformationGpuTest, run_test_with_roi) {
TheKernel<TypeParam> kernel;
KernelContext c;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);

auto reqs = kernel.Setup(c, in,
auto reqs = kernel.Setup(ctx, in,
make_cspan(this->vmat_), make_cspan(this->vvec_),
make_cspan(this->rois_));

ScratchpadAllocator sa;
sa.Reserve(reqs.scratch_sizes);
auto scratchpad = sa.GetScratchpad();
c.scratchpad = &scratchpad;
ctx.scratchpad = &scratchpad;

OutListGPU<typename TypeParam::Out, kNDims> out(
this->output_, reqs.output_shapes[0].template to_static<kNDims>());

kernel.Run(c, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_), make_cspan(this->rois_));
kernel.Run(ctx, out, in,
make_cspan(this->vmat_), make_cspan(this->vvec_), make_cspan(this->rois_));
CUDA_CALL(cudaDeviceSynchronize());

auto res = copy<mm::memory_kind::host>(out[0]);
Expand Down
12 changes: 7 additions & 5 deletions dali/kernels/imgproc/pointwise/multiply_add_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -130,6 +130,7 @@ TYPED_TEST(MultiplyAddGpuTest, check_kernel) {
TYPED_TEST(MultiplyAddGpuTest, setup_test) {
TheKernel<TypeParam> kernel;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNdims> in(this->input_device_, this->shapes_);
auto reqs = kernel.Setup(ctx, in, this->addends_, this->multipliers_);
ASSERT_EQ(this->shapes_.size(), static_cast<size_t>(reqs.output_shapes[0].num_samples()))
Expand All @@ -143,18 +144,19 @@ TYPED_TEST(MultiplyAddGpuTest, setup_test) {

TYPED_TEST(MultiplyAddGpuTest, run_test) {
TheKernel<TypeParam> kernel;
KernelContext c;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNdims> in(this->input_device_, this->shapes_);
OutListGPU<typename TypeParam::Out, kNdims> out(this->output_,
TensorListShape<kNdims>(this->shapes_));

auto reqs = kernel.Setup(c, in, this->addends_, this->multipliers_);
auto reqs = kernel.Setup(ctx, in, this->addends_, this->multipliers_);

ScratchpadAllocator sa;
sa.Reserve(reqs.scratch_sizes);
auto scratchpad = sa.GetScratchpad();
c.scratchpad = &scratchpad;
kernel.Run(c, out, in, this->addends_, this->multipliers_);
ctx.scratchpad = &scratchpad;
kernel.Run(ctx, out, in, this->addends_, this->multipliers_);
CUDA_CALL(cudaDeviceSynchronize());

auto res = copy<mm::memory_kind::host>(out[0]);
Expand Down
13 changes: 10 additions & 3 deletions dali/kernels/kernel_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "dali/kernels/scratch.h"
#include "dali/kernels/context.h"
#include "dali/kernels/kernel_req.h"
#include "dali/kernels/dynamic_scratchpad.h"
#include "dali/core/small_vector.h"
#include "dali/core/mm/memory_kind.h"

Expand Down Expand Up @@ -218,9 +219,15 @@ class DLL_PUBLIC KernelManager {
*/
template <typename Kernel, typename... OutInArgs>
void Run(int thread_idx, int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
assert(static_cast<size_t>(thread_idx) < scratchpads.size());
auto &sa = GetScratchpadAllocator(thread_idx);
Run<Kernel>(sa, instance_idx, context, std::forward<OutInArgs>(out_in_args)...);
assert(instance_idx >= 0 &&
static_cast<size_t>(instance_idx) < NumInstances() &&
"Kernel instance index (instance_idx) out of range");
auto &inst = instances[instance_idx];
DynamicScratchpad scratchpad({}, AccessOrder(context.gpu.stream));
auto *old_scratchpad = context.scratchpad;
context.scratchpad = &scratchpad;
inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
context.scratchpad = old_scratchpad;
}

/**
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/math/transform_points_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -70,6 +70,7 @@ struct TransformPointsTest : ::testing::Test {
const auto *in_points = reinterpret_cast<const vec<in_dim, In> *>(in_tensor.data);
const auto *out_points = reinterpret_cast<const vec<out_dim, Out> *>(out_tensor.data);
KernelContext ctx;
ctx.gpu.stream = 0;
auto &req = kmgr_.Setup<Kernel>(0, ctx, in_tensor.shape);
ASSERT_EQ(req.output_shapes[0][0], out_tensor.shape);
kmgr_.Run<Kernel>(0, 0, ctx, out_tensor, in_tensor, M, T);
Expand Down Expand Up @@ -103,6 +104,7 @@ struct TransformPointsTest : ::testing::Test {

kmgr_.Resize<Kernel>(1, 1);
KernelContext ctx;
ctx.gpu.stream = 0;
auto &req = kmgr_.Setup<Kernel>(0, ctx, in_gpu.shape);
ASSERT_EQ(req.output_shapes[0], out_gpu.shape);
kmgr_.Run<Kernel>(0, 0, ctx, out_gpu, in_gpu, make_span(M), make_span(T));
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/normalize/normalize_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -246,6 +246,7 @@ class NormalizeImplGPUTest<std::pair<Out, In>> : public ::testing::Test {
void RunTest() {
kmgr_.Resize<Kernel>(1, 1);
KernelContext ctx;
ctx.gpu.stream = 0;
for (int iter = 0; iter < 3; iter++) {
auto req = kmgr_.Setup<Kernel>(0, ctx, data_shape_, param_shape_,
use_scalar_base_, use_scalar_scale_, scale_is_stddev_);
Expand Down Expand Up @@ -276,6 +277,7 @@ class NormalizeImplGPUTest<std::pair<Out, In>> : public ::testing::Test {
void RunPerf() {
kmgr_.Resize<Kernel>(1, 1);
KernelContext ctx;
ctx.gpu.stream = 0;
auto req = kmgr_.Setup<Kernel>(0, ctx, data_shape_, param_shape_,
use_scalar_base_, use_scalar_scale_, scale_is_stddev_);
ASSERT_EQ(req.output_shapes.size(), 1u);
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/reduce/reduce_gpu_test.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -49,6 +49,7 @@ struct ReductionKernelTest {
Args &&...args) {
in.reshape(in_shape);
ref.reshape(ref_out_shape);
ctx.gpu.stream = 0;
auto req = kernel.Setup(ctx, in_shape, axes, keep_dims, batch, std::forward<Args>(args)...);
ASSERT_EQ(req.output_shapes.size(), 1), req;
ASSERT_EQ(req.output_shapes[0], ref_out_shape), req;
Expand All @@ -66,6 +67,7 @@ struct ReductionKernelTest {
template <typename... Args>
void Run(Args &&...args) {
auto scratchpad = sa.GetScratchpad();
ctx.gpu.stream = 0;
ctx.scratchpad = &scratchpad;
kernel.Run(ctx, out.gpu(stream()), in.gpu(stream()), std::forward<Args>(args)...);
}
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/signal/fft/fft_postprocess_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -108,6 +108,7 @@ class FFTPostprocessTest<FFTPostprocessArgs<Out, In, Convert>> : public ::testin

ToFreqMajorSpectrum<Out, In, Convert> tr;
KernelContext ctx;
ctx.gpu.stream = 0;
ScratchpadAllocator sa;
KernelRequirements req = tr.Setup(ctx, in_shape);
ASSERT_EQ(req.output_shapes.size(), 1u);
Expand Down Expand Up @@ -174,6 +175,7 @@ class FFTPostprocessTest<FFTPostprocessArgs<Out, In, Convert>> : public ::testin

ConvertTimeMajorSpectrum<Out, In, Convert> tr;
KernelContext ctx;
ctx.gpu.stream = 0;
tr.Setup(ctx, in_shape);
tr.Run(ctx, out_gpu, in.gpu());
CUDA_CALL(cudaGetLastError());
Expand Down
Loading

0 comments on commit 5e1136b

Please sign in to comment.