Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use DynamicScratchpad in KernelManager. #3670

Merged
merged 7 commits into from
Feb 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dali/kernels/context.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2018-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,7 @@
#include <vector>
#include <algorithm>
#include <type_traits>
#include "dali/core/access_order.h"
#include "dali/core/tensor_view.h"
#include "dali/core/mm/memory_resource.h"
#include "dali/core/mm/memory_kind.h"
Expand All @@ -34,7 +35,7 @@ struct Context {};

template <>
struct Context<ComputeGPU> {
cudaStream_t stream = 0;
cudaStream_t stream = AccessOrder::null_stream();
};

class Scratchpad;
Expand Down
60 changes: 46 additions & 14 deletions dali/kernels/dynamic_scratchpad.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class DynamicScratchpadImplT {
AccessOrder dealloc_order = {}) {
static_assert(!std::is_same<Kind, mm::memory_kind::host>::value,
"Cannot use a stream-ordered resource for plain host memory");
if (!dealloc_order.has_value())
dealloc_order = alloc_order;
adapter<Kind>() = { rsrc, alloc_order, dealloc_order };
set_upstream_resource<Kind>(&adapter<Kind>());
}
Expand Down Expand Up @@ -135,41 +137,71 @@ class DynamicScratchpad
initial_sizes_ = initial_sizes;
for (auto &s : initial_sizes_) {
if (s == 0)
s = 4096;
s = 0x10000; // 64k
}
if (!pinned_dealloc_order.has_value())
pinned_dealloc_order = device_order;
if (!managed_dealloc_order.has_value())
managed_dealloc_order = device_order;

device_order_ = device_order;
pinned_dealloc_order_ = pinned_dealloc_order;
managed_dealloc_order_ = managed_dealloc_order;
}

virtual void *Alloc(mm::memory_kind_id kind_id, size_t bytes, size_t alignment) {
void *ret = nullptr;
TYPE_SWITCH(kind_id, mm::kind2id, Kind,
(mm::memory_kind::host,
mm::memory_kind::pinned,
mm::memory_kind::device,
mm::memory_kind::managed),
(ret = AllocImpl<Kind>(bytes, alignment)),
(assert(!"Incorrect memory kind id");));
return ret;
}

template <typename T>
struct type_tag {};

void InitResource(type_tag<mm::memory_kind::host>) {
set_upstream_resource<mm::memory_kind::host>(mm::GetDefaultResource<mm::memory_kind::host>());
}

void InitResource(type_tag<mm::memory_kind::pinned>) {
set_upstream_resource<mm::memory_kind::pinned>(
mm::GetDefaultResource<mm::memory_kind::pinned>(),
AccessOrder::host(),
pinned_dealloc_order);
pinned_dealloc_order_);
}

void InitResource(type_tag<mm::memory_kind::device>) {
set_upstream_resource<mm::memory_kind::device>(
mm::GetDefaultResource<mm::memory_kind::device>(),
device_order);
device_order_);
}

void InitResource(type_tag<mm::memory_kind::managed>) {
set_upstream_resource<mm::memory_kind::managed>(
mm::GetDefaultResource<mm::memory_kind::managed>(),
AccessOrder::host(),
managed_dealloc_order);
managed_dealloc_order_);
}

virtual void *Alloc(mm::memory_kind_id kind_id, size_t bytes, size_t alignment) {
void *ret = nullptr;
TYPE_SWITCH(kind_id, mm::kind2id, Kind,
(mm::memory_kind::host,
mm::memory_kind::pinned,
mm::memory_kind::device,
mm::memory_kind::managed),
(ret = resource<Kind>().allocate(bytes, alignment)),
(assert(!"Incorrect memory kind id");));
return ret;
template <typename Kind>
void *AllocImpl(size_t bytes, size_t alignment) {
if (bytes == 0)
return nullptr; // do not initialize the resource in case of 0-sized allocation
JanuszL marked this conversation as resolved.
Show resolved Hide resolved

auto &r = resource<Kind>();
if (!r.get_upstream()) {
InitResource(type_tag<Kind>());
assert(r.get_upstream() != nullptr);
}
return r.allocate(bytes, alignment);
}

AccessOrder device_order_, pinned_dealloc_order_, managed_dealloc_order_;
};

} // namespace kernels
Expand Down
1 change: 1 addition & 0 deletions dali/kernels/imgproc/convolution/convolution_gpu_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ struct ConvolutionGpuKernelTest : public ::testing::Test {

void RunTest() {
KernelContext ctx_cpu, ctx_gpu;
ctx_gpu.gpu.stream = 0;
KernelCpu kernel_cpu;
KernelGpu kernel_gpu;

Expand Down
1 change: 1 addition & 0 deletions dali/kernels/imgproc/convolution/laplacian_gpu_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ struct LaplacianGpuTest : public ::testing::Test {

void RunTest() {
KernelContext ctx_cpu = {}, ctx_gpu = {};
ctx_gpu.gpu.stream = 0;
KernelCpu kernel_cpu;
KernelGpu kernel_gpu;
int nsamples = in_.shape.size();
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/imgproc/flip_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2019, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -51,6 +51,7 @@ class FlipGpuTest: public testing::TestWithParam<std::array<Index, sample_ndim>>

TEST_P(FlipGpuTest, ImplTest) {
KernelContext ctx;
ctx.gpu.stream = 0;
FlipGPU<float> kernel;
auto in_view = ttl_in_.gpu(nullptr);
ttl_in_.invalidate_cpu();
Expand All @@ -75,6 +76,7 @@ TEST_P(FlipGpuTest, ImplTest) {

TEST_P(FlipGpuTest, KernelTest) {
KernelContext ctx;
ctx.gpu.stream = 0;
FlipGPU<float> kernel;
auto in_view = ttl_in_.gpu(nullptr);
ttl_in_.invalidate_cpu();
Expand Down
23 changes: 14 additions & 9 deletions dali/kernels/imgproc/pointwise/linear_transformation_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -126,6 +126,7 @@ TYPED_TEST(LinearTransformationGpuTest, check_kernel) {
TYPED_TEST(LinearTransformationGpuTest, setup_test) {
TheKernel<TypeParam> kernel;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);
auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
ASSERT_EQ(this->out_shapes_.size(), static_cast<size_t>(reqs.output_shapes[0].num_samples()))
Expand All @@ -140,6 +141,7 @@ TYPED_TEST(LinearTransformationGpuTest, setup_test) {
TYPED_TEST(LinearTransformationGpuTest, setup_test_with_roi) {
TheKernel<TypeParam> kernel;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);
auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_),
make_cspan(this->rois_));
Expand All @@ -150,20 +152,21 @@ TYPED_TEST(LinearTransformationGpuTest, setup_test_with_roi) {

TYPED_TEST(LinearTransformationGpuTest, run_test) {
TheKernel<TypeParam> kernel;
KernelContext c;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);

auto reqs = kernel.Setup(c, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
auto reqs = kernel.Setup(ctx, in, make_cspan(this->vmat_), make_cspan(this->vvec_));

ScratchpadAllocator sa;
sa.Reserve(reqs.scratch_sizes);
auto scratchpad = sa.GetScratchpad();
c.scratchpad = &scratchpad;
ctx.scratchpad = &scratchpad;

OutListGPU<typename TypeParam::Out, kNDims> out(
this->output_, reqs.output_shapes[0].template to_static<kNDims>());

kernel.Run(c, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
kernel.Run(ctx, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_));
CUDA_CALL(cudaDeviceSynchronize());

auto res = copy<mm::memory_kind::host>(out[0]);
Expand All @@ -175,22 +178,24 @@ TYPED_TEST(LinearTransformationGpuTest, run_test) {

TYPED_TEST(LinearTransformationGpuTest, run_test_with_roi) {
TheKernel<TypeParam> kernel;
KernelContext c;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNDims> in(this->input_device_, this->in_shapes_);

auto reqs = kernel.Setup(c, in,
auto reqs = kernel.Setup(ctx, in,
make_cspan(this->vmat_), make_cspan(this->vvec_),
make_cspan(this->rois_));

ScratchpadAllocator sa;
sa.Reserve(reqs.scratch_sizes);
auto scratchpad = sa.GetScratchpad();
c.scratchpad = &scratchpad;
ctx.scratchpad = &scratchpad;

OutListGPU<typename TypeParam::Out, kNDims> out(
this->output_, reqs.output_shapes[0].template to_static<kNDims>());

kernel.Run(c, out, in, make_cspan(this->vmat_), make_cspan(this->vvec_), make_cspan(this->rois_));
kernel.Run(ctx, out, in,
make_cspan(this->vmat_), make_cspan(this->vvec_), make_cspan(this->rois_));
CUDA_CALL(cudaDeviceSynchronize());

auto res = copy<mm::memory_kind::host>(out[0]);
Expand Down
12 changes: 7 additions & 5 deletions dali/kernels/imgproc/pointwise/multiply_add_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -130,6 +130,7 @@ TYPED_TEST(MultiplyAddGpuTest, check_kernel) {
TYPED_TEST(MultiplyAddGpuTest, setup_test) {
TheKernel<TypeParam> kernel;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNdims> in(this->input_device_, this->shapes_);
auto reqs = kernel.Setup(ctx, in, this->addends_, this->multipliers_);
ASSERT_EQ(this->shapes_.size(), static_cast<size_t>(reqs.output_shapes[0].num_samples()))
Expand All @@ -143,18 +144,19 @@ TYPED_TEST(MultiplyAddGpuTest, setup_test) {

TYPED_TEST(MultiplyAddGpuTest, run_test) {
TheKernel<TypeParam> kernel;
KernelContext c;
KernelContext ctx;
ctx.gpu.stream = 0;
InListGPU<typename TypeParam::In, kNdims> in(this->input_device_, this->shapes_);
OutListGPU<typename TypeParam::Out, kNdims> out(this->output_,
TensorListShape<kNdims>(this->shapes_));

auto reqs = kernel.Setup(c, in, this->addends_, this->multipliers_);
auto reqs = kernel.Setup(ctx, in, this->addends_, this->multipliers_);

ScratchpadAllocator sa;
sa.Reserve(reqs.scratch_sizes);
auto scratchpad = sa.GetScratchpad();
c.scratchpad = &scratchpad;
kernel.Run(c, out, in, this->addends_, this->multipliers_);
ctx.scratchpad = &scratchpad;
kernel.Run(ctx, out, in, this->addends_, this->multipliers_);
CUDA_CALL(cudaDeviceSynchronize());

auto res = copy<mm::memory_kind::host>(out[0]);
Expand Down
13 changes: 10 additions & 3 deletions dali/kernels/kernel_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "dali/kernels/scratch.h"
#include "dali/kernels/context.h"
#include "dali/kernels/kernel_req.h"
#include "dali/kernels/dynamic_scratchpad.h"
#include "dali/core/small_vector.h"
#include "dali/core/mm/memory_kind.h"

Expand Down Expand Up @@ -218,9 +219,15 @@ class DLL_PUBLIC KernelManager {
*/
template <typename Kernel, typename... OutInArgs>
void Run(int thread_idx, int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
assert(static_cast<size_t>(thread_idx) < scratchpads.size());
auto &sa = GetScratchpadAllocator(thread_idx);
Run<Kernel>(sa, instance_idx, context, std::forward<OutInArgs>(out_in_args)...);
assert(instance_idx >= 0 &&
static_cast<size_t>(instance_idx) < NumInstances() &&
"Kernel instance index (instance_idx) out of range");
auto &inst = instances[instance_idx];
DynamicScratchpad scratchpad({}, AccessOrder(context.gpu.stream));
auto *old_scratchpad = context.scratchpad;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need context.scratchpad for if we are not using it anyway here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used by the Run method invoked afterwards.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was rather asking about the ideas of replacing the old one with one created here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, now the old one is always NULL, so we don't need to store it. I'll simplify it (again, possibly in a follow-up).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, let do that in a follow-up.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So kernel manager scratchpads are also to be removed in a follow-up, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's what I brought up in open floor. Since this involves an API change (we no longer need thread_idx), the change affects a large number of files - hence the decision to make the more important change that potentially affects performance (this one) first and then clean the rest.

context.scratchpad = &scratchpad;
inst.get<Kernel>().Run(context, std::forward<OutInArgs>(out_in_args)...);
context.scratchpad = old_scratchpad;
}

/**
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/math/transform_points_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -70,6 +70,7 @@ struct TransformPointsTest : ::testing::Test {
const auto *in_points = reinterpret_cast<const vec<in_dim, In> *>(in_tensor.data);
const auto *out_points = reinterpret_cast<const vec<out_dim, Out> *>(out_tensor.data);
KernelContext ctx;
ctx.gpu.stream = 0;
auto &req = kmgr_.Setup<Kernel>(0, ctx, in_tensor.shape);
ASSERT_EQ(req.output_shapes[0][0], out_tensor.shape);
kmgr_.Run<Kernel>(0, 0, ctx, out_tensor, in_tensor, M, T);
Expand Down Expand Up @@ -103,6 +104,7 @@ struct TransformPointsTest : ::testing::Test {

kmgr_.Resize<Kernel>(1, 1);
KernelContext ctx;
ctx.gpu.stream = 0;
auto &req = kmgr_.Setup<Kernel>(0, ctx, in_gpu.shape);
ASSERT_EQ(req.output_shapes[0], out_gpu.shape);
kmgr_.Run<Kernel>(0, 0, ctx, out_gpu, in_gpu, make_span(M), make_span(T));
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/normalize/normalize_gpu_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -246,6 +246,7 @@ class NormalizeImplGPUTest<std::pair<Out, In>> : public ::testing::Test {
void RunTest() {
kmgr_.Resize<Kernel>(1, 1);
KernelContext ctx;
ctx.gpu.stream = 0;
for (int iter = 0; iter < 3; iter++) {
auto req = kmgr_.Setup<Kernel>(0, ctx, data_shape_, param_shape_,
use_scalar_base_, use_scalar_scale_, scale_is_stddev_);
Expand Down Expand Up @@ -276,6 +277,7 @@ class NormalizeImplGPUTest<std::pair<Out, In>> : public ::testing::Test {
void RunPerf() {
kmgr_.Resize<Kernel>(1, 1);
KernelContext ctx;
ctx.gpu.stream = 0;
auto req = kmgr_.Setup<Kernel>(0, ctx, data_shape_, param_shape_,
use_scalar_base_, use_scalar_scale_, scale_is_stddev_);
ASSERT_EQ(req.output_shapes.size(), 1u);
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/reduce/reduce_gpu_test.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -49,6 +49,7 @@ struct ReductionKernelTest {
Args &&...args) {
in.reshape(in_shape);
ref.reshape(ref_out_shape);
ctx.gpu.stream = 0;
auto req = kernel.Setup(ctx, in_shape, axes, keep_dims, batch, std::forward<Args>(args)...);
ASSERT_EQ(req.output_shapes.size(), 1), req;
ASSERT_EQ(req.output_shapes[0], ref_out_shape), req;
Expand All @@ -66,6 +67,7 @@ struct ReductionKernelTest {
template <typename... Args>
void Run(Args &&...args) {
auto scratchpad = sa.GetScratchpad();
ctx.gpu.stream = 0;
ctx.scratchpad = &scratchpad;
kernel.Run(ctx, out.gpu(stream()), in.gpu(stream()), std::forward<Args>(args)...);
}
Expand Down
4 changes: 3 additions & 1 deletion dali/kernels/signal/fft/fft_postprocess_test.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
// Copyright (c) 2020, 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -108,6 +108,7 @@ class FFTPostprocessTest<FFTPostprocessArgs<Out, In, Convert>> : public ::testin

ToFreqMajorSpectrum<Out, In, Convert> tr;
KernelContext ctx;
ctx.gpu.stream = 0;
ScratchpadAllocator sa;
KernelRequirements req = tr.Setup(ctx, in_shape);
ASSERT_EQ(req.output_shapes.size(), 1u);
Expand Down Expand Up @@ -174,6 +175,7 @@ class FFTPostprocessTest<FFTPostprocessArgs<Out, In, Convert>> : public ::testin

ConvertTimeMajorSpectrum<Out, In, Convert> tr;
KernelContext ctx;
ctx.gpu.stream = 0;
tr.Setup(ctx, in_shape);
tr.Run(ctx, out_gpu, in.gpu());
CUDA_CALL(cudaGetLastError());
Expand Down
Loading