Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[MXNET-105] Fix CuDNN performance after code refactor #10116

Merged
merged 25 commits into from
Mar 22, 2018
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 14 additions & 12 deletions src/operator/nn/batch_norm-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,19 +261,21 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 11U);
CHECK_EQ(inputs.size(), 8U);
const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
int num_out_grads = param.output_mean_var ? 3U : 1U;
int in_data_start = 3;
int aux_states_start = in_data_start + batchnorm::kInMovingMean;
int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
std::vector<TBlob> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
std::vector<TBlob> in_data(inputs.begin() + in_data_start,
inputs.begin() + aux_states_start);
std::vector<TBlob> aux_states(inputs.begin() + aux_states_start,
inputs.begin() + out_data_start);
std::vector<TBlob> out_data(inputs.begin() + out_data_start, inputs.end());
std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
static thread_local std::vector<TBlob> out_grad(1);
Copy link
Contributor

@piiswrong piiswrong Mar 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably too many thread_local.
Why are we copying the vectors in the first place?
why not change the interface of operator Forward/Backward?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good question. I'll do that instead.

static thread_local std::vector<TBlob> out_data(3);
static thread_local std::vector<TBlob> in_data(3);
static thread_local std::vector<TBlob> aux_states(2);
out_grad[0] = inputs[0];
out_data[batchnorm::kMean] = inputs[1];
out_data[batchnorm::kVar] = inputs[2];
in_data[batchnorm::kData] = inputs[3];
in_data[batchnorm::kGamma] = inputs[4];
in_data[batchnorm::kBeta] = inputs[5];
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
const std::vector<TBlob> &in_grad = outputs;

MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
BatchNormBackward<xpu, DType, AccReal>(ctx, param, out_grad, in_data, out_data, req,
Expand Down
68 changes: 57 additions & 11 deletions src/operator/nn/batch_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -424,13 +424,19 @@ void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs,
// MKLDNN batchnorm only works well on the special MKLDNN layout.
if (SupportMKLDNNBN(inputs[0], param)
&& (inputs[in_data_start].IsMKLDNNData() || inputs[0].IsMKLDNNData())) {
std::vector<NDArray> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
std::vector<NDArray> in_data(inputs.begin() + in_data_start,
inputs.begin() + aux_states_start);
std::vector<NDArray> aux_states(inputs.begin() + aux_states_start,
inputs.begin() + out_data_start);
std::vector<NDArray> out_data(inputs.begin() + out_data_start, inputs.end());
std::vector<NDArray> in_grad(outputs.begin(), outputs.begin() + 3);
static thread_local std::vector<NDArray> out_grad(1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does thread_local help here?

Copy link
Member

@cjolivier01 cjolivier01 Mar 14, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't these hold a reference to the NDArray's Chunk data indefinitely?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I'm trying to avoid memory allocation for std::vector.

But you are right. It potentially causes mem leak.

static thread_local std::vector<NDArray> out_data(3);
static thread_local std::vector<NDArray> in_data(3);
static thread_local std::vector<NDArray> aux_states(2);
out_grad[0] = inputs[0];
out_data[batchnorm::kMean] = inputs[1];
out_data[batchnorm::kVar] = inputs[2];
in_data[batchnorm::kData] = inputs[3];
in_data[batchnorm::kGamma] = inputs[4];
in_data[batchnorm::kBeta] = inputs[5];
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
const std::vector<NDArray> &in_grad = outputs;

if (inputs[0].dtype() == mshadow::kFloat32) {
MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
Expand Down Expand Up @@ -470,8 +476,6 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
DispatchMode *dispatch_mode,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
CHECK_EQ(in_attrs->size(), 11);
CHECK_EQ(out_attrs->size(), 5);
DispatchMode wanted_mode;
#if MXNET_USE_MKLDNN == 1
if (dev_mask == mshadow::cpu::kDevMask)
Expand All @@ -486,6 +490,48 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
dispatch_mode, wanted_mode);
}

std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
const std::vector<nnvm::NodeEntry>& ograds) {
std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
for (uint32_t i = 0; i < out_data.size(); ++i) {
out_data[i] = nnvm::NodeEntry{n, i, 0};
}
std::vector<nnvm::NodeEntry> heads;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use reserve()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code runs to build the computation graph. It only runs once. Do we still need to call reserve()?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, please

heads.push_back(ograds[0]);
heads.push_back(out_data[batchnorm::kMean]);
heads.push_back(out_data[batchnorm::kVar]);
heads.push_back(n->inputs[batchnorm::kData]);
heads.push_back(n->inputs[batchnorm::kGamma]);
heads.push_back(n->inputs[batchnorm::kBeta]);
heads.push_back(n->inputs[batchnorm::kInMovingMean]);
heads.push_back(n->inputs[batchnorm::kInMovingVar]);

// add all the auxiliary data
//for (uint32_t i = 0; i < prop.aux_states.size(); ++i) {
// inputs.emplace_back(ptr->inputs[i + prop.arguments.size()]);
//}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

nnvm::NodePtr gnode = nnvm::Node::Create();
gnode->inputs = std::move(heads);
gnode->control_deps.emplace_back(n);
gnode->attrs = n->attrs;
gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
gnode->attrs.name = n->attrs.name + "_backward";
// The input of batchnorm
std::vector<nnvm::NodeEntry> in_grad(5);
for (uint32_t i = 0; i < 3; ++i) {
in_grad[i] = nnvm::NodeEntry{gnode, i, 0};
}
// attach no gradient node to forbid gradient on aux_state
nnvm::NodePtr ng = nnvm::Node::Create();
ng->attrs.op = Op::Get("_NoGradient");
ng->attrs.name = "NoGradient";
// the aux state of batchnorm
for (uint32_t i = 0; i < 2; ++i) {
in_grad[i + 3] = nnvm::NodeEntry{ng, 0, 0};
}
return in_grad;
}

NNVM_REGISTER_OP(BatchNorm)
.describe(R"code(Batch normalization.

Expand Down Expand Up @@ -559,7 +605,7 @@ then set ``gamma`` to 1 and its gradient to 0.
#if MXNET_USE_MKLDNN == 1
.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
#endif
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"})
.set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
#if MXNET_USE_MKLDNN == 1
.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
Expand All @@ -583,7 +629,7 @@ then set ``gamma`` to 1 and its gradient to 0.
});

NNVM_REGISTER_OP(_backward_BatchNorm)
.set_num_outputs(5)
.set_num_outputs(3)
.set_attr<nnvm::TIsBackward>("TIsBackward", true)
.set_attr<FInferStorageType>("FInferStorageType", backward_BatchNormStorageType)
#if MXNET_USE_MKLDNN == 1
Expand Down
21 changes: 15 additions & 6 deletions src/operator/nn/batch_norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -690,13 +690,18 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 11U);
CHECK_EQ(inputs.size(), 8U);
BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
std::vector<TBlob> out_grad(1, inputs[0]);
std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
static thread_local std::vector<TBlob> out_grad(1);
static thread_local std::vector<TBlob> out_data(3);
static thread_local std::vector<TBlob> in_data(3);
static thread_local std::vector<TBlob> aux_states(2);
out_grad[0] = inputs[0];
out_data[batchnorm::kMean] = inputs[1];
out_data[batchnorm::kVar] = inputs[2];
in_data[batchnorm::kData] = inputs[3];
in_data[batchnorm::kGamma] = inputs[4];
const std::vector<TBlob> &in_grad = outputs;
int dtype = inputs[0].type_flag_;
TShape shape = inputs[0].shape_;

Expand All @@ -709,12 +714,16 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
req, in_grad, aux_states);
})
} else {
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
in_data, out_data, req, in_grad, aux_states);
})
}
#else
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zheng-da aux_states is not defined if USE_CUDNN is not enabled. @marcoabreu seems there is no pure cuda ci environment which is not built with cudnn.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see. i'll update it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree. @marcoabreu could you add a CI only with CUDA?

Copy link
Contributor

@marcoabreu marcoabreu Mar 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, no problem at all! Compilation only or do we need tests as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to run the code at least once. We probably don't need to try both Python2 and Python3, something like that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done: #10281

MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
in_data, out_data, req, in_grad, aux_states);
Expand Down
2 changes: 1 addition & 1 deletion src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ void MKLDNNBatchNormBackward(const OpContext &ctx, const BatchNormParam &param,
const std::vector<NDArray> &in_grad,
const std::vector<NDArray> &aux_states) {
TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
CHECK_EQ(out_grad.size(), 1U);
CHECK_EQ(in_data.size(), 3U);
CHECK_EQ(out_data.size(), 3U);
CHECK_EQ(in_grad.size(), 3U);
Expand Down