-
Notifications
You must be signed in to change notification settings - Fork 6.8k
[MXNET-105] Fix CuDNN performance after code refactor #10116
Changes from 5 commits
01a667a
c616839
aaea301
5153679
3b3c959
583751e
1e71a10
4f27d57
457f51d
38a116b
746f604
9db50ca
9206dbd
1a7916a
975beb7
048604d
8fc682d
b0271a4
13ee008
b9fb194
c73ac46
afc7f75
f18dc20
287970f
db21715
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -424,13 +424,19 @@ void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs, | |
// MKLDNN batchnorm only works well on the special MKLDNN layout. | ||
if (SupportMKLDNNBN(inputs[0], param) | ||
&& (inputs[in_data_start].IsMKLDNNData() || inputs[0].IsMKLDNNData())) { | ||
std::vector<NDArray> out_grad(inputs.begin(), inputs.begin() + num_out_grads); | ||
std::vector<NDArray> in_data(inputs.begin() + in_data_start, | ||
inputs.begin() + aux_states_start); | ||
std::vector<NDArray> aux_states(inputs.begin() + aux_states_start, | ||
inputs.begin() + out_data_start); | ||
std::vector<NDArray> out_data(inputs.begin() + out_data_start, inputs.end()); | ||
std::vector<NDArray> in_grad(outputs.begin(), outputs.begin() + 3); | ||
static thread_local std::vector<NDArray> out_grad(1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How does thread_local help here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Won't these hold a reference to the NDArray's Chunk data indefinitely? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I'm trying to avoid memory allocation for std::vector. But you are right. It potentially causes mem leak. |
||
static thread_local std::vector<NDArray> out_data(3); | ||
static thread_local std::vector<NDArray> in_data(3); | ||
static thread_local std::vector<NDArray> aux_states(2); | ||
out_grad[0] = inputs[0]; | ||
out_data[batchnorm::kMean] = inputs[1]; | ||
out_data[batchnorm::kVar] = inputs[2]; | ||
in_data[batchnorm::kData] = inputs[3]; | ||
in_data[batchnorm::kGamma] = inputs[4]; | ||
in_data[batchnorm::kBeta] = inputs[5]; | ||
aux_states[batchnorm::kMovingMean] = inputs[6]; | ||
aux_states[batchnorm::kMovingVar] = inputs[7]; | ||
const std::vector<NDArray> &in_grad = outputs; | ||
|
||
if (inputs[0].dtype() == mshadow::kFloat32) { | ||
MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); | ||
|
@@ -470,8 +476,6 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, | |
DispatchMode *dispatch_mode, | ||
std::vector<int> *in_attrs, | ||
std::vector<int> *out_attrs) { | ||
CHECK_EQ(in_attrs->size(), 11); | ||
CHECK_EQ(out_attrs->size(), 5); | ||
DispatchMode wanted_mode; | ||
#if MXNET_USE_MKLDNN == 1 | ||
if (dev_mask == mshadow::cpu::kDevMask) | ||
|
@@ -486,6 +490,48 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, | |
dispatch_mode, wanted_mode); | ||
} | ||
|
||
std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n, | ||
const std::vector<nnvm::NodeEntry>& ograds) { | ||
std::vector<nnvm::NodeEntry> out_data(n->num_outputs()); | ||
for (uint32_t i = 0; i < out_data.size(); ++i) { | ||
out_data[i] = nnvm::NodeEntry{n, i, 0}; | ||
} | ||
std::vector<nnvm::NodeEntry> heads; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use reserve() There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code runs to build the computation graph. It only runs once. Do we still need to call reserve()? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, please |
||
heads.push_back(ograds[0]); | ||
heads.push_back(out_data[batchnorm::kMean]); | ||
heads.push_back(out_data[batchnorm::kVar]); | ||
heads.push_back(n->inputs[batchnorm::kData]); | ||
heads.push_back(n->inputs[batchnorm::kGamma]); | ||
heads.push_back(n->inputs[batchnorm::kBeta]); | ||
heads.push_back(n->inputs[batchnorm::kInMovingMean]); | ||
heads.push_back(n->inputs[batchnorm::kInMovingVar]); | ||
|
||
// add all the auxiliary data | ||
//for (uint32_t i = 0; i < prop.aux_states.size(); ++i) { | ||
// inputs.emplace_back(ptr->inputs[i + prop.arguments.size()]); | ||
//} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? |
||
nnvm::NodePtr gnode = nnvm::Node::Create(); | ||
gnode->inputs = std::move(heads); | ||
gnode->control_deps.emplace_back(n); | ||
gnode->attrs = n->attrs; | ||
gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm"); | ||
gnode->attrs.name = n->attrs.name + "_backward"; | ||
// The input of batchnorm | ||
std::vector<nnvm::NodeEntry> in_grad(5); | ||
for (uint32_t i = 0; i < 3; ++i) { | ||
in_grad[i] = nnvm::NodeEntry{gnode, i, 0}; | ||
} | ||
// attach no gradient node to forbid gradient on aux_state | ||
nnvm::NodePtr ng = nnvm::Node::Create(); | ||
ng->attrs.op = Op::Get("_NoGradient"); | ||
ng->attrs.name = "NoGradient"; | ||
// the aux state of batchnorm | ||
for (uint32_t i = 0; i < 2; ++i) { | ||
in_grad[i + 3] = nnvm::NodeEntry{ng, 0, 0}; | ||
} | ||
return in_grad; | ||
} | ||
|
||
NNVM_REGISTER_OP(BatchNorm) | ||
.describe(R"code(Batch normalization. | ||
|
||
|
@@ -559,7 +605,7 @@ then set ``gamma`` to 1 and its gradient to 0. | |
#if MXNET_USE_MKLDNN == 1 | ||
.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU) | ||
#endif | ||
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"}) | ||
.set_attr<nnvm::FGradient>("FGradient", BatchNormGrad) | ||
#if MXNET_USE_MKLDNN == 1 | ||
.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) { | ||
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; | ||
|
@@ -583,7 +629,7 @@ then set ``gamma`` to 1 and its gradient to 0. | |
}); | ||
|
||
NNVM_REGISTER_OP(_backward_BatchNorm) | ||
.set_num_outputs(5) | ||
.set_num_outputs(3) | ||
.set_attr<nnvm::TIsBackward>("TIsBackward", true) | ||
.set_attr<FInferStorageType>("FInferStorageType", backward_BatchNormStorageType) | ||
#if MXNET_USE_MKLDNN == 1 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -690,13 +690,18 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs, | |
const OpContext& ctx, const std::vector<TBlob>& inputs, | ||
const std::vector<OpReqType>& req, | ||
const std::vector<TBlob>& outputs) { | ||
CHECK_EQ(inputs.size(), 11U); | ||
CHECK_EQ(inputs.size(), 8U); | ||
BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed); | ||
std::vector<TBlob> out_grad(1, inputs[0]); | ||
std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6); | ||
std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8); | ||
std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end()); | ||
std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3); | ||
static thread_local std::vector<TBlob> out_grad(1); | ||
static thread_local std::vector<TBlob> out_data(3); | ||
static thread_local std::vector<TBlob> in_data(3); | ||
static thread_local std::vector<TBlob> aux_states(2); | ||
out_grad[0] = inputs[0]; | ||
out_data[batchnorm::kMean] = inputs[1]; | ||
out_data[batchnorm::kVar] = inputs[2]; | ||
in_data[batchnorm::kData] = inputs[3]; | ||
in_data[batchnorm::kGamma] = inputs[4]; | ||
const std::vector<TBlob> &in_grad = outputs; | ||
int dtype = inputs[0].type_flag_; | ||
TShape shape = inputs[0].shape_; | ||
|
||
|
@@ -709,12 +714,16 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs, | |
req, in_grad, aux_states); | ||
}) | ||
} else { | ||
aux_states[batchnorm::kMovingMean] = inputs[6]; | ||
aux_states[batchnorm::kMovingVar] = inputs[7]; | ||
MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { | ||
BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad, | ||
in_data, out_data, req, in_grad, aux_states); | ||
}) | ||
} | ||
#else | ||
aux_states[batchnorm::kMovingMean] = inputs[6]; | ||
aux_states[batchnorm::kMovingVar] = inputs[7]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zheng-da aux_states is not defined if USE_CUDNN is not enabled. @marcoabreu seems there is no pure cuda ci environment which is not built with cudnn. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i see. i'll update it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agree. @marcoabreu could you add a CI only with CUDA? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, no problem at all! Compilation only or do we need tests as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's better to run the code at least once. We probably don't need to try both Python2 and Python3, something like that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done: #10281 |
||
MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { | ||
BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad, | ||
in_data, out_data, req, in_grad, aux_states); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is probably too many thread_local.
Why are we copying the vectors in the first place?
why not change the interface of operator Forward/Backward?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good question. I'll do that instead.