Skip to content

Commit

Permalink
[MXNET-105] Fix CuDNN performance after code refactor (apache#10116)
Browse files Browse the repository at this point in the history
* Reduce #inputs/outputs of batchnorm backward.

* Pass more arrays to BN.

* Make std::vector thread local.

* Set inputs of BN backward for other cases.

* Fix for other cases.

* remove commented code.

* fix a potential mem leak.

* Fix a compile error in mkldnn.

* Fix an error.

* reserve space for std::vector.

* Fix alignment.

* Fix cpp unit test.

* Fix BN CPP unit tests.

* Fix a compile error.

* Fix compilation error.

* Move Op signature.

* Cache CuDNN conv op.

* Fix compile error.

* Fix compile error.

* Remove thread_local.

* Reduce mem alloc when caching cudnn conv.

* Fix a lint error.

* Cache CuDNN deconv.

* Fix lint error.
  • Loading branch information
zheng-da committed Jun 28, 2018
1 parent 78ff0e9 commit 257ab66
Show file tree
Hide file tree
Showing 18 changed files with 319 additions and 217 deletions.
47 changes: 22 additions & 25 deletions src/operator/nn/batch_norm-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,16 +224,25 @@ void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
*/
template <typename xpu, typename DType, typename AccReal>
void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &inputs,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_states) {
CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
CHECK_EQ(in_data.size(), 3U);
CHECK_EQ(out_data.size(), 3U);
CHECK_EQ(in_grad.size(), 3U);
const std::vector<TBlob> &outputs) {
CHECK_EQ(inputs.size(), 8U);
CHECK_EQ(outputs.size(), 3U);
std::vector<TBlob> out_grad(1);
std::vector<TBlob> out_data(3);
std::vector<TBlob> in_data(3);
std::vector<TBlob> aux_states(2);

out_grad[0] = inputs[0];
out_data[batchnorm::kMean] = inputs[1];
out_data[batchnorm::kVar] = inputs[2];
in_data[batchnorm::kData] = inputs[3];
in_data[batchnorm::kGamma] = inputs[4];
in_data[batchnorm::kBeta] = inputs[5];
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
const std::vector<TBlob> &in_grad = outputs;
mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data,
out_data, req, in_grad, aux_states);
Expand Down Expand Up @@ -261,23 +270,11 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 11U);
CHECK_EQ(inputs.size(), 8U);
const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
int num_out_grads = param.output_mean_var ? 3U : 1U;
int in_data_start = 3;
int aux_states_start = in_data_start + batchnorm::kInMovingMean;
int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
std::vector<TBlob> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
std::vector<TBlob> in_data(inputs.begin() + in_data_start,
inputs.begin() + aux_states_start);
std::vector<TBlob> aux_states(inputs.begin() + aux_states_start,
inputs.begin() + out_data_start);
std::vector<TBlob> out_data(inputs.begin() + out_data_start, inputs.end());
std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);

MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
BatchNormBackward<xpu, DType, AccReal>(ctx, param, out_grad, in_data, out_data, req,
in_grad, aux_states);

MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
BatchNormBackward<xpu, DType, AccReal>(ctx, param, inputs, req, outputs);
});
}

Expand Down
74 changes: 57 additions & 17 deletions src/operator/nn/batch_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -413,24 +413,26 @@ void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs,
const std::vector<NDArray> &inputs,
const std::vector<OpReqType> &req,
const std::vector<NDArray> &outputs) {
CHECK_EQ(inputs.size(), 11U);
CHECK_EQ(inputs.size(), 8U);
const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
int num_out_grads = param.output_mean_var ? 3U : 1U;
int in_data_start = 3;
int aux_states_start = in_data_start + batchnorm::kInMovingMean;
int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;

TShape shape = inputs[0].shape();
// MKLDNN batchnorm only works well on the special MKLDNN layout.
if (SupportMKLDNNBN(inputs[0], param)
&& (inputs[in_data_start].IsMKLDNNData() || inputs[0].IsMKLDNNData())) {
std::vector<NDArray> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
std::vector<NDArray> in_data(inputs.begin() + in_data_start,
inputs.begin() + aux_states_start);
std::vector<NDArray> aux_states(inputs.begin() + aux_states_start,
inputs.begin() + out_data_start);
std::vector<NDArray> out_data(inputs.begin() + out_data_start, inputs.end());
std::vector<NDArray> in_grad(outputs.begin(), outputs.begin() + 3);
&& (inputs[3].IsMKLDNNData() || inputs[0].IsMKLDNNData())) {
std::vector<NDArray> out_grad(1);
std::vector<NDArray> out_data(3);
std::vector<NDArray> in_data(3);
std::vector<NDArray> aux_states(2);
out_grad[0] = inputs[0];
out_data[batchnorm::kMean] = inputs[1];
out_data[batchnorm::kVar] = inputs[2];
in_data[batchnorm::kData] = inputs[3];
in_data[batchnorm::kGamma] = inputs[4];
in_data[batchnorm::kBeta] = inputs[5];
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
const std::vector<NDArray> &in_grad = outputs;

if (inputs[0].dtype() == mshadow::kFloat32) {
MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
Expand Down Expand Up @@ -470,8 +472,6 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
DispatchMode *dispatch_mode,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
CHECK_EQ(in_attrs->size(), 11);
CHECK_EQ(out_attrs->size(), 5);
DispatchMode wanted_mode;
#if MXNET_USE_MKLDNN == 1
if (dev_mask == mshadow::cpu::kDevMask)
Expand All @@ -486,6 +486,46 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
dispatch_mode, wanted_mode);
}

std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
const std::vector<nnvm::NodeEntry>& ograds) {
std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
for (uint32_t i = 0; i < out_data.size(); ++i) {
out_data[i] = nnvm::NodeEntry{n, i, 0};
}
std::vector<nnvm::NodeEntry> heads;
heads.reserve(8);
heads.push_back(ograds[0]);
heads.push_back(out_data[batchnorm::kMean]);
heads.push_back(out_data[batchnorm::kVar]);
heads.push_back(n->inputs[batchnorm::kData]);
heads.push_back(n->inputs[batchnorm::kGamma]);
heads.push_back(n->inputs[batchnorm::kBeta]);
heads.push_back(n->inputs[batchnorm::kInMovingMean]);
heads.push_back(n->inputs[batchnorm::kInMovingVar]);

nnvm::NodePtr gnode = nnvm::Node::Create();
gnode->inputs = std::move(heads);
gnode->control_deps.emplace_back(n);
gnode->attrs = n->attrs;
gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
gnode->attrs.name = n->attrs.name + "_backward";
// The input of batchnorm
std::vector<nnvm::NodeEntry> in_grad(5);
for (uint32_t i = 0; i < 3; ++i) {
in_grad[i] = nnvm::NodeEntry{gnode, i, 0};
}

// attach no gradient node to forbid gradient on aux_state
nnvm::NodePtr ng = nnvm::Node::Create();
ng->attrs.op = Op::Get("_NoGradient");
ng->attrs.name = "NoGradient";
// the aux state of batchnorm
for (uint32_t i = 0; i < 2; ++i) {
in_grad[i + 3] = nnvm::NodeEntry{ng, 0, 0};
}
return in_grad;
}

NNVM_REGISTER_OP(BatchNorm)
.describe(R"code(Batch normalization.
Expand Down Expand Up @@ -559,7 +599,7 @@ then set ``gamma`` to 1 and its gradient to 0.
#if MXNET_USE_MKLDNN == 1
.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
#endif
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"})
.set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
#if MXNET_USE_MKLDNN == 1
.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
Expand All @@ -583,7 +623,7 @@ then set ``gamma`` to 1 and its gradient to 0.
});

NNVM_REGISTER_OP(_backward_BatchNorm)
.set_num_outputs(5)
.set_num_outputs(3)
.set_attr<nnvm::TIsBackward>("TIsBackward", true)
.set_attr<FInferStorageType>("FInferStorageType", backward_BatchNormStorageType)
#if MXNET_USE_MKLDNN == 1
Expand Down
18 changes: 6 additions & 12 deletions src/operator/nn/batch_norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -690,13 +690,8 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 11U);
CHECK_EQ(inputs.size(), 8U);
BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
std::vector<TBlob> out_grad(1, inputs[0]);
std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
int dtype = inputs[0].type_flag_;
TShape shape = inputs[0].shape_;

Expand All @@ -705,19 +700,18 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
&& param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
GetCuDNNOp<DType>(param).Backward(ctx, out_grad, in_data, out_data,
req, in_grad, aux_states);
GetCuDNNOp<DType>(param).Backward(ctx, inputs, req, outputs);
})
} else {
MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
in_data, out_data, req, in_grad, aux_states);
BatchNormBackward<gpu, DType, AccReal>(ctx, param, inputs, req, outputs);
})
}
#else
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
in_data, out_data, req, in_grad, aux_states);
BatchNormBackward<gpu, DType, AccReal>(ctx, param, inputs, req, outputs);
});
#endif
}
Expand Down
2 changes: 2 additions & 0 deletions src/operator/nn/convolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
}
};

typedef ParamOpSign<ConvolutionParam> ConvSignature;

} // namespace op
} // namespace mxnet

Expand Down
37 changes: 32 additions & 5 deletions src/operator/nn/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,40 @@ static CuDNNConvolutionOp<DType> &GetCuDNNConvOp(const ConvolutionParam& param,
const std::vector<TShape>& in_shape, const std::vector<TShape>& out_shape,
const Context& ctx) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local CuDNNConvolutionOp<DType> op;
static thread_local std::unordered_map<ConvSignature,
std::shared_ptr<CuDNNConvolutionOp<DType> >,
OpHash> ops;
#else
static MX_THREAD_LOCAL CuDNNConvolutionOp<DType> op;
static MX_THREAD_LOCAL std::unordered_map<ConvSignature,
std::shared_ptr<CuDNNConvolutionOp<DType> >,
OpHash> ops;
#endif
op.Init(param, forward_compute_type, backward_compute_type,
in_shape, out_shape, ctx);
return op;
ConvSignature key(param);
size_t ndim = 0;
for (auto &s : in_shape)
ndim += s.ndim();
for (auto &s : out_shape)
ndim += s.ndim();
key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */
+ ndim + 1 /* for dev_id */);

key.AddSign(forward_compute_type);
key.AddSign(backward_compute_type);
key.AddSign(in_shape);
key.AddSign(out_shape);
key.AddSign(ctx.dev_id);

auto it = ops.find(key);
if (it == ops.end()) {
std::shared_ptr<CuDNNConvolutionOp<DType>> op(new CuDNNConvolutionOp<DType>());
auto ins_ret = ops.insert(std::pair<ConvSignature, std::shared_ptr<CuDNNConvolutionOp<DType>>>(
key, op));
CHECK(ins_ret.second);
it = ins_ret.first;
it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
out_shape, ctx);
}
return *it->second;
}
#endif

Expand Down
51 changes: 26 additions & 25 deletions src/operator/nn/cudnn/cudnn_batch_norm-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ class CuDNNBatchNormOp {
}

void Forward(const OpContext &ctx,
const std::vector<TBlob> &in_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &aux_states) {
const std::vector<TBlob> &in_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &aux_states) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(in_data.size(), 3U);
Expand Down Expand Up @@ -158,29 +158,30 @@ class CuDNNBatchNormOp {
}

void Backward(const OpContext &ctx,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_states) {
const std::vector<TBlob> &inputs,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &outputs) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(out_grad.size(), 1U);
CHECK_EQ(in_data.size(), 3U);
CHECK_EQ(out_data.size(), 3U);
CHECK_EQ(in_grad.size(), 3U);
CHECK_EQ(inputs.size(), 8U);
CHECK_EQ(outputs.size(), 3U);
CHECK(ctx.is_train && !param_.use_global_stats)
<< "use global statistics is not yet supported in CuDNNBatchNorm";

Init(in_data[cudnnbatchnorm::kData]);
// Rename the inputs and outputs.
const TBlob &out_grad = inputs[0];
const TBlob &out_mean = inputs[1];
const TBlob &out_var = inputs[2];
const TBlob &in_data = inputs[3];
const TBlob &in_gamma = inputs[4];
const std::vector<TBlob> &in_grad = outputs;

Init(in_data);
Stream<gpu> *s = ctx.get_stream<gpu>();
Tensor<gpu, 4, DType> x =
in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
Tensor<gpu, 4, DType> x = in_data.get_with_shape<gpu, 4, DType>(shape_, s);
Tensor<gpu, 4, DType> dx =
in_grad[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
Tensor<gpu, 4, DType> dy =
out_grad[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, DType>(shape_, s);
Tensor<gpu, 4, DType> dy = out_grad.get_with_shape<gpu, 4, DType>(shape_, s);

#if CUDNN_VERSION >= 4007
#if CUDNN_VERSION >= 7002
Expand All @@ -190,15 +191,15 @@ class CuDNNBatchNormOp {
#endif
MSHADOW_REAL_TYPE_SWITCH(dtype_param_, DTypeParam, {
Tensor<gpu, 1, DTypeParam> gamma =
in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
in_gamma.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> dbeta =
in_grad[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> dgamma =
in_grad[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> save_mean =
out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
out_mean.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> save_inv_var =
out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
out_var.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);

typename DataType<DType>::ScaleType a = 1.0f;
typename DataType<DType>::ScaleType b = 0.0f;
Expand Down Expand Up @@ -232,15 +233,15 @@ class CuDNNBatchNormOp {
#else // CUDNN_VERSION < 4007
MSHADOW_REAL_TYPE_SWITCH(dtype_param_, DTypeParam, {
Tensor<gpu, 1, DTypeParam> gamma =
in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
in_gamma.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> dbeta =
in_grad[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> dgamma =
in_grad[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> save_mean =
out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
out_mean.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> save_inv_var =
out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
out_var.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);

typename DataType<DType>::ScaleType a = 1.0f;
typename DataType<DType>::ScaleType b = 0.0f;
Expand Down
2 changes: 2 additions & 0 deletions src/operator/nn/deconvolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
}
};

typedef ParamOpSign<DeconvolutionParam> DeconvSignature;

} // namespace op
} // namespace mxnet

Expand Down
Loading

0 comments on commit 257ab66

Please sign in to comment.