Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[MXNET-105] Fix CuDNN performance after code refactor #10116

Merged
merged 25 commits into from
Mar 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 22 additions & 25 deletions src/operator/nn/batch_norm-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,16 +224,25 @@ void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
*/
template <typename xpu, typename DType, typename AccReal>
void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &inputs,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_states) {
CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U);
CHECK_EQ(in_data.size(), 3U);
CHECK_EQ(out_data.size(), 3U);
CHECK_EQ(in_grad.size(), 3U);
const std::vector<TBlob> &outputs) {
CHECK_EQ(inputs.size(), 8U);
CHECK_EQ(outputs.size(), 3U);
std::vector<TBlob> out_grad(1);
std::vector<TBlob> out_data(3);
std::vector<TBlob> in_data(3);
std::vector<TBlob> aux_states(2);

out_grad[0] = inputs[0];
out_data[batchnorm::kMean] = inputs[1];
out_data[batchnorm::kVar] = inputs[2];
in_data[batchnorm::kData] = inputs[3];
in_data[batchnorm::kGamma] = inputs[4];
in_data[batchnorm::kBeta] = inputs[5];
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
const std::vector<TBlob> &in_grad = outputs;
mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data,
out_data, req, in_grad, aux_states);
Expand Down Expand Up @@ -261,23 +270,11 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 11U);
CHECK_EQ(inputs.size(), 8U);
const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
int num_out_grads = param.output_mean_var ? 3U : 1U;
int in_data_start = 3;
int aux_states_start = in_data_start + batchnorm::kInMovingMean;
int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;
std::vector<TBlob> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
std::vector<TBlob> in_data(inputs.begin() + in_data_start,
inputs.begin() + aux_states_start);
std::vector<TBlob> aux_states(inputs.begin() + aux_states_start,
inputs.begin() + out_data_start);
std::vector<TBlob> out_data(inputs.begin() + out_data_start, inputs.end());
std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);

MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
BatchNormBackward<xpu, DType, AccReal>(ctx, param, out_grad, in_data, out_data, req,
in_grad, aux_states);

MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
BatchNormBackward<xpu, DType, AccReal>(ctx, param, inputs, req, outputs);
});
}

Expand Down
74 changes: 57 additions & 17 deletions src/operator/nn/batch_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -413,24 +413,26 @@ void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs,
const std::vector<NDArray> &inputs,
const std::vector<OpReqType> &req,
const std::vector<NDArray> &outputs) {
CHECK_EQ(inputs.size(), 11U);
CHECK_EQ(inputs.size(), 8U);
const BatchNormParam &param = nnvm::get<BatchNormParam>(attrs.parsed);
int num_out_grads = param.output_mean_var ? 3U : 1U;
int in_data_start = 3;
int aux_states_start = in_data_start + batchnorm::kInMovingMean;
int out_data_start = in_data_start + batchnorm::kInMovingVar + 1;

TShape shape = inputs[0].shape();
// MKLDNN batchnorm only works well on the special MKLDNN layout.
if (SupportMKLDNNBN(inputs[0], param)
&& (inputs[in_data_start].IsMKLDNNData() || inputs[0].IsMKLDNNData())) {
std::vector<NDArray> out_grad(inputs.begin(), inputs.begin() + num_out_grads);
std::vector<NDArray> in_data(inputs.begin() + in_data_start,
inputs.begin() + aux_states_start);
std::vector<NDArray> aux_states(inputs.begin() + aux_states_start,
inputs.begin() + out_data_start);
std::vector<NDArray> out_data(inputs.begin() + out_data_start, inputs.end());
std::vector<NDArray> in_grad(outputs.begin(), outputs.begin() + 3);
&& (inputs[3].IsMKLDNNData() || inputs[0].IsMKLDNNData())) {
std::vector<NDArray> out_grad(1);
std::vector<NDArray> out_data(3);
std::vector<NDArray> in_data(3);
std::vector<NDArray> aux_states(2);
out_grad[0] = inputs[0];
out_data[batchnorm::kMean] = inputs[1];
out_data[batchnorm::kVar] = inputs[2];
in_data[batchnorm::kData] = inputs[3];
in_data[batchnorm::kGamma] = inputs[4];
in_data[batchnorm::kBeta] = inputs[5];
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
const std::vector<NDArray> &in_grad = outputs;

if (inputs[0].dtype() == mshadow::kFloat32) {
MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
Expand Down Expand Up @@ -470,8 +472,6 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
DispatchMode *dispatch_mode,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
CHECK_EQ(in_attrs->size(), 11);
CHECK_EQ(out_attrs->size(), 5);
DispatchMode wanted_mode;
#if MXNET_USE_MKLDNN == 1
if (dev_mask == mshadow::cpu::kDevMask)
Expand All @@ -486,6 +486,46 @@ static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs,
dispatch_mode, wanted_mode);
}

std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::NodePtr& n,
const std::vector<nnvm::NodeEntry>& ograds) {
std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
for (uint32_t i = 0; i < out_data.size(); ++i) {
out_data[i] = nnvm::NodeEntry{n, i, 0};
}
std::vector<nnvm::NodeEntry> heads;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use reserve()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code runs to build the computation graph. It only runs once. Do we still need to call reserve()?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, please

heads.reserve(8);
heads.push_back(ograds[0]);
heads.push_back(out_data[batchnorm::kMean]);
heads.push_back(out_data[batchnorm::kVar]);
heads.push_back(n->inputs[batchnorm::kData]);
heads.push_back(n->inputs[batchnorm::kGamma]);
heads.push_back(n->inputs[batchnorm::kBeta]);
heads.push_back(n->inputs[batchnorm::kInMovingMean]);
heads.push_back(n->inputs[batchnorm::kInMovingVar]);

nnvm::NodePtr gnode = nnvm::Node::Create();
gnode->inputs = std::move(heads);
gnode->control_deps.emplace_back(n);
gnode->attrs = n->attrs;
gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
gnode->attrs.name = n->attrs.name + "_backward";
// The input of batchnorm
std::vector<nnvm::NodeEntry> in_grad(5);
for (uint32_t i = 0; i < 3; ++i) {
in_grad[i] = nnvm::NodeEntry{gnode, i, 0};
}

// attach no gradient node to forbid gradient on aux_state
nnvm::NodePtr ng = nnvm::Node::Create();
ng->attrs.op = Op::Get("_NoGradient");
ng->attrs.name = "NoGradient";
// the aux state of batchnorm
for (uint32_t i = 0; i < 2; ++i) {
in_grad[i + 3] = nnvm::NodeEntry{ng, 0, 0};
}
return in_grad;
}

NNVM_REGISTER_OP(BatchNorm)
.describe(R"code(Batch normalization.

Expand Down Expand Up @@ -559,7 +599,7 @@ then set ``gamma`` to 1 and its gradient to 0.
#if MXNET_USE_MKLDNN == 1
.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
#endif
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"})
.set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
#if MXNET_USE_MKLDNN == 1
.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
Expand All @@ -583,7 +623,7 @@ then set ``gamma`` to 1 and its gradient to 0.
});

NNVM_REGISTER_OP(_backward_BatchNorm)
.set_num_outputs(5)
.set_num_outputs(3)
.set_attr<nnvm::TIsBackward>("TIsBackward", true)
.set_attr<FInferStorageType>("FInferStorageType", backward_BatchNormStorageType)
#if MXNET_USE_MKLDNN == 1
Expand Down
18 changes: 6 additions & 12 deletions src/operator/nn/batch_norm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -690,13 +690,8 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx, const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 11U);
CHECK_EQ(inputs.size(), 8U);
BatchNormParam param = nnvm::get<BatchNormParam>(attrs.parsed);
std::vector<TBlob> out_grad(1, inputs[0]);
std::vector<TBlob> in_data(inputs.begin() + 3, inputs.begin() + 6);
std::vector<TBlob> aux_states(inputs.begin() + 6, inputs.begin() + 8);
std::vector<TBlob> out_data(inputs.begin() + 8, inputs.end());
std::vector<TBlob> in_grad(outputs.begin(), outputs.begin() + 3);
int dtype = inputs[0].type_flag_;
TShape shape = inputs[0].shape_;

Expand All @@ -705,19 +700,18 @@ void BatchNormGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4
&& param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) {
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
GetCuDNNOp<DType>(param).Backward(ctx, out_grad, in_data, out_data,
req, in_grad, aux_states);
GetCuDNNOp<DType>(param).Backward(ctx, inputs, req, outputs);
})
} else {
MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, {
BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
in_data, out_data, req, in_grad, aux_states);
BatchNormBackward<gpu, DType, AccReal>(ctx, param, inputs, req, outputs);
})
}
#else
aux_states[batchnorm::kMovingMean] = inputs[6];
aux_states[batchnorm::kMovingVar] = inputs[7];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zheng-da aux_states is not defined if USE_CUDNN is not enabled. @marcoabreu seems there is no pure cuda ci environment which is not built with cudnn.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see. i'll update it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree. @marcoabreu could you add a CI only with CUDA?

Copy link
Contributor

@marcoabreu marcoabreu Mar 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, no problem at all! Compilation only or do we need tests as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to run the code at least once. We probably don't need to try both Python2 and Python3, something like that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done: #10281

MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, {
BatchNormBackward<gpu, DType, AccReal>(ctx, param, out_grad,
in_data, out_data, req, in_grad, aux_states);
BatchNormBackward<gpu, DType, AccReal>(ctx, param, inputs, req, outputs);
});
#endif
}
Expand Down
2 changes: 2 additions & 0 deletions src/operator/nn/convolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
}
};

typedef ParamOpSign<ConvolutionParam> ConvSignature;

} // namespace op
} // namespace mxnet

Expand Down
37 changes: 32 additions & 5 deletions src/operator/nn/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,40 @@ static CuDNNConvolutionOp<DType> &GetCuDNNConvOp(const ConvolutionParam& param,
const std::vector<TShape>& in_shape, const std::vector<TShape>& out_shape,
const Context& ctx) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local CuDNNConvolutionOp<DType> op;
static thread_local std::unordered_map<ConvSignature,
std::shared_ptr<CuDNNConvolutionOp<DType> >,
OpHash> ops;
#else
static MX_THREAD_LOCAL CuDNNConvolutionOp<DType> op;
static MX_THREAD_LOCAL std::unordered_map<ConvSignature,
std::shared_ptr<CuDNNConvolutionOp<DType> >,
OpHash> ops;
#endif
op.Init(param, forward_compute_type, backward_compute_type,
in_shape, out_shape, ctx);
return op;
ConvSignature key(param);
size_t ndim = 0;
for (auto &s : in_shape)
ndim += s.ndim();
for (auto &s : out_shape)
ndim += s.ndim();
key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */
+ ndim + 1 /* for dev_id */);

key.AddSign(forward_compute_type);
key.AddSign(backward_compute_type);
key.AddSign(in_shape);
key.AddSign(out_shape);
key.AddSign(ctx.dev_id);

auto it = ops.find(key);
if (it == ops.end()) {
std::shared_ptr<CuDNNConvolutionOp<DType>> op(new CuDNNConvolutionOp<DType>());
auto ins_ret = ops.insert(std::pair<ConvSignature, std::shared_ptr<CuDNNConvolutionOp<DType>>>(
key, op));
CHECK(ins_ret.second);
it = ins_ret.first;
it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
out_shape, ctx);
}
return *it->second;
}
#endif

Expand Down
51 changes: 26 additions & 25 deletions src/operator/nn/cudnn/cudnn_batch_norm-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ class CuDNNBatchNormOp {
}

void Forward(const OpContext &ctx,
const std::vector<TBlob> &in_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &aux_states) {
const std::vector<TBlob> &in_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &aux_states) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(in_data.size(), 3U);
Expand Down Expand Up @@ -158,29 +158,30 @@ class CuDNNBatchNormOp {
}

void Backward(const OpContext &ctx,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_states) {
const std::vector<TBlob> &inputs,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &outputs) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(out_grad.size(), 1U);
CHECK_EQ(in_data.size(), 3U);
CHECK_EQ(out_data.size(), 3U);
CHECK_EQ(in_grad.size(), 3U);
CHECK_EQ(inputs.size(), 8U);
CHECK_EQ(outputs.size(), 3U);
CHECK(ctx.is_train && !param_.use_global_stats)
<< "use global statistics is not yet supported in CuDNNBatchNorm";

Init(in_data[cudnnbatchnorm::kData]);
// Rename the inputs and outputs.
const TBlob &out_grad = inputs[0];
const TBlob &out_mean = inputs[1];
const TBlob &out_var = inputs[2];
const TBlob &in_data = inputs[3];
const TBlob &in_gamma = inputs[4];
const std::vector<TBlob> &in_grad = outputs;

Init(in_data);
Stream<gpu> *s = ctx.get_stream<gpu>();
Tensor<gpu, 4, DType> x =
in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
Tensor<gpu, 4, DType> x = in_data.get_with_shape<gpu, 4, DType>(shape_, s);
Tensor<gpu, 4, DType> dx =
in_grad[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
Tensor<gpu, 4, DType> dy =
out_grad[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, DType>(shape_, s);
Tensor<gpu, 4, DType> dy = out_grad.get_with_shape<gpu, 4, DType>(shape_, s);

#if CUDNN_VERSION >= 4007
#if CUDNN_VERSION >= 7002
Expand All @@ -190,15 +191,15 @@ class CuDNNBatchNormOp {
#endif
MSHADOW_REAL_TYPE_SWITCH(dtype_param_, DTypeParam, {
Tensor<gpu, 1, DTypeParam> gamma =
in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
in_gamma.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> dbeta =
in_grad[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> dgamma =
in_grad[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> save_mean =
out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
out_mean.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> save_inv_var =
out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
out_var.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);

typename DataType<DType>::ScaleType a = 1.0f;
typename DataType<DType>::ScaleType b = 0.0f;
Expand Down Expand Up @@ -232,15 +233,15 @@ class CuDNNBatchNormOp {
#else // CUDNN_VERSION < 4007
MSHADOW_REAL_TYPE_SWITCH(dtype_param_, DTypeParam, {
Tensor<gpu, 1, DTypeParam> gamma =
in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
in_gamma.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> dbeta =
in_grad[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> dgamma =
in_grad[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> save_mean =
out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
out_mean.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
Tensor<gpu, 1, DTypeParam> save_inv_var =
out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
out_var.get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);

typename DataType<DType>::ScaleType a = 1.0f;
typename DataType<DType>::ScaleType b = 0.0f;
Expand Down
2 changes: 2 additions & 0 deletions src/operator/nn/deconvolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
}
};

typedef ParamOpSign<DeconvolutionParam> DeconvSignature;

} // namespace op
} // namespace mxnet

Expand Down
Loading