diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index d5c595145370..dfaa6be60a5f 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -58,7 +58,8 @@ void ConvolutionCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs[conv::kBias].shape_.ndim(), 1); auto layout = static_cast(param.layout.value()); auto li = cudnn::GetLayoutInfo(layout); - if (!cudnn::LegacyAddBias(ctx, li, outputs[conv::kOut], inputs[conv::kBias])) { + if (dmlc::GetEnv("MXNET_NATIVE_ADD_BIAS", li.channel_last) || + !cudnn::LegacyAddBias(ctx, li, outputs[conv::kOut], inputs[conv::kBias])) { int k = inputs[conv::kBias].shape_.Size(); auto b = inputs[conv::kBias].reshape(cudnn::ExpandChannelDims(layout, k)); BinaryBroadcastRTCCompute{"add"}( // NOLINT(whitespace/braces) @@ -142,7 +143,8 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, if (ok && !param.no_bias && req[conv::kBias] != kNullOp) { auto li = cudnn::GetLayoutInfo(static_cast(param.layout.value())); auto add_to = req[conv::kBias] == kAddTo; - if (!cudnn::LegacyBiasGrad(ctx, li, add_to, outputs[conv::kBias], inputs[0])) { + if (dmlc::GetEnv("MXNET_NATIVE_BIAS_GRAD", li.channel_last) || + !cudnn::LegacyBiasGrad(ctx, li, add_to, outputs[conv::kBias], inputs[0])) { if (li.channel_last) { // This kernel should be faster. auto y_grad = FlattenAs2DHead(inputs[0], ctx); diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index 99c09db55f83..8c88972564e3 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -57,7 +57,8 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs[deconv::kBias].shape_.ndim(), 1); auto layout = static_cast(param.layout.value()); auto li = cudnn::GetLayoutInfo(layout); - if (!cudnn::LegacyAddBias(ctx, li, outputs[deconv::kOut], inputs[deconv::kBias])) { + if (dmlc::GetEnv("MXNET_NATIVE_ADD_BIAS", li.channel_last) || + !cudnn::LegacyAddBias(ctx, li, outputs[deconv::kOut], inputs[deconv::kBias])) { int k = inputs[deconv::kBias].shape_.Size(); auto b = inputs[deconv::kBias].reshape(cudnn::ExpandChannelDims(layout, k)); BinaryBroadcastRTCCompute{"add"}( // NOLINT(whitespace/braces) @@ -120,7 +121,8 @@ void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, if (ok && !param.no_bias && req[deconv::kBias] != kNullOp) { auto li = cudnn::GetLayoutInfo(static_cast(param.layout.value())); auto add_to = req[conv::kBias] == kAddTo; - if (!cudnn::LegacyBiasGrad(ctx, li, add_to, outputs[deconv::kBias], inputs[0])) { + if (dmlc::GetEnv("MXNET_NATIVE_BIAS_GRAD", li.channel_last) || + !cudnn::LegacyBiasGrad(ctx, li, add_to, outputs[deconv::kBias], inputs[0])) { if (li.channel_last) { // This kernel should be faster. auto y_grad = FlattenAs2DHead(inputs[0], ctx);