[MXNET-266] Fix cudnn_conv and cudnn_deconv deadlock (apache#10392)

* Fix deadlock of cudnn_conv wrapper * Fix deconv deadlock * Fix lint * Revert "Fix lint" This reverts commit 66f0936. * Fix lint * Fix indentation
zheng-da · Jun 28, 2018 · 5b77ae9 · 5b77ae9
1 parent 6648405
commit 5b77ae9
Show file tree

Hide file tree

Showing 6 changed files with 455 additions and 462 deletions.
diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu
@@ -36,10 +36,12 @@ namespace op {
 
 #if MXNET_USE_CUDNN == 1
 template<typename DType>
-static CuDNNConvolutionOp<DType> &GetCuDNNConvOp(const ConvolutionParam& param,
-    int forward_compute_type, int backward_compute_type,
-    const std::vector<TShape>& in_shape, const std::vector<TShape>& out_shape,
-    const Context& ctx) {
+static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
+                                                 int forward_compute_type,
+                                                 int backward_compute_type,
+                                                 const std::vector<TShape>& in_shape,
+                                                 const std::vector<TShape>& out_shape,
+                                                 const RunContext& rctx) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<ConvSignature,
                                          std::shared_ptr<CuDNNConvolutionOp<DType> >,
@@ -62,7 +64,7 @@ static CuDNNConvolutionOp<DType> &GetCuDNNConvOp(const ConvolutionParam& param,
   key.AddSign(backward_compute_type);
   key.AddSign(in_shape);
   key.AddSign(out_shape);
-  key.AddSign(ctx.dev_id);
+  key.AddSign(rctx.ctx.dev_id);
 
   auto it = ops.find(key);
   if (it == ops.end()) {
@@ -72,17 +74,18 @@ static CuDNNConvolutionOp<DType> &GetCuDNNConvOp(const ConvolutionParam& param,
     CHECK(ins_ret.second);
     it = ins_ret.first;
     it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
-                     out_shape, ctx);
+                     out_shape, rctx);
   }
   return *it->second;
 }
 #endif
 
 template<>
 void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx, const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   int dtype = inputs[conv::kData].type_flag_;
 
@@ -120,7 +123,7 @@ void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
       op.Init(param);
       op.Forward(ctx, inputs, req, outputs);
     } else if (!CuDNNConvolutionOp<DType>::Supports(param,
-          compute_type, compute_type, ctx.run_ctx.ctx)) {
+          compute_type, compute_type, ctx.run_ctx.ctx.dev_id)) {
       LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
       ConvolutionOp<gpu, DType> op;
       op.Init(param);
@@ -131,7 +134,7 @@ void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
       for (size_t i = 0; i < in_shape.size(); i++)
         in_shape[i] = inputs[i].shape_;
       CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
-          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx);
+          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx);
       op.Forward(ctx, inputs, req, outputs);
     }
   })
@@ -146,9 +149,10 @@ void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
 
 template<>
 void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
-    const OpContext& ctx, const std::vector<TBlob>& inputs,
-    const std::vector<OpReqType>& req,
-    const std::vector<TBlob>& outputs) {
+                                 const OpContext& ctx,
+                                 const std::vector<TBlob>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<TBlob>& outputs) {
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
   std::vector<TBlob> in_data(inputs.begin() + 1, inputs.end());
   const TBlob &out_grad = inputs[0];
@@ -190,7 +194,7 @@ void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
       op.Init(param);
       op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
     } else if (!CuDNNConvolutionOp<DType>::Supports(param,
-          compute_type, compute_type, ctx.run_ctx.ctx)) {
+          compute_type, compute_type, ctx.run_ctx.ctx.dev_id)) {
       LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
       ConvolutionOp<gpu, DType> op;
       op.Init(param);
@@ -202,7 +206,7 @@ void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
       for (size_t i = 0; i < in_shape.size(); i++)
         in_shape[i] = in_data[i].shape_;
       CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
-          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx);
+          compute_type, compute_type, in_shape, out_shape, ctx.run_ctx);
       op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
     }
   })