PaddlePaddle · qili93 · Apr 13, 2021 · Apr 9, 2021
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -1363,7 +1363,14 @@ REGISTER_OP_KERNEL(
     conv2d_grad_grad, CUDNN, plat::CUDAPlace,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
+// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
+// Use depthwise_conv2d in MIOPEN to resolve this issue
+REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     depthwise_conv2d_grad_grad,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,

diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -919,11 +919,10 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
         batch_size * output_channels * output_height * output_width;
 #ifdef __HIPCC__
     int block_size = 256;
-    int grid_size = std::min((nums_output + block_size - 1) / block_size, 256);
 #else
     int block_size = 512;
-    int grid_size = (nums_output + block_size - 1) / block_size;
 #endif
+    int grid_size = (nums_output + block_size - 1) / block_size;
 
 #define check_case(c_filter_multiplier, c_stride, c_filter)                    \
   if (c_filter_multiplier == 0 ||                                              \

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
@@ -1524,6 +1524,10 @@ def conv2d(input,
             not use_cudnn):
         l_type = 'depthwise_conv2d'
 
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            core.is_compiled_with_rocm()):
+        l_type = 'depthwise_conv2d'
+
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
 

diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -1248,6 +1248,17 @@ def init_paddings(self):
 create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)
 
+# ------------ depthwise conv2d in MIOPEN ---------
+if core.is_compiled_with_rocm():
+    create_test_cudnn_padding_SAME_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_padding_SAME_class(
+        TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_cudnn_channel_last_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_channel_last_class(
+        TestDepthwiseConvWithDilation2_AsyPadding)
+
 create_test_cudnn_channel_last_fp16_class(
     TestConv2DOp_AsyPadding, grad_check=False)
 create_test_cudnn_channel_last_fp16_class(

diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
@@ -25,7 +25,7 @@
 import numpy as np
 from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
-from ...fluid import core, dygraph_utils
+from ...fluid import core, dygraph_utils, get_flags
 from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
@@ -551,6 +551,13 @@ def conv2d(x,
     if (num_channels == groups and num_channels != 1 and
             num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
+        if core.is_compiled_with_rocm():
+            use_cudnn = True
+        else:
+            use_cudnn = False
+
+    if (core.is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
+        ["FLAGS_conv2d_disable_cudnn"]):
         use_cudnn = False
 
     return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,

diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
@@ -153,6 +153,13 @@ def _get_default_param_initializer():
                                           in_channels != 1 and
                                           out_channels % in_channels == 0):
             self._op_type = 'depthwise_conv2d'
+            if core.is_compiled_with_rocm():
+                self._use_cudnn = True
+            else:
+                self._use_cudnn = False
+
+        if (core.is_compiled_with_cuda() and get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
             self._use_cudnn = False
 
     def extra_repr(self):
@@ -645,10 +652,6 @@ def __init__(self,
             bias_attr=bias_attr,
             data_format=data_format)
 
-        if (core.is_compiled_with_cuda() and get_flags(
-                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
-            self._use_cudnn = False
-
     def forward(self, x):
         if self._padding_mode != 'zeros':
             x = F.pad(x,