[AutoScheduler] Enable winograd for conv2d and layout rewrite for con…

…v3d (apache#7168) * [AutoScheduler] Enable winograd for conv2d & Enable layout rewrite for conv3d * fix test * fix test * update tutorials
wweic · Dec 26, 2020 · c1bca8e · c1bca8e
1 parent 592ecc0
commit c1bca8e
Show file tree

Hide file tree

Showing 14 changed files with 379 additions and 82 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -250,7 +250,12 @@
         "tune_relay_arm.py",
         "tune_relay_mobile_gpu.py",
     ],
-    "auto_scheduler": ["tune_matmul_x86.py", "tune_conv2d_layer_cuda.py"],
+    "auto_scheduler": [
+        "tune_matmul_x86.py",
+        "tune_conv2d_layer_cuda.py",
+        "tune_network_x86.py",
+        "tune_network_cuda.py",
+    ],
     "dev": ["low_level_custom_pass.py", "use_pass_infra.py", "bring_your_own_datatypes.py"],
 }
 

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
@@ -210,9 +210,10 @@ struct Conv2DWinogradAttrs : public tvm::AttrsNode<Conv2DWinogradAttrs> {
   int groups;
   IndexExpr channels;
   Array<IndexExpr> kernel_size;
-  std::string data_layout;
-  std::string kernel_layout;
-  std::string out_layout;
+  tvm::String data_layout;
+  tvm::String kernel_layout;
+  tvm::String out_layout;
+  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv2DWinogradAttrs, "relay.attrs.Conv2DWinogradAttrs") {
@@ -300,9 +301,10 @@ struct Conv3DAttrs : public tvm::AttrsNode<Conv3DAttrs> {
   int groups;
   IndexExpr channels;
   Array<IndexExpr> kernel_size;
-  std::string data_layout;
-  std::string kernel_layout;
-  std::string out_layout;
+  tvm::String data_layout;
+  tvm::String kernel_layout;
+  tvm::String out_layout;
+  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv3DAttrs, "relay.attrs.Conv3DAttrs") {

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
@@ -28,7 +28,10 @@
 
 
 def naive_schedule(_, outs, target):
-    """Return the naive default schedule"""
+    """Return the naive default schedule.
+    This function acts as a placeholder for op implementations that uses auto-scheduler.
+    Implemenations using this function should only be used along with auto-scheduler.
+    """
     if "gpu" in target.keys:
         # For GPU, we at least need thread binding to make a valid schedule.
         # So the naive schedule cannot be compiled.
@@ -502,7 +505,7 @@ def conv3d_transpose_strategy(attrs, inputs, out_type, target):
 
 
 # conv3d
-def wrap_compute_conv3d(topi_compute, need_layout=False):
+def wrap_compute_conv3d(topi_compute, need_layout=False, need_auto_scheduler_layout=False):
     """wrap conv3d topi compute"""
 
     def _compute_conv3d(attrs, inputs, out_type):
@@ -519,11 +522,14 @@ def _compute_conv3d(attrs, inputs, out_type):
             raise ValueError("Dilation should be positive value")
         if groups != 1:
             raise ValueError("Not support arbitrary group number for conv3d")
+
+        args = [inputs[0], inputs[1], strides, padding, dilation]
         if need_layout:
-            out = topi_compute(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)
-        else:
-            out = topi_compute(inputs[0], inputs[1], strides, padding, dilation, out_dtype)
-        return [out]
+            args.append(layout)
+        args.append(out_dtype)
+        if need_auto_scheduler_layout:
+            args.append(get_auto_scheduler_rewritten_layout(attrs))
+        return [topi_compute(*args)]
 
     return _compute_conv3d
 

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
@@ -91,6 +91,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
     """conv2d x86 strategy"""
     strategy = _op.OpStrategy()
     data, kernel = inputs
+    stride_h, stride_w = get_const_tuple(attrs.strides)
     dilation_h, dilation_w = get_const_tuple(attrs.dilation)
     groups = attrs.groups
     layout = attrs.data_layout
@@ -125,6 +126,35 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
                 name="conv2d_nhwc.x86",
             )
+
+            judge_winograd_auto_scheduler = False
+            if len(kernel.shape) == 4:
+                kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape)
+                judge_winograd_auto_scheduler = (
+                    "float" in data.dtype
+                    and "float" in kernel.dtype
+                    and kernel_h == 3
+                    and kernel_w == 3
+                    and stride_h == 1
+                    and stride_w == 1
+                    and dilation_h == 1
+                    and dilation_w == 1
+                    and 64 < co < 512
+                    # The last condition of co is based on our profiling of resnet workloads
+                    # on skylake avx512 machines. We found winograd is faster than direct
+                    # only when co is within this range
+                )
+
+            # register auto-scheduler implementations
+            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(
+                        topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True
+                    ),
+                    naive_schedule,  # this implementation should never be picked by autotvm
+                    name="conv2d_nhwc.winograd",
+                    plevel=15,
+                )
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
@@ -269,20 +299,39 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
     """conv3d generic strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.data_layout
-    if layout == "NCDHW":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
-            wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
-            name="conv3d_ncdhw.x86",
-        )
-    elif layout == "NDHWC":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
-            wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
-            name="conv3d_ndhwc.x86",
-        )
+    if is_auto_scheduler_enabled():
+        # Use auto-scheduler. We should provide clear compute definition without autotvm templates
+        # or packed layouts.
+        if layout == "NCDHW":
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.nn.conv3d_ncdhw, need_auto_scheduler_layout=True),
+                naive_schedule,
+                name="conv3d_ncdhw.x86",
+            )
+        elif layout == "NDHWC":
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.nn.conv3d_ndhwc, need_auto_scheduler_layout=True),
+                naive_schedule,
+                name="conv3d_ndhwc.x86",
+            )
+        else:
+            raise ValueError("Not support this layout {} yet".format(layout))
     else:
-        raise ValueError("Not support this layout {} yet".format(layout))
+        # Use autotvm templates
+        if layout == "NCDHW":
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
+                wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
+                name="conv3d_ncdhw.x86",
+            )
+        elif layout == "NDHWC":
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
+                wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
+                name="conv3d_ndhwc.x86",
+            )
+        else:
+            raise ValueError("Not support this layout {} yet".format(layout))
     return strategy
 
 
@@ -476,3 +525,30 @@ def scatter_nd_strategy_cpu(attrs, inputs, out_type, target):
         plevel=10,
     )
     return strategy
+
+
+@conv2d_winograd_without_weight_transfrom_strategy.register("cpu")
+def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom cpu strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    strides = attrs.get_int_tuple("strides")
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert strides == (1, 1), "Do not support strides now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout == "NHWC":
+        strategy.add_implementation(
+            wrap_compute_conv2d(
+                topi.nn.conv2d_winograd_nhwc_without_weight_transform,
+                need_auto_scheduler_layout=True,
+            ),
+            naive_schedule,
+            name="ansor.winograd",
+        )
+    else:
+        raise RuntimeError(
+            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+        )
+    return strategy
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -363,7 +363,14 @@ def _callback(op):
 
 @conv2d_winograd_nhwc.register(["cuda", "gpu"])
 def conv2d_winograd_nhwc_cuda(
-    data, weight, strides, padding, dilation, out_dtype, pre_computed=False
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    pre_computed=False,
+    auto_scheduler_rewritten_layout="",
 ):
     """Conv2D Winograd in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
@@ -982,6 +982,7 @@ def _conv2d_winograd_nhwc_impl(
     out_dtype,
     tile_size,
     pre_computed=False,
+    auto_scheduler_rewritten_layout="",
 ):
     """Conv2D Winograd implementation in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.
@@ -1002,8 +1003,10 @@ def _conv2d_winograd_nhwc_impl(
         Specifies the output data type.
     tile_size : int
         The size of the tile to use for the Winograd filter
-    pre_computed: bool
+    pre_computed: bool = False
         Whether the kernel is precomputed
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
 
     Returns
     -------
@@ -1020,7 +1023,16 @@ def _conv2d_winograd_nhwc_impl(
     if not pre_computed:
         KH, KW, CI, CO = get_const_tuple(weight.shape)
     else:
-        H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
+        if auto_scheduler_rewritten_layout:
+            H_CAT, W_CAT, CO, CI = get_const_tuple(
+                auto_scheduler.get_shape_from_rewritten_layout(
+                    auto_scheduler_rewritten_layout, ["eps", "nu", "co", "ci"]
+                )
+            )
+            auto_scheduler.remove_index_check(weight)
+        else:
+            H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
+
         KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
 
     pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
@@ -1052,8 +1064,10 @@ def _conv2d_winograd_nhwc_impl(
             ),
             name="kernel_pack",
         )
+        attrs = {}
     else:
         kernel_pack = weight
+        attrs = {"layout_free_placeholders": [kernel_pack]}
 
     # pack data tile
     input_tile = te.compute(
@@ -1085,9 +1099,12 @@ def _conv2d_winograd_nhwc_impl(
             data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][co][ci], axis=[ci]
         ),
         name="bgemm",
-        attrs={"layout_free_placeholders": [kernel_pack]},
+        attrs=attrs,
     )
 
+    if auto_scheduler_rewritten_layout:
+        bgemm = auto_scheduler.rewrite_compute_body(bgemm, auto_scheduler_rewritten_layout)
+
     # inverse transform
     r_a = te.reduce_axis((0, alpha), "r_a")
     r_b = te.reduce_axis((0, alpha), "r_b")
@@ -1112,7 +1129,16 @@ def _conv2d_winograd_nhwc_impl(
 
 
 @tvm.target.generic_func
-def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, pre_computed=False):
+def conv2d_winograd_nhwc(
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    pre_computed=False,
+    auto_scheduler_rewritten_layout="",
+):
     """Conv2D Winograd in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.
 
@@ -1132,6 +1158,8 @@ def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, pr
         Specifies the output data type.
     pre_computed: bool
         Whether the kernel is precomputed
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
 
     Returns
     -------
@@ -1149,11 +1177,18 @@ def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, pr
         out_dtype,
         tile_size,
         pre_computed,
+        auto_scheduler_rewritten_layout,
     )
 
 
 def conv2d_winograd_nhwc_without_weight_transform(
-    data, weight, strides, padding, dilation, out_dtype
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    auto_scheduler_rewritten_layout="",
 ):
     """Conv2D Winograd without layout transform in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.
@@ -1172,6 +1207,8 @@ def conv2d_winograd_nhwc_without_weight_transform(
         dilation size, or [dilation_height, dilation_width]
     out_dtype : str, optional
         Specifies the output data type.
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
 
     Returns
     -------
@@ -1180,5 +1217,12 @@ def conv2d_winograd_nhwc_without_weight_transform(
     """
 
     return conv2d_winograd_nhwc(
-        data, weight, strides, padding, dilation, out_dtype, pre_computed=True
+        data,
+        weight,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        pre_computed=True,
+        auto_scheduler_rewritten_layout=auto_scheduler_rewritten_layout,
     )