From f6bfd8c43620d87c71e36d38358047fc4db88e8e Mon Sep 17 00:00:00 2001
From: anijain2305 <anijain@umich.edu>
Date: Tue, 14 Apr 2020 16:57:37 +0000
Subject: [PATCH] [TOPI] Using x86 schedules for ARM conv2d.

---
 python/tvm/relay/op/strategy/arm_cpu.py     |  54 ++++++++--
 topi/python/topi/arm_cpu/conv2d_alter_op.py | 104 ++++++++++++++++++++
 topi/python/topi/x86/conv2d.py              |   2 +-
 3 files changed, 151 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index a803bb6bfdcd8..83e7e23503a7f 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -54,10 +54,15 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     if groups == 1:
         if layout == "NCHW":
             if kernel_layout == "OIHW":
+                # Commenting the spatial pack as x86 NCHWc schedules perform better.
+                # strategy.add_implementation(
+                #     wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
+                #     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
+                #     name="conv2d_nchw_spatial_pack.arm_cpu")
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
-                    name="conv2d_nchw_spatial_pack.arm_cpu")
+                    wrap_compute_conv2d(topi.x86.conv2d_nchw),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
+                    name="conv2d_nchw.x86")
                 # check if winograd algorithm is applicable
                 _, _, kh, kw = get_const_tuple(kernel.shape)
                 pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw))
@@ -100,11 +105,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
         if layout == "NCHW":
             assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout)
-            if kernel_layout == "OIHW":
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
-                    name="depthwise_conv2d_nchw.arm_cpu")
+
+            # Commenting the spatial pack as x86 NCHWc schedules perform better.
+            # if kernel_layout == "OIHW":
+            #     strategy.add_implementation(
+            #         wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
+            #         wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
+            #         name="depthwise_conv2d_nchw.arm_cpu")
+
             # TODO:
             # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
             # Let us comment it out but not remove.
@@ -115,6 +123,13 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
             #     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack),
             #     name="depthwise_conv2d_nchw_spatial_pack.arm_cpu",
             #     plevel=15)
+
+            channel_multiplier = get_const_tuple(inputs[1].shape)[1]
+            if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
+                    wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw),
+                    name="depthwise_conv2d_nchw.x86")
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
             logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
@@ -138,6 +153,29 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                                format(layout))
     return strategy
 
+@conv2d_NCHWc_strategy.register("arm_cpu")
+def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """conv2d_NCHWc x86 strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    logger.warning("Trying x86 Conv NCHWc.")
+    strategy.add_implementation(
+        wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
+        wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
+        name="conv2d_NCHWc.x86")
+    return strategy
+
+@depthwise_conv2d_NCHWc_strategy.register("cpu")
+def depthwise_conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """depthwise_conv2d x86 strategy"""
+    strategy = _op.OpStrategy()
+    logger.warning("Trying x86 DWC NCHWc.")
+    strategy.add_implementation(
+        wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True),
+        wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
+        name="depthwise_conv2d_NCHWc.x86")
+    return strategy
+
 def wrap_compute_conv2d_winograd_nnpack(topi_compute):
     """wrap topi compute for conv2d_winograd NNPack"""
     def _compute_conv2d_nnpack(attrs, inputs, out_type):
diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
index 553239b6c4268..934739f01797b 100644
--- a/topi/python/topi/arm_cpu/conv2d_alter_op.py
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -59,6 +59,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     data, kernel = tinfos
     out_dtype = out_type.dtype
 
+    data_tensor, kernel_tensor = tinfos
+    data_dtype = data_tensor.dtype
+    kernel_dtype = kernel_tensor.dtype
+
     idxd = tvm.tir.indexdiv
 
     if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu":
@@ -169,4 +173,104 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
         return relay.nn.conv2d(*inputs, **new_attrs)
 
+    if topi_tmpl == "conv2d_NCHWc.x86":
+        # we only convert conv2d_NCHW to conv2d_NCHWc for x86
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        if cfg.is_fallback:
+            _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding,
+                                out_dtype, False, data_layout)
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
+        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+
+        # update new attrs
+        new_attrs['channels'] = out_channel
+        new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
+        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
+        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+        new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+
+        # Store altered operator's config
+        new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                  dtype=data_dtype)
+        new_kernel = te.placeholder((out_channel//oc_bn, in_channel//ic_bn,
+                                     kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"],
+             new_attrs["out_layout"], out_dtype], topi_tmpl)
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
+
+    if topi_tmpl == "conv2d_NCHWc_int8.x86":
+        # TODO(@icemelon9, @anijain2305): Need to support data layout NHWC with kernel layout HWIO
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        if cfg.is_fallback:
+            _get_default_config_int8(cfg, data_tensor, kernel_tensor, strides, padding,
+                                     out_dtype, False, data_layout)
+
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
+        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+        n_elems = 4
+
+        # convert kernel data layout from 4D to 7D
+        data_expr, kernel_expr = inputs
+        kernel_IHWO = relay.transpose(kernel_expr, axes=(1, 2, 3, 0))
+        kernel_IHWOo = relay.reshape(kernel_IHWO, (in_channel, kh, kw, out_channel//oc_bn, oc_bn))
+        kernel_OHWoI = relay.transpose(kernel_IHWOo, axes=(3, 1, 2, 4, 0))
+        kernel_OHWoIi = relay.reshape(kernel_OHWoI, (out_channel//oc_bn, kh, kw, oc_bn,
+                                                     in_channel//ic_bn, ic_bn))
+        kernel_OHWoIie = relay.reshape(kernel_OHWoIi, (out_channel//oc_bn, kh, kw, oc_bn,
+                                                       in_channel//ic_bn, ic_bn//n_elems, n_elems))
+        kernel_OIHWioe = relay.transpose(kernel_OHWoIie, axes=(0, 4, 1, 2, 5, 3, 6))
+
+        # update new attrs
+        new_attrs['channels'] = out_channel
+        new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
+        new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+
+        # Store altered operator's config.
+        new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                  dtype=data_dtype)
+        new_kernel = te.placeholder((out_channel // oc_bn,
+                                     in_channel // ic_bn,
+                                     kh,
+                                     kw,
+                                     ic_bn // n_elems,
+                                     oc_bn,
+                                     n_elems), dtype=kernel_dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'],
+             new_attrs['out_layout'], out_dtype], topi_tmpl)
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.contrib_conv2d_nchwc(data_expr, kernel_OIHWioe, **new_attrs)
+
+    if topi_tmpl == "depthwise_conv2d_NCHWc.x86":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        if cfg.is_fallback:
+            _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding,
+                                out_dtype, True, data_layout)
+
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
+        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+        assert channel_multiplier == 1
+
+        # update new attrs
+        new_attrs['channels'] = out_channel
+        new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
+        new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn
+        new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+
+        # Store altered operator's config.
+        new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                  dtype=data_dtype)
+        new_kernel = te.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'],
+             new_attrs['out_layout'], out_dtype], topi_tmpl)
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
+
     return None
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index d875f8d6bd4b6..b4521357b4f03 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -169,7 +169,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo
 
     cfg.define_split("tile_ic", in_channel, num_outputs=2)
     cfg.define_split("tile_oc", num_filter, num_outputs=2)
-    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64, policy="verbose")
     if is_kernel_1x1:
         cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
     else: