fix doc, add support for int32_lanes=4, signed int

apache · Sep 28, 2019 · 23a8b02 · 23a8b02
1 parent 43340b1
commit 23a8b02
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 36 deletions.
diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py
@@ -25,7 +25,7 @@
 from ..generic import conv2d as conv2d_generic
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
-from .tensor_intrin import dot_2x1x2_uint8_uint8_uint32
+from .tensor_intrin import dot_int8_int8_int32
 
 
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
@@ -98,12 +98,13 @@ def traverse(op):
             args = [s, cfg, data_vec, conv_out, outs[0]]
             # int8 conv kernel is 7-dim
             _, _, kh, kw, _, _, _ = get_const_tuple(kernel.shape)
+            dtype = "uint" if data.dtype == "uint8" else "int"
             if kh == 1 and kw == 1:
                 conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(
-                    *args, int32_lanes=2, intrin=dot_2x1x2_uint8_uint8_uint32())
+                    *args, int32_lanes=4, intrin=dot_int8_int8_int32(int32_lanes=4, dtype=dtype))
             else:
                 conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(
-                    *args, int32_lanes=2, intrin=dot_2x1x2_uint8_uint8_uint32())
+                    *args, int32_lanes=4, intrin=dot_int8_int8_int32(int32_lanes=4, dtype=dtype))
 
         scheduled_ops.append(op)
 

diff --git a/topi/python/topi/arm_cpu/tensor_intrin.py b/topi/python/topi/arm_cpu/tensor_intrin.py
@@ -19,16 +19,18 @@
 
 import tvm
 
-def dot_2x1x2_uint8_uint8_uint32():
+def dot_int8_int8_int32(int32_lanes, dtype='uint'):
     """
     Int8 dot product by every 4 elements using ARM v8.2 udot.
     This function takes two arrays of int8 datatype -- data[4] and
-    kernel[2][4] -- and computes a dot product of data[4] with every
-    4 elements of kernels, resulting in output[2] of uint32 datatype.
+    kernel[int32_lanes][4] -- and computes a dot product of data[4] with every
+    4 elements of kernels, resulting in output[int32_lanes] of uint32 datatype.
     The pseudo code is as follows.
+
     .. code-block:: c
-        void dot_2x1x2_uint8_uint8_uint32(int8 data[4], int8 kernel[16][4], uint32 output[16]){
-            for (int i = 0; i < 2; i++){
+
+        void dot_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){
+            for (int i = 0; i < int32_lanes; i++){
                 out[i] = 0;
                 for (int k = 0; k < 4; k++){
                     out[i] += data[k] * kernel[i][k]
@@ -41,53 +43,63 @@ def dot_2x1x2_uint8_uint8_uint32():
     function returns a TensorIntrin that can be used to tensorize
     a schedule.
 
+    Parameters
+    ----------
+    int32_lanes: int
+        How many int32/uint32 to produce
+    dtype: str, optional, {"uint", "int"}
+        Whether it works on unsigned int or signed int
+
     Returns
     -------
     intrin : TensorIntrin
         The ARM uint8 TensorIntrin that can be used in tensorizing schedule
     """
-    int32_lanes = 2  # 2 uint32 lanes
     num_int8_elements = 4  # 4 uint8 elements in int32
 
-    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
-    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='uint8', name='kernel')
+    data = tvm.placeholder((num_int8_elements,), dtype='%s8' % dtype, name='data')
+    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='%s8' % dtype, name='kernel')
 
     k = tvm.reduce_axis((0, num_int8_elements), name='k')
     C = tvm.compute((int32_lanes,),
-                    lambda i: tvm.sum(data[k].astype('uint32') *
-                                      kernel[i, k].astype('uint32'),
+                    lambda i: tvm.sum(data[k].astype('%s32' % dtype) *
+                                      kernel[i, k].astype('%s32' % dtype),
                                       axis=k), name="C")
 
-    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+    a_buffer = tvm.decl_buffer(data.shape, dtype='%s8' % dtype, name="a_buffer",
                                offset_factor=1,
                                strides=[1])
-    b_buffer = tvm.decl_buffer(kernel.shape, dtype='uint8', name="b_buffer",
+    b_buffer = tvm.decl_buffer(kernel.shape, dtype='%s8' % dtype, name="b_buffer",
                                offset_factor=1,
                                strides=[tvm.var('s'), 1])
 
     def _intrin_func(ins, outs):
         def _instr(index):
             ib = tvm.ir_builder.create()
             if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.const(0, 'uint32x2')))
+                ib.emit(outs[0].vstore(0, tvm.const(0, '%s32x%d' % (dtype, int32_lanes))))
                 return ib.get()
 
-            a_int8 = ins[0].vload([0], "uint8x4")
-            re_int32 = tvm.call_pure_intrin('uint32', 'reinterpret', a_int8)
-            vec_ai32 = re_int32.astype('uint32x2')
+            dtype_a = '%s8x%d' % (dtype, num_int8_elements)
+            dtype_b = '%s8x%d' % (dtype, int32_lanes * num_int8_elements)
+            dtype_c = '%s32x%d' % (dtype, int32_lanes)
+
+            a_int8 = ins[0].vload([0], dtype_a)
+            re_int32 = tvm.call_pure_intrin('%s32' % dtype, 'reinterpret', a_int8)
+            # broadcast a
+            vec_ai32 = re_int32.astype(dtype_c)
 
-            vec_a = tvm.call_pure_intrin('uint8x8', 'reinterpret', vec_ai32)
-            vec_b = ins[1].vload([0, 0], "uint8x8")
-            vec_c = tvm.const(0, 'uint32x2')
+            vec_a = tvm.call_pure_intrin(dtype_b, 'reinterpret', vec_ai32)
+            vec_b = ins[1].vload([0, 0], dtype_b)
+            vec_c = outs[0].vload([0], dtype_c)
 
-            vdot = tvm.call_llvm_intrin('uint32x2',
-                                        'llvm.aarch64.neon.udot.v2i32.v8i8',
+            inst = 'udot' if dtype == 'uint' else 'sdot'
+            inst = 'llvm.aarch64.neon.%s.v%di32.v%di8' % (inst, int32_lanes, int32_lanes * num_int8_elements)
+            vdot = tvm.call_llvm_intrin(dtype_c,
+                                        inst,
                                         tvm.const(2, 'uint32'),
                                         vec_c, vec_a, vec_b)
-            if index == 0:
-                ib.emit(outs[0].vstore(0, vdot))
-            else:
-                ib.emit(outs[0].vstore(0, vdot + outs[0].vload([0], 'uint32x2')))
+            ib.emit(outs[0].vstore(0, vdot))
             return ib.get()
 
         # body, reset, update

diff --git a/topi/python/topi/generic/conv2d.py b/topi/python/topi/generic/conv2d.py
@@ -27,6 +27,7 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
     Normally the inner most pattern takes two int8/uint8 tensors
     data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
     produces a dot product int32/uint32 output[int32_lanes].
+
     Parameters
     ----------
     int32_lanes : int
@@ -69,6 +70,7 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
     Normally the inner most pattern takes two int8/uint8 tensors
     data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
     produces a dot product int32/uint32 output[int32_lanes].
+
     Parameters
     ----------
     int32_lanes : int

diff --git a/topi/recipe/conv/test_conv_int8_arm.py b/topi/recipe/conv/test_conv_int8_arm.py
@@ -146,12 +146,13 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
     LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup")
     SPEEDUP_ARRAY = []
     for i, wkl in enumerate(WORKLOADS):
-        fp32_time = run_inference('float32', 'float32', 'float32', *wkl)
-        int8_time = run_inference('uint8', 'uint8', 'uint32', *wkl)
-        kernel_h = wkl[4]
-        kernel_w = wkl[5]
-        LOGGER.info("Workload#" + str(i) + ", " + str(kernel_h) + "x" + str(kernel_w) + ", "
-                    + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time))
-
-        SPEEDUP_ARRAY.append(fp32_time/int8_time)
+        for dtype in ["uint", "int"]:
+            fp32_time = run_inference('float32', 'float32', 'float32', *wkl)
+            int8_time = run_inference('%s8' % dtype, '%s8' % dtype, '%s32' % dtype, *wkl)
+            kernel_h = wkl[4]
+            kernel_w = wkl[5]
+            LOGGER.info("[%s] Workload#" % dtype + str(i) + ", " + str(kernel_h) + "x" + str(kernel_w) + ", "
+                        + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time))
+
+            SPEEDUP_ARRAY.append(fp32_time/int8_time)
     LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY))))