[RFC] Improve quantized convolution performance for armv8 architectur…

…es (apache#5754) * Improve quantized conv2d performance for armv8 Signed-off-by: Giuseppe Rossini <[email protected]> Change-Id: I3a3d29f5332dd9b3354e8e0dfb24677a521f9c8f * Add ASF header to conv2d_gemm.py Change-Id: I33853279e39c849ae1b555a9c91d7557985a0a35 * Run clang-format-10 on c++ files Change-Id: Ieee22f032e595dabfc1616ab33466fcbf8d94365 * Fix pylint errors/warnings Change-Id: I435d4d7bca7500db99547f4401fdc0d0995a1ff4 * Fix pylint errors/warnings in topi Change-Id: I2fc1ad8453e9020072ab967c849df5390c2967b5 * Fix legalizations tests for aarch64 Change-Id: I0a67a49a7849f52ef7d57b9292ce9125bbb7cb2c * Reintroduce conv2d_nhwc_spatial_pack.arm_cpu and int16 cast Change-Id: I91b67fabd475e90a9b75f2dd5ecfee851265e0bb * Switch type of legalization depending on the strategy used Change-Id: I9a03040a8c40a6cd2658ed14c3751e05a8e19f2b * Revert last commit Change-Id: Ice34101e358e3ce8ebfb12c58f73e910ba5de8e8 * Fix the auto-tuner by registering the correct schedules Change-Id: Id9273688b2620e1ea849ab01b4c46af8fbf37fd0 * Address review comments Change-Id: Ia1755a0af7b6d159072d9f0c93c932c481101e48 * Improve usability and readability of conv2d_gemm_weight_transform Change-Id: I3333186bbc2fe4054b58ce15d910e3be7b315482 * Change variable name to weight in Conv2DGemmWeightTransformRel Change-Id: Ifb5f1f33af7512fe67c6b049b20a42a0bb2d26c9 * Fix clang-10 linting errors Change-Id: I25ccc844d9cee23766096e1daddb6180abc413a6 * Trigger tests Change-Id: Id37706fb7cf77a87a3cc817ecf8046297d9ca95a
neo-ai · Jul 2, 2020 · 713d5ed · 713d5ed
1 parent 3054749
commit 713d5ed
Show file tree

Hide file tree

Showing 14 changed files with 1,065 additions and 14 deletions.
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
@@ -187,6 +187,17 @@ struct ConvWinogradWeightTransformAttrs : public tvm::AttrsNode<ConvWinogradWeig
   }
 };
 
+/*! \brief Attributes used in gemm weight transformation operators */
+struct ConvGemmWeightTransformAttrs : public tvm::AttrsNode<ConvGemmWeightTransformAttrs> {
+  int tile_rows;
+  int tile_cols;
+
+  TVM_DECLARE_ATTRS(ConvGemmWeightTransformAttrs, "relay.attrs.ConvGemmWeightTransformAttrs") {
+    TVM_ATTR_FIELD(tile_rows).describe("Tile rows of the weight transformation for ConvGemm.");
+    TVM_ATTR_FIELD(tile_cols).describe("Tile columns of the weight transformation for ConvGemm.");
+  }
+};
+
 /*! \brief Attributes used in convolution operators with winograd algorithm */
 struct Conv2DWinogradAttrs : public tvm::AttrsNode<Conv2DWinogradAttrs> {
   int tile_size;

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
@@ -446,6 +446,23 @@ def compute_mirror_pad(attrs, inputs, out_dtype):
 reg.register_pattern("nn.contrib_conv2d_winograd_without_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
+# conv2d_gemm related operators
+reg.register_strategy("nn.contrib_conv2d_gemm_without_weight_transform",
+                      strategy.conv2d_gemm_without_weight_transform_strategy)
+reg.register_pattern("nn.contrib_conv2d_gemm_without_weight_transform",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
+
+@reg.register_compute("nn.contrib_conv2d_gemm_weight_transform")
+def compute_contrib_conv2d_gemm_weight_transform(attrs, inputs, out_dtype):
+    """Compute definition of contrib_conv2d_gemm_weight_transform"""
+    out = topi.nn.conv2d_gemm_weight_transform(
+        inputs[0], attrs.tile_rows, attrs.tile_cols)
+    return [out]
+
+reg.register_schedule("nn.contrib_conv2d_gemm_weight_transform",
+                      strategy.schedule_conv2d_gemm_weight_transform)
+reg.register_pattern("nn.contrib_conv2d_gemm_weight_transform",
+                     OpPattern.OUT_ELEMWISE_FUSABLE)
 
 @reg.register_compute("nn.contrib_conv2d_winograd_weight_transform")
 def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype):

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
@@ -2046,6 +2046,74 @@ def contrib_conv2d_winograd_without_weight_transform(data,
         kernel_layout, out_layout, out_dtype)
 
 
+def contrib_conv2d_gemm_without_weight_transform(data,
+                                                 weight,
+                                                 strides=(1, 1),
+                                                 padding=(0, 0),
+                                                 dilation=(1, 1),
+                                                 groups=1,
+                                                 channels=None,
+                                                 kernel_size=None,
+                                                 data_layout="NCHW",
+                                                 kernel_layout="OIHW",
+                                                 out_layout="",
+                                                 out_dtype=""):
+    r"""2D convolution with gemm algorithm.
+
+    The basic parameters are the same as the ones in vanilla conv2d.
+    It assumes the weight is pre-transformed by nn.contrib_conv2d_gemm_weight_transform
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    strides : tuple of int, optional
+        The strides of convolution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    kernel_layout : str, optional
+        Layout of the weight.
+
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    # convert 2-way padding to 4-way padding
+    padding = get_pad_tuple2d(padding)
+    return _make.contrib_conv2d_gemm_without_weight_transform(
+        data, weight, strides, padding, dilation,
+        groups, channels, kernel_size, data_layout,
+        kernel_layout, out_layout, out_dtype)
+
+
 def contrib_conv2d_nchwc(data,
                          kernel,
                          strides=(1, 1),
@@ -2204,6 +2272,29 @@ def contrib_conv2d_winograd_weight_transform(weight,
     return _make.contrib_conv2d_winograd_weight_transform(weight, tile_size)
 
 
+def contrib_conv2d_gemm_weight_transform(weights, tile_rows, tile_cols):
+    r"""Weight Transformation part for 2D convolution with gemm algorithm.
+
+    We separate this as a single op to enable pre-compute for inference.
+    Use this together with nn.contrib_conv2d_gemm_without_weight_transform
+
+    Parameters
+    ----------
+    weights : tvm.relay.Expr
+        The weight expressions.
+    tile_rows: int
+        Tile rows of the weight transformation for ConvGemm.
+    tile_cols: int
+       Tile columns of the weight transformation for ConvGemm.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.contrib_conv2d_gemm_weight_transform(weights, tile_rows, tile_cols)
+
+
 def contrib_conv3d_winograd_weight_transform(weight,
                                              tile_size):
     r"""Weight Transformation part for 3D convolution with winograd algorithm.

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -112,6 +112,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_direct_simd),
                     name='conv2d_direct_simd.micro_dev')
             elif kernel_layout == "HWIO":
+                is_aarch64 = "aarch64" in str(isa.target)
+
+                if is_aarch64 and data.dtype in ["int8", "uint8"]:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
+                        name="conv2d_NHWC_quantized.arm_cpu")
+
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
@@ -246,6 +254,40 @@ def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out
                            format(layout))
     return strategy
 
+def wrap_compute_conv2d_gemm(topi_compute):
+    """wrap topi compute for conv2d_gemm"""
+
+    def _compute_conv2d_gemm(attrs, inputs, out_type):
+        padding = attrs.get_int_tuple("padding")
+        strides = attrs.get_int_tuple("strides")
+        dilation = attrs.get_int_tuple("dilation")
+        out_dtype = attrs.get_str("out_dtype")
+        channels = attrs['channels']
+        kernel_size = attrs['kernel_size']
+        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
+        return [topi_compute(inputs[0], inputs[1], strides, padding,
+                             dilation, out_dtype, kernel_size, channels)]
+
+    return _compute_conv2d_gemm
+
+@conv2d_gemm_without_weight_transform_strategy.register("arm_cpu")
+def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom arm cpu strategy"""
+    layout = attrs.data_layout
+    data = inputs[0]
+    strategy = _op.OpStrategy()
+
+    if layout == "NHWC" and data.dtype in ['int8', 'uint8']:
+        strategy.add_implementation(
+            wrap_compute_conv2d_gemm(topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform),
+            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
+            name="conv2d_NHWC_quantized_without_transform.arm_cpu")
+    else:
+        raise RuntimeError(
+            "Unsupported conv2d_gemm_without_weight_transform layout {0} with datatype {1}".
+            format(layout, data.dtype))
+    return strategy
+
 @conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"])
 def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_transpose arm cpu strategy"""

diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
@@ -266,6 +266,12 @@ def conv2d_winograd_without_weight_transfrom_strategy(attrs, inputs, out_type, t
     """conv2d_winograd_without_weight_transfrom generic strategy"""
     raise ValueError("No generic implemenation for conv2d_winograd_without_weight_transform")
 
+# conv2d_gemm_without_weight_transform
+@override_native_generic_func("conv2d_gemm_without_weight_transform_strategy")
+def conv2d_gemm_without_weight_transform_strategy(attrs, inputs, out_type, target):
+    """conv2d_gemm_without_weight_transfrom generic strategy"""
+    raise ValueError("No generic implemenation for conv2d_gemm_without_weight_transform")
+
 # conv2d_winograd_weight_transform
 @generic_func
 def schedule_conv2d_winograd_weight_transform(attrs, outs, target):
@@ -280,6 +286,13 @@ def schedule_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
     with target:
         return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
 
+# conv2d_gemm_weight_transform
+@generic_func
+def schedule_conv2d_gemm_weight_transform(attrs, outs, target):
+    """Schedule conv2d_gemm_weight_transform"""
+    with target:
+        return topi.generic.schedule_conv2d_gemm_weight_transform(outs)
+
 # deformable_conv2d
 def wrap_compute_deformable_conv2d(topi_compute):
     """wrap deformable_conv2d topi compute"""

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
@@ -237,17 +237,23 @@ def is_fast_int8_on_arm():
     target = tvm.target.Target.current(allow_none=False)
     return '+v8.2a,+dotprod' in ' '.join(target.options)
 
+def is_aarch64_arm():
+    """ Checks whether we are compiling for an AArch64 target. """
+    target = tvm.target.Target.current(allow_none=False)
+    return 'aarch64' in ' '.join(target.options)
+
 ########################
 # ARM CPU legalizations.
 ########################
 
 @qnn_conv2d_legalize.register('arm_cpu')
 def _qnn_conv2d_legalize_arm_cpu(attrs, inputs, types):
     # ARM prefers the dtypes to be same.
-    if is_fast_int8_on_arm():
+    if (is_aarch64_arm() and attrs["data_layout"] == "NHWC") or is_fast_int8_on_arm():
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
     return helper_no_fast_int8_hw_legalization(attrs, inputs, types, relay.nn.conv2d)
 
+
 @qnn_dense_legalize.register('arm_cpu')
 def _qnn_dense_legalize_arm_cpu(attrs, inputs, types):
     # ARM prefers the dtypes to be same.

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
@@ -77,13 +77,41 @@ Expr MakeConvWinograd(Expr data, Expr weight, int tile_size, Array<IndexExpr> st
   return Call(op, {data, weight}, Attrs(attrs), {});
 }
 
+template <typename T>
+Expr MakeConvGemm(Expr data, Expr weight, Array<IndexExpr> strides, Array<IndexExpr> padding,
+                  Array<IndexExpr> dilation, int groups, IndexExpr channels,
+                  Array<IndexExpr> kernel_size, std::string data_layout, std::string kernel_layout,
+                  std::string out_layout, DataType out_dtype, std::string op_name) {
+  auto attrs = make_object<T>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = std::move(channels);
+  attrs->kernel_size = std::move(kernel_size);
+  attrs->data_layout = std::move(data_layout);
+  attrs->kernel_layout = std::move(kernel_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  const Op& op = Op::Get(op_name);
+  return Call(op, {data, weight}, Attrs(attrs), {});
+}
+
 Expr MakeConvWinogradWeightTransform(Expr weight, int tile_size, std::string op_name) {
   auto attrs = make_object<ConvWinogradWeightTransformAttrs>();
   attrs->tile_size = tile_size;
   const Op& op = Op::Get(op_name);
   return Call(op, {weight}, Attrs(attrs), {});
 }
 
+Expr MakeConvGemmWeightTransform(Expr weight, int tile_rows, int tile_cols, std::string op_name) {
+  auto attrs = make_object<ConvGemmWeightTransformAttrs>();
+  attrs->tile_rows = tile_rows;
+  attrs->tile_cols = tile_cols;
+  const Op& op = Op::Get(op_name);
+  return Call(op, {weight}, Attrs(attrs), {});
+}
+
 template <typename T>
 Expr MakeConvTranspose(Expr data, Expr weight, Array<IndexExpr> strides, Array<IndexExpr> padding,
                        Array<IndexExpr> dilation, int groups, IndexExpr channels,
@@ -504,6 +532,60 @@ weight transformation in advance.
     .set_support_level(10)
     .add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel);
 
+// relay.nn.contrib_conv2d_gemm_without_weight_transform
+TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_without_weight_transform")
+    .set_body_typed([](Expr data, Expr weight, Array<IndexExpr> strides, Array<IndexExpr> padding,
+                       Array<IndexExpr> dilation, int groups, IndexExpr channels,
+                       Array<IndexExpr> kernel_size, std::string data_layout,
+                       std::string kernel_layout, std::string out_layout, DataType out_dtype) {
+      return MakeConvGemm<Conv2DAttrs>(
+          data, weight, strides, padding, dilation, groups, channels, kernel_size, data_layout,
+          kernel_layout, out_layout, out_dtype, "nn.contrib_conv2d_gemm_without_weight_transform");
+    });
+
+RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_without_weight_transform")
+    .describe(R"code(Compute conv2d with gemm algorithm. Only supports NHWC layout.
+                 This operator assumes the weight tensor is already pre-transformed by
+                 nn.contrib_conv2d_gemm_weight_transform.
+
+- **data**: Input is 4D array of shape  (batch_size, height, width, in_channels)
+- **weight**: Any shape
+            We do not check the shape for this input tensor. Since different backend
+            has different layout strategy.
+
+- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<Conv2DAttrs>()
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_argument("weight", "Tensor", "The weight tensor.")
+    .set_support_level(10)
+    .add_type_rel("Conv2DGemm", Conv2DGemmRel<Conv2DAttrs>)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
+
+// relay.nn.contrib_conv2d_gemm_weight_transform
+
+TVM_REGISTER_NODE_TYPE(ConvGemmWeightTransformAttrs);
+
+TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_weight_transform")
+    .set_body_typed([](Expr weights, int tile_rows, int tile_cols) {
+      return MakeConvGemmWeightTransform(weights, tile_rows, tile_cols,
+                                         "nn.contrib_conv2d_gemm_weight_transform");
+    });
+
+RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_weight_transform")
+    .describe(R"code(Weight transformation of GEMM convolution algorithm.
+
+Separate this into another operator in order to enable Precompute Pass to compute the
+weight transformation in advance.
+
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<ConvGemmWeightTransformAttrs>()
+    .set_num_inputs(1)
+    .add_argument("weights", "Tensor", "The weights tensor.")
+    .set_support_level(10)
+    .add_type_rel("Conv2DGemmWeightTransform", Conv2DGemmWeightTransformRel);
+
 // Positional relay function to create conv2d NCHWc operator
 // used by frontend FFI.
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_NCHWc")