From ef5d6c8712904bfc5590f3a2f67c1abd726bf4c2 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 14 Dec 2022 19:31:48 +0300
Subject: [PATCH 01/84] add skylake-avx512 tests

---
 python/tvm/testing/utils.py                   | 23 +++++++++++
 .../python/integration/test_auto_tensorize.py |  6 +++
 tests/python/relay/test_op_level1.py          | 41 +++++++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 899b05440388..04a9c55c36f1 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1027,6 +1027,23 @@ def _has_vnni():
     return False
 
 
+# check avx512 intrinsic groups for SkyLake X
+def _has_slavx512():
+    arch = platform.machine()
+    # Only linux is supported for now.
+    if arch == "x86_64" and sys.platform.startswith("linux"):
+        with open("/proc/cpuinfo", "r") as content:
+            ctx = content.read()
+            check = ("avx512f" in ctx and
+                    "avx512cd" in ctx and
+                    "avx512bw" in ctx and
+                    "avx512dq" in ctx and
+                    "avx512vl" in ctx)
+            return check
+
+    return False
+
+
 requires_arm_dot = Feature("arm_dot", "ARM dot product", run_time_check=_arm_dot_supported)
 
 
@@ -1035,6 +1052,12 @@ def _has_vnni():
 )
 
 
+requires_skylake_avx512 = Feature(
+    # TODO(vvchernov): check name and long name
+    "skylake", "x86 SkyLake", run_time_check=lambda: _has_slavx512() and _is_intel()
+)
+
+
 def _cmake_flag_enabled(flag):
     flag = tvm.support.libinfo()[flag]
 
diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 572da53b34fd..ac2f34ed35c3 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -177,6 +177,12 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
         asm = lib.lib.get_source("asm")
         assert "vpdpbusd" in asm
 
+    if "skylake-avx512" in target:
+        asm = lib.lib.get_source("asm")
+        assert "vpmaddubsw" in asm
+        assert "vpmaddwd" in asm
+        assert "vpaddd" in asm
+
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
     runtime.set_input("data", data_np)
     runtime.run()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 3bb9918c7c77..4526bd264f31 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -846,6 +846,47 @@ def test_dense_amx_int8():
         np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_skylake_avx512
+@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
+def test_dense_skylake_avx512(m, n, k):
+    data_shape = (m, k)
+    weight_shape = (n, k)
+
+    for data_dtype in ["uint8", "int8"]:
+        data = relay.var("data", shape=data_shape, dtype=data_dtype)
+        weight = relay.var("weight", shape=weight_shape, dtype="int8")
+        bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
+        dense = relay.nn.dense(data, weight, out_dtype="int32")
+        out = relay.nn.bias_add(dense, bias)
+        mod = tvm.IRModule.from_expr(out)
+
+        target = "llvm -mcpu=skylake-avx512"
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target)
+
+        asm = lib.lib.get_source("asm")
+        assert "vpmaddubsw" in asm
+        assert "vpmaddwd" in asm
+        assert "vpaddd" in asm
+
+        dev = tvm.device(target, 0)
+        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+        a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
+        b = np.random.uniform(1, 10, size=weight_shape).astype("int8")
+        c = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
+
+        runtime.set_input("data", a)
+        runtime.set_input("weight", b)
+        runtime.set_input("bias", c)
+        runtime.run()
+
+        out = runtime.get_output(0).numpy()
+        ref = np.dot(a.astype("int32"), b.transpose().astype("int32")) + c
+
+        np.testing.assert_equal(out, ref)
+
+
 @pytest.mark.skip("Requires GFX10 AMDGPU")
 def test_dense_rocm_sdot4():
     data_shape = (32, 96)

From 7c84e41a37a94b9ed137fd4d14a952549b3688be Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 15 Dec 2022 13:53:17 +0300
Subject: [PATCH 02/84] extend tests by skylake-avx512

---
 python/tvm/topi/x86/tensor_intrin.py  |  2 ++
 tests/python/relay/test_op_level10.py | 50 +++++++++++++++++++++++++++
 tests/python/relay/test_op_level2.py  | 11 +++++-
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
index 3b83fecbf552..4d4fd74ce183 100644
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ b/python/tvm/topi/x86/tensor_intrin.py
@@ -318,6 +318,7 @@ def _instr(index):
             else:  # Fall back to the normal AVX512
                 vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai32)
                 vec_one = tvm.tir.const(1, "int16x32")
+                # TODO(vvchernov): vpmaddwd?
                 pair_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int16x32",
                     "llvm.x86.avx512.pmaddubs.w.512",
@@ -325,6 +326,7 @@ def _instr(index):
                     vec_a,
                     vec_b,
                 )
+                # TODO(vvchernov): vpaddd?
                 quad_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int32x16",
                     "llvm.x86.avx512.pmaddw.d.512",
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index cdf4e734842b..aaefda84b8e9 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -575,6 +575,56 @@ def test_batch_matmul_amx(b, m, n, k):
         np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_skylake_avx512
+@pytest.mark.parametrize(
+    "b,m,n,k",
+    [
+        (16, 32, 128, 96),
+        (16, 32, 128, 97),
+        (16, 32, 129, 96),
+    ],
+)
+def test_batch_matmul_skylake_avx512(b, m, n, k):
+    x_shape = (b, m, k)
+    y_shape = (b, n, k)
+    z_shape = (b, m, n)
+
+# TODO(vvchernov): join duplicate code with cascadelake
+    for lhs_dtype in ["uint8", "int8"]:
+        x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
+        y = relay.var("y", shape=y_shape, dtype="int8")
+        z = relay.var("z", shape=z_shape, dtype="int32")
+        bmm = relay.nn.batch_matmul(x, y, out_dtype="int32")
+        out = bmm + z
+        mod = tvm.IRModule.from_expr(out)
+
+        target = "llvm -mcpu=skylake-avx512"
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target)
+
+        asm = lib.lib.get_source("asm")
+        assert "vpmaddubsw" in asm
+        assert "vpmaddwd" in asm
+        assert "vpaddd" in asm
+
+        dev = tvm.device(target, 0)
+        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+        x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
+        y_np = np.random.uniform(1, 10, size=y_shape).astype("int8")
+        z_np = np.random.uniform(1, 10, size=z_shape).astype("int32")
+
+        runtime.set_input("x", x_np)
+        runtime.set_input("y", y_np)
+        runtime.set_input("z", z_np)
+        runtime.run()
+
+        out = runtime.get_output(0).numpy()
+        ref = tvm.topi.testing.batch_matmul(x_np, y_np, out_dtype="int32") + z_np
+
+        np.testing.assert_equal(out, ref)
+
+
 @pytest.mark.skip("Requires GFX10 AMDGPU")
 def test_batch_matmul_rocm_sdot4():
     x_shape = (16, 32, 96)
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index ca1adf940029..1a78fba74ba9 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1691,7 +1691,10 @@ class TestConv2DInt8Intrinsics:
 
     @tvm.testing.fixture
     def fast_int8_intrinsic(self, target):
-        if "nehalem" in target or "core-avx2" in target or "skylake-avx512" in target:
+        if "nehalem" in target or "core-avx2" in target:
+            return "pmaddubs"
+        elif "skylake-avx512" in target:
+            # TODO(vvchernov): vpmaddubsw? vpmaddwd? vpaddd?
             return "pmaddubs"
         elif "cascadelake" in target:
             return "vpdpbusd"
@@ -2219,5 +2222,11 @@ def test_conv2d_int8_alter_dtype_vnni():
     _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=cascadelake", "vpdpbusd")
 
 
+@tvm.testing.requires_skylake_avx512
+def test_conv2d_int8_alter_dtype_vnni():
+    # TODO(vvchernov): Is check of "vpmaddubsw" and "vpmaddwd" needed?
+    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=skylake-avx512", "vpaddd")
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From a1c3b3cf2c3bc8f11c7b49f54a80ec03a3bfb74b Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 15 Dec 2022 14:24:55 +0300
Subject: [PATCH 03/84] lint fixes

---
 python/tvm/testing/utils.py           | 16 ++++++++++------
 tests/python/relay/test_op_level10.py |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 04a9c55c36f1..1899c896d46d 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1034,11 +1034,13 @@ def _has_slavx512():
     if arch == "x86_64" and sys.platform.startswith("linux"):
         with open("/proc/cpuinfo", "r") as content:
             ctx = content.read()
-            check = ("avx512f" in ctx and
-                    "avx512cd" in ctx and
-                    "avx512bw" in ctx and
-                    "avx512dq" in ctx and
-                    "avx512vl" in ctx)
+            check = (
+                "avx512f" in ctx
+                and "avx512cd" in ctx
+                and "avx512bw" in ctx
+                and "avx512dq" in ctx
+                and "avx512vl" in ctx
+            )
             return check
 
     return False
@@ -1054,7 +1056,9 @@ def _has_slavx512():
 
 requires_skylake_avx512 = Feature(
     # TODO(vvchernov): check name and long name
-    "skylake", "x86 SkyLake", run_time_check=lambda: _has_slavx512() and _is_intel()
+    "skylake",
+    "x86 SkyLake",
+    run_time_check=lambda: _has_slavx512() and _is_intel(),
 )
 
 
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index aaefda84b8e9..7b2517fe22c7 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -589,7 +589,7 @@ def test_batch_matmul_skylake_avx512(b, m, n, k):
     y_shape = (b, n, k)
     z_shape = (b, m, n)
 
-# TODO(vvchernov): join duplicate code with cascadelake
+    # TODO(vvchernov): join duplicate code with cascadelake
     for lhs_dtype in ["uint8", "int8"]:
         x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
         y = relay.var("y", shape=y_shape, dtype="int8")

From 57fbc5ea5ed0a63471c663bbc2f5c09b6263afc4 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 15 Dec 2022 17:05:43 +0300
Subject: [PATCH 04/84] fix misprinting

---
 python/tvm/relay/qnn/op/legalizations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 9baabf36a9d8..e84f04941988 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -248,7 +248,7 @@ def helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay_op):
     Replacing QA + 128 with QA' and (zp_a + 128) with zp_a'
     We get our new quantized uint8 tensor - scale * (QA' - zp_a')
 
-    Similarly we can convert from int8 to uint8.
+    Similarly we can convert from uint8 to int8.
 
     Parameters
     ----------

From 094fd8d9d9c81e54caa7b45467cc3fcfd642cd9c Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 19 Dec 2022 08:50:00 +0300
Subject: [PATCH 05/84] misprinting fix

---
 tests/python/relay/test_op_level2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 1a78fba74ba9..fb8ad538e258 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2223,7 +2223,7 @@ def test_conv2d_int8_alter_dtype_vnni():
 
 
 @tvm.testing.requires_skylake_avx512
-def test_conv2d_int8_alter_dtype_vnni():
+def test_conv2d_int8_alter_dtype_avx512():
     # TODO(vvchernov): Is check of "vpmaddubsw" and "vpmaddwd" needed?
     _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=skylake-avx512", "vpaddd")
 

From b7dff8f17c609af3ddcb9abcde30f35f06d0fe8b Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 19 Dec 2022 09:11:39 +0300
Subject: [PATCH 06/84] TODOs for further development

---
 src/meta_schedule/space_generator/space_generator.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 926f86cc4ff9..9ed3e7c81fc1 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -23,6 +23,7 @@ namespace meta_schedule {
 
 String GetRuleKindFromTarget(const Target& target) {
   if (target->kind->name == "llvm") {
+    // TODO(vvchernov): possibly need check target_has_avx512
     static const PackedFunc* f_check_vnni =
         runtime::Registry::Get("tvm.topi.x86.utils.target_has_vnni");
     ICHECK(f_check_vnni != nullptr) << "The `target_has_vnni` func is not in tvm registry.";
@@ -73,6 +74,8 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
     Array<ScheduleRule> default_sch_rules;
     Array<Postproc> default_postprocs;
     Map<Mutator, FloatImm> default_mutator_probs;
+    // TODO(vvchernov): check if need separated ScheduleRule, Postproc, Mutator
+    // for target with skylake-avx512
     if (kind == "llvm") {
       default_sch_rules = ScheduleRule::DefaultLLVM();
       default_postprocs = Postproc::DefaultLLVM();

From 78d5e25f44cabb51a15d1af07772a562073bff25 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 19 Dec 2022 10:25:29 +0300
Subject: [PATCH 07/84] add temporally commented tests for skylake-avx512 due
 to not implemented shedules and postprocs for it. add TODOs for further check
 and development

---
 tests/python/contrib/test_gemm_acc32_vnni.py  |  1 +
 .../python/integration/test_auto_tensorize.py | 19 +++++++++++++++++++
 tests/python/relay/test_pass_qnn_legalize.py  |  2 ++
 .../test_meta_schedule_relay_integration.py   |  1 +
 ..._meta_schedule_schedule_rule_mlt_intrin.py |  1 +
 .../test_meta_schedule_trace_apply.py         |  2 ++
 6 files changed, 26 insertions(+)

diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
index 9cec823cc58a..d9e702a3926a 100644
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -21,6 +21,7 @@
 from tvm import te
 import numpy as np
 from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
+# TODO(vvchernov): construct test here or separetely for avx512 (skylake-avx512)
 from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
 import pytest
 
diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index ac2f34ed35c3..3d3a23fada94 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -284,6 +284,14 @@ def test_vnni_dense():
     )
 
 
+# TODO(vvchernov): need schedule rules and postprocs for avx512
+# @tvm.testing.requires_skylake_avx512
+# def test_avx512_dense():
+#     _test_dense(
+#         "uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_AVX512, "llvm -mcpu=skylake-avx512 -num-cores 4"
+#     )
+
+
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
 @tvm.testing.requires_gpu
 def test_dp4a_dense():
@@ -304,6 +312,14 @@ def test_vnni_conv2d():
     )
 
 
+# TODO(vvchernov): need schedule rules and postprocs for avx512
+# @tvm.testing.requires_skylake_avx512
+# def test_avx512_conv2d():
+#     _test_conv2d(
+#         "uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_AVX512, "llvm -mcpu=skylake-avx512 -num-cores 4"
+#     )
+
+
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
 @tvm.testing.requires_gpu
 def test_dp4a_conv2d():
@@ -331,6 +347,9 @@ def test_vnni_bert_int8():
     )
 
 
+# TODO(vvchernov): check BERT on skylake-avx512?
+
+
 @tvm.testing.requires_gpu
 @pytest.mark.skip("Slow on CI")
 def test_dp4a_bert_int8():
diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
index a30cd1e73e3f..9272407f819d 100644
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ b/tests/python/relay/test_pass_qnn_legalize.py
@@ -137,6 +137,7 @@ def _get_mod(data_dtype, kernel_dtype):
         # Check transformations for platforms with fast Int8 support.
         #############################################################
         # Check that Intel VNNI gets picked up.
+        # TODO(vvchernov): VNNI is not supported by skylake, cascadelake only
         with tvm.target.Target("llvm -mcpu=skylake-avx512"):
             mod = relay.transform.InferType()(mod)
             legalized_mod = relay.qnn.transform.Legalize()(mod)
@@ -230,6 +231,7 @@ def _get_mod(data_dtype, kernel_dtype):
         # Check transformations for platforms with fast Int8 support.
         #############################################################
         # Check that Intel VNNI gets picked up.
+        # TODO(vvchernov): VNNI is not supported by skylake, cascadelake only
         with tvm.target.Target("llvm -mcpu=skylake-avx512"):
             mod = relay.transform.InferType()(mod)
             legalized_mod = relay.qnn.transform.Legalize()(mod)
diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index d3731cfa1be8..803d6307132d 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -344,6 +344,7 @@ def _test(mod, params, target):
             assert "vnni" in annotations["schedule_rule"]
 
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
+    # TODO(vvchernov): repeat for skylse-avx512?
     _test(mod, params, target="llvm -mcpu=cascadelake")
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
index 54f342c3a5d8..3783eeb45fdb 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
@@ -28,6 +28,7 @@
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
 
+# TODO(vvchernov): check avx512 for skylake?
 def test_vnni_conv2d_nchwc():
     @T.prim_func
     def conv2d_nchwc(
diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 9a62207fa261..a77b0246c1ba 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -1131,6 +1131,7 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                 T_cast[ax0, ax1, ax2, ax3, ax4] = T.cast(compute_2[ax0, ax1, ax2, ax3, ax4], "int32")
 
 
+# TODO(vvchernov): construct avx512 reference module (without vnni)
 def get_conv2d_vnni_mod(intrin_id):
     @tvm.script.ir_module
     class Conv2dInt8_NCHWc_scheduled:
@@ -2502,6 +2503,7 @@ def apply_trace(sch):
     verify(Conv2dInt8, apply_trace, Conv2dInt8_target, "cuda", Conv2dInt8_tensorcore_scheduled)
 
 
+# TODO(vvchernov): test int8 conv2d foravx512 without VNNI
 def test_conv2d_int8_vnni():
     def apply_trace(sch):
         b0 = sch.get_block(name="compile_engine_const", func_name="main")

From df49fe6a6fb8a112d8601b806f0cb71a7ad8ca29 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 20 Dec 2022 09:34:39 +0300
Subject: [PATCH 08/84] update int8-acc32 test for vnni and avx512 w/o it

---
 tests/python/contrib/test_gemm_acc32_vnni.py | 164 +++++++++----------
 1 file changed, 81 insertions(+), 83 deletions(-)

diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
index d9e702a3926a..95042ceba473 100644
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -14,107 +14,105 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
 
 import tvm
 import tvm.testing
 from tvm import te
 import numpy as np
-from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
-# TODO(vvchernov): construct test here or separetely for avx512 (skylake-avx512)
 from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
-import pytest
 
 
-@tvm.testing.requires_llvm
-@pytest.mark.skip("skip because feature not enabled")
-def test_fc_int8_acc32():
-    m = 1024
-    n = 1024
-    k = 1024
-
+def verify_fc_int8_acc32(
+        m = 1024,
+        n = 1024,
+        k = 1024,
+        target="llvm -mcpu=cascadelake"):
     X = te.placeholder((m, k), name="X", dtype="uint8")
-    W = te.placeholder((n, k), name="W", dtype="int8")
+    # W = te.placeholder((n, k), name="W", dtype="int8")
+
+    if not tvm.testing.device_enabled(target):
+        print("skip because %s is not enabled..." % target)
+        return
+
+    dev = tvm.device(target, 0)
+    pc = dot_16x1x16_uint8_int8_int32()
+    ak = te.reduce_axis((0, k), name="k")
+    packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
+
+    t_fc = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[
+                tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4) * 16 + j % 16, ak % 4
+            ].astype("int32"),
+            axis=ak,
+        ),
+        name="F",
+    )
+    t_sch = te.create_schedule(t_fc.op)
+    a_x, a_y = t_fc.op.axis
+    (a_k,) = t_fc.op.reduce_axis
+
+    a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16)
+    a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32)
+    a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4)
+    a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4)
+    t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki)
+
+    t_sch[t_fc].unroll(a_koi)
+    t_sch[t_fc].tensorize(a_yi, pc)
+
+    t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
+    t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
+
+    # generate the plain data
+    a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
+    b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
+
+    packW = np.random.uniform(1, 10, size=(n // 16, 16 * (k // 4), 4)).astype("int8")
+    # This occurs in pre_compute stage
+    for r_idx in range(n // 16):
+        for s_idx in range(16 * (k // 4)):
+            for t_idx in range(4):
+                packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][
+                    (s_idx // 16) * 4 + t_idx
+                ]
+
+    x = tvm.nd.array(a_, dev)
+    w = tvm.nd.array(packW, dev)
+    y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
+    result = t_evaluator(x, w, y)
 
     peak = 280
     print("Peak {} Gops/s".format(peak))
-    memory_ops = m * k + n * k + 2 * m * n
+    # memory_ops = m * k + n * k + 2 * m * n
     gops_per_mm = 2 * m * n * k
 
+    gops_per_sec = gops_per_mm / result.mean / 1e9
+    # verify the correctness
+    tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=0)
+    print(
+        "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}".format(
+            result.mean * 1000, gops_per_sec, gops_per_sec / peak
+        )
+    )
+    t_func.export_library("tensorize_acc32.o")
+
+
+@tvm.testing.requires_cascadelake
+def test_fc_int8_acc32_vnni():
     # For LLVM < 8.0, it shows "'cascadelake' is not a recognized processor for this target
     # (ignoring processor)" error with the following setting. After LLVM 8.0 is enabled in the
     # test, we should use cascadelake setting.
-    def verify(target="llvm -mcpu=cascadelake"):
-        if not tvm.testing.device_enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-
-        dev = tvm.device(target, 0)
-        pc = dot_16x1x16_uint8_int8_int32_cascadelake()
-        ak = te.reduce_axis((0, k), name="k")
-        packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
-
-        t_fc = te.compute(
-            (m, n),
-            lambda i, j: te.sum(
-                X[i, ak].astype("int32")
-                * packedW[
-                    tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4) * 16 + j % 16, ak % 4
-                ].astype("int32"),
-                axis=ak,
-            ),
-            name="F",
-        )
-        t_sch = te.create_schedule(t_fc.op)
-        a_x, a_y = t_fc.op.axis
-        (a_k,) = t_fc.op.reduce_axis
-
-        a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16)
-        a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32)
-        a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4)
-        a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4)
-        t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki)
-
-        t_sch[t_fc].unroll(a_koi)
-        t_sch[t_fc].tensorize(a_yi, pc)
-
-        t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic")
-        t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10)
-
-        # generate the plain data
-        a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
-        b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8")
-
-        packW = np.random.uniform(1, 10, size=(n // 16, 16 * (k // 4), 4)).astype("int8")
-        # This occurs in pre_compute stage
-        for r_idx in range(n // 16):
-            for s_idx in range(16 * (k // 4)):
-                for t_idx in range(4):
-                    packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][
-                        (s_idx // 16) * 4 + t_idx
-                    ]
-
-        x = tvm.nd.array(a_, dev)
-        w = tvm.nd.array(packW, dev)
-        y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev)
-        result = t_evaluator(x, w, y)
-
-        gops_per_sec = gops_per_mm / result.mean / 1e9
-        # verify the correctness
-        tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=0)
-        print(
-            "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}".format(
-                result.mean * 1000, gops_per_sec, gops_per_sec / peak
-            )
-        )
-        t_func.export_library("tensorize_acc32.o")
+    verify_fc_int8_acc32(target = "llvm -mcpu=cascadelake")
 
-    verify()
 
+@tvm.testing.requires_skylake_avx512
+def test_fc_int8_acc32_avx512():
+    verify_fc_int8_acc32(target = "llvm -mcpu=skylake-avx512")
 
-if __name__ == "__main__":
-    # The test requires Cascade Lake and newer Intel machines to generate the
-    # correct AVX512 VNNI instruction. So, disabling the test.
 
-    # test_fc_int8_acc32()
-    pass
+if __name__ == "__main__":
+    test_fc_int8_acc32_vnni()
+    test_fc_int8_acc32_avx512()

From d2e02e822392ce38d19f66dca7b2612f1d5712ef Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 20 Dec 2022 09:50:07 +0300
Subject: [PATCH 09/84] pylint fix

---
 tests/python/contrib/test_gemm_acc32_vnni.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
index 95042ceba473..7f1a0027811f 100644
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -22,11 +22,7 @@
 from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
 
 
-def verify_fc_int8_acc32(
-        m = 1024,
-        n = 1024,
-        k = 1024,
-        target="llvm -mcpu=cascadelake"):
+def verify_fc_int8_acc32(m=1024, n=1024, k=1024, target="llvm -mcpu=cascadelake"):
     X = te.placeholder((m, k), name="X", dtype="uint8")
     # W = te.placeholder((n, k), name="W", dtype="int8")
 
@@ -75,9 +71,7 @@ def verify_fc_int8_acc32(
     for r_idx in range(n // 16):
         for s_idx in range(16 * (k // 4)):
             for t_idx in range(4):
-                packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][
-                    (s_idx // 16) * 4 + t_idx
-                ]
+                packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][(s_idx // 16) * 4 + t_idx]
 
     x = tvm.nd.array(a_, dev)
     w = tvm.nd.array(packW, dev)
@@ -105,12 +99,12 @@ def test_fc_int8_acc32_vnni():
     # For LLVM < 8.0, it shows "'cascadelake' is not a recognized processor for this target
     # (ignoring processor)" error with the following setting. After LLVM 8.0 is enabled in the
     # test, we should use cascadelake setting.
-    verify_fc_int8_acc32(target = "llvm -mcpu=cascadelake")
+    verify_fc_int8_acc32()
 
 
 @tvm.testing.requires_skylake_avx512
 def test_fc_int8_acc32_avx512():
-    verify_fc_int8_acc32(target = "llvm -mcpu=skylake-avx512")
+    verify_fc_int8_acc32(target="llvm -mcpu=skylake-avx512")
 
 
 if __name__ == "__main__":

From 0610ea69483931d9f2c1768a1b4baa44042777fd Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 20 Dec 2022 10:15:34 +0300
Subject: [PATCH 10/84] once more pylint fix

---
 tests/python/integration/test_auto_tensorize.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 3d3a23fada94..d6a7475170db 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -288,7 +288,10 @@ def test_vnni_dense():
 # @tvm.testing.requires_skylake_avx512
 # def test_avx512_dense():
 #     _test_dense(
-#         "uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_AVX512, "llvm -mcpu=skylake-avx512 -num-cores 4"
+#         "uint8",
+#         SCH_RULES_FOR_AVX512,
+#         POSTPROCS_FOR_AVX512,
+#         "llvm -mcpu=skylake-avx512 -num-cores 4"
 #     )
 
 
@@ -316,7 +319,10 @@ def test_vnni_conv2d():
 # @tvm.testing.requires_skylake_avx512
 # def test_avx512_conv2d():
 #     _test_conv2d(
-#         "uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_AVX512, "llvm -mcpu=skylake-avx512 -num-cores 4"
+#         "uint8",
+#         SCH_RULES_FOR_AVX512,
+#         POSTPROCS_FOR_AVX512,
+#         "llvm -mcpu=skylake-avx512 -num-cores 4"
 #     )
 
 
From 751f729b727c1465f3b7afabc0bb08b5afecdf42 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 20 Dec 2022 10:34:34 +0300
Subject: [PATCH 11/84] fix Feature init for skylake

---
 python/tvm/testing/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 1899c896d46d..fa8d1afaf7d3 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1055,9 +1055,8 @@ def _has_slavx512():
 
 
 requires_skylake_avx512 = Feature(
-    # TODO(vvchernov): check name and long name
-    "skylake",
-    "x86 SkyLake",
+    "skylake_avx512",
+    "x86 SkyLake AVX512",
     run_time_check=lambda: _has_slavx512() and _is_intel(),
 )
 

From 36476311272b0570eb6a5486815bb8d92d39f786 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 20 Dec 2022 11:43:00 +0300
Subject: [PATCH 12/84] fix test

---
 tests/python/contrib/test_gemm_acc32_vnni.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
index 7f1a0027811f..c01f7758cb45 100644
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -31,7 +31,10 @@ def verify_fc_int8_acc32(m=1024, n=1024, k=1024, target="llvm -mcpu=cascadelake"
         return
 
     dev = tvm.device(target, 0)
-    pc = dot_16x1x16_uint8_int8_int32()
+    # workaround for Target.current()
+    with tvm.target.Target(target) as target:
+        pc = dot_16x1x16_uint8_int8_int32()
+
     ak = te.reduce_axis((0, k), name="k")
     packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8")
 
@@ -91,7 +94,7 @@ def verify_fc_int8_acc32(m=1024, n=1024, k=1024, target="llvm -mcpu=cascadelake"
             result.mean * 1000, gops_per_sec, gops_per_sec / peak
         )
     )
-    t_func.export_library("tensorize_acc32.o")
+    # t_func.export_library("tensorize_acc32.o")
 
 
 @tvm.testing.requires_cascadelake

From a439103183fad9dbe22443ac1ff5f2dbfa4e1e16 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 20 Dec 2022 12:01:06 +0300
Subject: [PATCH 13/84] fix intrin names for assert for skylake

---
 .../python/integration/test_auto_tensorize.py |  5 ++--
 tests/python/relay/test_op_level1.py          |  5 ++--
 tests/python/relay/test_op_level10.py         |  5 ++--
 tests/python/relay/test_op_level2.py          | 28 ++++++++++---------
 4 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index d6a7475170db..57228562221f 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -179,9 +179,8 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
 
     if "skylake-avx512" in target:
         asm = lib.lib.get_source("asm")
-        assert "vpmaddubsw" in asm
-        assert "vpmaddwd" in asm
-        assert "vpaddd" in asm
+        assert "pmaddubs" in asm
+        assert "pmaddw" in asm
 
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
     runtime.set_input("data", data_np)
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 4526bd264f31..750987e97acf 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -865,9 +865,8 @@ def test_dense_skylake_avx512(m, n, k):
             lib = relay.build(mod, target=target)
 
         asm = lib.lib.get_source("asm")
-        assert "vpmaddubsw" in asm
-        assert "vpmaddwd" in asm
-        assert "vpaddd" in asm
+        assert "pmaddubs" in asm
+        assert "pmaddw" in asm
 
         dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 7b2517fe22c7..616ca9baf87e 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -603,9 +603,8 @@ def test_batch_matmul_skylake_avx512(b, m, n, k):
             lib = relay.build(mod, target=target)
 
         asm = lib.lib.get_source("asm")
-        assert "vpmaddubsw" in asm
-        assert "vpmaddwd" in asm
-        assert "vpaddd" in asm
+        assert "pmaddubs" in asm
+        assert "pmaddw" in asm
 
         dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index fb8ad538e258..31e01b5e0430 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1692,12 +1692,12 @@ class TestConv2DInt8Intrinsics:
     @tvm.testing.fixture
     def fast_int8_intrinsic(self, target):
         if "nehalem" in target or "core-avx2" in target:
-            return "pmaddubs"
+            return ["pmaddubs"]
         elif "skylake-avx512" in target:
             # TODO(vvchernov): vpmaddubsw? vpmaddwd? vpaddd?
-            return "pmaddubs"
+            return ["pmaddubs", "pmaddw"]
         elif "cascadelake" in target:
-            return "vpdpbusd"
+            return ["vpdpbusd"]
         else:
             assert False, "Target should be Skylake or Cascadelake"
 
@@ -1772,10 +1772,11 @@ def assembly(
     )
     def test_uses_intrinsic(
         self,
-        fast_int8_intrinsic,
+        fast_int8_intrinsics,
         assembly,
     ):
-        assert fast_int8_intrinsic in assembly
+        for fast_int8_intrinsic in fast_int8_intrinsics:
+            assert fast_int8_intrinsic in assembly
 
     # For datatypes that don't have HW support, ensure that code is
     # generated without the fast int8 intrinsic.
@@ -1783,10 +1784,11 @@ def test_uses_intrinsic(
     @pytest.mark.parametrize("dtypes", [("uint8", "uint8", "int32")])
     def test_no_intrinsic(
         self,
-        fast_int8_intrinsic,
+        fast_int8_intrinsics,
         assembly,
     ):
-        assert fast_int8_intrinsic not in assembly
+        for fast_int8_intrinsic in fast_int8_intrinsics:
+            assert fast_int8_intrinsic not in assembly
 
     # Check that a vectorized instruction is generated for older Intel
     # generations, because we default to NCHWc layout.
@@ -2140,7 +2142,7 @@ def get_subgraph(dtype):
             np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
 
 
-def _test_conv2d_int8_alter_dtype(data_dtype, target, dot_product_instr):
+def _test_conv2d_int8_alter_dtype(data_dtype, target, dot_product_instrs):
     def get_conv2d_nchw(
         d_shape,
         w_shape,
@@ -2197,7 +2199,8 @@ def get_conv2d_nchw(
     ):
         lib = relay.build(mod, target=target, params=params)
 
-    assert dot_product_instr in lib.lib.get_source("asm")
+    for dot_product_instr in dot_product_instrs:
+        assert dot_product_instr in lib.lib.get_source("asm")
 
     rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
@@ -2213,19 +2216,18 @@ def get_conv2d_nchw(
 @tvm.testing.requires_arm_dot
 def test_conv2d_int8_alter_dtype_arm():
     _test_conv2d_int8_alter_dtype(
-        "uint8", "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod", "sdot"
+        "uint8", "llvm -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod", ["sdot"]
     )
 
 
 @tvm.testing.requires_cascadelake
 def test_conv2d_int8_alter_dtype_vnni():
-    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=cascadelake", "vpdpbusd")
+    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=cascadelake", ["vpdpbusd"])
 
 
 @tvm.testing.requires_skylake_avx512
 def test_conv2d_int8_alter_dtype_avx512():
-    # TODO(vvchernov): Is check of "vpmaddubsw" and "vpmaddwd" needed?
-    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=skylake-avx512", "vpaddd")
+    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw"])
 
 
 if __name__ == "__main__":

From a2f15875f19e5eec614f461ed314ad8d05133df5 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 20 Dec 2022 14:16:11 +0300
Subject: [PATCH 14/84] small fix

---
 tests/python/relay/test_op_level2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 31e01b5e0430..7bcc355feb92 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1690,7 +1690,7 @@ class TestConv2DInt8Intrinsics:
     )
 
     @tvm.testing.fixture
-    def fast_int8_intrinsic(self, target):
+    def fast_int8_intrinsics(self, target):
         if "nehalem" in target or "core-avx2" in target:
             return ["pmaddubs"]
         elif "skylake-avx512" in target:

From 877edd2651be9885e8f5efdd13945aaf1617cd6e Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 20 Dec 2022 16:25:17 +0300
Subject: [PATCH 15/84] return back fast int8 intrinsic tests

---
 tests/python/relay/test_op_level2.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 7bcc355feb92..c5568ef6b89e 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1690,16 +1690,13 @@ class TestConv2DInt8Intrinsics:
     )
 
     @tvm.testing.fixture
-    def fast_int8_intrinsics(self, target):
-        if "nehalem" in target or "core-avx2" in target:
-            return ["pmaddubs"]
-        elif "skylake-avx512" in target:
-            # TODO(vvchernov): vpmaddubsw? vpmaddwd? vpaddd?
-            return ["pmaddubs", "pmaddw"]
+    def fast_int8_intrinsic(self, target):
+        if "nehalem" in target or "core-avx2" in target or "skylake-avx512" in target:
+            return "pmaddubs"
         elif "cascadelake" in target:
-            return ["vpdpbusd"]
+            return "vpdpbusd"
         else:
-            assert False, "Target should be Skylake or Cascadelake"
+            assert False, "Target should be Nehalem or core-avx2 or Skylake or Cascadelake"
 
     @tvm.testing.fixture
     def assembly(
@@ -1772,11 +1769,10 @@ def assembly(
     )
     def test_uses_intrinsic(
         self,
-        fast_int8_intrinsics,
+        fast_int8_intrinsic,
         assembly,
     ):
-        for fast_int8_intrinsic in fast_int8_intrinsics:
-            assert fast_int8_intrinsic in assembly
+        assert fast_int8_intrinsic in assembly
 
     # For datatypes that don't have HW support, ensure that code is
     # generated without the fast int8 intrinsic.
@@ -1784,11 +1780,10 @@ def test_uses_intrinsic(
     @pytest.mark.parametrize("dtypes", [("uint8", "uint8", "int32")])
     def test_no_intrinsic(
         self,
-        fast_int8_intrinsics,
+        fast_int8_intrinsic,
         assembly,
     ):
-        for fast_int8_intrinsic in fast_int8_intrinsics:
-            assert fast_int8_intrinsic not in assembly
+        assert fast_int8_intrinsic not in assembly
 
     # Check that a vectorized instruction is generated for older Intel
     # generations, because we default to NCHWc layout.

From f25254f6da570ca4eaec1dac39113305dd481cc7 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 21 Dec 2022 09:44:14 +0300
Subject: [PATCH 16/84] test connect of dense and batch_matmul to avx512
 tensorization

---
 python/tvm/topi/x86/dense.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index bb99a632811b..313bafdb0cc7 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -26,7 +26,7 @@
 
 from .. import generic, tag
 from ..utils import get_const_tuple, traverse_inline
-from .tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
+from .tensor_intrin import dot_16x1x16_uint8_int8_int32
 from .tensor_intrin import dot_32x128x32_u8s8s32_sapphirerapids
 from .tensor_intrin import acc_32x32_int32_sapphirerapids
 from .utils import get_simd_32bit_lanes, target_has_vnni, target_has_amx
@@ -361,7 +361,7 @@ def split_y(out):
 
     s[C].reorder(a_yo, a_xo, a_yi, a_ko, a_xi, a_ki)
 
-    pc = dot_16x1x16_uint8_int8_int32_cascadelake()
+    pc = dot_16x1x16_uint8_int8_int32()
     s[C].tensorize(a_xi, pc)
 
     if C == O:

From d4d8bc3d54d371352f56d4551439d709e612cc87 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 21 Dec 2022 11:03:22 +0300
Subject: [PATCH 17/84] extend dense_alter_layout on avx512 (currently) instead
 of VNNI. some renaming vnni to int8 for the sake of clarity

---
 python/tvm/relay/qnn/op/legalizations.py |  2 ++
 python/tvm/topi/x86/dense_alter_op.py    | 18 +++++++++---------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index e84f04941988..ef368a016e0c 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -449,6 +449,7 @@ def _qnn_dense_legalize_arm_cpu(attrs, inputs, types):
 
 @qnn_conv2d_legalize.register("cpu")
 def _qnn_conv2d_legalize_intel_cpu(attrs, inputs, types):
+    # TODO(vvchernov): not only VNNI
     # The VNNI transformations prefer uint8 x int8 datatypes.
     if is_fast_int8_on_intel():
         return helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay.qnn.op.conv2d)
@@ -457,6 +458,7 @@ def _qnn_conv2d_legalize_intel_cpu(attrs, inputs, types):
 
 @qnn_dense_legalize.register("cpu")
 def _qnn_dense_legalize_intel_cpu(attrs, inputs, types):
+    # TODO(vvchernov): not only VNNI
     # The VNNI transformations prefer uint8 x int8 datatypes.
     if is_fast_int8_on_intel():
         return helper_change_dtypes_to_uint8_int8(attrs, inputs, types, relay.qnn.op.dense)
diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py
index 2cb46b8291fb..a380b7fc9ff7 100644
--- a/python/tvm/topi/x86/dense_alter_op.py
+++ b/python/tvm/topi/x86/dense_alter_op.py
@@ -24,14 +24,14 @@
 from .dense import _default_dense_pack_config
 from ..utils import get_const_tuple
 from ..nn import dense_alter_layout
-from .utils import target_has_vnni
-from .utils import target_has_amx
+from .utils import target_has_avx512, target_has_amx
 from .. import nn
 
 
-def check_inst_applicable(x, y, allow_padding=False):
+def check_int8_applicable(x, y, allow_padding=False):
     mcpu = tvm.target.Target.current().mcpu
-    simd_avai = target_has_vnni(mcpu) or target_has_amx(mcpu)
+    # TODO(vvchernov): may be also target_has_avx2 or lower?
+    simd_avai = target_has_avx512(mcpu) or target_has_amx(mcpu)
     return (
         simd_avai
         and "int8" in x.dtype
@@ -49,7 +49,7 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
     M, K = get_const_tuple(data_tensor.shape)
     N, _ = get_const_tuple(weight_tensor.shape)
 
-    if check_inst_applicable(data_tensor, weight_tensor) and data_tensor.dtype == "uint8":
+    if check_int8_applicable(data_tensor, weight_tensor) and data_tensor.dtype == "uint8":
         weight_layout = "NC16n4c"
         return relay.nn.contrib_dense_pack(inputs[0], inputs[1], weight_layout, None, out_dtype)
 
@@ -86,10 +86,10 @@ def _alter_dense_layout(attrs, inputs, tinfos, out_type):
     return None
 
 
-def vnni_legalize(inputs, arg_types, op, attrs, need_expand=False):
+def int8_int8_legalize(inputs, arg_types, op, attrs, need_expand=False):
     """Legalizes s8, s8 -> s32 GEMM op for VNNI."""
     if (
-        check_inst_applicable(arg_types[0], arg_types[1], allow_padding=True)
+        check_int8_applicable(arg_types[0], arg_types[1], allow_padding=True)
         and arg_types[0].dtype == "int8"
     ):
         x, y = inputs
@@ -135,7 +135,7 @@ def vnni_legalize(inputs, arg_types, op, attrs, need_expand=False):
 @nn.dense_legalize.register("cpu")
 def _dense_legalize(attrs, inputs, arg_types):
     """Legalizes s8, s8 -> s32 dense for VNNI."""
-    return vnni_legalize(inputs, arg_types, relay.nn.dense, attrs)
+    return int8_int8_legalize(inputs, arg_types, relay.nn.dense, attrs)
 
 
 @nn.batch_matmul_legalize.register("cpu")
@@ -143,4 +143,4 @@ def _batch_matmul_legalize(attrs, inputs, arg_types):
     """Legalizes s8, s8 -> s32 batch_matmul for VNNI."""
     if attrs["transpose_a"] or not attrs["transpose_b"]:
         return None
-    return vnni_legalize(inputs, arg_types, relay.nn.batch_matmul, attrs, need_expand=True)
+    return int8_int8_legalize(inputs, arg_types, relay.nn.batch_matmul, attrs, need_expand=True)

From 7266380f940a09542e16b32a27a34cba6fd3d3d3 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 21 Dec 2022 11:34:30 +0300
Subject: [PATCH 18/84] more renaming vnni to int8 for dense schedule, compute,
 strategy for the sake of clarity

---
 python/tvm/topi/x86/batch_matmul.py               |  4 ++--
 python/tvm/topi/x86/dense.py                      | 15 ++++++++-------
 tests/python/relay/test_op_level1.py              |  1 +
 .../test_meta_schedule_vnni_integration.py        |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 9f3bc2951524..6d00cc1ba98c 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -25,7 +25,7 @@
 from .. import generic, nn
 from ..transform import layout_transform
 from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
-from .dense import dense_vnni_schedule, dense_amx_int8_schedule
+from .dense import dense_int8_schedule, dense_amx_int8_schedule
 from .injective import schedule_injective_from_existing
 from .utils import target_has_vnni, target_has_amx
 
@@ -66,7 +66,7 @@ def batch_matmul_vnni_schedule(cfg, s, C, O, layout_trans):
     # O: The output of the fused op
 
     # Schedule the GEMM part
-    s, fused_inner = dense_vnni_schedule(cfg, s, C, O, do_parallel=False)
+    s, fused_inner = dense_int8_schedule(cfg, s, C, O, do_parallel=False)
     # Parallelize over batch
     fused = s[O].fuse(O.op.axis[0], fused_inner)
     s[O].parallel(fused)
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 313bafdb0cc7..b697cf98a625 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -29,7 +29,7 @@
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
 from .tensor_intrin import dot_32x128x32_u8s8s32_sapphirerapids
 from .tensor_intrin import acc_32x32_int32_sapphirerapids
-from .utils import get_simd_32bit_lanes, target_has_vnni, target_has_amx
+from .utils import get_simd_32bit_lanes, target_has_avx512, target_has_amx
 
 
 def _schedule_dense_pack_template(cfg, s, C, O):
@@ -302,8 +302,8 @@ def _callback(op):
         if "dense_int8" in op.tag:
             if target_has_amx(mcpu):
                 dense_amx_int8_schedule(cfg, s, op.output(0), outs[0])
-            elif target_has_vnni(mcpu):
-                dense_vnni_schedule(cfg, s, op.output(0), outs[0])
+            elif target_has_avx512(mcpu):
+                dense_int8_schedule(cfg, s, op.output(0), outs[0])
 
     traverse_inline(s, outs[0].op, _callback)
     return s
@@ -315,8 +315,8 @@ def dense_int8_compute(cfg, X, packed_w, bias=None):
     n_o, _, n_i, _ = packed_w.shape
     ak = te.reduce_axis((0, k), name="k")
     mcpu = tvm.target.Target.current().mcpu
-    if target_has_vnni(mcpu):
-        target_attr = {"schedule_rule": "meta_schedule.x86.dense_vnni"}
+    if target_has_avx512(mcpu):
+        target_attr = {"schedule_rule": "meta_schedule.x86.dense_int8"}
     else:
         target_attr = None
 
@@ -339,8 +339,9 @@ def dense_int8_compute(cfg, X, packed_w, bias=None):
     return C
 
 
-def dense_vnni_schedule(cfg, s, C, O, do_parallel=True):
-    """Schedule dense compute using VNNI vpdpbusd instruction"""
+def dense_int8_schedule(cfg, s, C, O, do_parallel=True):
+    """Schedule dense compute using avx512 or lower instructions
+    including VNNI vpdpbusd instruction if possible"""
     # C: The output of GEMM
     # O: The output of the fused op
     def split_y(out):
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 750987e97acf..da44d80cbace 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -846,6 +846,7 @@ def test_dense_amx_int8():
         np.testing.assert_equal(out, ref)
 
 
+# TODO(vvchernov): join duplicated test code for cascadelake and skylake
 @tvm.testing.requires_skylake_avx512
 @pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
 def test_dense_skylake_avx512(m, n, k):
diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_vnni_integration.py
index 3bbe916472f5..c37c1da9250c 100644
--- a/tests/python/unittest/test_meta_schedule_vnni_integration.py
+++ b/tests/python/unittest/test_meta_schedule_vnni_integration.py
@@ -47,7 +47,7 @@ def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
         if dense_block is None:
             assert has_block(sch, "compute")
             dense_block = sch.get_block("compute")
-            assert "dense_vnni" in sch.get(dense_block).annotations["schedule_rule"]
+            assert "dense_int8" in sch.get(dense_block).annotations["schedule_rule"]
 
         post_blocks = sch.get_consumers(dense_block)
         if len(post_blocks) > 0:

From 0a393b8fd36ec0f4ec42c3dbedff063072f56d0d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 21 Dec 2022 16:19:10 +0300
Subject: [PATCH 19/84] update for batch_matmul with avx512

---
 python/tvm/topi/x86/batch_matmul.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 6d00cc1ba98c..95408a924f28 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -27,10 +27,10 @@
 from ..utils import get_const_tuple, get_max_power2_factor, traverse_inline
 from .dense import dense_int8_schedule, dense_amx_int8_schedule
 from .injective import schedule_injective_from_existing
-from .utils import target_has_vnni, target_has_amx
+from .utils import target_has_avx512, target_has_amx
 
 
-@autotvm.register_topi_compute("batch_matmul_vnni.x86")
+@autotvm.register_topi_compute("batch_matmul_int8.x86")
 def batch_matmul_int8_compute(cfg, x, y, *_):
     """Compute for uint8 x int8 -> int32 batch_matmul"""
     batch, m, k = x.shape
@@ -39,8 +39,8 @@ def batch_matmul_int8_compute(cfg, x, y, *_):
     _, n_o, _, n_i, _ = packed_y.shape
     ak = te.reduce_axis((0, k), name="k")
     mcpu = tvm.target.Target.current().mcpu
-    if target_has_vnni(mcpu):
-        attrs_info = {"schedule_rule": "batch_matmul_vnni"}
+    if target_has_avx512(mcpu):
+        attrs_info = {"schedule_rule": "batch_matmul_int8"}
     else:
         attrs_info = None
 
@@ -60,8 +60,9 @@ def batch_matmul_int8_compute(cfg, x, y, *_):
     return z
 
 
-def batch_matmul_vnni_schedule(cfg, s, C, O, layout_trans):
-    """Schedule batch_matmul compute using VNNI vpdpbusd instruction"""
+def batch_matmul_int8_schedule(cfg, s, C, O, layout_trans):
+    """Schedule batch_matmul compute using avx512 or lower instructions
+    including VNNI vpdpbusd instruction if possible"""
     # C: The output of batched GEMM
     # O: The output of the fused op
 
@@ -228,9 +229,9 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_schedule("batch_matmul_vnni.x86")
+@autotvm.register_topi_schedule("batch_matmul_int8.x86")
 def schedule_batch_matmul_int8(cfg, outs):
-    """Schedule for batch_matmul_vnni"""
+    """Schedule for batch_matmul_int8"""
     s = te.create_schedule([x.op for x in outs])
     mcpu = tvm.target.Target.current().mcpu
 
@@ -239,8 +240,8 @@ def _callback(op):
             layout_trans = op.input_tensors[1]
             if target_has_amx(mcpu):
                 batch_matmul_amx_schedule(cfg, s, op.output(0), outs[0], layout_trans)
-            elif target_has_vnni(mcpu):
-                batch_matmul_vnni_schedule(cfg, s, op.output(0), outs[0], layout_trans)
+            elif target_has_avx512(mcpu):
+                batch_matmul_int8_schedule(cfg, s, op.output(0), outs[0], layout_trans)
 
     traverse_inline(s, outs[0].op, _callback)
     return s

From 2029f836afe7eeb9ecf081226e8c1f8fc9bed648 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 22 Dec 2022 15:27:17 +0300
Subject: [PATCH 20/84] extend space generator init for avx512. Add Default
 AVX512 schedule rules

---
 include/tvm/meta_schedule/schedule_rule.h     |  2 +
 .../schedule_rule/schedule_rule.cc            | 45 +++++++++++++++++++
 .../space_generator/space_generator.cc        | 13 +++++-
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 16202e18bf95..9b8d6c64ac1c 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -292,6 +292,8 @@ class ScheduleRule : public runtime::ObjectRef {
   TVM_DLL static Array<ScheduleRule, void> DefaultLLVM();
   /*! \brief Create default schedule rules for x86 VNNI */
   TVM_DLL static Array<ScheduleRule, void> DefaultVNNI();
+  /*! \brief Create default schedule rules for x86 AVX512 */
+  TVM_DLL static Array<ScheduleRule, void> DefaultAVX512();
   /*! \brief Create default schedule rules for CUDA */
   TVM_DLL static Array<ScheduleRule, void> DefaultCUDA();
   /*! \brief Create default postprocessors for CUDA with TensorCore */
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 113703272031..93bc3adf1b6f 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -130,6 +130,51 @@ Array<ScheduleRule> ScheduleRule::DefaultVNNI() {
   };
 }
 
+Array<ScheduleRule> ScheduleRule::DefaultAVX512() {
+  return {
+      ScheduleRule::ApplyCustomRule(),
+      ScheduleRule::InlineConstantScalars(),
+      ScheduleRule::AutoInline(
+          /*into_producer=*/false,
+          /*into_consumer=*/true,
+          /*inline_const_tensor=*/true,
+          /*disallow_if_then_else=*/true,
+          /*require_injective=*/true,
+          /*require_ordered=*/true,
+          /*disallow_op=*/Array<String>{"tir.exp"}),
+      ScheduleRule::AddRFactor(
+          /*max_jobs_per_core=*/16,
+          /*max_innermost_factor=*/Integer(64)),
+      ScheduleRule::MultiLevelTilingWithIntrin(
+          /*intrin_name=*/"dot_16x4_avx512",
+          /*structure=*/"SSRSRS",
+          /*tile_binds=*/NullOpt,
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/NullOpt,
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+      ScheduleRule::MultiLevelTiling(
+          /*structure=*/"SSRSRS",
+          /*tile_binds=*/NullOpt,
+          /*max_innermost_factor=*/Integer(64),
+          /*vector_load_lens=*/NullOpt,
+          /*reuse_read=*/NullOpt,
+          /*reuse_write=*/
+          Map<String, ObjectRef>{{"req", String("may")},
+                                 {"levels", Array<Integer>{1, 2}},
+                                 {"scope", String("global")}}),
+      ScheduleRule::ParallelizeVectorizeUnroll(
+          /*max_jobs_per_core=*/16,
+          /*max_vectorize_extent=*/64,
+          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512},
+          /*unroll_explicit=*/true),
+      ScheduleRule::RandomComputeLocation(),
+  };
+}
+
 Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
   return {
       ScheduleRule::ApplyCustomRule(),
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 9ed3e7c81fc1..abf807826151 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -23,13 +23,20 @@ namespace meta_schedule {
 
 String GetRuleKindFromTarget(const Target& target) {
   if (target->kind->name == "llvm") {
-    // TODO(vvchernov): possibly need check target_has_avx512
     static const PackedFunc* f_check_vnni =
         runtime::Registry::Get("tvm.topi.x86.utils.target_has_vnni");
     ICHECK(f_check_vnni != nullptr) << "The `target_has_vnni` func is not in tvm registry.";
     if (target->GetAttr<String>("mcpu") &&
         (*f_check_vnni)(target->GetAttr<String>("mcpu").value())) {
       return "vnni";
+    } else {
+      static const PackedFunc* f_check_avx512 =
+        runtime::Registry::Get("tvm.topi.x86.utils.target_has_avx512");
+      ICHECK(f_check_avx512 != nullptr) << "The `target_has_avx512` func is not in tvm registry.";
+      if (target->GetAttr<String>("mcpu") &&
+          (*f_check_avx512)(target->GetAttr<String>("mcpu").value())) {
+        return "avx512";
+      }
     }
     return "llvm";
   }
@@ -96,6 +103,10 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
       default_sch_rules = ScheduleRule::DefaultVNNI();
       default_postprocs = Postproc::DefaultVNNI();
       default_mutator_probs = Mutator::DefaultVNNI();
+    } else if (kind == "avx512") {
+      default_sch_rules = ScheduleRule::DefaultAVX512();
+      default_postprocs = Postproc::DefaultVNNI();
+      default_mutator_probs = Mutator::DefaultVNNI();
     } else if (kind == "c") {
       default_sch_rules = ScheduleRule::DefaultMicro();
       default_postprocs = Postproc::DefaultMicro();

From 410c87b5888e72ca7bc1936d6f69cc0f99ca8af4 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 00:03:19 +0300
Subject: [PATCH 21/84] avx512 dot 16x4 intrin was implemented for MS default
 schedule rule

---
 python/tvm/tir/tensor_intrin/x86.py | 53 +++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index d93167f9e614..7db0c39df478 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -67,8 +67,61 @@ def dot_product_16x4_u8i8i32_vnni(
         )
 
 
+@T.prim_func
+def dot_product_16x4_u8i8i32_avx512(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((16, 4), "int8", offset_factor=1),
+    C: T.Buffer((16,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
+        T.writes(C[0:16])
+
+        A_u8x4 = A.vload([0], "uint8x4")
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+        A_brdcst = T.broadcast(A_i32, 16)
+        A_u8x64 = T.reinterpret(A_brdcst, dtype="uint8x64")
+
+        B_i8x64 = B.vload([0, 0], dtype="int8x64")
+        B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+        C_i32x16 = C.vload([0], dtype="int32x16")
+
+        C[T.ramp(T.int32(0), 1, 16)] = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
+            T.uint32(0),
+            C_i32x16,
+            T.broadcast(A_i32, 16),
+            B_i32x16,
+            dtype="int32x16",
+        )
+
+        Red = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddubs.w.512"),
+            T.uint32(0),
+            A_u8x64,
+            B_i8x64,
+            dtype="int16x32",
+        )
+
+        One = T.const(1, "int16x32")
+
+        C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),
+            T.uint32(0),
+            Red,
+            One,
+            dtype="int32x16",
+        )
+
+
 VNNI_DOT_16x4_INTRIN = "dot_16x4_vnni"
 
 TensorIntrin.register(
     VNNI_DOT_16x4_INTRIN, dot_product_16x4_u8i8i32_desc, dot_product_16x4_u8i8i32_vnni
 )
+
+AVX512_DOT_16x4_INTRIN = "dot_16x4_avx512"
+
+TensorIntrin.register(
+    AVX512_DOT_16x4_INTRIN, dot_product_16x4_u8i8i32_desc, dot_product_16x4_u8i8i32_avx512
+)

From a23198f640df26461c123d0c73ee0320da5b91fc Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 00:17:31 +0300
Subject: [PATCH 22/84] small fix

---
 python/tvm/tir/tensor_intrin/x86.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index 7db0c39df478..e758f3889add 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -103,7 +103,7 @@ def dot_product_16x4_u8i8i32_avx512(
             dtype="int16x32",
         )
 
-        One = T.const(1, "int16x32")
+        One = T.allocate_const([1], "int16x32", [1])
 
         C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),

From 1fa84f4f19724ee1885478d8d99f5ec6ef3291ad Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 00:38:53 +0300
Subject: [PATCH 23/84] update

---
 python/tvm/tir/tensor_intrin/x86.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index e758f3889add..be2eec086c13 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -103,13 +103,16 @@ def dot_product_16x4_u8i8i32_avx512(
             dtype="int16x32",
         )
 
-        One = T.allocate_const([1], "int16x32", [1])
+        One = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                               "int16", [32])
+        One_int16x32 = T.reinterpret(One, dtype="int16x32")
 
         C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),
             T.uint32(0),
             Red,
-            One,
+            One_int16x32,
             dtype="int32x16",
         )
 

From c3c15d2f3844b448a9cff741bf7c399f79b9a8fc Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 08:56:22 +0300
Subject: [PATCH 24/84] pylint fixes

---
 python/tvm/tir/tensor_intrin/x86.py                  | 5 ++---
 src/meta_schedule/space_generator/space_generator.cc | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index be2eec086c13..bdb5221538a9 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -103,9 +103,8 @@ def dot_product_16x4_u8i8i32_avx512(
             dtype="int16x32",
         )
 
-        One = T.allocate_const([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                               "int16", [32])
+        data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        One = T.allocate_const(data, "int16", [32])
         One_int16x32 = T.reinterpret(One, dtype="int16x32")
 
         C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index abf807826151..b580a6ab4726 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -31,7 +31,7 @@ String GetRuleKindFromTarget(const Target& target) {
       return "vnni";
     } else {
       static const PackedFunc* f_check_avx512 =
-        runtime::Registry::Get("tvm.topi.x86.utils.target_has_avx512");
+          runtime::Registry::Get("tvm.topi.x86.utils.target_has_avx512");
       ICHECK(f_check_avx512 != nullptr) << "The `target_has_avx512` func is not in tvm registry.";
       if (target->GetAttr<String>("mcpu") &&
           (*f_check_avx512)(target->GetAttr<String>("mcpu").value())) {

From 582caa9fb8ffa753214d753f8ca20b94c28ba4e0 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 09:00:55 +0300
Subject: [PATCH 25/84] test workaround for const alloc in tir

---
 python/tvm/tir/tensor_intrin/x86.py  | 8 +++++---
 python/tvm/topi/x86/tensor_intrin.py | 2 --
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index bdb5221538a9..12d731afb534 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for x86 tensorization."""
+import tvm
 from tvm.script import tir as T
 from .. import TensorIntrin
 
@@ -103,9 +104,10 @@ def dot_product_16x4_u8i8i32_avx512(
             dtype="int16x32",
         )
 
-        data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-        One = T.allocate_const(data, "int16", [32])
-        One_int16x32 = T.reinterpret(One, dtype="int16x32")
+        One_int16x32 = tvm.tir.call_intrin("int16x32", "tir.const", 1)
+        # data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        # One = T.allocate_const(data, "int16", [32])
+        # One_int16x32 = T.reinterpret(One, dtype="int16x32")
 
         C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),
diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
index 4d4fd74ce183..3b83fecbf552 100644
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ b/python/tvm/topi/x86/tensor_intrin.py
@@ -318,7 +318,6 @@ def _instr(index):
             else:  # Fall back to the normal AVX512
                 vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai32)
                 vec_one = tvm.tir.const(1, "int16x32")
-                # TODO(vvchernov): vpmaddwd?
                 pair_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int16x32",
                     "llvm.x86.avx512.pmaddubs.w.512",
@@ -326,7 +325,6 @@ def _instr(index):
                     vec_a,
                     vec_b,
                 )
-                # TODO(vvchernov): vpaddd?
                 quad_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int32x16",
                     "llvm.x86.avx512.pmaddw.d.512",

From 6279ad82dc708b0f6fb4bcd78a0f9c3c5f080ae8 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 09:08:41 +0300
Subject: [PATCH 26/84] test fix (broadcasting)

---
 python/tvm/topi/x86/tensor_intrin.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
index 3b83fecbf552..fd17551cb2cd 100644
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ b/python/tvm/topi/x86/tensor_intrin.py
@@ -113,7 +113,8 @@ def _instr(index):
 
             a_int8 = ins[0].vload([0], "uint8x4")
             re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
-            vec_ai32 = re_int32.astype(int_32xl)
+            vec_ai32 = tvm.tir.call_intrin(int_32xl, "tir.broadcast", re_int32, int32_lanes)
+            # vec_ai32 = re_int32.astype(int_32xl)
             vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
             vec_b = ins[1].vload([0, 0], int_8xl)
             vec_one = tvm.tir.const(1, int_lx32)

From 5d012fe696bd455933e8819edefde5199969a13b Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 09:47:53 +0300
Subject: [PATCH 27/84] remove excess instructions from
 dot_product_16x4_u8i8i32_avx512

---
 python/tvm/tir/tensor_intrin/x86.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index 12d731afb534..90e7a8d6a980 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -84,17 +84,6 @@ def dot_product_16x4_u8i8i32_avx512(
         A_u8x64 = T.reinterpret(A_brdcst, dtype="uint8x64")
 
         B_i8x64 = B.vload([0, 0], dtype="int8x64")
-        B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
-        C_i32x16 = C.vload([0], dtype="int32x16")
-
-        C[T.ramp(T.int32(0), 1, 16)] = T.call_llvm_pure_intrin(
-            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
-            T.uint32(0),
-            C_i32x16,
-            T.broadcast(A_i32, 16),
-            B_i32x16,
-            dtype="int32x16",
-        )
 
         Red = T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddubs.w.512"),
@@ -104,10 +93,8 @@ def dot_product_16x4_u8i8i32_avx512(
             dtype="int16x32",
         )
 
-        One_int16x32 = tvm.tir.call_intrin("int16x32", "tir.const", 1)
-        # data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-        # One = T.allocate_const(data, "int16", [32])
-        # One_int16x32 = T.reinterpret(One, dtype="int16x32")
+        One = T.allocate_const([1], "int16", [1])
+        One_int16x32 = T.broadcast(One, 32)
 
         C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),

From db282f0df9e33441e683fc6756333b936d3d612b Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 10:03:48 +0300
Subject: [PATCH 28/84] pylint fix

---
 python/tvm/tir/tensor_intrin/x86.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index 90e7a8d6a980..87b50e4f3a47 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for x86 tensorization."""
-import tvm
 from tvm.script import tir as T
 from .. import TensorIntrin
 

From 40f8211af1a51449e9cf2480fb51b24750c7db87 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 10:21:57 +0300
Subject: [PATCH 29/84] skip asm check for askew weight shapes

---
 tests/python/relay/test_op_level1.py  | 7 ++++---
 tests/python/relay/test_op_level10.py | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index da44d80cbace..d3493af2dfae 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -865,9 +865,10 @@ def test_dense_skylake_avx512(m, n, k):
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target=target)
 
-        asm = lib.lib.get_source("asm")
-        assert "pmaddubs" in asm
-        assert "pmaddw" in asm
+        if n%16 == 0 and k%4 == 0:
+            asm = lib.lib.get_source("asm")
+            assert "pmaddubs" in asm
+            assert "pmaddw" in asm
 
         dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 616ca9baf87e..b32ac765f8dc 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -602,9 +602,10 @@ def test_batch_matmul_skylake_avx512(b, m, n, k):
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target=target)
 
-        asm = lib.lib.get_source("asm")
-        assert "pmaddubs" in asm
-        assert "pmaddw" in asm
+        if n%16 == 0 and k%4 == 0:
+            asm = lib.lib.get_source("asm")
+            assert "pmaddubs" in asm
+            assert "pmaddw" in asm
 
         dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))

From 5d393e55d1dbca8c50ce81a998a474d78174c7e1 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 10:40:26 +0300
Subject: [PATCH 30/84] fix pylint

---
 tests/python/relay/test_op_level1.py  | 2 +-
 tests/python/relay/test_op_level10.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index d3493af2dfae..9a9e0a4dc4de 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -865,7 +865,7 @@ def test_dense_skylake_avx512(m, n, k):
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target=target)
 
-        if n%16 == 0 and k%4 == 0:
+        if n % 16 == 0 and k % 4 == 0:
             asm = lib.lib.get_source("asm")
             assert "pmaddubs" in asm
             assert "pmaddw" in asm
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index b32ac765f8dc..23e72ef01bd6 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -602,7 +602,7 @@ def test_batch_matmul_skylake_avx512(b, m, n, k):
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target=target)
 
-        if n%16 == 0 and k%4 == 0:
+        if n % 16 == 0 and k % 4 == 0:
             asm = lib.lib.get_source("asm")
             assert "pmaddubs" in asm
             assert "pmaddw" in asm

From bd9fd2ef2e2e88b749d7947cba80d0cf2294a808 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 11:59:15 +0300
Subject: [PATCH 31/84] revert test fix

---
 python/tvm/topi/x86/tensor_intrin.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/topi/x86/tensor_intrin.py b/python/tvm/topi/x86/tensor_intrin.py
index fd17551cb2cd..3b83fecbf552 100644
--- a/python/tvm/topi/x86/tensor_intrin.py
+++ b/python/tvm/topi/x86/tensor_intrin.py
@@ -113,8 +113,7 @@ def _instr(index):
 
             a_int8 = ins[0].vload([0], "uint8x4")
             re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
-            vec_ai32 = tvm.tir.call_intrin(int_32xl, "tir.broadcast", re_int32, int32_lanes)
-            # vec_ai32 = re_int32.astype(int_32xl)
+            vec_ai32 = re_int32.astype(int_32xl)
             vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
             vec_b = ins[1].vload([0, 0], int_8xl)
             vec_one = tvm.tir.const(1, int_lx32)

From 07666ecad40e0c1a85dbf196e3642a8bf11f0098 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 14:37:43 +0300
Subject: [PATCH 32/84] set number of args

---
 python/tvm/tir/tensor_intrin/x86.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index 87b50e4f3a47..e312c4516c14 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -86,7 +86,7 @@ def dot_product_16x4_u8i8i32_avx512(
 
         Red = T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddubs.w.512"),
-            T.uint32(0),
+            T.uint32(2),
             A_u8x64,
             B_i8x64,
             dtype="int16x32",
@@ -97,7 +97,7 @@ def dot_product_16x4_u8i8i32_avx512(
 
         C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),
-            T.uint32(0),
+            T.uint32(2),
             Red,
             One_int16x32,
             dtype="int32x16",

From 76a5e7e02527e4b90ebcb887993f1d9c9469edc5 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 15:28:10 +0300
Subject: [PATCH 33/84] test fix

---
 python/tvm/tir/tensor_intrin/x86.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index e312c4516c14..fc4d0b512e53 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -67,12 +67,12 @@ def dot_product_16x4_u8i8i32_vnni(
         )
 
 
+mem_scope="global"
 @T.prim_func
-def dot_product_16x4_u8i8i32_avx512(
-    A: T.Buffer((4,), "uint8", offset_factor=1),
-    B: T.Buffer((16, 4), "int8", offset_factor=1),
-    C: T.Buffer((16,), "int32", offset_factor=1),
-) -> None:
+def dot_product_16x4_u8i8i32_avx512(a: T.handle, b: T.handle, c: T.handle,) -> None:
+    A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
+    B = T.match_buffer(b, (16, 4), "int8", offset_factor=1, scope=mem_scope)
+    C = T.match_buffer(c, (16,), "int32", offset_factor=1, scope=mem_scope)
     with T.block("root"):
         T.reads(C[0:16], A[0:4], B[0:16, 0:4])
         T.writes(C[0:16])

From c6548dbd98af582bda88508ff1adebeb11b7077a Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 23 Dec 2022 20:30:44 +0300
Subject: [PATCH 34/84] fix const allocation in tir for avx512 dot 16x4

---
 python/tvm/tir/tensor_intrin/x86.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index fc4d0b512e53..cff8ca1ff63b 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for x86 tensorization."""
+import tvm
 from tvm.script import tir as T
 from .. import TensorIntrin
 
@@ -92,8 +93,7 @@ def dot_product_16x4_u8i8i32_avx512(a: T.handle, b: T.handle, c: T.handle,) -> N
             dtype="int16x32",
         )
 
-        One = T.allocate_const([1], "int16", [1])
-        One_int16x32 = T.broadcast(One, 32)
+        One_int16x32 = tvm.tir.const(1, "int16x32")
 
         C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),

From b1889dfc6471123c3c8823d4ec753894d94c3fae Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 26 Dec 2022 08:41:17 +0300
Subject: [PATCH 35/84] fix signature of dot_product_16x4_u8i8i32_avx512

---
 python/tvm/tir/tensor_intrin/x86.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index cff8ca1ff63b..1606690cbac6 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -68,12 +68,12 @@ def dot_product_16x4_u8i8i32_vnni(
         )
 
 
-mem_scope="global"
 @T.prim_func
-def dot_product_16x4_u8i8i32_avx512(a: T.handle, b: T.handle, c: T.handle,) -> None:
-    A = T.match_buffer(a, (4,), "uint8", offset_factor=1, scope=mem_scope)
-    B = T.match_buffer(b, (16, 4), "int8", offset_factor=1, scope=mem_scope)
-    C = T.match_buffer(c, (16,), "int32", offset_factor=1, scope=mem_scope)
+def dot_product_16x4_u8i8i32_avx512(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((16, 4), "int8", offset_factor=1),
+    C: T.Buffer((16,), "int32", offset_factor=1),
+) -> None:
     with T.block("root"):
         T.reads(C[0:16], A[0:4], B[0:16, 0:4])
         T.writes(C[0:16])

From 0b890e91493373ca11f65e026919fdabb0aeba8d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 26 Dec 2022 08:45:14 +0300
Subject: [PATCH 36/84] use script instead of tvm.tir for const allocation

---
 python/tvm/tir/tensor_intrin/x86.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
index 1606690cbac6..c527d0d21008 100644
--- a/python/tvm/tir/tensor_intrin/x86.py
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for x86 tensorization."""
-import tvm
 from tvm.script import tir as T
 from .. import TensorIntrin
 
@@ -93,13 +92,11 @@ def dot_product_16x4_u8i8i32_avx512(
             dtype="int16x32",
         )
 
-        One_int16x32 = tvm.tir.const(1, "int16x32")
-
         C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(
             T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512"),
             T.uint32(2),
             Red,
-            One_int16x32,
+            T.int16x32(1),
             dtype="int32x16",
         )
 

From a49963d404d1c8401cb0f78b4b2c1c58a521cdf9 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 08:18:27 +0300
Subject: [PATCH 37/84] extend auto tensorize test by skylake-avx512 target

---
 .../python/integration/test_auto_tensorize.py | 144 ++++++++++--------
 1 file changed, 81 insertions(+), 63 deletions(-)

diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 57228562221f..8ef92bdb5368 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -29,52 +29,58 @@
 from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
 from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 
-SCH_RULES_FOR_VNNI = [
-    ms.schedule_rule.ApplyCustomRule(),
-    ms.schedule_rule.AutoInline(
-        into_producer=False,
-        into_consumer=True,
-        inline_const_tensor=True,
-        disallow_if_then_else=True,
-        require_injective=True,
-        require_ordered=True,
-        disallow_op=["tir.exp"],
-    ),
-    ms.schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
-    ms.schedule_rule.MultiLevelTilingWithIntrin(
-        VNNI_INTRIN,
-        structure="SSRSRS",
-        tile_binds=None,
-        max_innermost_factor=64,
-        vector_load_lens=None,
-        reuse_read=None,
-        reuse_write=ms.schedule_rule.ReuseType(
-            req="may",
-            levels=[1, 2],
-            scope="global",
+
+def _get_schedule_rules_for_x86(intrin):
+    return [
+        ms.schedule_rule.ApplyCustomRule(),
+        ms.schedule_rule.AutoInline(
+            into_producer=False,
+            into_consumer=True,
+            inline_const_tensor=True,
+            disallow_if_then_else=True,
+            require_injective=True,
+            require_ordered=True,
+            disallow_op=["tir.exp"],
         ),
-    ),
-    ms.schedule_rule.MultiLevelTiling(
-        structure="SSRSRS",
-        tile_binds=None,
-        max_innermost_factor=64,
-        vector_load_lens=None,
-        reuse_read=None,
-        reuse_write=ms.schedule_rule.ReuseType(
-            req="may",
-            levels=[1, 2],
-            scope="global",
+        ms.schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
+        ms.schedule_rule.MultiLevelTilingWithIntrin(
+            intrin,
+            structure="SSRSRS",
+            tile_binds=None,
+            max_innermost_factor=64,
+            vector_load_lens=None,
+            reuse_read=None,
+            reuse_write=ms.schedule_rule.ReuseType(
+                req="may",
+                levels=[1, 2],
+                scope="global",
+            ),
         ),
-    ),
-    ms.schedule_rule.ParallelizeVectorizeUnroll(
-        max_jobs_per_core=16,
-        max_vectorize_extent=64,
-        unroll_max_steps=[0, 16, 64, 512],
-        unroll_explicit=True,
-    ),
-    ms.schedule_rule.RandomComputeLocation(),
-]
+        ms.schedule_rule.MultiLevelTiling(
+            structure="SSRSRS",
+            tile_binds=None,
+            max_innermost_factor=64,
+            vector_load_lens=None,
+            reuse_read=None,
+            reuse_write=ms.schedule_rule.ReuseType(
+                req="may",
+                levels=[1, 2],
+                scope="global",
+            ),
+        ),
+        ms.schedule_rule.ParallelizeVectorizeUnroll(
+            max_jobs_per_core=16,
+            max_vectorize_extent=64,
+            unroll_max_steps=[0, 16, 64, 512],
+            unroll_explicit=True,
+        ),
+        ms.schedule_rule.RandomComputeLocation(),
+    ]
+
+SCH_RULES_FOR_VNNI = _get_schedule_rules_for_x86(VNNI_INTRIN)
+SCH_RULES_FOR_AVX512 = _get_schedule_rules_for_x86(AVX512_INTRIN)
 
 
 def _get_sch_rules_for_dp4a(intrin):
@@ -279,19 +285,21 @@ def _test_bert_int8(relay_mod, params, input_info, target, sch_rules, postprocs)
 @tvm.testing.requires_cascadelake
 def test_vnni_dense():
     _test_dense(
-        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
+        "uint8",
+        SCH_RULES_FOR_VNNI,
+        POSTPROCS_FOR_VNNI,
+        "llvm -mcpu=cascadelake -num-cores 4"
     )
 
 
-# TODO(vvchernov): need schedule rules and postprocs for avx512
-# @tvm.testing.requires_skylake_avx512
-# def test_avx512_dense():
-#     _test_dense(
-#         "uint8",
-#         SCH_RULES_FOR_AVX512,
-#         POSTPROCS_FOR_AVX512,
-#         "llvm -mcpu=skylake-avx512 -num-cores 4"
-#     )
+@tvm.testing.requires_skylake_avx512
+def test_avx512_dense():
+    _test_dense(
+        "uint8",
+        SCH_RULES_FOR_AVX512,
+        POSTPROCS_FOR_VNNI,
+        "llvm -mcpu=skylake-avx512 -num-cores 4"
+    )
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
@@ -314,15 +322,14 @@ def test_vnni_conv2d():
     )
 
 
-# TODO(vvchernov): need schedule rules and postprocs for avx512
-# @tvm.testing.requires_skylake_avx512
-# def test_avx512_conv2d():
-#     _test_conv2d(
-#         "uint8",
-#         SCH_RULES_FOR_AVX512,
-#         POSTPROCS_FOR_AVX512,
-#         "llvm -mcpu=skylake-avx512 -num-cores 4"
-#     )
+@tvm.testing.requires_skylake_avx512
+def test_avx512_conv2d():
+    _test_conv2d(
+        "uint8",
+        SCH_RULES_FOR_AVX512,
+        POSTPROCS_FOR_VNNI,
+        "llvm -mcpu=skylake-avx512 -num-cores 4"
+    )
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
@@ -352,7 +359,18 @@ def test_vnni_bert_int8():
     )
 
 
-# TODO(vvchernov): check BERT on skylake-avx512?
+@tvm.testing.requires_skylake_avx512
+@pytest.mark.skip_if(tvm.testing.IS_IN_CI, reason="Slow on CI")
+def test_avx512_bert_int8():
+    relay_mod, params, input_info = load_quantized_bert_base()
+    _test_bert_int8(
+        relay_mod,
+        params,
+        input_info,
+        "llvm -mcpu=skylake-avx512 -num-cores 4",
+        SCH_RULES_FOR_AVX512,
+        POSTPROCS_FOR_VNNI,
+    )
 
 
 @tvm.testing.requires_gpu

From 9bd9df1b56e6d99596b99e420ef1e9030e9faf0c Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 08:22:28 +0300
Subject: [PATCH 38/84] clean code

---
 tests/python/integration/test_auto_tensorize.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 8ef92bdb5368..ad94a951ff76 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -32,6 +32,9 @@
 from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 
 
+CASCADELAKE_VNNI_TARGET = "llvm -mcpu=cascadelake -num-cores 4"
+SKYLAKE_AVX512_TARGET = "llvm -mcpu=skylake-avx512 -num-cores 4"
+
 def _get_schedule_rules_for_x86(intrin):
     return [
         ms.schedule_rule.ApplyCustomRule(),
@@ -288,7 +291,7 @@ def test_vnni_dense():
         "uint8",
         SCH_RULES_FOR_VNNI,
         POSTPROCS_FOR_VNNI,
-        "llvm -mcpu=cascadelake -num-cores 4"
+        CASCADELAKE_VNNI_TARGET
     )
 
 
@@ -298,7 +301,7 @@ def test_avx512_dense():
         "uint8",
         SCH_RULES_FOR_AVX512,
         POSTPROCS_FOR_VNNI,
-        "llvm -mcpu=skylake-avx512 -num-cores 4"
+        SKYLAKE_AVX512_TARGET
     )
 
 
@@ -318,7 +321,7 @@ def test_dp4a_dense():
 @tvm.testing.requires_cascadelake
 def test_vnni_conv2d():
     _test_conv2d(
-        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
+        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, CASCADELAKE_VNNI_TARGET
     )
 
 
@@ -328,7 +331,7 @@ def test_avx512_conv2d():
         "uint8",
         SCH_RULES_FOR_AVX512,
         POSTPROCS_FOR_VNNI,
-        "llvm -mcpu=skylake-avx512 -num-cores 4"
+        SKYLAKE_AVX512_TARGET
     )
 
 
@@ -353,7 +356,7 @@ def test_vnni_bert_int8():
         relay_mod,
         params,
         input_info,
-        "llvm -mcpu=cascadelake -num-cores 4",
+        CASCADELAKE_VNNI_TARGET,
         SCH_RULES_FOR_VNNI,
         POSTPROCS_FOR_VNNI,
     )
@@ -367,7 +370,7 @@ def test_avx512_bert_int8():
         relay_mod,
         params,
         input_info,
-        "llvm -mcpu=skylake-avx512 -num-cores 4",
+        SKYLAKE_AVX512_TARGET,
         SCH_RULES_FOR_AVX512,
         POSTPROCS_FOR_VNNI,
     )

From dbca3090f08d15b996af78d026218d082c76f908 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 08:35:32 +0300
Subject: [PATCH 39/84] update test_op_level1, resolve TODO

---
 tests/python/relay/test_op_level1.py | 57 +++++++---------------------
 1 file changed, 13 insertions(+), 44 deletions(-)

diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 9a9e0a4dc4de..1824ce79f6ab 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -760,9 +760,7 @@ def test_bitserial_dense():
     assert yy.checked_type == relay.TensorType((m, 32), "int16")
 
 
-@tvm.testing.requires_cascadelake
-@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
-def test_dense_vnni(m, n, k):
+def dense_x86_test(m, n, k, target="llvm -mcpu=cascadelake", intrins=["vpdpbusd"]):
     data_shape = (m, k)
     weight_shape = (n, k)
 
@@ -774,14 +772,15 @@ def test_dense_vnni(m, n, k):
         out = relay.nn.bias_add(dense, bias)
         mod = tvm.IRModule.from_expr(out)
 
-        target = "llvm -mcpu=cascadelake"
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target=target)
 
-        asm = lib.lib.get_source("asm")
-        assert "vpdpbusd" in asm
+        # TODO(vvchernov): needs for avx512 arch, can be extended
+        if n % 16 == 0 and k % 4 == 0:
+            asm = lib.lib.get_source("asm")
+            for intrin in intrins:
+                assert intrin in asm
 
-        dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
         a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
@@ -846,46 +845,16 @@ def test_dense_amx_int8():
         np.testing.assert_equal(out, ref)
 
 
-# TODO(vvchernov): join duplicated test code for cascadelake and skylake
-@tvm.testing.requires_skylake_avx512
+@tvm.testing.requires_cascadelake
 @pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
-def test_dense_skylake_avx512(m, n, k):
-    data_shape = (m, k)
-    weight_shape = (n, k)
-
-    for data_dtype in ["uint8", "int8"]:
-        data = relay.var("data", shape=data_shape, dtype=data_dtype)
-        weight = relay.var("weight", shape=weight_shape, dtype="int8")
-        bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
-        dense = relay.nn.dense(data, weight, out_dtype="int32")
-        out = relay.nn.bias_add(dense, bias)
-        mod = tvm.IRModule.from_expr(out)
-
-        target = "llvm -mcpu=skylake-avx512"
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target)
-
-        if n % 16 == 0 and k % 4 == 0:
-            asm = lib.lib.get_source("asm")
-            assert "pmaddubs" in asm
-            assert "pmaddw" in asm
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
-        b = np.random.uniform(1, 10, size=weight_shape).astype("int8")
-        c = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
-
-        runtime.set_input("data", a)
-        runtime.set_input("weight", b)
-        runtime.set_input("bias", c)
-        runtime.run()
+def test_dense_vnni(m, n, k):
+    dense_x86_test(m, n, k)
 
-        out = runtime.get_output(0).numpy()
-        ref = np.dot(a.astype("int32"), b.transpose().astype("int32")) + c
 
-        np.testing.assert_equal(out, ref)
+@tvm.testing.requires_skylake_avx512
+@pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
+def test_dense_skylake_avx512(m, n, k):
+    dense_x86_test(m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "pmaddd"])
 
 
 @pytest.mark.skip("Requires GFX10 AMDGPU")

From 0e78ee81062bedee1d4979f9776c0eaa611c3aee Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 08:38:51 +0300
Subject: [PATCH 40/84] small update test_op_level2

---
 tests/python/relay/test_op_level2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index c5568ef6b89e..c0dfe562c112 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2222,7 +2222,7 @@ def test_conv2d_int8_alter_dtype_vnni():
 
 @tvm.testing.requires_skylake_avx512
 def test_conv2d_int8_alter_dtype_avx512():
-    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw"])
+    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "pmaddd"])
 
 
 if __name__ == "__main__":

From 99e8d4649f70508349dd5b05bae2c9c7ea3e0986 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 08:47:48 +0300
Subject: [PATCH 41/84] update test_op_level10, resolve TODO

---
 tests/python/relay/test_op_level10.py | 72 ++++++++-------------------
 1 file changed, 20 insertions(+), 52 deletions(-)

diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 23e72ef01bd6..881b511af676 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -473,16 +473,7 @@ def test_batch_matmul(executor_kind):
     verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np, (10, 27, 27))
 
 
-@tvm.testing.requires_cascadelake
-@pytest.mark.parametrize(
-    "b,m,n,k",
-    [
-        (16, 32, 128, 96),
-        (16, 32, 128, 97),
-        (16, 32, 129, 96),
-    ],
-)
-def test_batch_matmul_vnni(b, m, n, k):
+def batch_matmul_x86_test(b, m, n, k, target="llvm -mcpu=cascadelake", intrins=["vpdpbusd"]):
     x_shape = (b, m, k)
     y_shape = (b, n, k)
     z_shape = (b, m, n)
@@ -495,14 +486,15 @@ def test_batch_matmul_vnni(b, m, n, k):
         out = bmm + z
         mod = tvm.IRModule.from_expr(out)
 
-        target = "llvm -mcpu=cascadelake"
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build(mod, target=target)
 
-        asm = lib.lib.get_source("asm")
-        assert "vpdpbusd" in asm
+        # TODO(vvchernov): needs for avx512 arch, can be extended
+        if n % 16 == 0 and k % 4 == 0:
+            asm = lib.lib.get_source("asm")
+            for intrin in intrins:
+                assert intrin in asm
 
-        dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
         x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
@@ -575,7 +567,7 @@ def test_batch_matmul_amx(b, m, n, k):
         np.testing.assert_equal(out, ref)
 
 
-@tvm.testing.requires_skylake_avx512
+@tvm.testing.requires_cascadelake
 @pytest.mark.parametrize(
     "b,m,n,k",
     [
@@ -584,45 +576,21 @@ def test_batch_matmul_amx(b, m, n, k):
         (16, 32, 129, 96),
     ],
 )
-def test_batch_matmul_skylake_avx512(b, m, n, k):
-    x_shape = (b, m, k)
-    y_shape = (b, n, k)
-    z_shape = (b, m, n)
-
-    # TODO(vvchernov): join duplicate code with cascadelake
-    for lhs_dtype in ["uint8", "int8"]:
-        x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
-        y = relay.var("y", shape=y_shape, dtype="int8")
-        z = relay.var("z", shape=z_shape, dtype="int32")
-        bmm = relay.nn.batch_matmul(x, y, out_dtype="int32")
-        out = bmm + z
-        mod = tvm.IRModule.from_expr(out)
-
-        target = "llvm -mcpu=skylake-avx512"
-        with tvm.transform.PassContext(opt_level=3):
-            lib = relay.build(mod, target=target)
-
-        if n % 16 == 0 and k % 4 == 0:
-            asm = lib.lib.get_source("asm")
-            assert "pmaddubs" in asm
-            assert "pmaddw" in asm
-
-        dev = tvm.device(target, 0)
-        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
-        x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
-        y_np = np.random.uniform(1, 10, size=y_shape).astype("int8")
-        z_np = np.random.uniform(1, 10, size=z_shape).astype("int32")
-
-        runtime.set_input("x", x_np)
-        runtime.set_input("y", y_np)
-        runtime.set_input("z", z_np)
-        runtime.run()
+def test_batch_matmul_vnni(b, m, n, k):
+    batch_matmul_x86_test(b, m, n, k)
 
-        out = runtime.get_output(0).numpy()
-        ref = tvm.topi.testing.batch_matmul(x_np, y_np, out_dtype="int32") + z_np
 
-        np.testing.assert_equal(out, ref)
+@tvm.testing.requires_skylake_avx512
+@pytest.mark.parametrize(
+    "b,m,n,k",
+    [
+        (16, 32, 128, 96),
+        (16, 32, 128, 97),
+        (16, 32, 129, 96),
+    ],
+)
+def test_batch_matmul_skylake_avx512(b, m, n, k):
+    batch_matmul_x86_test(b, m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "pmaddd"])
 
 
 @pytest.mark.skip("Requires GFX10 AMDGPU")

From bfe7424a787ec8a691bac56ad3dc4ecbd8fe203d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 09:00:07 +0300
Subject: [PATCH 42/84] update qnn legalize pass test, resolve TODOs

---
 tests/python/relay/test_pass_qnn_legalize.py | 28 ++++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
index 9272407f819d..c64b30a2128b 100644
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ b/tests/python/relay/test_pass_qnn_legalize.py
@@ -136,12 +136,12 @@ def _get_mod(data_dtype, kernel_dtype):
         #############################################################
         # Check transformations for platforms with fast Int8 support.
         #############################################################
-        # Check that Intel VNNI gets picked up.
-        # TODO(vvchernov): VNNI is not supported by skylake, cascadelake only
-        with tvm.target.Target("llvm -mcpu=skylake-avx512"):
-            mod = relay.transform.InferType()(mod)
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert "cast" in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
+        # Check that Intel AVX512 (with or w/o VNNI) gets picked up.
+        for target in ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]:
+            with tvm.target.Target(target):
+                mod = relay.transform.InferType()(mod)
+                legalized_mod = relay.qnn.transform.Legalize()(mod)
+                assert "cast" in legalized_mod.astext() and "qnn.conv2d" in legalized_mod.astext()
 
         # Since same dtype, there should not be any transformation
         with tvm.target.Target(
@@ -168,7 +168,7 @@ def _get_mod(data_dtype, kernel_dtype):
     #############################################################
     # Check transformations for platforms with fast Int8 support.
     #############################################################
-    # Check no transformation for Intel VNNI.
+    # Check no transformation for Intel AVX512.
     with tvm.target.Target("llvm -mcpu=skylake-avx512"):
         mod = relay.transform.InferType()(mod)
         legalized_mod = relay.qnn.transform.Legalize()(mod)
@@ -230,12 +230,12 @@ def _get_mod(data_dtype, kernel_dtype):
         #############################################################
         # Check transformations for platforms with fast Int8 support.
         #############################################################
-        # Check that Intel VNNI gets picked up.
-        # TODO(vvchernov): VNNI is not supported by skylake, cascadelake only
-        with tvm.target.Target("llvm -mcpu=skylake-avx512"):
-            mod = relay.transform.InferType()(mod)
-            legalized_mod = relay.qnn.transform.Legalize()(mod)
-            assert "cast" in legalized_mod.astext() and "qnn.dense" in legalized_mod.astext()
+        # Check that Intel AVX512 (with or w/o VNNI) gets picked up.
+        for target in ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]:
+            with tvm.target.Target(target):
+                mod = relay.transform.InferType()(mod)
+                legalized_mod = relay.qnn.transform.Legalize()(mod)
+                assert "cast" in legalized_mod.astext() and "qnn.dense" in legalized_mod.astext()
 
         # Since same dtype, there should not be any transformation
         with tvm.target.Target(
@@ -262,7 +262,7 @@ def _get_mod(data_dtype, kernel_dtype):
     #############################################################
     # Check transformations for platforms with fast Int8 support.
     #############################################################
-    # Check no transformation for Intel VNNI.
+    # Check no transformation for Intel AVX512.
     with tvm.target.Target("llvm -mcpu=skylake-avx512"):
         mod = relay.transform.InferType()(mod)
         legalized_mod = relay.qnn.transform.Legalize()(mod)

From 4211c0fd9cd545771bc73f3770608c589d8856b2 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 09:05:53 +0300
Subject: [PATCH 43/84] pylint fixes

---
 .../python/integration/test_auto_tensorize.py | 27 +++++--------------
 tests/python/relay/test_op_level2.py          |  4 ++-
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index ad94a951ff76..9d29a74943d2 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -35,6 +35,7 @@
 CASCADELAKE_VNNI_TARGET = "llvm -mcpu=cascadelake -num-cores 4"
 SKYLAKE_AVX512_TARGET = "llvm -mcpu=skylake-avx512 -num-cores 4"
 
+
 def _get_schedule_rules_for_x86(intrin):
     return [
         ms.schedule_rule.ApplyCustomRule(),
@@ -82,6 +83,7 @@ def _get_schedule_rules_for_x86(intrin):
         ms.schedule_rule.RandomComputeLocation(),
     ]
 
+
 SCH_RULES_FOR_VNNI = _get_schedule_rules_for_x86(VNNI_INTRIN)
 SCH_RULES_FOR_AVX512 = _get_schedule_rules_for_x86(AVX512_INTRIN)
 
@@ -287,22 +289,12 @@ def _test_bert_int8(relay_mod, params, input_info, target, sch_rules, postprocs)
 
 @tvm.testing.requires_cascadelake
 def test_vnni_dense():
-    _test_dense(
-        "uint8",
-        SCH_RULES_FOR_VNNI,
-        POSTPROCS_FOR_VNNI,
-        CASCADELAKE_VNNI_TARGET
-    )
+    _test_dense("uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, CASCADELAKE_VNNI_TARGET)
 
 
 @tvm.testing.requires_skylake_avx512
 def test_avx512_dense():
-    _test_dense(
-        "uint8",
-        SCH_RULES_FOR_AVX512,
-        POSTPROCS_FOR_VNNI,
-        SKYLAKE_AVX512_TARGET
-    )
+    _test_dense("uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_VNNI, SKYLAKE_AVX512_TARGET)
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
@@ -320,19 +312,12 @@ def test_dp4a_dense():
 
 @tvm.testing.requires_cascadelake
 def test_vnni_conv2d():
-    _test_conv2d(
-        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, CASCADELAKE_VNNI_TARGET
-    )
+    _test_conv2d("uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, CASCADELAKE_VNNI_TARGET)
 
 
 @tvm.testing.requires_skylake_avx512
 def test_avx512_conv2d():
-    _test_conv2d(
-        "uint8",
-        SCH_RULES_FOR_AVX512,
-        POSTPROCS_FOR_VNNI,
-        SKYLAKE_AVX512_TARGET
-    )
+    _test_conv2d("uint8", SCH_RULES_FOR_AVX512, POSTPROCS_FOR_VNNI, SKYLAKE_AVX512_TARGET)
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index c0dfe562c112..1e4a11abf986 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2222,7 +2222,9 @@ def test_conv2d_int8_alter_dtype_vnni():
 
 @tvm.testing.requires_skylake_avx512
 def test_conv2d_int8_alter_dtype_avx512():
-    _test_conv2d_int8_alter_dtype("int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "pmaddd"])
+    _test_conv2d_int8_alter_dtype(
+        "int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "pmaddd"]
+    )
 
 
 if __name__ == "__main__":

From bd240521727492ab5382363ad3e724950073582c Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 13:07:36 +0300
Subject: [PATCH 44/84] update ms test for avx512

---
 .../test_meta_schedule_relay_integration.py   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_relay_integration.py b/tests/python/unittest/test_meta_schedule_relay_integration.py
index 803d6307132d..795890de083e 100644
--- a/tests/python/unittest/test_meta_schedule_relay_integration.py
+++ b/tests/python/unittest/test_meta_schedule_relay_integration.py
@@ -316,9 +316,8 @@ def traverse(t):
         assert t.task_name in expected_task_names, t.task_name
 
 
-@pytest.mark.skip("Too slow on CI")
-def extract_task_qbert():
-    def _test(mod, params, target):
+def extract_task_qbert(target, sch_rule_tag):
+    def _test(mod, params, target, sch_rule_tag):
         extracted_tasks = ms.relay_integration.extract_tasks(mod, target, params)
         tune_tasks = list(
             filter(
@@ -341,11 +340,20 @@ def _test(mod, params, target):
             annotations = sch.get(block).annotations
 
             assert "schedule_rule" in annotations
-            assert "vnni" in annotations["schedule_rule"]
+            assert sch_rule_tag in annotations["schedule_rule"]
 
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
-    # TODO(vvchernov): repeat for skylse-avx512?
-    _test(mod, params, target="llvm -mcpu=cascadelake")
+    _test(mod, params, target=target, sch_rule_tag=sch_rule_tag)
+
+
+@pytest.mark.skip("Too slow on CI")
+def extract_task_qbert_vnni():
+    extract_task_qbert("llvm -mcpu=cascadelake", "vnni")
+
+
+@pytest.mark.skip("Too slow on CI")
+def extract_task_qbert_avx512():
+    extract_task_qbert("llvm -mcpu=skylake-avx512", "avx512")
 
 
 @tvm.testing.skip_if_32bit(reason="Apparently the LLVM version on i386 image is too old")

From 52abeb04bc0db76b0c613ce658d573a8d812f1b5 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 13:42:43 +0300
Subject: [PATCH 45/84] update more ms test for avx512

---
 ..._meta_schedule_schedule_rule_mlt_intrin.py | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
index 3783eeb45fdb..4667626f1706 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_mlt_intrin.py
@@ -26,10 +26,10 @@
 from tvm.target import Target
 from tvm.tir.tensor_intrin.arm_cpu import DP4A_INTRIN
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 
 
-# TODO(vvchernov): check avx512 for skylake?
-def test_vnni_conv2d_nchwc():
+def test_x86_conv2d_nchwc(intrin=VNNI_INTRIN, target="llvm -mcpu=cascadelake -num-cores=4"):
     @T.prim_func
     def conv2d_nchwc(
         placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
@@ -69,7 +69,7 @@ def conv2d_nchwc(
 
     # fmt: off
     @T.prim_func
-    def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+    def x86_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         conv2d_NCHWc_int8_global = T.alloc_buffer([1, 16, 56, 56, 16], dtype="int32")
         for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1):
@@ -87,7 +87,7 @@ def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                     ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                     T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                     T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16])
-                    T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                    T.block_attr({"meta_schedule.auto_tensorize":intrin})
                     with T.init():
                         for i4_1 in T.serial(16):
                             with T.block("conv2d_NCHWc_int8_init"):
@@ -114,7 +114,7 @@ def vnni_conv2d_nchwc_0(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                     conv2d_NCHWc_int8[v0, v1, v2, v3, v4] = conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4]
 
     @T.prim_func
-    def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+    def x86_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         conv2d_NCHWc_int8_global = T.alloc_buffer([1, 16, 56, 56, 16], dtype="int32")
         for i0_0, i1_0, i2_0, i3_0, i4_0_0 in T.grid(1, 8, 28, 56, 1):
@@ -132,7 +132,7 @@ def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                     ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                     T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                     T.writes(conv2d_NCHWc_int8_global[n, oc_chunk, oh, ow, 0 : 16])
-                    T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                    T.block_attr({"meta_schedule.auto_tensorize":intrin})
                     with T.init():
                         for i4_1 in T.serial(16):
                             with T.block("conv2d_NCHWc_int8_init"):
@@ -159,7 +159,7 @@ def vnni_conv2d_nchwc_1(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                     conv2d_NCHWc_int8[v0, v1, v2, v3, v4] = conv2d_NCHWc_int8_global[v0, v1, v2, v3, v4]
 
     @T.prim_func
-    def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
+    def x86_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"], conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"]) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 8, 28, 56, 1, 1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1):
             with T.block("conv2d_NCHWc_int8_o"):
@@ -175,7 +175,7 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
                 ic_s_inner_o = T.axis.reduce(1, i9_0_1 + i9_0_0)
                 T.reads(placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
                 T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
-                T.block_attr({"meta_schedule.auto_tensorize":"dot_16x4_vnni"})
+                T.block_attr({"meta_schedule.auto_tensorize":intrin})
                 with T.init():
                     for i4_1 in T.serial(16):
                         with T.block("conv2d_NCHWc_int8_init"):
@@ -229,7 +229,6 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
     ]
 
     mod = conv2d_nchwc
-    target = Target("llvm -mcpu=cascadelake -num-cores=4")
     actual = generate_design_space(
         kind="llvm",
         mod=mod,
@@ -237,7 +236,7 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
         types=None,
         sch_rules=[
             ms.schedule_rule.MultiLevelTilingWithIntrin(
-                VNNI_INTRIN,
+                intrin,
                 structure="SSRSRS",
                 tile_binds=None,
                 max_innermost_factor=64,
@@ -250,7 +249,7 @@ def vnni_conv2d_nchwc_2(placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"], plac
     check_sketches(
         mod,
         sketches=actual,
-        expected_mods=[vnni_conv2d_nchwc_0, vnni_conv2d_nchwc_1, vnni_conv2d_nchwc_2],
+        expected_mods=[x86_conv2d_nchwc_0, x86_conv2d_nchwc_1, x86_conv2d_nchwc_2],
         expected_decisions=[decision_0, decision_1, decision_2],
     )
 
@@ -418,7 +417,8 @@ def test_dp4a_dense_no_tensorize_2():
 
 
 if __name__ == "__main__":
-    test_vnni_conv2d_nchwc()
+    test_x86_conv2d_nchwc()
+    test_x86_conv2d_nchwc(AVX512_INTRIN, "llvm -mcpu=skylake-avx512 -num-cores=4")
     test_dp4a_dense()
     test_dp4a_dense_no_tensorize_1()
     test_dp4a_dense_no_tensorize_2()

From 315a9476107433dfe81c6866be91ccaabac1ba16 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 16:19:15 +0300
Subject: [PATCH 46/84] try to fix i386 CI tests

---
 python/tvm/testing/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index fa8d1afaf7d3..19669cd60cf4 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1029,6 +1029,9 @@ def _has_vnni():
 
 # check avx512 intrinsic groups for SkyLake X
 def _has_slavx512():
+    # Check LLVM support
+    llvm_version = tvm.target.codegen.llvm_version_major()
+    is_llvm_support = llvm_version >= 8
     arch = platform.machine()
     # Only linux is supported for now.
     if arch == "x86_64" and sys.platform.startswith("linux"):
@@ -1041,7 +1044,7 @@ def _has_slavx512():
                 and "avx512dq" in ctx
                 and "avx512vl" in ctx
             )
-            return check
+            return check and is_llvm_support
 
     return False
 

From b3f4749203a4a42c1ea031ba32a6cf174723d7f4 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 17:05:14 +0300
Subject: [PATCH 47/84] fix intrin name for check

---
 tests/python/relay/test_op_level1.py  | 2 +-
 tests/python/relay/test_op_level10.py | 2 +-
 tests/python/relay/test_op_level2.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 1824ce79f6ab..737bedc90a9d 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -854,7 +854,7 @@ def test_dense_vnni(m, n, k):
 @tvm.testing.requires_skylake_avx512
 @pytest.mark.parametrize("m,n,k", [(32, 128, 96), (32, 128, 97)])
 def test_dense_skylake_avx512(m, n, k):
-    dense_x86_test(m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "pmaddd"])
+    dense_x86_test(m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"])
 
 
 @pytest.mark.skip("Requires GFX10 AMDGPU")
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 881b511af676..b703468e0148 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -590,7 +590,7 @@ def test_batch_matmul_vnni(b, m, n, k):
     ],
 )
 def test_batch_matmul_skylake_avx512(b, m, n, k):
-    batch_matmul_x86_test(b, m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "pmaddd"])
+    batch_matmul_x86_test(b, m, n, k, "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"])
 
 
 @pytest.mark.skip("Requires GFX10 AMDGPU")
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 1e4a11abf986..90276eb0c8e7 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2223,7 +2223,7 @@ def test_conv2d_int8_alter_dtype_vnni():
 @tvm.testing.requires_skylake_avx512
 def test_conv2d_int8_alter_dtype_avx512():
     _test_conv2d_int8_alter_dtype(
-        "int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "pmaddd"]
+        "int8", "llvm -mcpu=skylake-avx512", ["pmaddubs", "pmaddw", "vpaddd"]
     )
 
 
From 8fc39dc322f85add2ec94d73be38e0486a3aaae5 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 17:09:26 +0300
Subject: [PATCH 48/84] skip test due to model downloading issue

---
 tests/python/integration/test_auto_tensorize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/integration/test_auto_tensorize.py b/tests/python/integration/test_auto_tensorize.py
index 9d29a74943d2..70b2b875c124 100644
--- a/tests/python/integration/test_auto_tensorize.py
+++ b/tests/python/integration/test_auto_tensorize.py
@@ -348,7 +348,7 @@ def test_vnni_bert_int8():
 
 
 @tvm.testing.requires_skylake_avx512
-@pytest.mark.skip_if(tvm.testing.IS_IN_CI, reason="Slow on CI")
+@pytest.mark.skip("Due to quantized BERT download issue")
 def test_avx512_bert_int8():
     relay_mod, params, input_info = load_quantized_bert_base()
     _test_bert_int8(

From eb97d6da1552d3bac2daa49dfbe2ec1515cedafa Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 29 Dec 2022 17:13:20 +0300
Subject: [PATCH 49/84] fix test failure

---
 tests/python/relay/test_op_level1.py  | 1 +
 tests/python/relay/test_op_level10.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 737bedc90a9d..0549f4f2fbcc 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -781,6 +781,7 @@ def dense_x86_test(m, n, k, target="llvm -mcpu=cascadelake", intrins=["vpdpbusd"
             for intrin in intrins:
                 assert intrin in asm
 
+        dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
         a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index b703468e0148..ed044989ac18 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -495,6 +495,7 @@ def batch_matmul_x86_test(b, m, n, k, target="llvm -mcpu=cascadelake", intrins=[
             for intrin in intrins:
                 assert intrin in asm
 
+        dev = tvm.device(target, 0)
         runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
         x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)

From 3f3647623863487a5d6bca9c56aba7ef94285219 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 30 Dec 2022 11:26:24 +0300
Subject: [PATCH 50/84] use ORT for conv2d check

---
 tests/python/relay/test_op_level2.py | 55 ++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 90276eb0c8e7..9a69e4127183 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -30,6 +30,9 @@
 from tvm.relay.testing import run_infer_type
 from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
 
+from onnx import helper, mapping
+import onnxruntime
+
 executor_kind = tvm.testing.parameter("graph", "vm")
 
 
@@ -2159,6 +2162,55 @@ def get_conv2d_nchw(
             out_dtype=out_dtype,
         )
 
+    def verify_by_ort(x_data, w_data, b_data, out):
+        def get_onnx_model(x_shape, w_shape, out_shape):
+            x_dtype = "int8"
+            w_dtype = "int8"
+            b_dtype = "int32"
+            x_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(x_dtype)]
+            w_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(w_dtype)]
+            b_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(b_dtype)]
+
+            y_dtype = "int32"
+            y_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(y_dtype)]
+
+
+            input_nodes = [
+                helper.make_tensor_value_info("x", x_proto_type, list(x_shape)),
+                helper.make_tensor_value_info("w", w_proto_type, list(w_shape)),
+                helper.make_tensor_value_info("b", b_proto_type, (w_shape[0],)),
+            ]
+            input_names = [
+                "x",
+                "w",
+                "b",
+            ]
+
+            node = helper.make_node(
+                "Conv",
+                inputs=input_names,
+                outputs=["y"],
+            )
+
+            graph = helper.make_graph(
+                [node],
+                "ort_conv2d_test",
+                inputs=input_nodes,
+                outputs=[helper.make_tensor_value_info("y", y_proto_type, list(out_shape))],
+            )
+            model = helper.make_model(graph, producer_name="ort_conv2d_test")
+            return model
+
+        onnx_model = get_onnx_model(x_data.shape, w_data.shape, out.shape)
+        ort_exec = onnxruntime.backend.prepare(onnx_model.SerializeToString(), "CPU")
+        ort_out = ort_exec.run([x_data, w_data, b_data])
+        # Unpack output if there's only a single value.
+        if len(ort_out) == 1:
+            ort_out = ort_out[0]
+        if len(out) == 1:
+            out = out[0]
+        np.testing.assert_equal(out, ort_out)
+
     I, O, H, W = 64, 64, 56, 56
     kH = kW = 3
 
@@ -2205,6 +2257,9 @@ def get_conv2d_nchw(
 
     out = rt_mod.get_output(0).numpy()
 
+    verify_by_ort(data_np, weight_np, bias_np, out)
+    verify_by_ort(data_np, weight_np, bias_np, ref)
+
     np.testing.assert_equal(out, ref)
 
 
From 68fb495d0921f533d9b2eca5c6ff009cc0b04f14 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sat, 7 Jan 2023 10:15:20 +0300
Subject: [PATCH 51/84] lint fix after rebasing

---
 tests/python/relay/test_op_level2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 9a69e4127183..862309468cb2 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2174,7 +2174,6 @@ def get_onnx_model(x_shape, w_shape, out_shape):
             y_dtype = "int32"
             y_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(y_dtype)]
 
-
             input_nodes = [
                 helper.make_tensor_value_info("x", x_proto_type, list(x_shape)),
                 helper.make_tensor_value_info("w", w_proto_type, list(w_shape)),

From 61271dff85546d27c21844151fa2de47477f350f Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sat, 7 Jan 2023 12:12:00 +0300
Subject: [PATCH 52/84] comment ORT part of test

---
 tests/python/relay/test_op_level2.py | 102 +++++++++++++--------------
 1 file changed, 51 insertions(+), 51 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 862309468cb2..c36cf03c3911 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -30,8 +30,8 @@
 from tvm.relay.testing import run_infer_type
 from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
 
-from onnx import helper, mapping
-import onnxruntime
+# from onnx import helper, mapping
+# import onnxruntime
 
 executor_kind = tvm.testing.parameter("graph", "vm")
 
@@ -2162,53 +2162,53 @@ def get_conv2d_nchw(
             out_dtype=out_dtype,
         )
 
-    def verify_by_ort(x_data, w_data, b_data, out):
-        def get_onnx_model(x_shape, w_shape, out_shape):
-            x_dtype = "int8"
-            w_dtype = "int8"
-            b_dtype = "int32"
-            x_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(x_dtype)]
-            w_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(w_dtype)]
-            b_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(b_dtype)]
-
-            y_dtype = "int32"
-            y_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(y_dtype)]
-
-            input_nodes = [
-                helper.make_tensor_value_info("x", x_proto_type, list(x_shape)),
-                helper.make_tensor_value_info("w", w_proto_type, list(w_shape)),
-                helper.make_tensor_value_info("b", b_proto_type, (w_shape[0],)),
-            ]
-            input_names = [
-                "x",
-                "w",
-                "b",
-            ]
-
-            node = helper.make_node(
-                "Conv",
-                inputs=input_names,
-                outputs=["y"],
-            )
-
-            graph = helper.make_graph(
-                [node],
-                "ort_conv2d_test",
-                inputs=input_nodes,
-                outputs=[helper.make_tensor_value_info("y", y_proto_type, list(out_shape))],
-            )
-            model = helper.make_model(graph, producer_name="ort_conv2d_test")
-            return model
-
-        onnx_model = get_onnx_model(x_data.shape, w_data.shape, out.shape)
-        ort_exec = onnxruntime.backend.prepare(onnx_model.SerializeToString(), "CPU")
-        ort_out = ort_exec.run([x_data, w_data, b_data])
-        # Unpack output if there's only a single value.
-        if len(ort_out) == 1:
-            ort_out = ort_out[0]
-        if len(out) == 1:
-            out = out[0]
-        np.testing.assert_equal(out, ort_out)
+    # def verify_by_ort(x_data, w_data, b_data, out):
+    #     def get_onnx_model(x_shape, w_shape, out_shape):
+    #         x_dtype = "int8"
+    #         w_dtype = "int8"
+    #         b_dtype = "int32"
+    #         x_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(x_dtype)]
+    #         w_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(w_dtype)]
+    #         b_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(b_dtype)]
+
+    #         y_dtype = "int32"
+    #         y_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(y_dtype)]
+
+    #         input_nodes = [
+    #             helper.make_tensor_value_info("x", x_proto_type, list(x_shape)),
+    #             helper.make_tensor_value_info("w", w_proto_type, list(w_shape)),
+    #             helper.make_tensor_value_info("b", b_proto_type, (w_shape[0],)),
+    #         ]
+    #         input_names = [
+    #             "x",
+    #             "w",
+    #             "b",
+    #         ]
+
+    #         node = helper.make_node(
+    #             "Conv",
+    #             inputs=input_names,
+    #             outputs=["y"],
+    #         )
+
+    #         graph = helper.make_graph(
+    #             [node],
+    #             "ort_conv2d_test",
+    #             inputs=input_nodes,
+    #             outputs=[helper.make_tensor_value_info("y", y_proto_type, list(out_shape))],
+    #         )
+    #         model = helper.make_model(graph, producer_name="ort_conv2d_test")
+    #         return model
+
+    #     onnx_model = get_onnx_model(x_data.shape, w_data.shape, out.shape)
+    #     ort_exec = onnxruntime.backend.prepare(onnx_model.SerializeToString(), "CPU")
+    #     ort_out = ort_exec.run([x_data, w_data, b_data])
+    #     # Unpack output if there's only a single value.
+    #     if len(ort_out) == 1:
+    #         ort_out = ort_out[0]
+    #     if len(out) == 1:
+    #         out = out[0]
+    #     np.testing.assert_equal(out, ort_out)
 
     I, O, H, W = 64, 64, 56, 56
     kH = kW = 3
@@ -2256,8 +2256,8 @@ def get_onnx_model(x_shape, w_shape, out_shape):
 
     out = rt_mod.get_output(0).numpy()
 
-    verify_by_ort(data_np, weight_np, bias_np, out)
-    verify_by_ort(data_np, weight_np, bias_np, ref)
+    # verify_by_ort(data_np, weight_np, bias_np, out)
+    # verify_by_ort(data_np, weight_np, bias_np, ref)
 
     np.testing.assert_equal(out, ref)
 

From c5a88a1ca668bda3a05842876252fdcf22ac34f2 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 11:27:23 +0300
Subject: [PATCH 53/84] extend tests tir schedule analysis and transform for
 avx512. unify test classes

---
 .../unittest/test_tir_schedule_analysis.py    |  74 +--------
 .../unittest/test_tir_schedule_transform.py   | 157 +++---------------
 .../unittest/tir_schedule_test_utils.py       | 143 ++++++++++++++++
 3 files changed, 168 insertions(+), 206 deletions(-)
 create mode 100644 tests/python/unittest/tir_schedule_test_utils.py

diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index e0667da6fe92..65b3917341cf 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -40,6 +40,8 @@
 from tvm.meta_schedule.testing import te_workload
 from tvm.te import create_prim_func
 
+from .tir_schedule_test_utils import DenseTIRModule, Conv2dNCHWcTIRModule
+
 
 def _make_vars(*args: str) -> List[Var]:
     return [Var(arg, dtype="int32") for arg in args]
@@ -145,70 +147,6 @@ def test_suggest_index_map_winograd():
     assert inverse_index_map.is_equivalent_to(expected_inverse_index_map)
 
 
-@tvm.script.ir_module
-class DenseVNNIModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1024, 1024), "uint8"],
-        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
-        compute: T.Buffer[(1024, 1024), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            for i0, i1, i2 in T.grid(1024, 1024, 1024):
-                with T.block("compute"):
-                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
-                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
-                    T.writes(compute[i, j])
-                    with T.init():
-                        compute[i, j] = 0
-                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
-                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
-                    )
-
-
-@tvm.script.ir_module
-class Conv2dNCHWcVNNIModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
-            with T.block("conv2d_NCHWc_int8"):
-                (
-                    n,
-                    oc_chunk,
-                    oh,
-                    ow,
-                    oc_block,
-                    kh,
-                    kw,
-                    ic_outer,
-                    ic_f_inner,
-                    ic_s_inner,
-                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
-                T.reads(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                )
-                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-                with T.init():
-                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-                    n, oc_chunk, oh, ow, oc_block
-                ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
-                ) * T.cast(
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                    "int32",
-                )
-
-
 def collect_loops(prim_func):
     loops = []
 
@@ -222,8 +160,8 @@ def callback(node):
     return loops
 
 
-def test_get_tensorize_loop_mapping_dense_vnni():
-    s = Schedule(DenseVNNIModule)
+def test_get_tensorize_loop_mapping_dense_16x4():
+    s = Schedule(DenseTIRModule)
     block = s.get_block("compute")
 
     info = get_tensorize_loop_mapping(s, block, dot_product_16x4_u8i8i32_desc)
@@ -240,8 +178,8 @@ def test_get_tensorize_loop_mapping_dense_vnni():
     assert s.get(desc_loop_to_sref[desc_loops[1]]) == s.get(loop_k)
 
 
-def test_get_tensorize_loop_mapping_conv2d_nchwc_vnni():
-    s = Schedule(Conv2dNCHWcVNNIModule)
+def test_get_tensorize_loop_mapping_conv2d_nchwc_16x4():
+    s = Schedule(Conv2dNCHWcTIRModule)
     block = s.get_block("conv2d_NCHWc_int8")
 
     info = get_tensorize_loop_mapping(s, block, dot_product_16x4_u8i8i32_desc)
diff --git a/tests/python/unittest/test_tir_schedule_transform.py b/tests/python/unittest/test_tir_schedule_transform.py
index e812587e6676..777be4e20295 100644
--- a/tests/python/unittest/test_tir_schedule_transform.py
+++ b/tests/python/unittest/test_tir_schedule_transform.py
@@ -15,161 +15,42 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm.script import tir as T
 from tvm.tir import Schedule
 from tvm.tir.schedule.transform import tile_with_tensor_intrin
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN, AVX512_DOT_16x4_INTRIN
 
+from .tir_schedule_test_utils import (
+    DenseTIRModule,
+    DenseTIRModuleTiled,
+    Conv2dNCHWcTIRModule,
+    Conv2dNCHWcTIRModuleTiled,
+)
 
-@tvm.script.ir_module
-class DenseVNNIModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1024, 1024), "uint8"],
-        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
-        compute: T.Buffer[(1024, 1024), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            for i0, i1, i2 in T.grid(1024, 1024, 1024):
-                with T.block("compute"):
-                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
-                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
-                    T.writes(compute[i, j])
-                    with T.init():
-                        compute[i, j] = 0
-                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
-                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
-                    )
 
-
-@tvm.script.ir_module
-class DenseVNNIModuleTiled:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1024, 1024), "uint8"],
-        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
-        compute: T.Buffer[(1024, 1024), "int32"],
-    ) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        for i0, i1_0, i2_0, i1_1, i2_1 in T.grid(1024, 64, 256, 16, 4):
-            with T.block("compute"):
-                i = T.axis.spatial(1024, i0)
-                j = T.axis.spatial(1024, i1_0 * 16 + i1_1)
-                k = T.axis.reduce(1024, i2_0 * 4 + i2_1)
-                T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
-                T.writes(compute[i, j])
-                with T.init():
-                    compute[i, j] = 0
-                compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
-                    placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
-                )
-
-
-@tvm.script.ir_module
-class Conv2dNCHWcVNNIModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
-            with T.block("conv2d_NCHWc_int8"):
-                (
-                    n,
-                    oc_chunk,
-                    oh,
-                    ow,
-                    oc_block,
-                    kh,
-                    kw,
-                    ic_outer,
-                    ic_f_inner,
-                    ic_s_inner,
-                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
-                T.reads(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                )
-                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-                with T.init():
-                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-                    n, oc_chunk, oh, ow, oc_block
-                ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
-                ) * T.cast(
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                    "int32",
-                )
-
-
-@tvm.script.ir_module
-class Conv2dNCHWcVNNIModuleTiled:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-    ) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        for i0, i1, i2, i3, i4_0, i5, i6, i7, i8, i9_0, i4_1, i9_1 in T.grid(
-            1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
-        ):
-            with T.block("conv2d_NCHWc_int8"):
-                n, oc_chunk, oh, ow = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                oc_block = T.axis.spatial(16, i4_0 * 16 + i4_1)
-                kh, kw, ic_outer, ic_f_inner = T.axis.remap("RRRR", [i5, i6, i7, i8])
-                ic_s_inner = T.axis.reduce(4, i9_0 * 4 + i9_1)
-                T.reads(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                )
-                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-                with T.init():
-                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-                    n, oc_chunk, oh, ow, oc_block
-                ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
-                ) * T.cast(
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                    "int32",
-                )
-
-
-def test_tile_with_tensor_intrin_dense_vnni():
-    s = Schedule(DenseVNNIModule)
+def test_tile_with_tensor_intrin_dense(intrin=VNNI_DOT_16x4_INTRIN):
+    s = Schedule(DenseTIRModule)
     block = s.get_block("compute")
 
-    tiled_loop = tile_with_tensor_intrin(s, block, VNNI_DOT_16x4_INTRIN)
+    tiled_loop = tile_with_tensor_intrin(s, block, intrin)
 
     _, _, _, i1_1, _ = s.get_loops(block)
 
     assert s.get(tiled_loop) == s.get(i1_1)
-    tvm.ir.assert_structural_equal(s.mod, DenseVNNIModuleTiled)
+    tvm.ir.assert_structural_equal(s.mod, DenseTIRModuleTiled)
 
 
-def test_tile_with_tensor_intrin_conv2d_nchwc_vnni():
-    s = Schedule(Conv2dNCHWcVNNIModule)
+def test_tile_with_tensor_intrin_conv2d_nchwc(intrin=VNNI_DOT_16x4_INTRIN):
+    s = Schedule(Conv2dNCHWcTIRModule)
     block = s.get_block("conv2d_NCHWc_int8")
-    tiled_loop = tile_with_tensor_intrin(s, block, VNNI_DOT_16x4_INTRIN)
+    tiled_loop = tile_with_tensor_intrin(s, block, intrin)
     tiled_loops = s.get_loops(block)
     assert len(tiled_loops) == 12
     assert s.get(tiled_loop) == s.get(tiled_loops[-2])
-    tvm.ir.assert_structural_equal(s.mod, Conv2dNCHWcVNNIModuleTiled)
+    tvm.ir.assert_structural_equal(s.mod, Conv2dNCHWcTIRModuleTiled)
 
 
 if __name__ == "__main__":
-    test_tile_with_tensor_intrin_dense_vnni()
-    test_tile_with_tensor_intrin_conv2d_nchwc_vnni()
+    test_tile_with_tensor_intrin_dense()
+    test_tile_with_tensor_intrin_dense(AVX512_DOT_16x4_INTRIN)
+    test_tile_with_tensor_intrin_conv2d_nchwc()
+    test_tile_with_tensor_intrin_conv2d_nchwc(AVX512_DOT_16x4_INTRIN)
diff --git a/tests/python/unittest/tir_schedule_test_utils.py b/tests/python/unittest/tir_schedule_test_utils.py
new file mode 100644
index 000000000000..a922ae57187a
--- /dev/null
+++ b/tests/python/unittest/tir_schedule_test_utils.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm.script import tir as T
+
+
+@tvm.script.ir_module
+class DenseTIRModule:
+  @T.prim_func
+  def main(
+    placeholder: T.Buffer[(1024, 1024), "uint8"],
+    placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+    compute: T.Buffer[(1024, 1024), "int32"],
+  ) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    with T.block("root"):
+      T.reads()
+      T.writes()
+      for i0, i1, i2 in T.grid(1024, 1024, 1024):
+        with T.block("compute"):
+          i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+          T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+          T.writes(compute[i, j])
+          with T.init():
+            compute[i, j] = 0
+          compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+            placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+          )
+
+
+@tvm.script.ir_module
+class DenseTIRModuleTiled:
+  @T.prim_func
+  def main(
+    placeholder: T.Buffer[(1024, 1024), "uint8"],
+    placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+    compute: T.Buffer[(1024, 1024), "int32"],
+  ) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    # with T.block("root")
+    for i0, i1_0, i2_0, i1_1, i2_1 in T.grid(1024, 64, 256, 16, 4):
+      with T.block("compute"):
+        i = T.axis.spatial(1024, i0)
+        j = T.axis.spatial(1024, i1_0 * 16 + i1_1)
+        k = T.axis.reduce(1024, i2_0 * 4 + i2_1)
+        T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+        T.writes(compute[i, j])
+        with T.init():
+          compute[i, j] = 0
+        compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+          placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+        )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcTIRModule:
+  @T.prim_func
+  def main(
+    placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+    placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+    conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+  ) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+      with T.block("conv2d_NCHWc_int8"):
+        (
+          n,
+          oc_chunk,
+          oh,
+          ow,
+          oc_block,
+          kh,
+          kw,
+          ic_outer,
+          ic_f_inner,
+          ic_s_inner,
+        ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+        T.reads(
+          placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+          placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+        )
+        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+        with T.init():
+          conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+          n, oc_chunk, oh, ow, oc_block
+        ] + T.cast(
+          placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+        ) * T.cast(
+          placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], "int32",
+        )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcTIRModuleTiled:
+  @T.prim_func
+  def main(
+    placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+    placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+    conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+  ) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    # with T.block("root")
+    for i0, i1, i2, i3, i4_0, i5, i6, i7, i8, i9_0, i4_1, i9_1 in T.grid(
+      1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
+    ):
+      with T.block("conv2d_NCHWc_int8"):
+        n, oc_chunk, oh, ow = T.axis.remap("SSSS", [i0, i1, i2, i3])
+        oc_block = T.axis.spatial(16, i4_0 * 16 + i4_1)
+        kh, kw, ic_outer, ic_f_inner = T.axis.remap("RRRR", [i5, i6, i7, i8])
+        ic_s_inner = T.axis.reduce(4, i9_0 * 4 + i9_1)
+        T.reads(
+          placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+          placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+        )
+        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+        with T.init():
+          conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+          n, oc_chunk, oh, ow, oc_block
+        ] + T.cast(
+          placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+        ) * T.cast(
+          placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], "int32",
+        )

From f8875552a8899a69bd919361271b434bf189d054 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 11:44:10 +0300
Subject: [PATCH 54/84] extend test tir schedule tensorize for avx512

---
 .../python/unittest/test_tir_schedule_tensorize.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index fc0bdc146c88..4847f261a32c 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -29,7 +29,7 @@
     ARM_DOT_4x4_i8_SDOT_INTRIN,
 )
 from tvm.tir.tensor_intrin.rocm import AMDGPU_SDOT4_INTRIN
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN, AVX512_DOT_16x4_INTRIN
 from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN, VDMPY_i16i16i32_INTRIN
 
 # fmt: off
@@ -557,7 +557,7 @@ def get_matmul_packed(m, n, k, lhs_type, rhs_dtype="int8"):
     return te.create_prim_func([X, W, matmul])
 
 
-def test_tensorize_vnni():
+def tensorize_16x4_test(intrin=VNNI_DOT_16x4_INTRIN):
     m, n, k = 128, 128, 128
 
     func = get_matmul_packed(m, n, k, "uint8")
@@ -572,11 +572,19 @@ def test_tensorize_vnni():
     sch.reorder(ko, ji, ki)
 
     sch.decompose_reduction(block, ko)
-    sch.tensorize(ji, VNNI_DOT_16x4_INTRIN)
+    sch.tensorize(ji, intrin)
 
     verify_trace_roundtrip(sch=sch, mod=func)
 
 
+def test_tensorize_vnni():
+    tensorize_16x4_test()
+
+
+def test_tensorize_avx512():
+    tensorize_16x4_test(AVX512_DOT_16x4_INTRIN)
+
+
 def test_tensorize_arm_dot():
     m, n, k = 128, 128, 128
 

From bfdb2c25c00e1d3c0b30cc89e81f5cc71631d990 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 12:18:08 +0300
Subject: [PATCH 55/84] extend test meta schedule vnni integration for avx512

---
 .../test_meta_schedule_vnni_integration.py    | 46 ++++++++++++++-----
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_vnni_integration.py
index c37c1da9250c..15a3efbfedbe 100644
--- a/tests/python/unittest/test_meta_schedule_vnni_integration.py
+++ b/tests/python/unittest/test_meta_schedule_vnni_integration.py
@@ -28,6 +28,7 @@
 from tvm.tir.schedule import BlockRV, Schedule
 from tvm.tir.schedule.analysis import has_block
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 
 logging.basicConfig(
     format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
@@ -36,9 +37,9 @@
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 
 
-def _schedule_dense(m: Optional[int], do_tune: bool):
+def _schedule_dense(m: Optional[int], do_tune: bool, intrin=VNNI_INTRIN):
     """Manually schedule a dense block, created from TE compute op via CreatePrimFunc,
-    using VNNI instruction.
+    using VNNI or AVX512 instructions.
     """
 
     def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
@@ -90,7 +91,7 @@ def schedule_fn(sch, dense_block: Optional[BlockRV] = None) -> bool:
         dec = sch.decompose_reduction(dense_block, a_ko)
         init_loop = sch.get_loops(dec)[-1]
         sch.vectorize(init_loop)
-        sch.tensorize(a_xi, VNNI_INTRIN)
+        sch.tensorize(a_xi, intrin)
         return True
 
     return schedule_fn
@@ -135,10 +136,8 @@ def f_check(lib, dev):
     return relay_mod, params, f_check
 
 
-@tvm.testing.requires_cascadelake
-def test_vnni_schedule_fn_database():
+def schedule_16x4_dense_fn_database(target, intrin):
     m, n, k = 1024, 1024, 1024
-    target = tvm.target.Target("llvm -mcpu=cascadelake -num-cores 4")
     dev = tvm.cpu(0)
     relay_mod, params, f_check = _relay_dense(m, n, k)
 
@@ -146,6 +145,7 @@ def test_vnni_schedule_fn_database():
         _schedule_dense(
             m=m,
             do_tune=False,
+            intrin=intrin,
         )
     ), tvm.transform.PassContext(
         opt_level=3,
@@ -167,7 +167,18 @@ def test_vnni_schedule_fn_database():
 
 
 @tvm.testing.requires_cascadelake
-def test_vnni_schedule_fn_tune():
+def test_vnni_schedule_fn_database():
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
+    schedule_16x4_dense_fn_database(target, VNNI_INTRIN)
+
+
+@tvm.testing.requires_skylake_avx512
+def test_avx512_schedule_fn_database():
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-512 -num-cores=4")
+    schedule_16x4_dense_fn_database(target, AVX512_INTRIN)
+
+
+def schedule_16x4_dense_fn_tune(target, intrin, tag="meta_schedule.x86.dense_vnni"):
     # pylint: disable=W0105
     """
     We can inject and apply a custom TIR scheduling to a TE compute of interest, using
@@ -191,14 +202,13 @@ def test_vnni_schedule_fn_tune():
     The relevant code is in `src/meta_schedule/space_generator/apply_custom_rule.cc`.
     """
 
-    def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
-        _schedule_dense(m=None, do_tune=True)(sch, dense_block)
+    def schedule_rule_dense_16x4(sch: Schedule, dense_block: BlockRV):
+        _schedule_dense(m=None, do_tune=True, intrin=intrin)(sch, dense_block)
         return [sch]
 
-    register_func("meta_schedule.x86.dense_vnni", schedule_rule_dense_vnni)
+    register_func(tag, schedule_rule_dense_16x4)
 
     m, n, k = 1024, 1024, 1024
-    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
     dev = tvm.cpu(0)
     relay_mod, params, f_check = _relay_dense(m, n, k)
 
@@ -247,6 +257,20 @@ def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
     f_check(lib, dev)
 
 
+@tvm.testing.requires_cascadelake
+def test_vnni_schedule_fn_tune():
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=cascadelake -num-cores=4")
+    schedule_16x4_dense_fn_tune(target, VNNI_INTRIN)
+
+
+@tvm.testing.requires_skylake_avx512
+def test_avx512_schedule_fn_tune():
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-avx512 -num-cores=4")
+    schedule_16x4_dense_fn_tune(target, AVX512_INTRIN, "meta_schedule.x86.dense_avx512")
+
+
 if __name__ == """__main__""":
     test_vnni_schedule_fn_database()
+    test_avx512_schedule_fn_database()
     test_vnni_schedule_fn_tune()
+    test_avx512_schedule_fn_tune()

From de26b94829dcaa971fe88781189d0d199107dade Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 12:19:28 +0300
Subject: [PATCH 56/84] rename test file

---
 ...vnni_integration.py => test_meta_schedule_16x4_integration.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/python/unittest/{test_meta_schedule_vnni_integration.py => test_meta_schedule_16x4_integration.py} (100%)

diff --git a/tests/python/unittest/test_meta_schedule_vnni_integration.py b/tests/python/unittest/test_meta_schedule_16x4_integration.py
similarity index 100%
rename from tests/python/unittest/test_meta_schedule_vnni_integration.py
rename to tests/python/unittest/test_meta_schedule_16x4_integration.py

From 5059fd627001444bc6f19dade638b20db1bc77ae Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 12:55:03 +0300
Subject: [PATCH 57/84] pylint fix

---
 .../unittest/tir_schedule_test_utils.py       | 222 +++++++++---------
 1 file changed, 113 insertions(+), 109 deletions(-)

diff --git a/tests/python/unittest/tir_schedule_test_utils.py b/tests/python/unittest/tir_schedule_test_utils.py
index a922ae57187a..e32321bc0f3b 100644
--- a/tests/python/unittest/tir_schedule_test_utils.py
+++ b/tests/python/unittest/tir_schedule_test_utils.py
@@ -20,124 +20,128 @@
 
 @tvm.script.ir_module
 class DenseTIRModule:
-  @T.prim_func
-  def main(
-    placeholder: T.Buffer[(1024, 1024), "uint8"],
-    placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
-    compute: T.Buffer[(1024, 1024), "int32"],
-  ) -> None:
-    T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    with T.block("root"):
-      T.reads()
-      T.writes()
-      for i0, i1, i2 in T.grid(1024, 1024, 1024):
-        with T.block("compute"):
-          i, j, k = T.axis.remap("SSR", [i0, i1, i2])
-          T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
-          T.writes(compute[i, j])
-          with T.init():
-            compute[i, j] = 0
-          compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
-            placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
-          )
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            for i0, i1, i2 in T.grid(1024, 1024, 1024):
+                with T.block("compute"):
+                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                    T.writes(compute[i, j])
+                    with T.init():
+                        compute[i, j] = 0
+                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                    )
 
 
 @tvm.script.ir_module
 class DenseTIRModuleTiled:
-  @T.prim_func
-  def main(
-    placeholder: T.Buffer[(1024, 1024), "uint8"],
-    placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
-    compute: T.Buffer[(1024, 1024), "int32"],
-  ) -> None:
-    # function attr dict
-    T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    # body
-    # with T.block("root")
-    for i0, i1_0, i2_0, i1_1, i2_1 in T.grid(1024, 64, 256, 16, 4):
-      with T.block("compute"):
-        i = T.axis.spatial(1024, i0)
-        j = T.axis.spatial(1024, i1_0 * 16 + i1_1)
-        k = T.axis.reduce(1024, i2_0 * 4 + i2_1)
-        T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
-        T.writes(compute[i, j])
-        with T.init():
-          compute[i, j] = 0
-        compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
-          placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
-        )
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1_0, i2_0, i1_1, i2_1 in T.grid(1024, 64, 256, 16, 4):
+            with T.block("compute"):
+                i = T.axis.spatial(1024, i0)
+                j = T.axis.spatial(1024, i1_0 * 16 + i1_1)
+                k = T.axis.reduce(1024, i2_0 * 4 + i2_1)
+                T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                T.writes(compute[i, j])
+                with T.init():
+                    compute[i, j] = 0
+                compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                    placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                )
 
 
 @tvm.script.ir_module
 class Conv2dNCHWcTIRModule:
-  @T.prim_func
-  def main(
-    placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-    placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-    conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-  ) -> None:
-    T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
-      with T.block("conv2d_NCHWc_int8"):
-        (
-          n,
-          oc_chunk,
-          oh,
-          ow,
-          oc_block,
-          kh,
-          kw,
-          ic_outer,
-          ic_f_inner,
-          ic_s_inner,
-        ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
-        T.reads(
-          placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-          placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-        )
-        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-        with T.init():
-          conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-          n, oc_chunk, oh, ow, oc_block
-        ] + T.cast(
-          placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
-        ) * T.cast(
-          placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], "int32",
-        )
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                (
+                    n,
+                    oc_chunk,
+                    oh,
+                    ow,
+                    oc_block,
+                    kh,
+                    kw,
+                    ic_outer,
+                    ic_f_inner,
+                    ic_s_inner,
+                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
 
 
 @tvm.script.ir_module
 class Conv2dNCHWcTIRModuleTiled:
-  @T.prim_func
-  def main(
-    placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-    placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-    conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-  ) -> None:
-    # function attr dict
-    T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    # body
-    # with T.block("root")
-    for i0, i1, i2, i3, i4_0, i5, i6, i7, i8, i9_0, i4_1, i9_1 in T.grid(
-      1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
-    ):
-      with T.block("conv2d_NCHWc_int8"):
-        n, oc_chunk, oh, ow = T.axis.remap("SSSS", [i0, i1, i2, i3])
-        oc_block = T.axis.spatial(16, i4_0 * 16 + i4_1)
-        kh, kw, ic_outer, ic_f_inner = T.axis.remap("RRRR", [i5, i6, i7, i8])
-        ic_s_inner = T.axis.reduce(4, i9_0 * 4 + i9_1)
-        T.reads(
-          placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-          placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-        )
-        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-        with T.init():
-          conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-          n, oc_chunk, oh, ow, oc_block
-        ] + T.cast(
-          placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
-        ) * T.cast(
-          placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner], "int32",
-        )
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1, i2, i3, i4_0, i5, i6, i7, i8, i9_0, i4_1, i9_1 in T.grid(
+            1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
+        ):
+            with T.block("conv2d_NCHWc_int8"):
+                n, oc_chunk, oh, ow = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                oc_block = T.axis.spatial(16, i4_0 * 16 + i4_1)
+                kh, kw, ic_outer, ic_f_inner = T.axis.remap("RRRR", [i5, i6, i7, i8])
+                ic_s_inner = T.axis.reduce(4, i9_0 * 4 + i9_1)
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )

From 206d45826f89978c83ec68abf6dfa8be58f1e356 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 15:34:55 +0300
Subject: [PATCH 58/84] tag fix

---
 .../unittest/test_meta_schedule_16x4_integration.py  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_16x4_integration.py b/tests/python/unittest/test_meta_schedule_16x4_integration.py
index 15a3efbfedbe..f5c11f14ec9a 100644
--- a/tests/python/unittest/test_meta_schedule_16x4_integration.py
+++ b/tests/python/unittest/test_meta_schedule_16x4_integration.py
@@ -178,21 +178,21 @@ def test_avx512_schedule_fn_database():
     schedule_16x4_dense_fn_database(target, AVX512_INTRIN)
 
 
-def schedule_16x4_dense_fn_tune(target, intrin, tag="meta_schedule.x86.dense_vnni"):
+def schedule_16x4_dense_fn_tune(target, intrin):
     # pylint: disable=W0105
     """
     We can inject and apply a custom TIR scheduling to a TE compute of interest, using
     the "schedule_rule" annotation. For example, in topi/x86/dense.py we have the following
-    declaration for int8 dense targeting the VNNI instruction.
+    declaration for int8 dense targeting the VNNI or AVX512 instructions.
 
     C = te.compute(
         ...
-        attrs={"schedule_rule": "meta_schedule.x86.dense_vnni"},
+        attrs={"schedule_rule": "meta_schedule.x86.dense_int8"},
     )
 
     When the MetaSchedule encounters a TensorIR block with the "schedule_rule" annotation,
     it looks up the packed func registry for a function that is associated with the given schedule
-    rule key ("meta_schedule.x86.dense_vnni" in this example). The signature of such custom
+    rule key ("meta_schedule.x86.dense_int8" in this example). The signature of such custom
     schedule functions must be
 
        (tir.schedule.Schedule, tir.schedule.BlockRV) -> [tir.schedule.Schedule].
@@ -206,7 +206,7 @@ def schedule_rule_dense_16x4(sch: Schedule, dense_block: BlockRV):
         _schedule_dense(m=None, do_tune=True, intrin=intrin)(sch, dense_block)
         return [sch]
 
-    register_func(tag, schedule_rule_dense_16x4)
+    register_func("meta_schedule.x86.dense_int8", schedule_rule_dense_16x4)
 
     m, n, k = 1024, 1024, 1024
     dev = tvm.cpu(0)
@@ -266,7 +266,7 @@ def test_vnni_schedule_fn_tune():
 @tvm.testing.requires_skylake_avx512
 def test_avx512_schedule_fn_tune():
     target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-avx512 -num-cores=4")
-    schedule_16x4_dense_fn_tune(target, AVX512_INTRIN, "meta_schedule.x86.dense_avx512")
+    schedule_16x4_dense_fn_tune(target, AVX512_INTRIN)
 
 
 if __name__ == """__main__""":

From e66b0e570eb191abd87c84c6c1fdcfeac66f4b0f Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 19:09:05 +0300
Subject: [PATCH 59/84] update test meta schedule trace apply with avx512

---
 .../test_meta_schedule_trace_apply.py         | 509 ++++++++++--------
 1 file changed, 271 insertions(+), 238 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index a77b0246c1ba..f99929cbf5c7 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -25,6 +25,9 @@
 from tvm.target import Target
 from tvm.target.codegen import llvm_lookup_intrinsic_id
 
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
+
 
 # fmt: off
 @tvm.script.ir_module
@@ -1131,12 +1134,34 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                 T_cast[ax0, ax1, ax2, ax3, ax4] = T.cast(compute_2[ax0, ax1, ax2, ax3, ax4], "int32")
 
 
-# TODO(vvchernov): construct avx512 reference module (without vnni)
-def get_conv2d_vnni_mod(intrin_id):
+def get_conv2d_16x4_mod(intrin):
     @tvm.script.ir_module
     class Conv2dInt8_NCHWc_scheduled:
         @T.prim_func
         def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1, 4, 16, 4), "int8"], p2: T.Buffer[(1, 128, 1, 1, 16), "int32"], p3: T.Buffer[(1, 128, 1, 1, 16), "float32"], p4: T.Buffer[1, "float32"], p5: T.Buffer[(1, 128, 7, 7, 16), "uint8"], T_cast: T.Buffer[(1, 128, 7, 7, 16), "int32"]) -> None:
+            def calculate_with_intrin(A, B, C):
+                if intrin == "vnni":
+                    A_u8x4: T.uint8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                    B_i8x64: T.int8x64 = B[0, 0:64]
+                    B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+                    C_i32x16: T.int32x16 = C[0:16]
+                    vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
+                    C[0:16] = T.call_llvm_pure_intrin(T.uint32(vnni_id), T.uint32(3), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                elif intrin == "avx512":
+                    A_u8x4: T.uint8x4 = A[0:4]
+                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                    A_brdcst: T.int32x16 = T.broadcast(A_i32, 16)
+                    A_u8x64: T.uint8x64 = T.reinterpret(A_brdcst, dtype="uint8x64")
+
+                    B_i8x64: T.int8x64 = B[0, 0:64]
+
+                    avx512_id_1 = T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddubs.w.512")
+                    Red: T.int16x32 = T.call_llvm_pure_intrin(avx512_id_1, T.uint32(2), A_u8x64, B_i8x64, dtype="int16x32")
+
+                    avx512_id_2 = T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512")
+                    C[0:16] += T.call_llvm_pure_intrin(avx512_id_2, T.uint32(2), Red, T.int16x32(1), dtype="int32x16")
+
             # function attr dict
             T.func_attr({"global_symbol": "main", "tir.noalias": True})
             # body
@@ -1176,12 +1201,7 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                             A = T.match_buffer(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
                             B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4], [16, 4], dtype="int8", offset_factor=1)
                             C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], [16], dtype="int32", offset_factor=1)
-                            A_u8x4: T.uint8x4 = A[0:4]
-                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                            B_i8x64: T.int8x64 = B[0, 0:64]
-                            B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
-                            C_i32x16: T.int32x16 = C[0:16]
-                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(intrin_id), T.uint32(0), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                            calculate_with_intrin(A, B, C)
                     for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
                         for ax4_fused in T.vectorized(16):
                             with T.block("T_cast_8"):
@@ -2503,243 +2523,256 @@ def apply_trace(sch):
     verify(Conv2dInt8, apply_trace, Conv2dInt8_target, "cuda", Conv2dInt8_tensorcore_scheduled)
 
 
-# TODO(vvchernov): test int8 conv2d foravx512 without VNNI
+def apply_trace_16x4(sch, intrin):
+    b0 = sch.get_block(name="compile_engine_const", func_name="main")
+    b1 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
+    b2 = sch.get_block(name="T_add", func_name="main")
+    b3 = sch.get_block(name="T_cast", func_name="main")
+    b4 = sch.get_block(name="T_multiply", func_name="main")
+    b5 = sch.get_block(name="compile_engine_const_1", func_name="main")
+    b6 = sch.get_block(name="T_add_1", func_name="main")
+    b7 = sch.get_block(name="T_floor", func_name="main")
+    b8 = sch.get_block(name="T_cast_1", func_name="main")
+    b9 = sch.get_block(name="compute", func_name="main")
+    b10 = sch.get_block(name="T_cast_2", func_name="main")
+    b11 = sch.get_block(name="T_cast_3", func_name="main")
+    b12 = sch.get_block(name="T_subtract", func_name="main")
+    b13 = sch.get_block(name="T_multiply_1", func_name="main")
+    b14 = sch.get_block(name="compile_engine_const_2", func_name="main")
+    b15 = sch.get_block(name="T_add_2", func_name="main")
+    b16 = sch.get_block(name="T_floor_1", func_name="main")
+    b17 = sch.get_block(name="T_cast_4", func_name="main")
+    b18 = sch.get_block(name="T_add_3", func_name="main")
+    b19 = sch.get_block(name="compute_1", func_name="main")
+    b20 = sch.get_block(name="T_cast_5", func_name="main")
+    b21 = sch.get_block(name="root", func_name="main")
+    sch.compute_inline(block=b20)
+    sch.compute_inline(block=b19)
+    sch.compute_inline(block=b18)
+    sch.compute_inline(block=b17)
+    sch.compute_inline(block=b16)
+    sch.compute_inline(block=b15)
+    sch.compute_inline(block=b14)
+    sch.compute_inline(block=b13)
+    sch.compute_inline(block=b12)
+    sch.compute_inline(block=b11)
+    sch.compute_inline(block=b10)
+    sch.compute_inline(block=b9)
+    sch.compute_inline(block=b8)
+    sch.compute_inline(block=b7)
+    sch.compute_inline(block=b6)
+    sch.compute_inline(block=b5)
+    sch.compute_inline(block=b4)
+    sch.compute_inline(block=b3)
+    sch.compute_inline(block=b2)
+    sch.compute_inline(block=b0)
+    sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
+    l22, l23, l24, l25, l26, l27, l28, l29, l30, l31 = sch.get_loops(block=b1)
+    l32, l33 = sch.split(loop=l31, factors=[None, 4], preserve_unit_iters=True)
+    l34, l35 = sch.split(loop=l26, factors=[None, 16], preserve_unit_iters=True)
+    l36, l37, l38, l39, l40, l41, l42, l43, l44, l45, l46, l47 = sch.get_loops(block=b1)
+    sch.reorder(l42, l43, l44, l45, l46, l35, l33)
+    b48 = sch.blockize(loop=l35)
+    sch.annotate(block_or_loop=b48, ann_key="meta_schedule.auto_tensorize", ann_val=intrin)
+    l49, l50, l51, l52, l53, l54, l55, l56, l57, l58 = sch.get_loops(block=b48)
+    v59, v60, v61, v62 = sch.sample_perfect_tile(
+        loop=l49, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
+    )
+    l63, l64, l65, l66 = sch.split(
+        loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True
+    )
+    v67, v68, v69, v70 = sch.sample_perfect_tile(
+        loop=l50, n=4, max_innermost_factor=64, decision=[4, 32, 1, 1]
+    )
+    l71, l72, l73, l74 = sch.split(
+        loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True
+    )
+    v75, v76, v77, v78 = sch.sample_perfect_tile(
+        loop=l51, n=4, max_innermost_factor=64, decision=[1, 7, 1, 1]
+    )
+    l79, l80, l81, l82 = sch.split(
+        loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True
+    )
+    v83, v84, v85, v86 = sch.sample_perfect_tile(
+        loop=l52, n=4, max_innermost_factor=64, decision=[1, 1, 1, 7]
+    )
+    l87, l88, l89, l90 = sch.split(
+        loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True
+    )
+    v91, v92, v93, v94 = sch.sample_perfect_tile(
+        loop=l53, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
+    )
+    l95, l96, l97, l98 = sch.split(
+        loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True
+    )
+    v99, v100 = sch.sample_perfect_tile(loop=l54, n=2, max_innermost_factor=64, decision=[1, 1])
+    l101, l102 = sch.split(loop=l54, factors=[v99, v100], preserve_unit_iters=True)
+    v103, v104 = sch.sample_perfect_tile(
+        loop=l55, n=2, max_innermost_factor=64, decision=[1, 1]
+    )
+    l105, l106 = sch.split(loop=l55, factors=[v103, v104], preserve_unit_iters=True)
+    v107, v108 = sch.sample_perfect_tile(
+        loop=l56, n=2, max_innermost_factor=64, decision=[4, 8]
+    )
+    l109, l110 = sch.split(loop=l56, factors=[v107, v108], preserve_unit_iters=True)
+    v111, v112 = sch.sample_perfect_tile(
+        loop=l57, n=2, max_innermost_factor=64, decision=[4, 1]
+    )
+    l113, l114 = sch.split(loop=l57, factors=[v111, v112], preserve_unit_iters=True)
+    v115, v116 = sch.sample_perfect_tile(
+        loop=l58, n=2, max_innermost_factor=64, decision=[1, 1]
+    )
+    l117, l118 = sch.split(loop=l58, factors=[v115, v116], preserve_unit_iters=True)
+    sch.reorder(
+        l63,
+        l71,
+        l79,
+        l87,
+        l95,
+        l64,
+        l72,
+        l80,
+        l88,
+        l96,
+        l101,
+        l105,
+        l109,
+        l113,
+        l117,
+        l65,
+        l73,
+        l81,
+        l89,
+        l97,
+        l102,
+        l106,
+        l110,
+        l114,
+        l118,
+        l66,
+        l74,
+        l82,
+        l90,
+        l98,
+    )
+    (b119,) = sch.get_consumers(block=b48)
+    sch.reverse_compute_at(block=b119, loop=l96, preserve_unit_loops=True, index=-1)
+    sch.annotate(block_or_loop=b21, ann_key="meta_schedule.parallel", ann_val=96)
+    sch.annotate(block_or_loop=b21, ann_key="meta_schedule.vectorize", ann_val=64)
+    v120 = sch.sample_categorical(
+        candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25], decision=2
+    )
+    sch.annotate(block_or_loop=b21, ann_key="meta_schedule.unroll_explicit", ann_val=v120)
+    sch.enter_postproc()
+    b121 = sch.get_block(name="root", func_name="main")
+    sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.parallel")
+    sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.vectorize")
+    sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit")
+    b122, b123 = sch.get_child_blocks(b121)
+    (
+        l124,
+        l125,
+        l126,
+        l127,
+        l128,
+        l129,
+        l130,
+        l131,
+        l132,
+        l133,
+        l134,
+        l135,
+        l136,
+        l137,
+        l138,
+        l139,
+        l140,
+        l141,
+        l142,
+        l143,
+        l144,
+        l145,
+        l146,
+        l147,
+        l148,
+        l149,
+        l150,
+        l151,
+        l152,
+        l153,
+    ) = sch.get_loops(block=b122)
+    l154 = sch.fuse(l124, l125, l126, l127, l128, l129, l130, preserve_unit_iters=True)
+    sch.parallel(loop=l154)
+    sch.annotate(block_or_loop=l154, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+    sch.annotate(block_or_loop=l154, ann_key="pragma_unroll_explicit", ann_val=1)
+    l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b123)
+    l164 = sch.fuse(l163, preserve_unit_iters=True)
+    sch.vectorize(loop=l164)
+    sch.annotate(block_or_loop=l155, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+    sch.annotate(block_or_loop=l155, ann_key="pragma_unroll_explicit", ann_val=1)
+    b165 = sch.get_block(name="conv2d_NCHWc_int8_o", func_name="main")
+    (
+        l166,
+        l167,
+        l168,
+        l169,
+        l170,
+        l171,
+        l172,
+        l173,
+        l174,
+        l175,
+        l176,
+        l177,
+        l178,
+        l179,
+        l180,
+        l181,
+        l182,
+        l183,
+        l184,
+        l185,
+        l186,
+        l187,
+        l188,
+        l189,
+    ) = sch.get_loops(block=b165)
+    b190 = sch.decompose_reduction(block=b165, loop=l170)
+    sch.unannotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize")
+    sch.annotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize", ann_val="")
+    b191 = sch.get_block(name="conv2d_NCHWc_int8_o_init", func_name="main")
+    sch.unannotate(block_or_loop=b191, ann_key="meta_schedule.auto_tensorize")
+    (b192,) = sch.get_child_blocks(b191)
+    (l193,) = sch.get_loops(block=b192)
+    sch.vectorize(loop=l193)
+    b194 = sch.get_block(name="conv2d_NCHWc_int8_o_update", func_name="main")
+    sch.unannotate(block_or_loop=b194, ann_key="meta_schedule.auto_tensorize")
+    sch.tensorize(block_or_loop=b194, tensor_intrin=intrin)
+
+
 def test_conv2d_int8_vnni():
     def apply_trace(sch):
-        b0 = sch.get_block(name="compile_engine_const", func_name="main")
-        b1 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
-        b2 = sch.get_block(name="T_add", func_name="main")
-        b3 = sch.get_block(name="T_cast", func_name="main")
-        b4 = sch.get_block(name="T_multiply", func_name="main")
-        b5 = sch.get_block(name="compile_engine_const_1", func_name="main")
-        b6 = sch.get_block(name="T_add_1", func_name="main")
-        b7 = sch.get_block(name="T_floor", func_name="main")
-        b8 = sch.get_block(name="T_cast_1", func_name="main")
-        b9 = sch.get_block(name="compute", func_name="main")
-        b10 = sch.get_block(name="T_cast_2", func_name="main")
-        b11 = sch.get_block(name="T_cast_3", func_name="main")
-        b12 = sch.get_block(name="T_subtract", func_name="main")
-        b13 = sch.get_block(name="T_multiply_1", func_name="main")
-        b14 = sch.get_block(name="compile_engine_const_2", func_name="main")
-        b15 = sch.get_block(name="T_add_2", func_name="main")
-        b16 = sch.get_block(name="T_floor_1", func_name="main")
-        b17 = sch.get_block(name="T_cast_4", func_name="main")
-        b18 = sch.get_block(name="T_add_3", func_name="main")
-        b19 = sch.get_block(name="compute_1", func_name="main")
-        b20 = sch.get_block(name="T_cast_5", func_name="main")
-        b21 = sch.get_block(name="root", func_name="main")
-        sch.compute_inline(block=b20)
-        sch.compute_inline(block=b19)
-        sch.compute_inline(block=b18)
-        sch.compute_inline(block=b17)
-        sch.compute_inline(block=b16)
-        sch.compute_inline(block=b15)
-        sch.compute_inline(block=b14)
-        sch.compute_inline(block=b13)
-        sch.compute_inline(block=b12)
-        sch.compute_inline(block=b11)
-        sch.compute_inline(block=b10)
-        sch.compute_inline(block=b9)
-        sch.compute_inline(block=b8)
-        sch.compute_inline(block=b7)
-        sch.compute_inline(block=b6)
-        sch.compute_inline(block=b5)
-        sch.compute_inline(block=b4)
-        sch.compute_inline(block=b3)
-        sch.compute_inline(block=b2)
-        sch.compute_inline(block=b0)
-        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
-        l22, l23, l24, l25, l26, l27, l28, l29, l30, l31 = sch.get_loops(block=b1)
-        l32, l33 = sch.split(loop=l31, factors=[None, 4], preserve_unit_iters=True)
-        l34, l35 = sch.split(loop=l26, factors=[None, 16], preserve_unit_iters=True)
-        l36, l37, l38, l39, l40, l41, l42, l43, l44, l45, l46, l47 = sch.get_loops(block=b1)
-        sch.reorder(l42, l43, l44, l45, l46, l35, l33)
-        b48 = sch.blockize(loop=l35)
-        sch.annotate(
-            block_or_loop=b48, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni"
-        )
-        l49, l50, l51, l52, l53, l54, l55, l56, l57, l58 = sch.get_loops(block=b48)
-        v59, v60, v61, v62 = sch.sample_perfect_tile(
-            loop=l49, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
-        )
-        l63, l64, l65, l66 = sch.split(
-            loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True
-        )
-        v67, v68, v69, v70 = sch.sample_perfect_tile(
-            loop=l50, n=4, max_innermost_factor=64, decision=[4, 32, 1, 1]
-        )
-        l71, l72, l73, l74 = sch.split(
-            loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True
-        )
-        v75, v76, v77, v78 = sch.sample_perfect_tile(
-            loop=l51, n=4, max_innermost_factor=64, decision=[1, 7, 1, 1]
-        )
-        l79, l80, l81, l82 = sch.split(
-            loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True
-        )
-        v83, v84, v85, v86 = sch.sample_perfect_tile(
-            loop=l52, n=4, max_innermost_factor=64, decision=[1, 1, 1, 7]
-        )
-        l87, l88, l89, l90 = sch.split(
-            loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True
-        )
-        v91, v92, v93, v94 = sch.sample_perfect_tile(
-            loop=l53, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
-        )
-        l95, l96, l97, l98 = sch.split(
-            loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True
-        )
-        v99, v100 = sch.sample_perfect_tile(loop=l54, n=2, max_innermost_factor=64, decision=[1, 1])
-        l101, l102 = sch.split(loop=l54, factors=[v99, v100], preserve_unit_iters=True)
-        v103, v104 = sch.sample_perfect_tile(
-            loop=l55, n=2, max_innermost_factor=64, decision=[1, 1]
-        )
-        l105, l106 = sch.split(loop=l55, factors=[v103, v104], preserve_unit_iters=True)
-        v107, v108 = sch.sample_perfect_tile(
-            loop=l56, n=2, max_innermost_factor=64, decision=[4, 8]
-        )
-        l109, l110 = sch.split(loop=l56, factors=[v107, v108], preserve_unit_iters=True)
-        v111, v112 = sch.sample_perfect_tile(
-            loop=l57, n=2, max_innermost_factor=64, decision=[4, 1]
-        )
-        l113, l114 = sch.split(loop=l57, factors=[v111, v112], preserve_unit_iters=True)
-        v115, v116 = sch.sample_perfect_tile(
-            loop=l58, n=2, max_innermost_factor=64, decision=[1, 1]
-        )
-        l117, l118 = sch.split(loop=l58, factors=[v115, v116], preserve_unit_iters=True)
-        sch.reorder(
-            l63,
-            l71,
-            l79,
-            l87,
-            l95,
-            l64,
-            l72,
-            l80,
-            l88,
-            l96,
-            l101,
-            l105,
-            l109,
-            l113,
-            l117,
-            l65,
-            l73,
-            l81,
-            l89,
-            l97,
-            l102,
-            l106,
-            l110,
-            l114,
-            l118,
-            l66,
-            l74,
-            l82,
-            l90,
-            l98,
-        )
-        (b119,) = sch.get_consumers(block=b48)
-        sch.reverse_compute_at(block=b119, loop=l96, preserve_unit_loops=True, index=-1)
-        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.parallel", ann_val=96)
-        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.vectorize", ann_val=64)
-        v120 = sch.sample_categorical(
-            candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25], decision=2
-        )
-        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.unroll_explicit", ann_val=v120)
-        sch.enter_postproc()
-        b121 = sch.get_block(name="root", func_name="main")
-        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.parallel")
-        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.vectorize")
-        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit")
-        b122, b123 = sch.get_child_blocks(b121)
-        (
-            l124,
-            l125,
-            l126,
-            l127,
-            l128,
-            l129,
-            l130,
-            l131,
-            l132,
-            l133,
-            l134,
-            l135,
-            l136,
-            l137,
-            l138,
-            l139,
-            l140,
-            l141,
-            l142,
-            l143,
-            l144,
-            l145,
-            l146,
-            l147,
-            l148,
-            l149,
-            l150,
-            l151,
-            l152,
-            l153,
-        ) = sch.get_loops(block=b122)
-        l154 = sch.fuse(l124, l125, l126, l127, l128, l129, l130, preserve_unit_iters=True)
-        sch.parallel(loop=l154)
-        sch.annotate(block_or_loop=l154, ann_key="pragma_auto_unroll_max_step", ann_val=64)
-        sch.annotate(block_or_loop=l154, ann_key="pragma_unroll_explicit", ann_val=1)
-        l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b123)
-        l164 = sch.fuse(l163, preserve_unit_iters=True)
-        sch.vectorize(loop=l164)
-        sch.annotate(block_or_loop=l155, ann_key="pragma_auto_unroll_max_step", ann_val=64)
-        sch.annotate(block_or_loop=l155, ann_key="pragma_unroll_explicit", ann_val=1)
-        b165 = sch.get_block(name="conv2d_NCHWc_int8_o", func_name="main")
-        (
-            l166,
-            l167,
-            l168,
-            l169,
-            l170,
-            l171,
-            l172,
-            l173,
-            l174,
-            l175,
-            l176,
-            l177,
-            l178,
-            l179,
-            l180,
-            l181,
-            l182,
-            l183,
-            l184,
-            l185,
-            l186,
-            l187,
-            l188,
-            l189,
-        ) = sch.get_loops(block=b165)
-        b190 = sch.decompose_reduction(block=b165, loop=l170)
-        sch.unannotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize")
-        sch.annotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize", ann_val="")
-        b191 = sch.get_block(name="conv2d_NCHWc_int8_o_init", func_name="main")
-        sch.unannotate(block_or_loop=b191, ann_key="meta_schedule.auto_tensorize")
-        (b192,) = sch.get_child_blocks(b191)
-        (l193,) = sch.get_loops(block=b192)
-        sch.vectorize(loop=l193)
-        b194 = sch.get_block(name="conv2d_NCHWc_int8_o_update", func_name="main")
-        sch.unannotate(block_or_loop=b194, ann_key="meta_schedule.auto_tensorize")
-        sch.tensorize(block_or_loop=b194, tensor_intrin="dot_16x4_vnni")
-
-    vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
+        return apply_trace_16x4(sch, VNNI_INTRIN)
+
     verify(
         Conv2dInt8_NCHWc,
         apply_trace,
         Conv2dInt8_NCHWc_target,
         "llvm -mcpu=cascadelake",
-        get_conv2d_vnni_mod(vnni_id),
+        get_conv2d_16x4_mod("vnni"),
+    )
+
+
+def test_conv2d_int8_vavx512():
+    def apply_trace(sch):
+        return apply_trace_16x4(sch, AVX512_INTRIN)
+
+    verify(
+        Conv2dInt8_NCHWc,
+        apply_trace,
+        Conv2dInt8_NCHWc_target,
+        "llvm -mcpu=skylake-avx512",
+        get_conv2d_16x4_mod("avx512"),
     )
 
 
From b56bb45078238e4e213ce8aeed21dfc5cb2eb40d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 19:17:30 +0300
Subject: [PATCH 60/84] rollback test class unifying in utils

---
 .../unittest/test_tir_schedule_analysis.py    |  65 +++++++-
 .../unittest/test_tir_schedule_transform.py   | 135 +++++++++++++++-
 .../unittest/tir_schedule_test_utils.py       | 147 ------------------
 3 files changed, 193 insertions(+), 154 deletions(-)
 delete mode 100644 tests/python/unittest/tir_schedule_test_utils.py

diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 65b3917341cf..b7d1fe2b1b52 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -40,7 +40,70 @@
 from tvm.meta_schedule.testing import te_workload
 from tvm.te import create_prim_func
 
-from .tir_schedule_test_utils import DenseTIRModule, Conv2dNCHWcTIRModule
+
+@tvm.script.ir_module
+class DenseTIRModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            for i0, i1, i2 in T.grid(1024, 1024, 1024):
+                with T.block("compute"):
+                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                    T.writes(compute[i, j])
+                    with T.init():
+                        compute[i, j] = 0
+                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                    )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcTIRModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                (
+                    n,
+                    oc_chunk,
+                    oh,
+                    ow,
+                    oc_block,
+                    kh,
+                    kw,
+                    ic_outer,
+                    ic_f_inner,
+                    ic_s_inner,
+                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
 
 
 def _make_vars(*args: str) -> List[Var]:
diff --git a/tests/python/unittest/test_tir_schedule_transform.py b/tests/python/unittest/test_tir_schedule_transform.py
index 777be4e20295..c068385f0a46 100644
--- a/tests/python/unittest/test_tir_schedule_transform.py
+++ b/tests/python/unittest/test_tir_schedule_transform.py
@@ -15,16 +15,139 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm.script import tir as T
 from tvm.tir import Schedule
 from tvm.tir.schedule.transform import tile_with_tensor_intrin
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN, AVX512_DOT_16x4_INTRIN
 
-from .tir_schedule_test_utils import (
-    DenseTIRModule,
-    DenseTIRModuleTiled,
-    Conv2dNCHWcTIRModule,
-    Conv2dNCHWcTIRModuleTiled,
-)
+
+@tvm.script.ir_module
+class DenseTIRModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            for i0, i1, i2 in T.grid(1024, 1024, 1024):
+                with T.block("compute"):
+                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                    T.writes(compute[i, j])
+                    with T.init():
+                        compute[i, j] = 0
+                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                    )
+
+
+@tvm.script.ir_module
+class DenseTIRModuleTiled:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1_0, i2_0, i1_1, i2_1 in T.grid(1024, 64, 256, 16, 4):
+            with T.block("compute"):
+                i = T.axis.spatial(1024, i0)
+                j = T.axis.spatial(1024, i1_0 * 16 + i1_1)
+                k = T.axis.reduce(1024, i2_0 * 4 + i2_1)
+                T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                T.writes(compute[i, j])
+                with T.init():
+                    compute[i, j] = 0
+                compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                    placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcTIRModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                (
+                    n,
+                    oc_chunk,
+                    oh,
+                    ow,
+                    oc_block,
+                    kh,
+                    kw,
+                    ic_outer,
+                    ic_f_inner,
+                    ic_s_inner,
+                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcTIRModuleTiled:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1, i2, i3, i4_0, i5, i6, i7, i8, i9_0, i4_1, i9_1 in T.grid(
+            1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
+        ):
+            with T.block("conv2d_NCHWc_int8"):
+                n, oc_chunk, oh, ow = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                oc_block = T.axis.spatial(16, i4_0 * 16 + i4_1)
+                kh, kw, ic_outer, ic_f_inner = T.axis.remap("RRRR", [i5, i6, i7, i8])
+                ic_s_inner = T.axis.reduce(4, i9_0 * 4 + i9_1)
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
 
 
 def test_tile_with_tensor_intrin_dense(intrin=VNNI_DOT_16x4_INTRIN):
diff --git a/tests/python/unittest/tir_schedule_test_utils.py b/tests/python/unittest/tir_schedule_test_utils.py
deleted file mode 100644
index e32321bc0f3b..000000000000
--- a/tests/python/unittest/tir_schedule_test_utils.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-from tvm.script import tir as T
-
-
-@tvm.script.ir_module
-class DenseTIRModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1024, 1024), "uint8"],
-        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
-        compute: T.Buffer[(1024, 1024), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            for i0, i1, i2 in T.grid(1024, 1024, 1024):
-                with T.block("compute"):
-                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
-                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
-                    T.writes(compute[i, j])
-                    with T.init():
-                        compute[i, j] = 0
-                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
-                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
-                    )
-
-
-@tvm.script.ir_module
-class DenseTIRModuleTiled:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1024, 1024), "uint8"],
-        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
-        compute: T.Buffer[(1024, 1024), "int32"],
-    ) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        for i0, i1_0, i2_0, i1_1, i2_1 in T.grid(1024, 64, 256, 16, 4):
-            with T.block("compute"):
-                i = T.axis.spatial(1024, i0)
-                j = T.axis.spatial(1024, i1_0 * 16 + i1_1)
-                k = T.axis.reduce(1024, i2_0 * 4 + i2_1)
-                T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
-                T.writes(compute[i, j])
-                with T.init():
-                    compute[i, j] = 0
-                compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
-                    placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
-                )
-
-
-@tvm.script.ir_module
-class Conv2dNCHWcTIRModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
-            with T.block("conv2d_NCHWc_int8"):
-                (
-                    n,
-                    oc_chunk,
-                    oh,
-                    ow,
-                    oc_block,
-                    kh,
-                    kw,
-                    ic_outer,
-                    ic_f_inner,
-                    ic_s_inner,
-                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
-                T.reads(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                )
-                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-                with T.init():
-                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-                    n, oc_chunk, oh, ow, oc_block
-                ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    "int32",
-                ) * T.cast(
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                    "int32",
-                )
-
-
-@tvm.script.ir_module
-class Conv2dNCHWcTIRModuleTiled:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-    ) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # body
-        # with T.block("root")
-        for i0, i1, i2, i3, i4_0, i5, i6, i7, i8, i9_0, i4_1, i9_1 in T.grid(
-            1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
-        ):
-            with T.block("conv2d_NCHWc_int8"):
-                n, oc_chunk, oh, ow = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                oc_block = T.axis.spatial(16, i4_0 * 16 + i4_1)
-                kh, kw, ic_outer, ic_f_inner = T.axis.remap("RRRR", [i5, i6, i7, i8])
-                ic_s_inner = T.axis.reduce(4, i9_0 * 4 + i9_1)
-                T.reads(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                )
-                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-                with T.init():
-                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-                    n, oc_chunk, oh, ow, oc_block
-                ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    "int32",
-                ) * T.cast(
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                    "int32",
-                )

From 8255f1f9d20abe0a647b80c3555545481a0ae94f Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 8 Jan 2023 19:41:00 +0300
Subject: [PATCH 61/84] pylint fixes

---
 .../test_meta_schedule_trace_apply.py         | 36 +++++--------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index f99929cbf5c7..f70956d3cb59 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -2578,50 +2578,32 @@ def apply_trace_16x4(sch, intrin):
     v59, v60, v61, v62 = sch.sample_perfect_tile(
         loop=l49, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
     )
-    l63, l64, l65, l66 = sch.split(
-        loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True
-    )
+    l63, l64, l65, l66 = sch.split(loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True)
     v67, v68, v69, v70 = sch.sample_perfect_tile(
         loop=l50, n=4, max_innermost_factor=64, decision=[4, 32, 1, 1]
     )
-    l71, l72, l73, l74 = sch.split(
-        loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True
-    )
+    l71, l72, l73, l74 = sch.split(loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True)
     v75, v76, v77, v78 = sch.sample_perfect_tile(
         loop=l51, n=4, max_innermost_factor=64, decision=[1, 7, 1, 1]
     )
-    l79, l80, l81, l82 = sch.split(
-        loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True
-    )
+    l79, l80, l81, l82 = sch.split(loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True)
     v83, v84, v85, v86 = sch.sample_perfect_tile(
         loop=l52, n=4, max_innermost_factor=64, decision=[1, 1, 1, 7]
     )
-    l87, l88, l89, l90 = sch.split(
-        loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True
-    )
+    l87, l88, l89, l90 = sch.split(loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True)
     v91, v92, v93, v94 = sch.sample_perfect_tile(
         loop=l53, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
     )
-    l95, l96, l97, l98 = sch.split(
-        loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True
-    )
+    l95, l96, l97, l98 = sch.split(loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True)
     v99, v100 = sch.sample_perfect_tile(loop=l54, n=2, max_innermost_factor=64, decision=[1, 1])
     l101, l102 = sch.split(loop=l54, factors=[v99, v100], preserve_unit_iters=True)
-    v103, v104 = sch.sample_perfect_tile(
-        loop=l55, n=2, max_innermost_factor=64, decision=[1, 1]
-    )
+    v103, v104 = sch.sample_perfect_tile(loop=l55, n=2, max_innermost_factor=64, decision=[1, 1])
     l105, l106 = sch.split(loop=l55, factors=[v103, v104], preserve_unit_iters=True)
-    v107, v108 = sch.sample_perfect_tile(
-        loop=l56, n=2, max_innermost_factor=64, decision=[4, 8]
-    )
+    v107, v108 = sch.sample_perfect_tile(loop=l56, n=2, max_innermost_factor=64, decision=[4, 8])
     l109, l110 = sch.split(loop=l56, factors=[v107, v108], preserve_unit_iters=True)
-    v111, v112 = sch.sample_perfect_tile(
-        loop=l57, n=2, max_innermost_factor=64, decision=[4, 1]
-    )
+    v111, v112 = sch.sample_perfect_tile(loop=l57, n=2, max_innermost_factor=64, decision=[4, 1])
     l113, l114 = sch.split(loop=l57, factors=[v111, v112], preserve_unit_iters=True)
-    v115, v116 = sch.sample_perfect_tile(
-        loop=l58, n=2, max_innermost_factor=64, decision=[1, 1]
-    )
+    v115, v116 = sch.sample_perfect_tile(loop=l58, n=2, max_innermost_factor=64, decision=[1, 1])
     l117, l118 = sch.split(loop=l58, factors=[v115, v116], preserve_unit_iters=True)
     sch.reorder(
         l63,

From fea930c0cfa7ff9a54dccadb947102304c027f6a Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 9 Jan 2023 08:37:22 +0300
Subject: [PATCH 62/84] separate TIRs for scheduled conv2d for vnni and avx512

---
 .../test_meta_schedule_trace_apply.py         | 104 +++++++++++++-----
 1 file changed, 79 insertions(+), 25 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index f70956d3cb59..0432c2f23834 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -1136,32 +1136,70 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
 
 def get_conv2d_16x4_mod(intrin):
     @tvm.script.ir_module
-    class Conv2dInt8_NCHWc_scheduled:
+    class Conv2dInt8_NCHWc_VNNI_scheduled:
         @T.prim_func
         def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1, 4, 16, 4), "int8"], p2: T.Buffer[(1, 128, 1, 1, 16), "int32"], p3: T.Buffer[(1, 128, 1, 1, 16), "float32"], p4: T.Buffer[1, "float32"], p5: T.Buffer[(1, 128, 7, 7, 16), "uint8"], T_cast: T.Buffer[(1, 128, 7, 7, 16), "int32"]) -> None:
-            def calculate_with_intrin(A, B, C):
-                if intrin == "vnni":
-                    A_u8x4: T.uint8x4 = A[0:4]
-                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                    B_i8x64: T.int8x64 = B[0, 0:64]
-                    B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
-                    C_i32x16: T.int32x16 = C[0:16]
-                    vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
-                    C[0:16] = T.call_llvm_pure_intrin(T.uint32(vnni_id), T.uint32(3), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
-                elif intrin == "avx512":
-                    A_u8x4: T.uint8x4 = A[0:4]
-                    A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                    A_brdcst: T.int32x16 = T.broadcast(A_i32, 16)
-                    A_u8x64: T.uint8x64 = T.reinterpret(A_brdcst, dtype="uint8x64")
-
-                    B_i8x64: T.int8x64 = B[0, 0:64]
-
-                    avx512_id_1 = T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddubs.w.512")
-                    Red: T.int16x32 = T.call_llvm_pure_intrin(avx512_id_1, T.uint32(2), A_u8x64, B_i8x64, dtype="int16x32")
-
-                    avx512_id_2 = T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512")
-                    C[0:16] += T.call_llvm_pure_intrin(avx512_id_2, T.uint32(2), Red, T.int16x32(1), dtype="int32x16")
+            # function attr dict
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            # body
+            # with T.block("root")
+            conv2d_NCHWc_int8 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
+            for i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused in T.parallel(128, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}):
+                for i2_1, i3_1, i4_0_1 in T.grid(7, 1, 1):
+                    for i0_2_init, i1_2_init, i2_2_init, i3_2_init, i4_0_2_init, i0_3_init, i1_3_init, i2_3_init, i3_3_init, i4_0_3_init in T.grid(1, 1, 1, 1, 1, 1, 1, 1, 7, 1):
+                        with T.block("conv2d_NCHWc_int8_o_init"):
+                            n = T.axis.spatial(1, i0_3_init + i0_2_init)
+                            oc_chunk = T.axis.spatial(128, i1_2_init + i1_3_init + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
+                            oh = T.axis.spatial(7, i2_1 + i2_2_init + i2_3_init)
+                            ow = T.axis.spatial(7, i3_1 * 7 + i3_2_init * 7 + i3_3_init)
+                            oc_block_o = T.axis.spatial(1, i4_0_3_init + i4_0_1 + i4_0_2_init)
+                            T.reads()
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
+                            for i4_1 in T.vectorized(16):
+                                with T.block("conv2d_NCHWc_int8_init"):
+                                    oc_block_i_init = T.axis.spatial(16, i4_1)
+                                    T.reads()
+                                    T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
+                                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
+                    for i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 7, 1):
+                        with T.block("conv2d_NCHWc_int8_o_update"):
+                            n = T.axis.spatial(1, i0_3 + i0_2)
+                            oc_chunk = T.axis.spatial(128, i1_2 + i1_3 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
+                            oh = T.axis.spatial(7, i2_1 + i2_2 + i2_3)
+                            ow = T.axis.spatial(7, i3_1 * 7 + i3_2 * 7 + i3_3)
+                            oc_block_o = T.axis.spatial(1, i4_0_3 + i4_0_1 + i4_0_2)
+                            kh = T.axis.reduce(1, i5_0 + i5_1)
+                            kw = T.axis.reduce(1, i6_1 + i6_0)
+                            ic_outer = T.axis.reduce(32, i7_0 * 8 + i7_1)
+                            ic_f_inner = T.axis.reduce(4, i8_1 + i8_0)
+                            ic_s_inner_o = T.axis.reduce(1, i9_0_0 + i9_0_1)
+                            T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
+                            A = T.match_buffer(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
+                            B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4], [16, 4], dtype="int8", offset_factor=1)
+                            C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], [16], dtype="int32", offset_factor=1)
+                            A_u8x4: T.uint8x4 = A[0:4]
+                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                            B_i8x64: T.int8x64 = B[0, 0:64]
+                            B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+                            C_i32x16: T.int32x16 = C[0:16]
+                            vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
+                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(vnni_id), T.uint32(3), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
+                        for ax4_fused in T.vectorized(16):
+                            with T.block("T_cast_8"):
+                                ax0_1 = T.axis.spatial(1, ax0)
+                                ax1_1 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32 + ax1)
+                                ax2_1 = T.axis.spatial(7, i2_1 + ax2)
+                                ax3_1, ax4 = T.axis.remap("SS", [ax3, ax4_fused])
+                                T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4], p2[ax0_1, ax1_1, 0, 0, ax4], p3[ax0_1, ax1_1, 0, 0, ax4], p4[0], p5[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
+                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = T.cast(T.max(T.min(T.cast(T.max(T.min(T.cast(T.floor(T.float32(0.95489668846130371) * (T.cast(T.cast(T.max(T.min(T.cast(T.floor(T.cast(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4] + p2[ax0_1, ax1_1, 0, 0, ax4], "float32") * p3[ax0_1, ax1_1, 0, 0, ax4] + T.float32(65.5), dtype="float32"), "int32"), 255), 0), "uint8"), "float32") - p4[0]) + T.float32(0.5), dtype="float32"), "int32") + T.cast(T.floor(T.float32(0.71245479583740234) * T.cast(p5[ax0_1, ax1_1, ax2_1, ax3_1, ax4], "float32") + T.float32(0.5), dtype="float32"), "int32"), 255), 0), "uint8"), T.uint8(255)), T.uint8(0)), "int32")
 
+    @tvm.script.ir_module
+    class Conv2dInt8_NCHWc_AVX512_scheduled:
+        @T.prim_func
+        def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1, 4, 16, 4), "int8"], p2: T.Buffer[(1, 128, 1, 1, 16), "int32"], p3: T.Buffer[(1, 128, 1, 1, 16), "float32"], p4: T.Buffer[1, "float32"], p5: T.Buffer[(1, 128, 7, 7, 16), "uint8"], T_cast: T.Buffer[(1, 128, 7, 7, 16), "int32"]) -> None:
             # function attr dict
             T.func_attr({"global_symbol": "main", "tir.noalias": True})
             # body
@@ -1201,7 +1239,18 @@ def calculate_with_intrin(A, B, C):
                             A = T.match_buffer(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
                             B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4], [16, 4], dtype="int8", offset_factor=1)
                             C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], [16], dtype="int32", offset_factor=1)
-                            calculate_with_intrin(A, B, C)
+                            A_u8x4: T.uint8x4 = A[0:4]
+                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
+                            A_brdcst: T.int32x16 = T.broadcast(A_i32, 16)
+                            A_u8x64: T.uint8x64 = T.reinterpret(A_brdcst, dtype="uint8x64")
+
+                            B_i8x64: T.int8x64 = B[0, 0:64]
+
+                            avx512_id_1 = T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddubs.w.512")
+                            Red: T.int16x32 = T.call_llvm_pure_intrin(avx512_id_1, T.uint32(2), A_u8x64, B_i8x64, dtype="int16x32")
+
+                            avx512_id_2 = T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512")
+                            C[0:16] += T.call_llvm_pure_intrin(avx512_id_2, T.uint32(2), Red, T.int16x32(1), dtype="int32x16")
                     for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
                         for ax4_fused in T.vectorized(16):
                             with T.block("T_cast_8"):
@@ -1213,7 +1262,12 @@ def calculate_with_intrin(A, B, C):
                                 T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
                                 T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = T.cast(T.max(T.min(T.cast(T.max(T.min(T.cast(T.floor(T.float32(0.95489668846130371) * (T.cast(T.cast(T.max(T.min(T.cast(T.floor(T.cast(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4] + p2[ax0_1, ax1_1, 0, 0, ax4], "float32") * p3[ax0_1, ax1_1, 0, 0, ax4] + T.float32(65.5), dtype="float32"), "int32"), 255), 0), "uint8"), "float32") - p4[0]) + T.float32(0.5), dtype="float32"), "int32") + T.cast(T.floor(T.float32(0.71245479583740234) * T.cast(p5[ax0_1, ax1_1, ax2_1, ax3_1, ax4], "float32") + T.float32(0.5), dtype="float32"), "int32"), 255), 0), "uint8"), T.uint8(255)), T.uint8(0)), "int32")
 
-    return Conv2dInt8_NCHWc_scheduled
+    if intrin == "vnni":
+        return Conv2dInt8_NCHWc_VNNI_scheduled
+    elif intrin == "avx512":
+        return Conv2dInt8_NCHWc_AVX512_scheduled
+    else:
+        raise NotImplementedError("VNNI and AVX512 are supported only. \"", intrin, "\" is not supported")
 
 
 @tvm.script.ir_module

From 9e5f6eefb627a0fcfadfb55d45584d8462cbf1b9 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 9 Jan 2023 08:46:10 +0300
Subject: [PATCH 63/84] fix registering issue in test

---
 tests/python/unittest/test_meta_schedule_16x4_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_16x4_integration.py b/tests/python/unittest/test_meta_schedule_16x4_integration.py
index f5c11f14ec9a..e432cdd22268 100644
--- a/tests/python/unittest/test_meta_schedule_16x4_integration.py
+++ b/tests/python/unittest/test_meta_schedule_16x4_integration.py
@@ -206,7 +206,7 @@ def schedule_rule_dense_16x4(sch: Schedule, dense_block: BlockRV):
         _schedule_dense(m=None, do_tune=True, intrin=intrin)(sch, dense_block)
         return [sch]
 
-    register_func("meta_schedule.x86.dense_int8", schedule_rule_dense_16x4)
+    register_func("meta_schedule.x86.dense_int8", schedule_rule_dense_16x4, override=True)
 
     m, n, k = 1024, 1024, 1024
     dev = tvm.cpu(0)

From ea093b05f9a0ae4c2401b79cd3a35a384aef019f Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 10 Jan 2023 08:54:41 +0300
Subject: [PATCH 64/84] update conv+bias onnx model for intermediate test

---
 tests/python/relay/test_op_level2.py | 51 ++++++++++++++++++----------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index c36cf03c3911..d5079009d327 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -30,9 +30,6 @@
 from tvm.relay.testing import run_infer_type
 from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
 
-# from onnx import helper, mapping
-# import onnxruntime
-
 executor_kind = tvm.testing.parameter("graph", "vm")
 
 
@@ -2162,9 +2159,12 @@ def get_conv2d_nchw(
             out_dtype=out_dtype,
         )
 
-    # def verify_by_ort(x_data, w_data, b_data, out):
-    #     def get_onnx_model(x_shape, w_shape, out_shape):
-    #         x_dtype = "int8"
+    # def verify_by_ort(x_data, w_data, b_data, data_dtype, out):
+    #     from onnx import helper, mapping, TensorProto
+    #     from onnxruntime import backend as ort_bk
+
+    #     def get_onnx_model(data_dtype, x_shape, w_shape, b_shape, out_shape):
+    #         x_dtype = data_dtype
     #         w_dtype = "int8"
     #         b_dtype = "int32"
     #         x_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(x_dtype)]
@@ -2177,31 +2177,46 @@ def get_conv2d_nchw(
     #         input_nodes = [
     #             helper.make_tensor_value_info("x", x_proto_type, list(x_shape)),
     #             helper.make_tensor_value_info("w", w_proto_type, list(w_shape)),
-    #             helper.make_tensor_value_info("b", b_proto_type, (w_shape[0],)),
+    #             helper.make_tensor_value_info("B", b_proto_type, list(b_shape)),
+    #         ]
+    #         initializer = [
+    #             helper.make_tensor("x_scale", TensorProto.FLOAT, [], [1.]),
+    #             helper.make_tensor("x_zero_point", x_proto_type, [], [0]),
+    #             helper.make_tensor("w_scale", TensorProto.FLOAT, [], [1.]),
+    #             helper.make_tensor("w_zero_point", w_proto_type, [], [0]),
+    #             helper.make_tensor("y_scale", TensorProto.FLOAT, [], [1.]),
+    #             helper.make_tensor("y_zero_point", y_proto_type, [], [0]),
     #         ]
     #         input_names = [
     #             "x",
+    #             "x_scale",
+    #             "x_zero_point",
     #             "w",
-    #             "b",
+    #             "w_scale",
+    #             "w_zero_point",
+    #             "y_scale",
+    #             "y_zero_point",
+    #             "B"
     #         ]
 
-    #         node = helper.make_node(
-    #             "Conv",
+    #         node_conv = helper.make_node(
+    #             "QLinearConv",
     #             inputs=input_names,
     #             outputs=["y"],
     #         )
 
     #         graph = helper.make_graph(
-    #             [node],
-    #             "ort_conv2d_test",
+    #             [node_conv],
+    #             "ort_conv2d_int8_test",
     #             inputs=input_nodes,
+    #             initializer=initializer,
     #             outputs=[helper.make_tensor_value_info("y", y_proto_type, list(out_shape))],
     #         )
-    #         model = helper.make_model(graph, producer_name="ort_conv2d_test")
+    #         model = helper.make_model(graph, producer_name="ort_conv2d_int8_test")
     #         return model
 
-    #     onnx_model = get_onnx_model(x_data.shape, w_data.shape, out.shape)
-    #     ort_exec = onnxruntime.backend.prepare(onnx_model.SerializeToString(), "CPU")
+    #     onnx_model = get_onnx_model(data_dtype, x_data.shape, w_data.shape, b_data.shape, out.shape)
+    #     ort_exec = ort_bk.prepare(onnx_model.SerializeToString(), "CPU")
     #     ort_out = ort_exec.run([x_data, w_data, b_data])
     #     # Unpack output if there's only a single value.
     #     if len(ort_out) == 1:
@@ -2256,8 +2271,10 @@ def get_conv2d_nchw(
 
     out = rt_mod.get_output(0).numpy()
 
-    # verify_by_ort(data_np, weight_np, bias_np, out)
-    # verify_by_ort(data_np, weight_np, bias_np, ref)
+    # print("COMPARE ORT and OUT")
+    # verify_by_ort(data_np, weight_np, bias_np, data_dtype, out)
+    # print("COMPARE ORT and REF")
+    # verify_by_ort(data_np, weight_np, bias_np, data_dtype, ref)
 
     np.testing.assert_equal(out, ref)
 

From 293033d049b3ec8de4f38b3974839e38fd94914a Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 11 Jan 2023 17:19:57 +0300
Subject: [PATCH 65/84] fix int16 overflow

---
 tests/python/relay/test_op_level2.py | 73 +---------------------------
 1 file changed, 1 insertion(+), 72 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index d5079009d327..41910254a0ad 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2159,73 +2159,7 @@ def get_conv2d_nchw(
             out_dtype=out_dtype,
         )
 
-    # def verify_by_ort(x_data, w_data, b_data, data_dtype, out):
-    #     from onnx import helper, mapping, TensorProto
-    #     from onnxruntime import backend as ort_bk
-
-    #     def get_onnx_model(data_dtype, x_shape, w_shape, b_shape, out_shape):
-    #         x_dtype = data_dtype
-    #         w_dtype = "int8"
-    #         b_dtype = "int32"
-    #         x_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(x_dtype)]
-    #         w_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(w_dtype)]
-    #         b_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(b_dtype)]
-
-    #         y_dtype = "int32"
-    #         y_proto_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(y_dtype)]
-
-    #         input_nodes = [
-    #             helper.make_tensor_value_info("x", x_proto_type, list(x_shape)),
-    #             helper.make_tensor_value_info("w", w_proto_type, list(w_shape)),
-    #             helper.make_tensor_value_info("B", b_proto_type, list(b_shape)),
-    #         ]
-    #         initializer = [
-    #             helper.make_tensor("x_scale", TensorProto.FLOAT, [], [1.]),
-    #             helper.make_tensor("x_zero_point", x_proto_type, [], [0]),
-    #             helper.make_tensor("w_scale", TensorProto.FLOAT, [], [1.]),
-    #             helper.make_tensor("w_zero_point", w_proto_type, [], [0]),
-    #             helper.make_tensor("y_scale", TensorProto.FLOAT, [], [1.]),
-    #             helper.make_tensor("y_zero_point", y_proto_type, [], [0]),
-    #         ]
-    #         input_names = [
-    #             "x",
-    #             "x_scale",
-    #             "x_zero_point",
-    #             "w",
-    #             "w_scale",
-    #             "w_zero_point",
-    #             "y_scale",
-    #             "y_zero_point",
-    #             "B"
-    #         ]
-
-    #         node_conv = helper.make_node(
-    #             "QLinearConv",
-    #             inputs=input_names,
-    #             outputs=["y"],
-    #         )
-
-    #         graph = helper.make_graph(
-    #             [node_conv],
-    #             "ort_conv2d_int8_test",
-    #             inputs=input_nodes,
-    #             initializer=initializer,
-    #             outputs=[helper.make_tensor_value_info("y", y_proto_type, list(out_shape))],
-    #         )
-    #         model = helper.make_model(graph, producer_name="ort_conv2d_int8_test")
-    #         return model
-
-    #     onnx_model = get_onnx_model(data_dtype, x_data.shape, w_data.shape, b_data.shape, out.shape)
-    #     ort_exec = ort_bk.prepare(onnx_model.SerializeToString(), "CPU")
-    #     ort_out = ort_exec.run([x_data, w_data, b_data])
-    #     # Unpack output if there's only a single value.
-    #     if len(ort_out) == 1:
-    #         ort_out = ort_out[0]
-    #     if len(out) == 1:
-    #         out = out[0]
-    #     np.testing.assert_equal(out, ort_out)
-
-    I, O, H, W = 64, 64, 56, 56
+    I, O, H, W = 1, 1, 56, 56
     kH = kW = 3
 
     data_shape = (1, I, H, W)
@@ -2271,11 +2205,6 @@ def get_conv2d_nchw(
 
     out = rt_mod.get_output(0).numpy()
 
-    # print("COMPARE ORT and OUT")
-    # verify_by_ort(data_np, weight_np, bias_np, data_dtype, out)
-    # print("COMPARE ORT and REF")
-    # verify_by_ort(data_np, weight_np, bias_np, data_dtype, ref)
-
     np.testing.assert_equal(out, ref)
 
 
From 915fad708224f6ff05afa7200535c070a5b8d4cc Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 11 Jan 2023 17:28:59 +0300
Subject: [PATCH 66/84] fix int16 overflow for dense test

---
 .../unittest/test_meta_schedule_16x4_integration.py    | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_16x4_integration.py b/tests/python/unittest/test_meta_schedule_16x4_integration.py
index e432cdd22268..0d9a789f5377 100644
--- a/tests/python/unittest/test_meta_schedule_16x4_integration.py
+++ b/tests/python/unittest/test_meta_schedule_16x4_integration.py
@@ -136,8 +136,7 @@ def f_check(lib, dev):
     return relay_mod, params, f_check
 
 
-def schedule_16x4_dense_fn_database(target, intrin):
-    m, n, k = 1024, 1024, 1024
+def schedule_16x4_dense_fn_database(target, intrin, m=1024, n=1024, k=1024):
     dev = tvm.cpu(0)
     relay_mod, params, f_check = _relay_dense(m, n, k)
 
@@ -175,10 +174,10 @@ def test_vnni_schedule_fn_database():
 @tvm.testing.requires_skylake_avx512
 def test_avx512_schedule_fn_database():
     target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-512 -num-cores=4")
-    schedule_16x4_dense_fn_database(target, AVX512_INTRIN)
+    schedule_16x4_dense_fn_database(target, AVX512_INTRIN, 16, 16, 16)
 
 
-def schedule_16x4_dense_fn_tune(target, intrin):
+def schedule_16x4_dense_fn_tune(target, intrin, m=1024, n=1024, k=1024):
     # pylint: disable=W0105
     """
     We can inject and apply a custom TIR scheduling to a TE compute of interest, using
@@ -208,7 +207,6 @@ def schedule_rule_dense_16x4(sch: Schedule, dense_block: BlockRV):
 
     register_func("meta_schedule.x86.dense_int8", schedule_rule_dense_16x4, override=True)
 
-    m, n, k = 1024, 1024, 1024
     dev = tvm.cpu(0)
     relay_mod, params, f_check = _relay_dense(m, n, k)
 
@@ -266,7 +264,7 @@ def test_vnni_schedule_fn_tune():
 @tvm.testing.requires_skylake_avx512
 def test_avx512_schedule_fn_tune():
     target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-avx512 -num-cores=4")
-    schedule_16x4_dense_fn_tune(target, AVX512_INTRIN)
+    schedule_16x4_dense_fn_tune(target, AVX512_INTRIN, 16, 16, 16)
 
 
 if __name__ == """__main__""":

From a289d4b982476ca4c864fd60e01a4abfd48d772e Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 11 Jan 2023 21:14:25 +0300
Subject: [PATCH 67/84] update input data for test of dense

---
 .../python/unittest/test_meta_schedule_16x4_integration.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_16x4_integration.py b/tests/python/unittest/test_meta_schedule_16x4_integration.py
index 0d9a789f5377..723072a2e6ef 100644
--- a/tests/python/unittest/test_meta_schedule_16x4_integration.py
+++ b/tests/python/unittest/test_meta_schedule_16x4_integration.py
@@ -110,10 +110,10 @@ def _relay_dense(m, n, k):
         out_dtype="int32",
     )
     relay_mod = tvm.IRModule.from_expr(out)
-    data = np.random.uniform(1, 10, size=(m, k)).astype("uint8")
+    data = np.random.randint(0, 5, size=(m, k), dtype="uint8")
     params = {
-        "weight": np.random.uniform(1, 10, size=(n, k)).astype("int8"),
-        "bias": np.random.uniform(1, 10, size=(n,)).astype("int32"),
+        "weight": np.random.randint(0, 5, size=(n, k), dtype="int8"),
+        "bias": np.random.randint(0, 5, size=(n,), dtype="int32"),
     }
 
     def f_check(lib, dev):

From e6ea691167411fc05ca88aaac8705cd7d34542a9 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 12 Jan 2023 11:34:08 +0300
Subject: [PATCH 68/84] small rollback

---
 tests/python/unittest/test_meta_schedule_trace_apply.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 0432c2f23834..335a6cd0d817 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -1184,7 +1184,7 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                             B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
                             C_i32x16: T.int32x16 = C[0:16]
                             vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
-                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(vnni_id), T.uint32(3), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(vnni_id), T.uint32(0), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
                     for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
                         for ax4_fused in T.vectorized(16):
                             with T.block("T_cast_8"):

From 300c66df62a5081a65e8683d1dc54cd449e83729 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 12 Jan 2023 15:55:14 +0300
Subject: [PATCH 69/84] fix misprinting

---
 tests/python/unittest/test_meta_schedule_16x4_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_16x4_integration.py b/tests/python/unittest/test_meta_schedule_16x4_integration.py
index 723072a2e6ef..6dc72d69336f 100644
--- a/tests/python/unittest/test_meta_schedule_16x4_integration.py
+++ b/tests/python/unittest/test_meta_schedule_16x4_integration.py
@@ -173,7 +173,7 @@ def test_vnni_schedule_fn_database():
 
 @tvm.testing.requires_skylake_avx512
 def test_avx512_schedule_fn_database():
-    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-512 -num-cores=4")
+    target = tvm.target.Target("llvm -keys=x86,cpu -mcpu=skylake-avx512 -num-cores=4")
     schedule_16x4_dense_fn_database(target, AVX512_INTRIN, 16, 16, 16)
 
 
From 59bf956c857f218e123d2b1ecb8bd990948b04a0 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 12 Jan 2023 18:12:20 +0300
Subject: [PATCH 70/84] fix

---
 tests/python/relay/test_op_level2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 41910254a0ad..f7cfc81fb2d3 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2159,7 +2159,7 @@ def get_conv2d_nchw(
             out_dtype=out_dtype,
         )
 
-    I, O, H, W = 1, 1, 56, 56
+    I, O, H, W = 64, 64, 56, 56
     kH = kW = 3
 
     data_shape = (1, I, H, W)
@@ -2168,16 +2168,16 @@ def get_conv2d_nchw(
 
     bias = relay.var("bias", shape=bias_shape, dtype="int32")
     bias_np = np.random.randint(low=-127, high=128, size=bias_shape).astype("int32")
-    weight_np = np.random.uniform(-128, 127, size=weight_shape).astype("int8")
+    weight_np = np.random.uniform(-32, 32, size=weight_shape).astype("int8")
 
     conv2d = get_conv2d_nchw(data_shape, weight_shape, data_dtype)
     bias_add = relay.add(conv2d, bias)
     mod = tvm.IRModule.from_expr(bias_add)
 
     if data_dtype == "uint8":
-        data_np = np.random.uniform(0, 255, size=data_shape).astype("uint8")
+        data_np = np.random.uniform(0, 64, size=data_shape).astype("uint8")
     else:
-        data_np = np.random.uniform(-128, 127, size=data_shape).astype("int8")
+        data_np = np.random.uniform(-32, 32, size=data_shape).astype("int8")
 
     params = {"weight": weight_np, "bias": bias_np}
 

From c550370488ae34ddf253ac210001f595bed44c09 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Sun, 15 Jan 2023 08:16:31 +0300
Subject: [PATCH 71/84] restart CI


From b69946a5ed211134a8981cea3b2f0499053b2701 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 09:24:24 +0300
Subject: [PATCH 72/84] DefaultVNNI was renamed to DefaultLLVM for mutator

---
 include/tvm/meta_schedule/mutator.h                  | 2 --
 src/meta_schedule/mutator/mutator.cc                 | 2 --
 src/meta_schedule/space_generator/space_generator.cc | 4 ++--
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index 498b2797ada5..1560c00f3907 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -131,8 +131,6 @@ class Mutator : public runtime::ObjectRef {
                                    FApply f_apply, FClone f_clone, FAsString f_as_string);
   /*! \brief Create default mutators for LLVM */
   TVM_DLL static Map<Mutator, FloatImm, void> DefaultLLVM();
-  /*! \brief Create default mutators for x86 VNNI */
-  TVM_DLL static Map<Mutator, FloatImm, void> DefaultVNNI();
   /*! \brief Create default mutators for CUDA */
   TVM_DLL static Map<Mutator, FloatImm, void> DefaultCUDA();
   /*! \brief Create default mutators for CUDA with TensorCore */
diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc
index 3cf43e11260e..ddc2d73590f9 100644
--- a/src/meta_schedule/mutator/mutator.cc
+++ b/src/meta_schedule/mutator/mutator.cc
@@ -59,8 +59,6 @@ Map<Mutator, FloatImm> Mutator::DefaultLLVM() {
       {Mutator::MutateParallel(/*max_jobs_per_core=*/16), FloatImm(DataType::Float(64), 0.02)}};
 }
 
-Map<Mutator, FloatImm> Mutator::DefaultVNNI() { return Mutator::DefaultLLVM(); }
-
 Map<Mutator, FloatImm> Mutator::DefaultCUDA() {
   return Map<Mutator, FloatImm>{
       {Mutator::MutateTileSize(), FloatImm(DataType::Float(64), 0.9)},
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index b580a6ab4726..c0b692690e23 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -102,11 +102,11 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
     } else if (kind == "vnni") {
       default_sch_rules = ScheduleRule::DefaultVNNI();
       default_postprocs = Postproc::DefaultVNNI();
-      default_mutator_probs = Mutator::DefaultVNNI();
+      default_mutator_probs = Mutator::DefaultLLVM();
     } else if (kind == "avx512") {
       default_sch_rules = ScheduleRule::DefaultAVX512();
       default_postprocs = Postproc::DefaultVNNI();
-      default_mutator_probs = Mutator::DefaultVNNI();
+      default_mutator_probs = Mutator::DefaultLLVM();
     } else if (kind == "c") {
       default_sch_rules = ScheduleRule::DefaultMicro();
       default_postprocs = Postproc::DefaultMicro();

From 9c6054b21d0003a8a93dc06a864c1bdc589e4121 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 09:31:56 +0300
Subject: [PATCH 73/84] rename test file for the sake of clarity

---
 ..._16x4_integration.py => test_meta_schedule_cpu_dot_product.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/python/unittest/{test_meta_schedule_16x4_integration.py => test_meta_schedule_cpu_dot_product.py} (100%)

diff --git a/tests/python/unittest/test_meta_schedule_16x4_integration.py b/tests/python/unittest/test_meta_schedule_cpu_dot_product.py
similarity index 100%
rename from tests/python/unittest/test_meta_schedule_16x4_integration.py
rename to tests/python/unittest/test_meta_schedule_cpu_dot_product.py

From 76d9affd8b0018fa1dc9d09592690c75ed645df9 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 09:39:44 +0300
Subject: [PATCH 74/84] DefaultVNNI was renamed to DefaultCPUTensorization for
 postproc

---
 include/tvm/meta_schedule/postproc.h                 | 4 ++--
 src/meta_schedule/postproc/postproc.cc               | 2 +-
 src/meta_schedule/space_generator/space_generator.cc | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 06fa086c4bca..85fb9003e87f 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -163,8 +163,8 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Postproc RewriteLayout();
   /*! \brief Create default postprocessors for LLVM */
   TVM_DLL static Array<Postproc, void> DefaultLLVM();
-  /*! \brief Create default postprocessors for x86 VNNI */
-  TVM_DLL static Array<Postproc, void> DefaultVNNI();
+  /*! \brief Create default postprocessors for x86 (AVX512 and VNNI) */
+  TVM_DLL static Array<Postproc, void> DefaultCPUTensorization();
   /*! \brief Create default postprocessors for CUDA */
   TVM_DLL static Array<Postproc, void> DefaultCUDA();
   /*! \brief Create default postprocessors for CUDA with TensorCore */
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index 7730e4372fa9..bcd0cef4dd69 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -59,7 +59,7 @@ Array<Postproc> Postproc::DefaultLLVM() {
   };
 }
 
-Array<Postproc> Postproc::DefaultVNNI() {
+Array<Postproc> Postproc::DefaultCPUTensorization() {
   return Array<Postproc>{
       Postproc::DisallowDynamicLoop(),   Postproc::RewriteParallelVectorizeUnroll(),
       Postproc::RewriteReductionBlock(), Postproc::RewriteTensorize(/*vectorize_init_loop=*/true),
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index c0b692690e23..8d0bb00c201d 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -101,11 +101,11 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
       default_mutator_probs = Mutator::DefaultHexagon();
     } else if (kind == "vnni") {
       default_sch_rules = ScheduleRule::DefaultVNNI();
-      default_postprocs = Postproc::DefaultVNNI();
+      default_postprocs = Postproc::DefaultCPUTensorization();
       default_mutator_probs = Mutator::DefaultLLVM();
     } else if (kind == "avx512") {
       default_sch_rules = ScheduleRule::DefaultAVX512();
-      default_postprocs = Postproc::DefaultVNNI();
+      default_postprocs = Postproc::DefaultCPUTensorization();
       default_mutator_probs = Mutator::DefaultLLVM();
     } else if (kind == "c") {
       default_sch_rules = ScheduleRule::DefaultMicro();

From db7960e24086403a769ca22771983a9cedf60d0d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 09:52:13 +0300
Subject: [PATCH 75/84] remove resolved TODO

---
 src/meta_schedule/space_generator/space_generator.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 8d0bb00c201d..0fffbd367be6 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -81,7 +81,6 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
     Array<ScheduleRule> default_sch_rules;
     Array<Postproc> default_postprocs;
     Map<Mutator, FloatImm> default_mutator_probs;
-    // TODO(vvchernov): check if need separated ScheduleRule, Postproc, Mutator
     // for target with skylake-avx512
     if (kind == "llvm") {
       default_sch_rules = ScheduleRule::DefaultLLVM();

From b7d3f8b0871e65588a1b76799e12d4a8d8748101 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 10:24:07 +0300
Subject: [PATCH 76/84] DefaultVNNI and AVX512 for ScheduleRule were unified

---
 include/tvm/meta_schedule/schedule_rule.h     |  6 +--
 .../schedule_rule/schedule_rule.cc            | 51 ++-----------------
 .../space_generator/space_generator.cc        |  4 +-
 3 files changed, 9 insertions(+), 52 deletions(-)

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 9b8d6c64ac1c..031a4543c2ef 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -290,10 +290,8 @@ class ScheduleRule : public runtime::ObjectRef {
 
   /*! \brief Create default schedule rules for LLVM */
   TVM_DLL static Array<ScheduleRule, void> DefaultLLVM();
-  /*! \brief Create default schedule rules for x86 VNNI */
-  TVM_DLL static Array<ScheduleRule, void> DefaultVNNI();
-  /*! \brief Create default schedule rules for x86 AVX512 */
-  TVM_DLL static Array<ScheduleRule, void> DefaultAVX512();
+  /*! \brief Create default schedule rules for x86 (AVX512 and VNNI) */
+  TVM_DLL static Array<ScheduleRule, void> DefaultX86(const std::string& type);
   /*! \brief Create default schedule rules for CUDA */
   TVM_DLL static Array<ScheduleRule, void> DefaultCUDA();
   /*! \brief Create default postprocessors for CUDA with TensorCore */
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 93bc3adf1b6f..85a951074c25 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -85,52 +85,11 @@ Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
   };
 }
 
-Array<ScheduleRule> ScheduleRule::DefaultVNNI() {
-  return {
-      ScheduleRule::ApplyCustomRule(),
-      ScheduleRule::InlineConstantScalars(),
-      ScheduleRule::AutoInline(
-          /*into_producer=*/false,
-          /*into_consumer=*/true,
-          /*inline_const_tensor=*/true,
-          /*disallow_if_then_else=*/true,
-          /*require_injective=*/true,
-          /*require_ordered=*/true,
-          /*disallow_op=*/Array<String>{"tir.exp"}),
-      ScheduleRule::AddRFactor(
-          /*max_jobs_per_core=*/16,
-          /*max_innermost_factor=*/Integer(64)),
-      ScheduleRule::MultiLevelTilingWithIntrin(
-          /*intrin_name=*/"dot_16x4_vnni",
-          /*structure=*/"SSRSRS",
-          /*tile_binds=*/NullOpt,
-          /*max_innermost_factor=*/Integer(64),
-          /*vector_load_lens=*/NullOpt,
-          /*reuse_read=*/NullOpt,
-          /*reuse_write=*/
-          Map<String, ObjectRef>{{"req", String("may")},
-                                 {"levels", Array<Integer>{1, 2}},
-                                 {"scope", String("global")}}),
-      ScheduleRule::MultiLevelTiling(
-          /*structure=*/"SSRSRS",
-          /*tile_binds=*/NullOpt,
-          /*max_innermost_factor=*/Integer(64),
-          /*vector_load_lens=*/NullOpt,
-          /*reuse_read=*/NullOpt,
-          /*reuse_write=*/
-          Map<String, ObjectRef>{{"req", String("may")},
-                                 {"levels", Array<Integer>{1, 2}},
-                                 {"scope", String("global")}}),
-      ScheduleRule::ParallelizeVectorizeUnroll(
-          /*max_jobs_per_core=*/16,
-          /*max_vectorize_extent=*/64,
-          /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512},
-          /*unroll_explicit=*/true),
-      ScheduleRule::RandomComputeLocation(),
+Array<ScheduleRule> DefaultX86(const String& type) {
+  const static Map<String, String> intrins = {
+    {"vnni", "dot_16x4_vnni"},
+    {"avx512", "dot_16x4_avx512"}
   };
-}
-
-Array<ScheduleRule> ScheduleRule::DefaultAVX512() {
   return {
       ScheduleRule::ApplyCustomRule(),
       ScheduleRule::InlineConstantScalars(),
@@ -146,7 +105,7 @@ Array<ScheduleRule> ScheduleRule::DefaultAVX512() {
           /*max_jobs_per_core=*/16,
           /*max_innermost_factor=*/Integer(64)),
       ScheduleRule::MultiLevelTilingWithIntrin(
-          /*intrin_name=*/"dot_16x4_avx512",
+          /*intrin_name=*/intrins[type],
           /*structure=*/"SSRSRS",
           /*tile_binds=*/NullOpt,
           /*max_innermost_factor=*/Integer(64),
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 0fffbd367be6..2ce8d8fa1103 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -99,11 +99,11 @@ void SpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
       default_postprocs = Postproc::DefaultHexagon();
       default_mutator_probs = Mutator::DefaultHexagon();
     } else if (kind == "vnni") {
-      default_sch_rules = ScheduleRule::DefaultVNNI();
+      default_sch_rules = ScheduleRule::DefaultX86("vnni");
       default_postprocs = Postproc::DefaultCPUTensorization();
       default_mutator_probs = Mutator::DefaultLLVM();
     } else if (kind == "avx512") {
-      default_sch_rules = ScheduleRule::DefaultAVX512();
+      default_sch_rules = ScheduleRule::DefaultX86("avx512");
       default_postprocs = Postproc::DefaultCPUTensorization();
       default_mutator_probs = Mutator::DefaultLLVM();
     } else if (kind == "c") {

From 8c9e403d3e9274b501c51b82b85577dc19e55e3d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 10:32:41 +0300
Subject: [PATCH 77/84] replace code to upstream with initial version

---
 .../unittest/test_tir_schedule_analysis.py    | 130 +++++++++---------
 1 file changed, 65 insertions(+), 65 deletions(-)

diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index b7d1fe2b1b52..38bd4bba1418 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -41,71 +41,6 @@
 from tvm.te import create_prim_func
 
 
-@tvm.script.ir_module
-class DenseTIRModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1024, 1024), "uint8"],
-        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
-        compute: T.Buffer[(1024, 1024), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        with T.block("root"):
-            T.reads()
-            T.writes()
-            for i0, i1, i2 in T.grid(1024, 1024, 1024):
-                with T.block("compute"):
-                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
-                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
-                    T.writes(compute[i, j])
-                    with T.init():
-                        compute[i, j] = 0
-                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
-                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
-                    )
-
-
-@tvm.script.ir_module
-class Conv2dNCHWcTIRModule:
-    @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
-        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
-        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
-    ) -> None:
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
-            with T.block("conv2d_NCHWc_int8"):
-                (
-                    n,
-                    oc_chunk,
-                    oh,
-                    ow,
-                    oc_block,
-                    kh,
-                    kw,
-                    ic_outer,
-                    ic_f_inner,
-                    ic_s_inner,
-                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
-                T.reads(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                )
-                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
-                with T.init():
-                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
-                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
-                    n, oc_chunk, oh, ow, oc_block
-                ] + T.cast(
-                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
-                    "int32",
-                ) * T.cast(
-                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
-                    "int32",
-                )
-
-
 def _make_vars(*args: str) -> List[Var]:
     return [Var(arg, dtype="int32") for arg in args]
 
@@ -210,6 +145,71 @@ def test_suggest_index_map_winograd():
     assert inverse_index_map.is_equivalent_to(expected_inverse_index_map)
 
 
+@tvm.script.ir_module
+class DenseTIRModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            for i0, i1, i2 in T.grid(1024, 1024, 1024):
+                with T.block("compute"):
+                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                    T.writes(compute[i, j])
+                    with T.init():
+                        compute[i, j] = 0
+                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                    )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcTIRModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                (
+                    n,
+                    oc_chunk,
+                    oh,
+                    ow,
+                    oc_block,
+                    kh,
+                    kw,
+                    ic_outer,
+                    ic_f_inner,
+                    ic_s_inner,
+                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    "int32",
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
+
+
 def collect_loops(prim_func):
     loops = []
 

From f8794c9c2de5266a878ad273fd6e0dedc6253e7f Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 10:41:33 +0300
Subject: [PATCH 78/84] fix arg type

---
 include/tvm/meta_schedule/schedule_rule.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 031a4543c2ef..7995d1fceeb6 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -291,7 +291,7 @@ class ScheduleRule : public runtime::ObjectRef {
   /*! \brief Create default schedule rules for LLVM */
   TVM_DLL static Array<ScheduleRule, void> DefaultLLVM();
   /*! \brief Create default schedule rules for x86 (AVX512 and VNNI) */
-  TVM_DLL static Array<ScheduleRule, void> DefaultX86(const std::string& type);
+  TVM_DLL static Array<ScheduleRule, void> DefaultX86(const String& type);
   /*! \brief Create default schedule rules for CUDA */
   TVM_DLL static Array<ScheduleRule, void> DefaultCUDA();
   /*! \brief Create default postprocessors for CUDA with TensorCore */

From cddd1a15d4c818b0462f55c653e6a5f7aacbc853 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 10:59:17 +0300
Subject: [PATCH 79/84] lint fix

---
 src/meta_schedule/schedule_rule/schedule_rule.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 85a951074c25..eea837ccaccf 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -86,7 +86,7 @@ Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
 }
 
 Array<ScheduleRule> DefaultX86(const String& type) {
-  const static Map<String, String> intrins = {
+  static const Map<String, String> intrins = {
     {"vnni", "dot_16x4_vnni"},
     {"avx512", "dot_16x4_avx512"}
   };

From 84b780dc264d42c23415cba9e24aba3d84115e88 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 11:04:59 +0300
Subject: [PATCH 80/84] small fix

---
 src/meta_schedule/schedule_rule/schedule_rule.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index eea837ccaccf..53e515a48d30 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -85,7 +85,7 @@ Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
   };
 }
 
-Array<ScheduleRule> DefaultX86(const String& type) {
+Array<ScheduleRule> ScheduleRule::DefaultX86(const String& type) {
   static const Map<String, String> intrins = {
     {"vnni", "dot_16x4_vnni"},
     {"avx512", "dot_16x4_avx512"}

From 2d772f6f4d18cc3242a5ceb93e99861fabc34906 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 12:08:24 +0300
Subject: [PATCH 81/84] lint fix

---
 src/meta_schedule/schedule_rule/schedule_rule.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index 53e515a48d30..e25f0b12210d 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -86,10 +86,8 @@ Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
 }
 
 Array<ScheduleRule> ScheduleRule::DefaultX86(const String& type) {
-  static const Map<String, String> intrins = {
-    {"vnni", "dot_16x4_vnni"},
-    {"avx512", "dot_16x4_avx512"}
-  };
+  static const Map<String, String> intrins = {{"vnni", "dot_16x4_vnni"},
+                                              {"avx512", "dot_16x4_avx512"}};
   return {
       ScheduleRule::ApplyCustomRule(),
       ScheduleRule::InlineConstantScalars(),

From d2343ab3f731b96f223ffce558b3720724b8ec2d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 14:13:19 +0300
Subject: [PATCH 82/84] fix misprinting

---
 tests/python/unittest/test_meta_schedule_trace_apply.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 335a6cd0d817..a12194d3adc7 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -2799,7 +2799,7 @@ def apply_trace(sch):
     )
 
 
-def test_conv2d_int8_vavx512():
+def test_conv2d_int8_avx512():
     def apply_trace(sch):
         return apply_trace_16x4(sch, AVX512_INTRIN)
 

From 18c361056b8725c6d0c0268e8d61941469de16a4 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 14:28:19 +0300
Subject: [PATCH 83/84] rollback trace apply test for avx512 (reviewer remark)

---
 .../test_meta_schedule_trace_apply.py         | 515 ++++++++----------
 1 file changed, 213 insertions(+), 302 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index a12194d3adc7..79e362a628b4 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -26,7 +26,6 @@
 from tvm.target.codegen import llvm_lookup_intrinsic_id
 
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
-from tvm.tir.tensor_intrin.x86 import AVX512_DOT_16x4_INTRIN as AVX512_INTRIN
 
 
 # fmt: off
@@ -1134,9 +1133,9 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                 T_cast[ax0, ax1, ax2, ax3, ax4] = T.cast(compute_2[ax0, ax1, ax2, ax3, ax4], "int32")
 
 
-def get_conv2d_16x4_mod(intrin):
+def get_conv2d_vnni_mod(intrin_id):
     @tvm.script.ir_module
-    class Conv2dInt8_NCHWc_VNNI_scheduled:
+    class Conv2dInt8_NCHWc_scheduled:
         @T.prim_func
         def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1, 4, 16, 4), "int8"], p2: T.Buffer[(1, 128, 1, 1, 16), "int32"], p3: T.Buffer[(1, 128, 1, 1, 16), "float32"], p4: T.Buffer[1, "float32"], p5: T.Buffer[(1, 128, 7, 7, 16), "uint8"], T_cast: T.Buffer[(1, 128, 7, 7, 16), "int32"]) -> None:
             # function attr dict
@@ -1183,8 +1182,7 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                             B_i8x64: T.int8x64 = B[0, 0:64]
                             B_i32x16: T.int32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
                             C_i32x16: T.int32x16 = C[0:16]
-                            vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
-                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(vnni_id), T.uint32(0), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
+                            C[0:16] = T.call_llvm_pure_intrin(T.uint32(intrin_id), T.uint32(0), C_i32x16, T.broadcast(A_i32, 16), B_i32x16, dtype="int32x16")
                     for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
                         for ax4_fused in T.vectorized(16):
                             with T.block("T_cast_8"):
@@ -1196,78 +1194,7 @@ def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1,
                                 T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
                                 T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = T.cast(T.max(T.min(T.cast(T.max(T.min(T.cast(T.floor(T.float32(0.95489668846130371) * (T.cast(T.cast(T.max(T.min(T.cast(T.floor(T.cast(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4] + p2[ax0_1, ax1_1, 0, 0, ax4], "float32") * p3[ax0_1, ax1_1, 0, 0, ax4] + T.float32(65.5), dtype="float32"), "int32"), 255), 0), "uint8"), "float32") - p4[0]) + T.float32(0.5), dtype="float32"), "int32") + T.cast(T.floor(T.float32(0.71245479583740234) * T.cast(p5[ax0_1, ax1_1, ax2_1, ax3_1, ax4], "float32") + T.float32(0.5), dtype="float32"), "int32"), 255), 0), "uint8"), T.uint8(255)), T.uint8(0)), "int32")
 
-    @tvm.script.ir_module
-    class Conv2dInt8_NCHWc_AVX512_scheduled:
-        @T.prim_func
-        def main(p0: T.Buffer[(1, 32, 7, 7, 16), "uint8"], p1: T.Buffer[(128, 32, 1, 1, 4, 16, 4), "int8"], p2: T.Buffer[(1, 128, 1, 1, 16), "int32"], p3: T.Buffer[(1, 128, 1, 1, 16), "float32"], p4: T.Buffer[1, "float32"], p5: T.Buffer[(1, 128, 7, 7, 16), "uint8"], T_cast: T.Buffer[(1, 128, 7, 7, 16), "int32"]) -> None:
-            # function attr dict
-            T.func_attr({"global_symbol": "main", "tir.noalias": True})
-            # body
-            # with T.block("root")
-            conv2d_NCHWc_int8 = T.alloc_buffer([1, 128, 7, 7, 16], dtype="int32")
-            for i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused in T.parallel(128, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}):
-                for i2_1, i3_1, i4_0_1 in T.grid(7, 1, 1):
-                    for i0_2_init, i1_2_init, i2_2_init, i3_2_init, i4_0_2_init, i0_3_init, i1_3_init, i2_3_init, i3_3_init, i4_0_3_init in T.grid(1, 1, 1, 1, 1, 1, 1, 1, 7, 1):
-                        with T.block("conv2d_NCHWc_int8_o_init"):
-                            n = T.axis.spatial(1, i0_3_init + i0_2_init)
-                            oc_chunk = T.axis.spatial(128, i1_2_init + i1_3_init + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
-                            oh = T.axis.spatial(7, i2_1 + i2_2_init + i2_3_init)
-                            ow = T.axis.spatial(7, i3_1 * 7 + i3_2_init * 7 + i3_3_init)
-                            oc_block_o = T.axis.spatial(1, i4_0_3_init + i4_0_1 + i4_0_2_init)
-                            T.reads()
-                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
-                            for i4_1 in T.vectorized(16):
-                                with T.block("conv2d_NCHWc_int8_init"):
-                                    oc_block_i_init = T.axis.spatial(16, i4_1)
-                                    T.reads()
-                                    T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init])
-                                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_i_init] = 0
-                    for i5_0, i6_0, i7_0, i8_0, i9_0_0, i0_2, i1_2, i2_2, i3_2, i4_0_2, i5_1, i6_1, i7_1, i8_1, i9_0_1, i0_3, i1_3, i2_3, i3_3, i4_0_3 in T.grid(1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 7, 1):
-                        with T.block("conv2d_NCHWc_int8_o_update"):
-                            n = T.axis.spatial(1, i0_3 + i0_2)
-                            oc_chunk = T.axis.spatial(128, i1_2 + i1_3 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32)
-                            oh = T.axis.spatial(7, i2_1 + i2_2 + i2_3)
-                            ow = T.axis.spatial(7, i3_1 * 7 + i3_2 * 7 + i3_3)
-                            oc_block_o = T.axis.spatial(1, i4_0_3 + i4_0_1 + i4_0_2)
-                            kh = T.axis.reduce(1, i5_0 + i5_1)
-                            kw = T.axis.reduce(1, i6_1 + i6_0)
-                            ic_outer = T.axis.reduce(32, i7_0 * 8 + i7_1)
-                            ic_f_inner = T.axis.reduce(4, i8_1 + i8_0)
-                            ic_s_inner_o = T.axis.reduce(1, i9_0_0 + i9_0_1)
-                            T.reads(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4])
-                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16])
-                            A = T.match_buffer(p0[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4], [4], dtype="uint8", offset_factor=1)
-                            B = T.match_buffer(p1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0 : 16, 0 : 4], [16, 4], dtype="int8", offset_factor=1)
-                            C = T.match_buffer(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0 : 16], [16], dtype="int32", offset_factor=1)
-                            A_u8x4: T.uint8x4 = A[0:4]
-                            A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32")
-                            A_brdcst: T.int32x16 = T.broadcast(A_i32, 16)
-                            A_u8x64: T.uint8x64 = T.reinterpret(A_brdcst, dtype="uint8x64")
-
-                            B_i8x64: T.int8x64 = B[0, 0:64]
-
-                            avx512_id_1 = T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddubs.w.512")
-                            Red: T.int16x32 = T.call_llvm_pure_intrin(avx512_id_1, T.uint32(2), A_u8x64, B_i8x64, dtype="int16x32")
-
-                            avx512_id_2 = T.llvm_lookup_intrinsic_id("llvm.x86.avx512.pmaddw.d.512")
-                            C[0:16] += T.call_llvm_pure_intrin(avx512_id_2, T.uint32(2), Red, T.int16x32(1), dtype="int32x16")
-                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 7):
-                        for ax4_fused in T.vectorized(16):
-                            with T.block("T_cast_8"):
-                                ax0_1 = T.axis.spatial(1, ax0)
-                                ax1_1 = T.axis.spatial(128, i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused // 32 * 32 + i0_0_i1_0_i2_0_i3_0_i4_0_0_i0_1_i1_1_fused % 32 + ax1)
-                                ax2_1 = T.axis.spatial(7, i2_1 + ax2)
-                                ax3_1, ax4 = T.axis.remap("SS", [ax3, ax4_fused])
-                                T.reads(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4], p2[ax0_1, ax1_1, 0, 0, ax4], p3[ax0_1, ax1_1, 0, 0, ax4], p4[0], p5[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                                T.writes(T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4])
-                                T_cast[ax0_1, ax1_1, ax2_1, ax3_1, ax4] = T.cast(T.max(T.min(T.cast(T.max(T.min(T.cast(T.floor(T.float32(0.95489668846130371) * (T.cast(T.cast(T.max(T.min(T.cast(T.floor(T.cast(conv2d_NCHWc_int8[ax0_1, ax1_1, ax2_1, ax3_1, ax4] + p2[ax0_1, ax1_1, 0, 0, ax4], "float32") * p3[ax0_1, ax1_1, 0, 0, ax4] + T.float32(65.5), dtype="float32"), "int32"), 255), 0), "uint8"), "float32") - p4[0]) + T.float32(0.5), dtype="float32"), "int32") + T.cast(T.floor(T.float32(0.71245479583740234) * T.cast(p5[ax0_1, ax1_1, ax2_1, ax3_1, ax4], "float32") + T.float32(0.5), dtype="float32"), "int32"), 255), 0), "uint8"), T.uint8(255)), T.uint8(0)), "int32")
-
-    if intrin == "vnni":
-        return Conv2dInt8_NCHWc_VNNI_scheduled
-    elif intrin == "avx512":
-        return Conv2dInt8_NCHWc_AVX512_scheduled
-    else:
-        raise NotImplementedError("VNNI and AVX512 are supported only. \"", intrin, "\" is not supported")
+    return Conv2dInt8_NCHWc_scheduled
 
 
 @tvm.script.ir_module
@@ -2577,238 +2504,222 @@ def apply_trace(sch):
     verify(Conv2dInt8, apply_trace, Conv2dInt8_target, "cuda", Conv2dInt8_tensorcore_scheduled)
 
 
-def apply_trace_16x4(sch, intrin):
-    b0 = sch.get_block(name="compile_engine_const", func_name="main")
-    b1 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
-    b2 = sch.get_block(name="T_add", func_name="main")
-    b3 = sch.get_block(name="T_cast", func_name="main")
-    b4 = sch.get_block(name="T_multiply", func_name="main")
-    b5 = sch.get_block(name="compile_engine_const_1", func_name="main")
-    b6 = sch.get_block(name="T_add_1", func_name="main")
-    b7 = sch.get_block(name="T_floor", func_name="main")
-    b8 = sch.get_block(name="T_cast_1", func_name="main")
-    b9 = sch.get_block(name="compute", func_name="main")
-    b10 = sch.get_block(name="T_cast_2", func_name="main")
-    b11 = sch.get_block(name="T_cast_3", func_name="main")
-    b12 = sch.get_block(name="T_subtract", func_name="main")
-    b13 = sch.get_block(name="T_multiply_1", func_name="main")
-    b14 = sch.get_block(name="compile_engine_const_2", func_name="main")
-    b15 = sch.get_block(name="T_add_2", func_name="main")
-    b16 = sch.get_block(name="T_floor_1", func_name="main")
-    b17 = sch.get_block(name="T_cast_4", func_name="main")
-    b18 = sch.get_block(name="T_add_3", func_name="main")
-    b19 = sch.get_block(name="compute_1", func_name="main")
-    b20 = sch.get_block(name="T_cast_5", func_name="main")
-    b21 = sch.get_block(name="root", func_name="main")
-    sch.compute_inline(block=b20)
-    sch.compute_inline(block=b19)
-    sch.compute_inline(block=b18)
-    sch.compute_inline(block=b17)
-    sch.compute_inline(block=b16)
-    sch.compute_inline(block=b15)
-    sch.compute_inline(block=b14)
-    sch.compute_inline(block=b13)
-    sch.compute_inline(block=b12)
-    sch.compute_inline(block=b11)
-    sch.compute_inline(block=b10)
-    sch.compute_inline(block=b9)
-    sch.compute_inline(block=b8)
-    sch.compute_inline(block=b7)
-    sch.compute_inline(block=b6)
-    sch.compute_inline(block=b5)
-    sch.compute_inline(block=b4)
-    sch.compute_inline(block=b3)
-    sch.compute_inline(block=b2)
-    sch.compute_inline(block=b0)
-    sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
-    l22, l23, l24, l25, l26, l27, l28, l29, l30, l31 = sch.get_loops(block=b1)
-    l32, l33 = sch.split(loop=l31, factors=[None, 4], preserve_unit_iters=True)
-    l34, l35 = sch.split(loop=l26, factors=[None, 16], preserve_unit_iters=True)
-    l36, l37, l38, l39, l40, l41, l42, l43, l44, l45, l46, l47 = sch.get_loops(block=b1)
-    sch.reorder(l42, l43, l44, l45, l46, l35, l33)
-    b48 = sch.blockize(loop=l35)
-    sch.annotate(block_or_loop=b48, ann_key="meta_schedule.auto_tensorize", ann_val=intrin)
-    l49, l50, l51, l52, l53, l54, l55, l56, l57, l58 = sch.get_loops(block=b48)
-    v59, v60, v61, v62 = sch.sample_perfect_tile(
-        loop=l49, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
-    )
-    l63, l64, l65, l66 = sch.split(loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True)
-    v67, v68, v69, v70 = sch.sample_perfect_tile(
-        loop=l50, n=4, max_innermost_factor=64, decision=[4, 32, 1, 1]
-    )
-    l71, l72, l73, l74 = sch.split(loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True)
-    v75, v76, v77, v78 = sch.sample_perfect_tile(
-        loop=l51, n=4, max_innermost_factor=64, decision=[1, 7, 1, 1]
-    )
-    l79, l80, l81, l82 = sch.split(loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True)
-    v83, v84, v85, v86 = sch.sample_perfect_tile(
-        loop=l52, n=4, max_innermost_factor=64, decision=[1, 1, 1, 7]
-    )
-    l87, l88, l89, l90 = sch.split(loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True)
-    v91, v92, v93, v94 = sch.sample_perfect_tile(
-        loop=l53, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
-    )
-    l95, l96, l97, l98 = sch.split(loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True)
-    v99, v100 = sch.sample_perfect_tile(loop=l54, n=2, max_innermost_factor=64, decision=[1, 1])
-    l101, l102 = sch.split(loop=l54, factors=[v99, v100], preserve_unit_iters=True)
-    v103, v104 = sch.sample_perfect_tile(loop=l55, n=2, max_innermost_factor=64, decision=[1, 1])
-    l105, l106 = sch.split(loop=l55, factors=[v103, v104], preserve_unit_iters=True)
-    v107, v108 = sch.sample_perfect_tile(loop=l56, n=2, max_innermost_factor=64, decision=[4, 8])
-    l109, l110 = sch.split(loop=l56, factors=[v107, v108], preserve_unit_iters=True)
-    v111, v112 = sch.sample_perfect_tile(loop=l57, n=2, max_innermost_factor=64, decision=[4, 1])
-    l113, l114 = sch.split(loop=l57, factors=[v111, v112], preserve_unit_iters=True)
-    v115, v116 = sch.sample_perfect_tile(loop=l58, n=2, max_innermost_factor=64, decision=[1, 1])
-    l117, l118 = sch.split(loop=l58, factors=[v115, v116], preserve_unit_iters=True)
-    sch.reorder(
-        l63,
-        l71,
-        l79,
-        l87,
-        l95,
-        l64,
-        l72,
-        l80,
-        l88,
-        l96,
-        l101,
-        l105,
-        l109,
-        l113,
-        l117,
-        l65,
-        l73,
-        l81,
-        l89,
-        l97,
-        l102,
-        l106,
-        l110,
-        l114,
-        l118,
-        l66,
-        l74,
-        l82,
-        l90,
-        l98,
-    )
-    (b119,) = sch.get_consumers(block=b48)
-    sch.reverse_compute_at(block=b119, loop=l96, preserve_unit_loops=True, index=-1)
-    sch.annotate(block_or_loop=b21, ann_key="meta_schedule.parallel", ann_val=96)
-    sch.annotate(block_or_loop=b21, ann_key="meta_schedule.vectorize", ann_val=64)
-    v120 = sch.sample_categorical(
-        candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25], decision=2
-    )
-    sch.annotate(block_or_loop=b21, ann_key="meta_schedule.unroll_explicit", ann_val=v120)
-    sch.enter_postproc()
-    b121 = sch.get_block(name="root", func_name="main")
-    sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.parallel")
-    sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.vectorize")
-    sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit")
-    b122, b123 = sch.get_child_blocks(b121)
-    (
-        l124,
-        l125,
-        l126,
-        l127,
-        l128,
-        l129,
-        l130,
-        l131,
-        l132,
-        l133,
-        l134,
-        l135,
-        l136,
-        l137,
-        l138,
-        l139,
-        l140,
-        l141,
-        l142,
-        l143,
-        l144,
-        l145,
-        l146,
-        l147,
-        l148,
-        l149,
-        l150,
-        l151,
-        l152,
-        l153,
-    ) = sch.get_loops(block=b122)
-    l154 = sch.fuse(l124, l125, l126, l127, l128, l129, l130, preserve_unit_iters=True)
-    sch.parallel(loop=l154)
-    sch.annotate(block_or_loop=l154, ann_key="pragma_auto_unroll_max_step", ann_val=64)
-    sch.annotate(block_or_loop=l154, ann_key="pragma_unroll_explicit", ann_val=1)
-    l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b123)
-    l164 = sch.fuse(l163, preserve_unit_iters=True)
-    sch.vectorize(loop=l164)
-    sch.annotate(block_or_loop=l155, ann_key="pragma_auto_unroll_max_step", ann_val=64)
-    sch.annotate(block_or_loop=l155, ann_key="pragma_unroll_explicit", ann_val=1)
-    b165 = sch.get_block(name="conv2d_NCHWc_int8_o", func_name="main")
-    (
-        l166,
-        l167,
-        l168,
-        l169,
-        l170,
-        l171,
-        l172,
-        l173,
-        l174,
-        l175,
-        l176,
-        l177,
-        l178,
-        l179,
-        l180,
-        l181,
-        l182,
-        l183,
-        l184,
-        l185,
-        l186,
-        l187,
-        l188,
-        l189,
-    ) = sch.get_loops(block=b165)
-    b190 = sch.decompose_reduction(block=b165, loop=l170)
-    sch.unannotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize")
-    sch.annotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize", ann_val="")
-    b191 = sch.get_block(name="conv2d_NCHWc_int8_o_init", func_name="main")
-    sch.unannotate(block_or_loop=b191, ann_key="meta_schedule.auto_tensorize")
-    (b192,) = sch.get_child_blocks(b191)
-    (l193,) = sch.get_loops(block=b192)
-    sch.vectorize(loop=l193)
-    b194 = sch.get_block(name="conv2d_NCHWc_int8_o_update", func_name="main")
-    sch.unannotate(block_or_loop=b194, ann_key="meta_schedule.auto_tensorize")
-    sch.tensorize(block_or_loop=b194, tensor_intrin=intrin)
-
-
 def test_conv2d_int8_vnni():
     def apply_trace(sch):
-        return apply_trace_16x4(sch, VNNI_INTRIN)
-
+        b0 = sch.get_block(name="compile_engine_const", func_name="main")
+        b1 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
+        b2 = sch.get_block(name="T_add", func_name="main")
+        b3 = sch.get_block(name="T_cast", func_name="main")
+        b4 = sch.get_block(name="T_multiply", func_name="main")
+        b5 = sch.get_block(name="compile_engine_const_1", func_name="main")
+        b6 = sch.get_block(name="T_add_1", func_name="main")
+        b7 = sch.get_block(name="T_floor", func_name="main")
+        b8 = sch.get_block(name="T_cast_1", func_name="main")
+        b9 = sch.get_block(name="compute", func_name="main")
+        b10 = sch.get_block(name="T_cast_2", func_name="main")
+        b11 = sch.get_block(name="T_cast_3", func_name="main")
+        b12 = sch.get_block(name="T_subtract", func_name="main")
+        b13 = sch.get_block(name="T_multiply_1", func_name="main")
+        b14 = sch.get_block(name="compile_engine_const_2", func_name="main")
+        b15 = sch.get_block(name="T_add_2", func_name="main")
+        b16 = sch.get_block(name="T_floor_1", func_name="main")
+        b17 = sch.get_block(name="T_cast_4", func_name="main")
+        b18 = sch.get_block(name="T_add_3", func_name="main")
+        b19 = sch.get_block(name="compute_1", func_name="main")
+        b20 = sch.get_block(name="T_cast_5", func_name="main")
+        b21 = sch.get_block(name="root", func_name="main")
+        sch.compute_inline(block=b20)
+        sch.compute_inline(block=b19)
+        sch.compute_inline(block=b18)
+        sch.compute_inline(block=b17)
+        sch.compute_inline(block=b16)
+        sch.compute_inline(block=b15)
+        sch.compute_inline(block=b14)
+        sch.compute_inline(block=b13)
+        sch.compute_inline(block=b12)
+        sch.compute_inline(block=b11)
+        sch.compute_inline(block=b10)
+        sch.compute_inline(block=b9)
+        sch.compute_inline(block=b8)
+        sch.compute_inline(block=b7)
+        sch.compute_inline(block=b6)
+        sch.compute_inline(block=b5)
+        sch.compute_inline(block=b4)
+        sch.compute_inline(block=b3)
+        sch.compute_inline(block=b2)
+        sch.compute_inline(block=b0)
+        sch.annotate(block_or_loop=b1, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
+        l22, l23, l24, l25, l26, l27, l28, l29, l30, l31 = sch.get_loops(block=b1)
+        l32, l33 = sch.split(loop=l31, factors=[None, 4], preserve_unit_iters=True)
+        l34, l35 = sch.split(loop=l26, factors=[None, 16], preserve_unit_iters=True)
+        l36, l37, l38, l39, l40, l41, l42, l43, l44, l45, l46, l47 = sch.get_loops(block=b1)
+        sch.reorder(l42, l43, l44, l45, l46, l35, l33)
+        b48 = sch.blockize(loop=l35)
+        sch.annotate(block_or_loop=b48, ann_key="meta_schedule.auto_tensorize", ann_val=VNNI_INTRIN)
+        l49, l50, l51, l52, l53, l54, l55, l56, l57, l58 = sch.get_loops(block=b48)
+        v59, v60, v61, v62 = sch.sample_perfect_tile(
+            loop=l49, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
+        )
+        l63, l64, l65, l66 = sch.split(loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True)
+        v67, v68, v69, v70 = sch.sample_perfect_tile(
+            loop=l50, n=4, max_innermost_factor=64, decision=[4, 32, 1, 1]
+        )
+        l71, l72, l73, l74 = sch.split(loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True)
+        v75, v76, v77, v78 = sch.sample_perfect_tile(
+            loop=l51, n=4, max_innermost_factor=64, decision=[1, 7, 1, 1]
+        )
+        l79, l80, l81, l82 = sch.split(loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True)
+        v83, v84, v85, v86 = sch.sample_perfect_tile(
+            loop=l52, n=4, max_innermost_factor=64, decision=[1, 1, 1, 7]
+        )
+        l87, l88, l89, l90 = sch.split(loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True)
+        v91, v92, v93, v94 = sch.sample_perfect_tile(
+            loop=l53, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
+        )
+        l95, l96, l97, l98 = sch.split(loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True)
+        v99, v100 = sch.sample_perfect_tile(loop=l54, n=2, max_innermost_factor=64, decision=[1, 1])
+        l101, l102 = sch.split(loop=l54, factors=[v99, v100], preserve_unit_iters=True)
+        v103, v104 = sch.sample_perfect_tile(loop=l55, n=2, max_innermost_factor=64, decision=[1, 1])
+        l105, l106 = sch.split(loop=l55, factors=[v103, v104], preserve_unit_iters=True)
+        v107, v108 = sch.sample_perfect_tile(loop=l56, n=2, max_innermost_factor=64, decision=[4, 8])
+        l109, l110 = sch.split(loop=l56, factors=[v107, v108], preserve_unit_iters=True)
+        v111, v112 = sch.sample_perfect_tile(loop=l57, n=2, max_innermost_factor=64, decision=[4, 1])
+        l113, l114 = sch.split(loop=l57, factors=[v111, v112], preserve_unit_iters=True)
+        v115, v116 = sch.sample_perfect_tile(loop=l58, n=2, max_innermost_factor=64, decision=[1, 1])
+        l117, l118 = sch.split(loop=l58, factors=[v115, v116], preserve_unit_iters=True)
+        sch.reorder(
+            l63,
+            l71,
+            l79,
+            l87,
+            l95,
+            l64,
+            l72,
+            l80,
+            l88,
+            l96,
+            l101,
+            l105,
+            l109,
+            l113,
+            l117,
+            l65,
+            l73,
+            l81,
+            l89,
+            l97,
+            l102,
+            l106,
+            l110,
+            l114,
+            l118,
+            l66,
+            l74,
+            l82,
+            l90,
+            l98,
+        )
+        (b119,) = sch.get_consumers(block=b48)
+        sch.reverse_compute_at(block=b119, loop=l96, preserve_unit_loops=True, index=-1)
+        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.parallel", ann_val=96)
+        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.vectorize", ann_val=64)
+        v120 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512], probs=[0.25, 0.25, 0.25, 0.25], decision=2
+        )
+        sch.annotate(block_or_loop=b21, ann_key="meta_schedule.unroll_explicit", ann_val=v120)
+        sch.enter_postproc()
+        b121 = sch.get_block(name="root", func_name="main")
+        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.parallel")
+        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.vectorize")
+        sch.unannotate(block_or_loop=b121, ann_key="meta_schedule.unroll_explicit")
+        b122, b123 = sch.get_child_blocks(b121)
+        (
+            l124,
+            l125,
+            l126,
+            l127,
+            l128,
+            l129,
+            l130,
+            l131,
+            l132,
+            l133,
+            l134,
+            l135,
+            l136,
+            l137,
+            l138,
+            l139,
+            l140,
+            l141,
+            l142,
+            l143,
+            l144,
+            l145,
+            l146,
+            l147,
+            l148,
+            l149,
+            l150,
+            l151,
+            l152,
+            l153,
+        ) = sch.get_loops(block=b122)
+        l154 = sch.fuse(l124, l125, l126, l127, l128, l129, l130, preserve_unit_iters=True)
+        sch.parallel(loop=l154)
+        sch.annotate(block_or_loop=l154, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+        sch.annotate(block_or_loop=l154, ann_key="pragma_unroll_explicit", ann_val=1)
+        l155, l156, l157, l158, l159, l160, l161, l162, l163 = sch.get_loops(block=b123)
+        l164 = sch.fuse(l163, preserve_unit_iters=True)
+        sch.vectorize(loop=l164)
+        sch.annotate(block_or_loop=l155, ann_key="pragma_auto_unroll_max_step", ann_val=64)
+        sch.annotate(block_or_loop=l155, ann_key="pragma_unroll_explicit", ann_val=1)
+        b165 = sch.get_block(name="conv2d_NCHWc_int8_o", func_name="main")
+        (
+            l166,
+            l167,
+            l168,
+            l169,
+            l170,
+            l171,
+            l172,
+            l173,
+            l174,
+            l175,
+            l176,
+            l177,
+            l178,
+            l179,
+            l180,
+            l181,
+            l182,
+            l183,
+            l184,
+            l185,
+            l186,
+            l187,
+            l188,
+            l189,
+        ) = sch.get_loops(block=b165)
+        b190 = sch.decompose_reduction(block=b165, loop=l170)
+        sch.unannotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize")
+        sch.annotate(block_or_loop=b190, ann_key="meta_schedule.auto_tensorize", ann_val="")
+        b191 = sch.get_block(name="conv2d_NCHWc_int8_o_init", func_name="main")
+        sch.unannotate(block_or_loop=b191, ann_key="meta_schedule.auto_tensorize")
+        (b192,) = sch.get_child_blocks(b191)
+        (l193,) = sch.get_loops(block=b192)
+        sch.vectorize(loop=l193)
+        b194 = sch.get_block(name="conv2d_NCHWc_int8_o_update", func_name="main")
+        sch.unannotate(block_or_loop=b194, ann_key="meta_schedule.auto_tensorize")
+        sch.tensorize(block_or_loop=b194, tensor_intrin=VNNI_INTRIN)
+
+    vnni_id = llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512")
     verify(
         Conv2dInt8_NCHWc,
         apply_trace,
         Conv2dInt8_NCHWc_target,
         "llvm -mcpu=cascadelake",
-        get_conv2d_16x4_mod("vnni"),
-    )
-
-
-def test_conv2d_int8_avx512():
-    def apply_trace(sch):
-        return apply_trace_16x4(sch, AVX512_INTRIN)
-
-    verify(
-        Conv2dInt8_NCHWc,
-        apply_trace,
-        Conv2dInt8_NCHWc_target,
-        "llvm -mcpu=skylake-avx512",
-        get_conv2d_16x4_mod("avx512"),
+        get_conv2d_vnni_mod(vnni_id),
     )
 
 
From c06ee201999ea26b87647d907508a253415fd498 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Jan 2023 14:54:15 +0300
Subject: [PATCH 84/84] fix pylint

---
 .../test_meta_schedule_trace_apply.py         | 36 ++++++++++++++-----
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_trace_apply.py b/tests/python/unittest/test_meta_schedule_trace_apply.py
index 79e362a628b4..43b9eb8bbb19 100644
--- a/tests/python/unittest/test_meta_schedule_trace_apply.py
+++ b/tests/python/unittest/test_meta_schedule_trace_apply.py
@@ -2560,32 +2560,50 @@ def apply_trace(sch):
         v59, v60, v61, v62 = sch.sample_perfect_tile(
             loop=l49, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
         )
-        l63, l64, l65, l66 = sch.split(loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True)
+        l63, l64, l65, l66 = sch.split(
+            loop=l49, factors=[v59, v60, v61, v62], preserve_unit_iters=True
+        )
         v67, v68, v69, v70 = sch.sample_perfect_tile(
             loop=l50, n=4, max_innermost_factor=64, decision=[4, 32, 1, 1]
         )
-        l71, l72, l73, l74 = sch.split(loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True)
+        l71, l72, l73, l74 = sch.split(
+            loop=l50, factors=[v67, v68, v69, v70], preserve_unit_iters=True
+        )
         v75, v76, v77, v78 = sch.sample_perfect_tile(
             loop=l51, n=4, max_innermost_factor=64, decision=[1, 7, 1, 1]
         )
-        l79, l80, l81, l82 = sch.split(loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True)
+        l79, l80, l81, l82 = sch.split(
+            loop=l51, factors=[v75, v76, v77, v78], preserve_unit_iters=True
+        )
         v83, v84, v85, v86 = sch.sample_perfect_tile(
             loop=l52, n=4, max_innermost_factor=64, decision=[1, 1, 1, 7]
         )
-        l87, l88, l89, l90 = sch.split(loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True)
+        l87, l88, l89, l90 = sch.split(
+            loop=l52, factors=[v83, v84, v85, v86], preserve_unit_iters=True
+        )
         v91, v92, v93, v94 = sch.sample_perfect_tile(
             loop=l53, n=4, max_innermost_factor=64, decision=[1, 1, 1, 1]
         )
-        l95, l96, l97, l98 = sch.split(loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True)
+        l95, l96, l97, l98 = sch.split(
+            loop=l53, factors=[v91, v92, v93, v94], preserve_unit_iters=True
+        )
         v99, v100 = sch.sample_perfect_tile(loop=l54, n=2, max_innermost_factor=64, decision=[1, 1])
         l101, l102 = sch.split(loop=l54, factors=[v99, v100], preserve_unit_iters=True)
-        v103, v104 = sch.sample_perfect_tile(loop=l55, n=2, max_innermost_factor=64, decision=[1, 1])
+        v103, v104 = sch.sample_perfect_tile(
+            loop=l55, n=2, max_innermost_factor=64, decision=[1, 1]
+        )
         l105, l106 = sch.split(loop=l55, factors=[v103, v104], preserve_unit_iters=True)
-        v107, v108 = sch.sample_perfect_tile(loop=l56, n=2, max_innermost_factor=64, decision=[4, 8])
+        v107, v108 = sch.sample_perfect_tile(
+            loop=l56, n=2, max_innermost_factor=64, decision=[4, 8]
+        )
         l109, l110 = sch.split(loop=l56, factors=[v107, v108], preserve_unit_iters=True)
-        v111, v112 = sch.sample_perfect_tile(loop=l57, n=2, max_innermost_factor=64, decision=[4, 1])
+        v111, v112 = sch.sample_perfect_tile(
+            loop=l57, n=2, max_innermost_factor=64, decision=[4, 1]
+        )
         l113, l114 = sch.split(loop=l57, factors=[v111, v112], preserve_unit_iters=True)
-        v115, v116 = sch.sample_perfect_tile(loop=l58, n=2, max_innermost_factor=64, decision=[1, 1])
+        v115, v116 = sch.sample_perfect_tile(
+            loop=l58, n=2, max_innermost_factor=64, decision=[1, 1]
+        )
         l117, l118 = sch.split(loop=l58, factors=[v115, v116], preserve_unit_iters=True)
         sch.reorder(
             l63,