From 59c88d1ecd15c0651a5bd406e25f9e65c07acf46 Mon Sep 17 00:00:00 2001
From: "chengfan.jcf" <chengfan.jcf@alibaba-inc.com>
Date: Wed, 24 Jun 2020 13:58:55 +0800
Subject: [PATCH] Revert commit

---
 docs/conf.py                                  |    1 -
 include/tvm/relay/attrs/transform.h           |   13 -
 include/tvm/relay/transform.h                 |   14 -
 include/tvm/runtime/c_runtime_api.h           |   23 -
 include/tvm/runtime/device_api.h              |    3 +-
 include/tvm/runtime/ndarray.h                 |   12 +-
 scripts/common.py                             | 1034 -----------------
 scripts/shape_configs.py                      |  247 ----
 scripts/tune_network.py                       |  405 -------
 scripts/tune_op_subgraph.py                   |  602 ----------
 scripts/tune_test.py                          |  394 -------
 src/arith/rewrite_simplify.cc                 |   71 +-
 src/relay/analysis/type_solver.cc             |    1 -
 src/relay/op/tensor/transform.cc              |   54 -
 src/relay/transforms/defuse_ops.cc            |   91 --
 .../transforms/kernel_layout_transform.cc     |   66 --
 .../transforms/kernel_layout_transform.h      |  102 --
 src/relay/transforms/pattern_util.h           |    2 -
 src/runtime/cuda/cuda_device_api.cc           |    4 -
 src/runtime/ndarray.cc                        |   80 +-
 src/runtime/opencl/opencl_device_api.cc       |    3 -
 src/runtime/rpc/rpc_module.cc                 |   30 -
 src/runtime/threading_backend.cc              |    9 +-
 src/te/schedule/schedule_dataflow_rewrite.cc  |   66 +-
 src/tir/analysis/verify_gpu_code.cc           |   44 +-
 src/tir/transforms/unroll_loop.cc             |   20 +-
 tests/python/unittest/test_ansor_feature.py   |  150 ---
 .../unittest/test_ansor_relay_integration.py  |  114 --
 .../unittest/test_ansor_task_scheduler.py     |   52 -
 .../test_tir_transform_unroll_loop.py         |   24 -
 topi/include/topi/transform.h                 |   69 --
 topi/python/topi/nn/conv2d.py                 |   39 +-
 tutorials/ansor/README.txt                    |    4 -
 tutorials/ansor/tune_conv2d_cuda.py           |  179 ---
 tutorials/ansor/tune_simple_subgraph.py       |  193 ---
 tutorials/autotvm/README.txt                  |    4 +-
 36 files changed, 31 insertions(+), 4188 deletions(-)
 delete mode 100644 scripts/common.py
 delete mode 100644 scripts/shape_configs.py
 delete mode 100644 scripts/tune_network.py
 delete mode 100644 scripts/tune_op_subgraph.py
 delete mode 100644 scripts/tune_test.py
 delete mode 100644 src/relay/transforms/defuse_ops.cc
 delete mode 100644 src/relay/transforms/kernel_layout_transform.cc
 delete mode 100644 src/relay/transforms/kernel_layout_transform.h
 delete mode 100644 tests/python/unittest/test_ansor_feature.py
 delete mode 100644 tests/python/unittest/test_ansor_relay_integration.py
 delete mode 100644 tests/python/unittest/test_ansor_task_scheduler.py
 delete mode 100644 tutorials/ansor/README.txt
 delete mode 100644 tutorials/ansor/tune_conv2d_cuda.py
 delete mode 100644 tutorials/ansor/tune_simple_subgraph.py

diff --git a/docs/conf.py b/docs/conf.py
index 5826526d55b0..7ece63bd7aa8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -198,7 +198,6 @@
      '../tutorials/language',
      '../tutorials/optimize',
      '../tutorials/autotvm',
-     '../tutorials/ansor',
      '../tutorials/dev',
      '../tutorials/topi',
      '../tutorials/deployment',
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 95476ed61bdd..750a8a43163c 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -296,19 +296,6 @@ struct LayoutTransformAttrs : public tvm::AttrsNode<LayoutTransformAttrs> {
   }
 };
 
-/*! \brief Attributes for KernelLayoutTransform operator */
-struct KernelLayoutTransformAttrs : public tvm::AttrsNode<KernelLayoutTransformAttrs> {
-  std::string src_layout;
-  std::string dst_layout;
-
-  TVM_DECLARE_ATTRS(KernelLayoutTransformAttrs, "relay.attrs.KernelLayoutTransformAttrs") {
-    TVM_ATTR_FIELD(src_layout)
-        .describe("The source layout of the tensor. (e.g. 1N32C112H112W)");
-    TVM_ATTR_FIELD(dst_layout)
-        .describe("The destination layout of the tensor. (e.g. 1N2C112H112W16c)");
-  }
-};
-
 /*! \brief Attributes for ShapeOf operator */
 struct ShapeOfAttrs : public tvm::AttrsNode<ShapeOfAttrs> {
   DataType dtype;
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 5f5d9b643633..1b8b31aee5d1 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -277,20 +277,6 @@ TVM_DLL Pass CanonicalizeOps();
  */
 TVM_DLL Pass AlterOpLayout();
 
-/*!
- * \brief Alternate the layouts of kernels.
- *
- * \return The pass.
- */
-TVM_DLL Pass KernelLayoutTransform();
-
-/*!
- * \brief The reverse of FuseOps.
- *
- * \return The pass.
- */
-TVM_DLL Pass DeFuseOps();
-
 /*!
  * \brief Given a dest layout, this pass transforms the expr such that most of the ops input data
  * layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms, one
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 5a32ac7d3d9f..213c7059a5f9 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -384,29 +384,6 @@ TVM_DLL int TVMFuncListGlobalNames(int* out_size, const char*** out_array);
 TVM_DLL int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_bits,
                           int dtype_lanes, int device_type, int device_id, TVMArrayHandle* out);
 
-/*!
- * \brief Allocate a nd-array's memory of non-empty values,
- *  including space of shape, of given spec.
- *
- * \param shape The shape of the array, the data content will be copied to out
- * \param ndim The number of dimension of the array.
- * \param dtype_code The type code of the dtype
- * \param dtype_bits The number of bits of dtype
- * \param dtype_lanes The number of lanes in the dtype.
- * \param device_type The device type of context
- * \param device_id The device id of context.
- * \param out The output handle.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMArrayAllocNonEmpty(const tvm_index_t* shape,
-                                  int ndim,
-                                  int dtype_code,
-                                  int dtype_bits,
-                                  int dtype_lanes,
-                                  int device_type,
-                                  int device_id,
-                                  TVMArrayHandle* out);
-
 /*!
  * \brief Free the TVM Array.
  * \param handle The array handle to be freed.
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 9b2eb6be2160..421811a52c3b 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -44,8 +44,7 @@ enum DeviceAttrKind : int {
   kMaxClockRate = 6,
   kMultiProcessorCount = 7,
   kMaxThreadDimensions = 8,
-  kGcnArch = 9,
-  kMaxRegistersPerBlock = 10
+  kGcnArch = 9
 };
 
 /*! \brief Number of bytes each allocation must align to */
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 9cc66a371974..e69d802652fd 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -138,17 +138,7 @@ class NDArray : public ObjectRef {
    * \param ctx The context of the Array.
    * \return The created Array
    */
-  TVM_DLL static NDArray Empty(std::vector<int64_t> shape,
-                               DLDataType dtype, DLContext ctx);
-  /*!
-   * \brief Create an NDArray with non-empty values.
-   * \param shape The shape of the new array.
-   * \param dtype The data type of the new array.
-   * \param ctx The context of the Array.
-   * \return The created Array
-   */
-  TVM_DLL static NDArray NonEmpty(std::vector<int64_t> shape,
-                                  DLDataType dtype, DLContext ctx);
+  TVM_DLL static NDArray Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
    *
diff --git a/scripts/common.py b/scripts/common.py
deleted file mode 100644
index e9cf58e128bb..000000000000
--- a/scripts/common.py
+++ /dev/null
@@ -1,1034 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Common utility for scripts"""
-import argparse
-import math
-import os
-import re
-import time
-from collections import defaultdict, namedtuple
-from typing import Dict, List, Tuple
-
-import numpy as np
-import matplotlib.pyplot as plt
-
-import topi
-import tvm
-from tvm import te
-from tvm.ansor import (LogReader, make_workload_key_func,
-                       register_workload_func,
-                       write_measure_records_to_file)
-from tvm.contrib import ndk, util
-
-############################################################
-######################  Test Workloads  ####################
-############################################################
-
-@register_workload_func
-def min_mn(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = topi.min(A, axis=1)
-
-    return [A, B]
-
-@register_workload_func
-def argmin_mn(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = topi.argmin(A, axis=1)
-
-    return [A, B]
-
-@register_workload_func
-def softmax_mn(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = topi.nn.softmax(A, axis=1)
-
-    return [A, B]
-
-@register_workload_func
-def norm_bmn(B, M, N):
-    A = te.placeholder((B, M, N), name='A')
-    i = te.reduce_axis((0, M))
-    j = te.reduce_axis((0, N))
-    C = te.compute((B,), lambda b: te.sum(A[b][i][j] * A[b][i][j], axis=[i, j]), name='C')
-    D = te.compute((B,), lambda b: te.sqrt(C[b]), name='D')
-
-    return [A, D]
-
-@register_workload_func
-def add_mn(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = te.placeholder((M, N), name='B')
-    C = te.compute((M, N), lambda i, j: A[i][j] + B[i][j], name='C')
-
-    return [A, B, C]
-
-@register_workload_func
-def matmul_nkkm(N, M, K, in_type='float32', out_type='float32',
-                tensor_core_support=False):
-    if tensor_core_support:
-        A = te.placeholder((N // 16, K // 16, 16, 16), name='A', dtype=in_type)
-        B = te.placeholder((K // 16, M // 16, 16, 16), name='B', dtype=in_type)
-        k = te.reduce_axis((0, K // 16), name='k')
-        kk = te.reduce_axis((0, 16), name='kk')
-        if not ((in_type == 'float16' and out_type == 'float32') or \
-            (in_type == 'int8' and out_type == 'int32')):
-            raise ValueError
-        C = te.compute((N // 16, M // 16, 16, 16),
-            lambda i, j, ii, jj: te.sum(A[i][k][ii][kk].astype(out_type) * B[k][j][kk][jj].astype(out_type),
-                                    axis=[k, kk]),
-            name='C')
-    else:
-        A = te.placeholder((N, K), name='A', dtype=in_type)
-        B = te.placeholder((K, M), name='B', dtype=in_type)
-        k = te.reduce_axis((0, K), name='k')
-        C = te.compute((N, M),
-                       lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]),
-                       name='C')
-
-    return [A, B, C]
-
-@register_workload_func
-def dense_layer(batch, in_dim, out_dim):
-    A = te.placeholder((batch, in_dim), name='A')
-    B = te.placeholder((out_dim, in_dim), name='B')
-    k = te.reduce_axis((0, in_dim), name='k')
-    C = te.compute((batch, out_dim), lambda i, j: te.sum(A[i][k] * B[j][k], axis=[k]), name='C')
-
-    return [A, B, C]
-
-@register_workload_func
-def max_pool_2d_nchw(N, C, H, W):
-    data = te.placeholder((N, C, H, W), name='data')
-    out = topi.nn.pool(data, (2, 2), (1, 1), (0, 0, 0, 0), pool_type='max', ceil_mode=True,
-                       layout="NCHW", count_include_pad=True)
-
-    return [data, out]
-
-@register_workload_func
-def add_min_relu(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = te.placeholder((M, N), name='B')
-    C = topi.add(A, B)
-    D = topi.min(C, axis=1)
-    out = topi.nn.relu(D)
-    return [A, B, out]
-
-@register_workload_func
-def conv2d_relu_softmax_min(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
-    conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation)
-    relu = topi.nn.relu(conv)
-    softmax = topi.nn.softmax(relu, axis=1)
-    out = topi.min(softmax, axis=1)
-
-    return [data, kernel, out]
-
-@register_workload_func
-def conv2d_nchw_bias(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
-    bias = te.placeholder((CO, 1, 1), name='bias')
-    conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation)
-    #out = topi.nn.relu(conv)
-    out = topi.add(conv, bias)
-    return [data, kernel, bias, out]
-
-def conv2d_nhwc_without_layout_rewrite(Input, Filter, stride, padding, dilation, out_dtype='float32'):
-    """A copy of `topi.nn.conv2d_nhwc` but without the 'layout_free` attribute.
-    We use this in single op and subgraph evaluation because we don't want to introduce graph level optimization.
-    """
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_height, in_width, in_channel = Input.shape
-    if len(Filter.shape) == 10:
-        kernel_h = Filter.shape[2] * Filter.shape[6]
-        kernel_w = Filter.shape[3] * Filter.shape[7]
-        channel = Filter.shape[4] * Filter.shape[8]
-        num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[5] * Filter.shape[9]
-        #Filter = te.placeholder([kernel_h, kernel_w, channel, num_filter], Filter.dtype, Filter.name)
-    elif len(Filter.shape) == 11:
-        kernel_h = Filter.shape[3] * Filter.shape[7]
-        kernel_w = Filter.shape[4] * Filter.shape[8]
-        channel = Filter.shape[5] * Filter.shape[9]
-        num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[2] * Filter.shape[6] * Filter.shape[10]
-    else:
-        kernel_h, kernel_w, channel, num_filter = Filter.shape
-
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = topi.nn.get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w))
-    out_channel = num_filter
-    out_height = topi.util.simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = topi.util.simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    pad_before = [0, pad_top, pad_left, 0]
-    pad_after = [0, pad_down, pad_right, 0]
-    PaddedInput = topi.nn.pad(Input, pad_before, pad_after, name="PaddedInput")
-    rc = te.reduce_axis((0, in_channel), name='rc')
-    ry = te.reduce_axis((0, kernel_h), name='ry')
-    rx = te.reduce_axis((0, kernel_w), name='rx')
-    Output = te.compute(
-        (batch, out_height, out_width, out_channel),
-        lambda nn, yy, xx, ff: te.sum(
-            PaddedInput[nn, yy * stride_h + ry * dilation_h,
-                        xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
-            Filter[ry, rx, rc, ff].astype(out_dtype)
-            , axis=[ry, rx, rc]),
-        name="Conv2dOutput", tag="conv2d_nhwc")
-    return Output
-
-
-@register_workload_func
-def conv2d_nhwc_bias_with_rewrite(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, H, W, CI), name='data')
-    kernel = te.placeholder((KH, KW, CI, CO), name='kernel')
-    bias = te.placeholder((CO, ), name='bias')
-    conv = topi.nn.conv2d_nhwc(data, kernel, strides, padding, dilation)
-    out = topi.add(conv, bias)
-    return [data, kernel, bias, out]
-
-@register_workload_func
-def depthwise_conv2d_nhwc_bias_with_rewrite(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, H, W, CI), name='data')
-    kernel = te.placeholder((KH, KW, CI, 1), name='kernel')
-    bias = te.placeholder((CO, ), name='bias')
-    conv = topi.nn.depthwise_conv2d_nhwc(data, kernel, strides, padding, dilation)
-    out = topi.add(conv, bias)
-    return [data, kernel, bias, out]
-
-@register_workload_func
-def conv2d_nhwc_bias(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, H, W, CI), name='data')
-    kernel = te.placeholder((KH, KW, CI, CO), name='kernel')
-    bias = te.placeholder((CO, ), name='bias')
-    conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation)
-    out = topi.add(conv, bias)
-    return [data, kernel, bias, out]
-
-
-@register_workload_func
-def conv2d_nchw_bn_relu(N, H, W, CI, CO, kernel_size, strides, padding, dilation=1):
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, kernel_size, kernel_size), name='kernel')
-    bias = te.placeholder((CO, 1, 1), name='bias')
-    bn_scale = te.placeholder((CO, 1, 1), name='bn_scale')
-    bn_offset = te.placeholder((CO, 1, 1), name='bn_offset')
-
-    OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-    OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-
-    conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation)
-    conv = te.compute((N, CO, OH, OW),
-                       lambda i, j, k, l: conv[i, j, k, l] + bias[j, 0, 0],
-                       name='bias_add')
-    conv = te.compute((N, CO, OH, OW),
-                       lambda i, j, k, l: conv[i, j, k, l] * bn_scale[j, 0, 0],
-                       name='bn_mul')
-    conv = te.compute((N, CO, OH, OW),
-                       lambda i, j, k, l: conv[i, j, k, l] + bn_offset[j, 0, 0],
-                       name='bn_add')
-    out = topi.nn.relu(conv)
-
-    return [data, kernel, bias, bn_offset, bn_scale, out]
-
-@register_workload_func
-def conv2d_nhwc_bn_relu(N, H, W, CI, CO, kernel_size, strides, padding, dilation=1):
-    data = te.placeholder((N, H, W, CI), name='data')
-    kernel = te.placeholder((kernel_size, kernel_size, CI, CO), name='kernel')
-    bias = te.placeholder((CO,), name='bias')
-    bn_scale = te.placeholder((CO,), name='bn_scale')
-    bn_offset = te.placeholder((CO,), name='bn_offset')
-
-    OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-    OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-
-    conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation)
-    conv = te.compute((N, OH, OW, CO),
-                       lambda i, j, k, l: conv[i, j, k, l] + bias[l],
-                       name='bias_add')
-    conv = te.compute((N, OH, OW, CO),
-                       lambda i, j, k, l: conv[i, j, k, l] * bn_scale[l],
-                       name='bn_mul')
-    conv = te.compute((N, OH, OW, CO),
-                       lambda i, j, k, l: conv[i, j, k, l] + bn_offset[l],
-                       name='bn_add')
-    out = topi.nn.relu(conv)
-
-    return [data, kernel, bias, bn_offset, bn_scale, out]
-
-resnet_conv2d_configs = {
-    # format : N, H, W, CI, CO, KH, KW, strides, padding, dilation
-    '18': [
-        (1, 224, 224, 3, 64, 7, 7, (2, 2), (3, 3), (1, 1)),
-        (1, 56, 56, 64, 128, 3, 3, (2, 2), (1, 1), (1, 1)),
-        (1, 56, 56, 64, 128, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 56, 56, 64, 64, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 56, 56, 64, 64, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 28, 28, 128, 256, 3, 3, (2, 2), (1, 1), (1, 1)),
-        (1, 28, 28, 128, 256, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 28, 28, 128, 128, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 14, 14, 256, 512, 3, 3, (2, 2), (1, 1), (1, 1)),
-        (1, 14, 14, 256, 512, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 14, 14, 256, 256, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)),
-    ],
-    '50': [
-        (1, 224, 224, 3, 64, 7, 7, (2, 2), (3, 3), (1, 1)),
-        (1, 56, 56, 256, 512, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 56, 56, 256, 128, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 56, 56, 256, 64, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 56, 56, 64, 256, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 56, 56, 64, 64, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 56, 56, 64, 64, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 28, 28, 512, 1024, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 28, 28, 512, 256, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 28, 28, 512, 128, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 28, 28, 128, 512, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 28, 28, 128, 128, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 14, 14, 1024, 2048, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 14, 14, 1024, 512, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 14, 14, 1024, 256, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 14, 14, 256, 1024, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 14, 14, 256, 256, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 7, 7, 2048, 512, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 7, 7, 512, 2048, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)),
-    ],
-}
-
-# number of appearance for all conv2ds in resnet
-resnet_conv2d_weights = {
-    '18': [1, 1, 1, 4, 1, 1, 1, 3, 1, 1, 3, 3],
-    '50': [1, 1, 1, 2, 4, 3, 1, 1, 1, 3, 4, 4, 1, 1, 5, 6, 6, 2, 3, 3],
-}
-
-
-def parse_workload_name(name: str) -> List[str]:
-    """Parse workload name with wildcard character and abbreviation to standard names"""
-    if name.startswith('matmul-'):  # e.g. matmul-512, matmul-1024, matmul-+
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [256, 512, 1024]
-        else:
-            cfg_list = [N]
-        return ["matmul-%s" % x for x in cfg_list]
-    elif name.startswith('dense-'):  # e.g. dense-1-512-1024, dense-16-512-512
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = ["1-512-512", "16-512-512"]
-        else:
-            cfg_list = [N]
-        return ["dense-%s" % x for x in cfg_list]
-    elif name.startswith('min-'):  # e.g. min-4096
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["min-%s" % x for x in cfg_list]
-    elif name.startswith('argmin-'):  # e.g. argmin-4096
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["argmin-%s" % x for x in cfg_list]
-    elif name.startswith('softmax-'):  # e.g. softmax-4096
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["softmax-%s" % x for x in cfg_list]
-    elif name.startswith('add-'):  # e.g. add-4096
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["add-%s" % x for x in cfg_list]
-    elif name.startswith('norm-'):  # e.g. norm-1024
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["norm-%s" % x for x in cfg_list]
-    elif name.startswith('add-min-relu'):  # e.g. add-min-relu-4096
-        N = name.split('-', maxsplit=3)[3]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["add-min-relu-%s" % x for x in cfg_list]
-    elif name.startswith('nhwc-resnet-'):  # e.g.  nhwc-resnet-50.C1
-        res = re.match(r'nhwc-resnet-(\d+).C([\d\+]+)(.B(\d+))?', name)
-        n_layers = res.group(1)
-        if res.group(2) == '+':
-            idx_list = range(len(resnet_conv2d_configs[n_layers]))
-        else:
-            idx_list = [int(res.group(2))]
-
-        batch_size = 1 if res.group(4) is None else int(res.group(4))
-        return ['nhwc-resnet-%s.C%d.B%d' % (n_layers, i, batch_size) for i in idx_list]
-    elif name.startswith('resnet-'):  # e.g.  resnet-50.C1, resnet-50.C1.B2, resnet-50.C+.B2
-        res = re.match(r'resnet-(\d+).C([\d\+]+)(.B(\d+))?', name)
-        n_layers = res.group(1)
-        if res.group(2) == '+':
-            idx_list = range(len(resnet_conv2d_configs[n_layers]))
-        else:
-            idx_list = [int(res.group(2))]
-
-        batch_size = 1 if res.group(4) is None else int(res.group(4))
-        return ['resnet-%s.C%d.B%d' % (n_layers, i, batch_size) for i in idx_list]
-    elif name in ['conv2d-bn-relu', 'conv2d-relu-softmax-min', 'max-pool-2d', 'conv2d-rewrite', 'depthwise-conv2d-rewrite']:
-        return [name]
-    else:
-        raise ValueError("Invalid workload " + name)
-
-
-def get_workload_keys(name: str) -> List[str]:
-    """Parse workload name and return the workload keys"""
-    normalized_names = parse_workload_name(name)
-
-    ret = []
-    for name in normalized_names:
-        if name.startswith('matmul-'):
-            name_split = name.split('-')
-            in_type = out_type = 'float32'
-            tensor_core_support = False
-            if len(name_split) == 2:    # e.g. matmul-512
-                N = K = M = int(name_split[1])
-            elif len(name_split) == 4:  # e.g. matmul-32-256-512
-                N = int(name_split[1])
-                K = int(name_split[2])
-                M = int(name_split[3])
-            elif len(name_split) == 6:  # e.g. matmul-32-512-512-float16-float32
-                N = int(name_split[1])
-                K = int(name_split[2])
-                M = int(name_split[3])
-                in_type = name_split[4]
-                out_type = name_split[5]
-            elif len(name_split) == 7:  # e.g. matmul-32-512-512-float16-float32-tc
-                N = int(name_split[1])
-                K = int(name_split[2])
-                M = int(name_split[3])
-                in_type = name_split[4]
-                out_type = name_split[5]
-                tensor_core_support = name_split[6] == "tc"
-            else:
-                raise ValueError("Invalid matmul workload")
-            ret.append(make_workload_key_func(matmul_nkkm,
-                                              (N, M, K, in_type, out_type, tensor_core_support)))
-        elif name.startswith('dense-'):  # e.g. dense-1-512-1024, dense-16-512-512
-            name_split = name.split('-')
-            assert len(name_split) == 4
-            batch = int(name_split[1])
-            in_dim = int(name_split[2])
-            out_dim = int(name_split[3])
-            ret.append(make_workload_key_func(dense_layer, (batch, in_dim, out_dim)))
-        elif name.startswith('min-'):  # e.g. min-4096
-            name_split = name.split('-')
-            if len(name_split) == 2:
-                M = 64
-                N = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid min workload")
-            ret.append(make_workload_key_func(min_mn, (M, N)))
-        elif name.startswith('argmin-'):  # e.g. argmin-4096
-            name_split = name.split('-')
-            if len(name_split) == 2:
-                M = 64
-                N = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid argmin workload")
-            ret.append(make_workload_key_func(argmin_mn, (M, N)))
-        elif name.startswith('softmax-'):  # e.g. softmax-4096
-            name_split = name.split('-')
-            if len(name_split) == 2:
-                M = 64
-                N = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid softmax workload")
-            ret.append(make_workload_key_func(softmax_mn, (M, N)))
-        elif name.startswith('add-min-relu'):  # e.g. add-min-relu-4096
-            name_split = name.split('-')
-            if len(name_split) == 4:
-                M = 64
-                N = int(name_split[3])
-            elif len(name_split) == 5:
-                M = int(name_split[3])
-                N = int(name_split[4])
-            else:
-                raise ValueError("Invalid workload")
-            ret.append(make_workload_key_func(add_min_relu, (M, N)))
-        elif name.startswith('add-'):  # e.g. add-4096
-            name_split = name.split('-')
-            if len(name_split) == 2:
-                N = M = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid add workload")
-            ret.append(make_workload_key_func(add_mn, (M, N)))
-        elif name.startswith('norm-'):  # e.g. norm-4096
-            name_split = name.split('-')
-            B = 2
-            if len(name_split) == 2:
-                N = M = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid norm workload")
-            ret.append(make_workload_key_func(norm_bmn, (B, M, N)))
-        elif name.startswith('nhwc-resnet-'):  # e.g.  nhwc-resnet-50.C1.B2
-            res = re.match(r'nhwc-resnet-(\d+).C(\d+).B(\d+)', name)
-            n_layers = res.group(1)
-            idx = int(res.group(2))
-            batch_size = 1 if res.group(3) is None else int(res.group(3))
-            args = list(resnet_conv2d_configs[n_layers][idx])
-            args[0] = batch_size
-            ret.append(make_workload_key_func(conv2d_nhwc_bias, args))
-        elif name.startswith('resnet-'):  # e.g.  resnet-50.C1.B2
-            res = re.match(r'resnet-(\d+).C(\d+).B(\d+)', name)
-            n_layers = res.group(1)
-            idx = int(res.group(2))
-            batch_size = 1 if res.group(3) is None else int(res.group(3))
-            args = list(resnet_conv2d_configs[n_layers][idx])
-            args[0] = batch_size
-            ret.append(make_workload_key_func(conv2d_nchw_bias, args))
-        elif name == 'max-pool-2d':
-            return [make_workload_key_func(max_pool_2d_nchw, (2, 512, 7, 7))]
-        elif name == 'conv2d-bn-relu':
-            return [make_workload_key_func(conv2d_nhwc_bn_relu,
-                                           (1, 7, 7, 512, 512, 3, 1, 1, 1)) ]
-        elif name == 'conv2d-rewrite':
-            return [ make_workload_key_func(conv2d_nhwc_bias_with_rewrite,
-                                            (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))]
-        elif name == 'depthwise-conv2d-rewrite':
-            return [ make_workload_key_func(depthwise_conv2d_nhwc_bias_with_rewrite,
-                                            (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))]
-        elif name == 'conv2d-relu-softmax-min':
-            return [make_workload_key_func(conv2d_relu_softmax_min,
-                                           (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))]
-        else:
-            raise ValueError("Invalid workload " + name)
-
-    return ret
-
-
-def get_workload_weights(name: str) -> List[float]:
-    """Return weights for workload name"""
-    if name.startswith('resnet-'):
-        res = re.match(r'resnet-(\d+).C+', name)
-        n_layers = res.group(1)
-        return np.array(resnet_conv2d_weights[n_layers])
-    else:
-        return np.ones(len(get_workload_keys(name)))
-
-
-############################################################
-######################  Measure Tools   ####################
-############################################################
-
-
-def measure_schedule(s,
-                     bufs,
-                     target,
-                     target_host=None,
-                     remote=None,
-                     ndk_cc=None,
-                     number=10,
-                     repeat=3,
-                     min_repeat_ms=500):
-    """Measure the time cost of a schedule"""
-    func = tvm.build(s, bufs, target=target, target_host=target_host)
-    if remote:
-        ctx = remote.context(str(target), 0)
-        temp = util.tempdir()
-        remote_path = temp.relpath("tmp_deploy_lib.so")
-        os.environ['TVM_NDK_CC'] = ndk_cc
-        func.export_library(remote_path, ndk.create_shared)
-        remote.upload(remote_path)
-        func = remote.load_module("tmp_deploy_lib.so")
-    else:
-        ctx = tvm.context(str(target), 0)
-
-    if os.environ.get('TVM_AUTO_CACHE_FLUSH', '0') == '1':
-        min_repeat_ms = 0
-        number = 1
-
-    time_f = func.time_evaluator(func.entry_name,
-                                 ctx,
-                                 number=number,
-                                 repeat=repeat,
-                                 min_repeat_ms=min_repeat_ms)
-
-    np_args = [np.ones(topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
-    args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
-    ctx.sync()
-
-    costs = time_f(*args).results
-
-    return costs
-
-def check_correctness(s, bufs, s_ref, buf_ref, target, target_host=None, remote=None, ndk_cc=None):
-    """Check the correctness of a schedule against a reference schedule"""
-    func = tvm.build(s, bufs, target=target, target_host=target_host)
-    func_ref = tvm.build(s_ref, buf_ref, target='llvm')
-
-    if remote:
-        raise NotImplemented
-    else:
-        ctx = tvm.context(str(target), 0)
-        ctx_ref = tvm.cpu()
-
-    np_args = [np.ones(topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
-    args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
-    args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args]
-    ctx.sync()
-
-    func(*args)
-    func_ref(*args_ref)
-
-    for arr, arr_ref in zip(args, args_ref):
-        np.testing.assert_allclose(arr.asnumpy(), arr_ref.asnumpy())
-
-
-############################################################
-#####################  Other Utilities  ####################
-############################################################
-
-
-def geomean(xs):
-    """Compute geometric mean"""
-    return math.exp(math.fsum(math.log(x) for x in xs) / len(xs))
-
-
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-global last_tic
-last_tic = None
-
-
-def PRINT_TIME(msg):
-    """Print time interval between differnt calls. This is for debug so we make the name letters capital"""
-    global last_tic
-    now = time.time()
-
-    if last_tic is None:
-        last_tic = now
-
-    print(msg, now - last_tic)
-    last_tic = now
-
-
-############################################################
-######################  I/O Utilities  #####################
-############################################################
-
-# The format for a line in resulst file
-BenchmarkRecord = namedtuple("BenchmarkRecord", [
-    'device', 'backend', 'workload_type', 'workload_name', 'library', 'algorithm', 'value',
-    'time_stamp'
-])
-
-
-class BaselineDatabase:
-    """A class for query records in baseline database"""
-    def __init__(self, filename):
-        self.filename = filename
-
-        self.lines = []
-        for line in open(filename):
-            if line.startswith('#') or line.isspace():
-                continue
-            self.lines.append(line.split('\t'))
-
-    def filter_records(self, devices=None, backends=None, wkl_names=None, libraries=None):
-        ret = []
-        for line in self.lines:
-            line = BenchmarkRecord(*line)
-
-            if devices is not None and line.device not in devices:
-                continue
-            if backends is not None and line.backend not in backends:
-                continue
-            if wkl_names is not None and line.workload_name not in wkl_names:
-                continue
-            if libraries is not None and line.library not in libraries:
-                continue
-
-            ret.append(line)
-        return ret
-
-    def get_data_dict(self, device, target, wkl_names) -> Tuple[Dict, List]:
-        """Return a data dict s.t.  data[wkl][library] = cost"""
-        data = defaultdict(lambda: defaultdict(lambda: 1e10))
-
-        all_libraries = set()
-
-        if "cpu" in target.keys:
-            backends = ['cpu']
-        elif "gpu" in target.keys:
-            backends = ['gpu']
-        else:
-            raise ValueError("Invalid target: " + target)
-
-        # Read costs for baselines
-        records = self.filter_records(devices=[device], backends=backends, wkl_names=wkl_names)
-        for record in records:
-            # use min over (possible) multiple algorithms
-            all_libraries.add(record.library)
-            data[record.workload_name][record.library] = \
-                min(data[record.workload_name][record.library],
-                    np.mean(eval(record.value)['costs']))
-
-        return data, list(all_libraries)
-
-
-class LogFileDatabase:
-    """A class for indexing best records in a log file"""
-    def __init__(self, filename: str, n_lines: int = -1):
-        inputs, results = LogReader(filename).read_lines(n_lines)
-
-        # best records, search by (target_key, workload_key).  e.g. ('gpu', 'conv2d...')
-        self.best_by_targetkey = {}
-
-        # best according to (model, workload_key).  e.g. ('1080ti', 'conv2d...'))
-        self.best_by_model = {}
-
-        # find best records and build the index
-        for inp, res in zip(inputs, results):
-            if res.error_no != 0:
-                continue
-
-            # use target keys in tvm target system as key to build best map
-            for target_key in inp.task.target.keys:
-                key = (target_key, inp.task.workload_key)
-                if key not in self.best_by_targetkey:
-                    self.best_by_targetkey[key] = (inp, res)
-                else:
-                    _, other_res = self.best_by_targetkey[key]
-                    if np.mean([x.value for x in other_res.costs]) > \
-                            np.mean([x.value for x in res.costs]):
-                        self.best_by_targetkey[key] = (inp, res)
-
-            # use model as key to build best map
-            key = (inp.task.target.model, inp.task.workload_key)
-            if key not in self.best_by_model:
-                if inp.task.target.model != 'unknown':
-                    self.best_by_model[key] = (inp, res)
-            else:
-                _, other_res = self.best_by_model[key]
-                if np.mean([x.value for x in other_res.costs]) > \
-                        np.mean([x.value for x in res.costs]):
-                    self.best_by_model[key] = (inp, res)
-
-    def write_best(self, filename: str):
-        best_records = list(self.best_by_targetkey.values())
-        inputs = [x[0] for x in best_records]
-        results = [x[1] for x in best_records]
-        write_measure_records_to_file(filename, inputs, results)
-
-
-############################################################
-######################  Plot Utilities  ####################
-############################################################
-
-def max_curve(raw_curve):
-    """Return b[i] = max(a[:i]) """
-    ret = []
-    cur_max = -np.inf
-    for x in raw_curve:
-        cur_max = max(cur_max, x)
-        ret.append(cur_max)
-    return ret
-
-def min_curve(raw_curve):
-    """Return b[i] = min(a[:i]) """
-    ret = []
-    cur_min = np.inf
-    for x in raw_curve:
-        cur_min = min(cur_min, x)
-        ret.append(cur_min)
-    return ret
-
-def mean_curve(raw_curve, window_size=None):
-    """Return b[i] = mean(a[:i]) """
-    ret = []
-    mean = 0
-    if window_size is None:
-        for i, x in enumerate(raw_curve):
-            mean = (mean * i + x) / (i + 1)
-            ret.append(mean)
-    else:
-        for i, x in enumerate(raw_curve):
-            if i >= window_size:
-                mean = (mean * window_size + x - raw_curve[i - window_size]) / window_size
-            else:
-                mean = (mean * i + x) / (i + 1)
-            ret.append(mean)
-    return ret
-
-
-def enhance_color(color, h=1, l=1, s=1):
-    """Make color looks better for pyplot"""
-    import matplotlib.colors as mc
-    import colorsys
-    try:
-        c = mc.cnames[color]
-    except:
-        c = color
-    c = np.array(colorsys.rgb_to_hls(*mc.to_rgb(c)))
-
-    h, l, s = h * c[0], l * c[1], s * c[2]
-    h, l, s = [max(min(x, 1), 0) for x in [h, l, s]]
-
-    return colorsys.hls_to_rgb(h, l, s)
-
-
-method_color_dict = {
-    'ours': 'C0',
-    'AutoTVM': 'C1',
-
-    'tensorflow': 'C2',
-    'tensorflow-tensorrt': 'C9',
-    'tflite': 'C2',
-
-    'pytorch': enhance_color('C3', l=1.1, s=0.9),
-
-    'FlexTensor': enhance_color('C5'),
-    'halide': enhance_color('teal', l=1.25),
-
-    'Limit space': 'C7',
-    'No fine-tuning': 'C8',
-    'No task scheduler': 'C1',
-}
-
-def method2color(method):
-    if '-batch-' in method:
-        method, batch_size = method.split('-batch-')
-        #return enhance_color(method_color_dict[method], s=1.1, l=1.5)
-        return method_color_dict[method]
-    else:
-        return method_color_dict[method]
-
-method_order_list = [
-    'pytorch', 'tensorflow', 'tensorflow-xla', 'tensorflow-tensorrt',
-    'tflite', 'halide', 'FlexTensor',  'AutoTVM',
-
-    'Limit space', 'No fine-tuning',
-    'ours',
-]
-
-def method2order(method):
-    if '-batch-' in method:
-        method, batch_size = method.split('-batch-')
-        batch_size = int(batch_size)
-        return method_order_list.index(method) + batch_size / 100
-    else:
-        return method_order_list.index(method)
-
-show_name_replace_dict = {
-    'pytorch': "PyTorch",
-    'tensorflow-tensorrt': 'TensorRT-TF',
-    'tensorflow': 'TensorFlow',
-    'tflite': 'TensorFlow Lite',
-    'halide': 'Halide',
-
-    'ours': 'Ansor (ours)',
-    'batch-16': 'batch',
-
-    'resnet_50': 'ResNet-50',
-    'mobilenet_v2': 'Mobilenet V2',
-    'resnet_18_3d': '3D-ResNet',
-    'dcgan': 'DCGAN',
-    'dqn': 'DQN',
-    'bert': 'BERT',
-}
-
-def show_name(name):
-    #    if name.startswith('resnet-'):
-    #        return name.split('.')[1]
-    for key, value in show_name_replace_dict.items():
-        name = name.replace(key, value)
-
-    return name
-
-def draw_grouped_bar_chart(data, baseline='pytorch', output='out.png',
-                           yscale_log=False, yticks=None, y_max=None,
-                           legend_bbox_to_anchor=None, legend_nrow=None,
-                           figure_size=None, figax=None, draw_ylabel=True, draw_legend=True):
-    width = 1
-    gap = 1.5
-    fontsize = 19
-    xticks_font_size = fontsize - 2
-
-    figure_size = figure_size or (11, 4)
-    legend_bbox_to_anchor = legend_bbox_to_anchor or (0.45, 1.35)
-
-    all_methods = set()
-    legend_set = {}
-
-    if figax is None:
-        fig, ax = plt.subplots()
-        axes = []
-        axes.append(ax)
-    else:
-        ax = figax
-
-    x0 = 0
-    xticks = []
-    xlabels = []
-
-    workloads = list(data.keys())
-    for wkl in workloads:
-        ys = []
-        colors = []
-
-        methods = list(data[wkl].keys())
-
-        if baseline in data[wkl]:
-            baseline_cost = data[wkl][baseline]
-        else:
-            # normalize to best library
-            baseline_cost = 1e10
-            for method in methods:
-                if data[wkl][method] < baseline_cost:
-                    baseline_cost = data[wkl][method]
-
-        methods.sort(key=lambda x: method2order(x))
-        for method in methods:
-            relative_speedup = baseline_cost / data[wkl][method]
-            if yticks is None:
-                ys.append(relative_speedup)
-            else:
-                ys.append(max(relative_speedup, yticks[0] * 1.1))
-            colors.append(method2color(method))
-
-        # draw the bars
-        xs = np.arange(x0, x0 + len(ys))
-        bars = ax.bar(xs, ys, width=width, color=colors)
-
-        for method, bar_obj in zip(methods, bars):
-            all_methods.add(method)
-            if method not in legend_set:
-                legend_set[method] = bar_obj
-
-        # tick and label
-        x0 += len(ys) + gap
-
-        xticks.append(x0 - gap - len(ys)*width/2.0 - width/2.0)
-        xlabels.append(show_name(wkl))
-
-        ax.set_xticks(xticks)
-        ax.set_xticklabels(xlabels, fontsize=xticks_font_size)
-        plt.tick_params(axis='x', which='both', bottom='off', top='off')
-
-        if draw_ylabel is True:
-            ax.set_ylabel('Relative Speedup', fontsize=fontsize)
-        elif isinstance(draw_ylabel, str):
-            ax.set_ylabel(draw_ylabel, fontsize=fontsize)
-
-        if yscale_log:
-            ax.set_yscale('log', basey=2)
-        if yticks is not None:
-            ax.set_yticks(yticks)
-        if y_max:
-            ax.set_ylim(top=y_max)
-
-        from matplotlib.ticker import FormatStrFormatter
-        ax.set_yticklabels(ax.get_yticks(), fontsize=fontsize)
-        ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
-        ax.yaxis.grid(linewidth=0.4, linestyle='dotted') # draw grid line
-        ax.set_axisbelow(True)  # grid lines are behind the rest
-        ax.tick_params(bottom=False, top=False, right=False)
-
-    # put legend outside the plot
-    all_methods = list(all_methods)
-    all_methods.sort(key=lambda x : method2order(x))
-
-    if draw_legend:
-        legend_nrow = legend_nrow or 2
-        ncol = (len(all_methods) + legend_nrow - 1)// legend_nrow
-        ax.legend([legend_set[x] for x in all_methods],
-                  [show_name(x) for x in all_methods],
-                  fontsize=fontsize-1,
-                  loc='upper center',
-                  bbox_to_anchor=legend_bbox_to_anchor,
-                  ncol=ncol,
-                  handlelength=1.0,
-                  handletextpad=0.5,
-                  columnspacing=1.1)
-
-    if figax is None:
-        fig.set_size_inches(figure_size)
-        fig.savefig(output, bbox_inches='tight')
-        print("Output the plot to %s" % output)
-
-
-def to_str_round(x, decimal=6):
-    if isinstance(x, str):
-        return x
-    if isinstance(x, (list, tuple)) or isinstance(x, np.ndarray):
-        return "[" + ", ".join([to_str_round(y, decimal=decimal)
-                                for y in x]) + "]"
-    if isinstance(x, dict):
-        return str({k: eval(to_str_round(v)) for k, v in x.items()})
-    if isinstance(x, int):
-        return str(x)
-    if isinstance(x, float):
-        format_str = "%%.%df" % decimal
-        return format_str % x
-    raise ValueError("Invalid value: " + str(x))
-
diff --git a/scripts/shape_configs.py b/scripts/shape_configs.py
deleted file mode 100644
index db6b3b9dc9aa..000000000000
--- a/scripts/shape_configs.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Shape configurations for single operator / subgraph evaluation
-This file is shared by tune_op_subgraph.py and scripts in scripts/baseline/
-"""
-
-matmul_shapes = [
-    (1, 128, 128, 128),
-    (1, 512, 32, 512),
-    (1, 512, 512, 512),
-    (1, 1024, 1024, 1024),
-]
-
-conv1d_shapes = [
-    # derived from conv2d_shapes
-    (1, 256, 64, 128, 3, 2, 1),
-#    (1, 256, 64, 128, 1, 2, 0),
-#    (1, 256, 64, 64, 1, 1, 0),
-#    (1, 128, 128, 256, 3, 2, 1),
-    (1, 128, 128, 256, 1, 2, 0),
-#    (1, 128, 128, 128, 3, 1, 1),
-#    (1, 64, 256, 512, 3, 2, 1),
-#    (1, 64, 256, 512, 1, 2, 0),
-    (1, 64, 256, 256, 5, 1, 2),
-    (1, 32, 512, 512, 3, 1, 1),
-]
-
-conv2d_shapes = [
-    # all conv2d layers in resnet-18
-    (1, 224, 224, 3, 64, 7, 2, 3),
-#    (1, 56, 56, 64, 128, 3, 2, 1),
-#    (1, 56, 56, 64, 128, 1, 2, 0),
-#    (1, 56, 56, 64, 64, 3, 1, 1),
-    (1, 56, 56, 64, 64, 1, 1, 0),
-#    (1, 28, 28, 128, 256, 3, 2, 1),
-#    (1, 28, 28, 128, 256, 1, 2, 0),
-#    (1, 28, 28, 128, 128, 3, 1, 1),
-#    (1, 14, 14, 256, 512, 3, 2, 1),
-#    (1, 14, 14, 256, 512, 1, 2, 0),
-    (1, 14, 14, 256, 256, 3, 1, 1),
-    (1, 7, 7, 512, 512, 3, 1, 1),
-]
-
-conv3d_shapes = [
-    # Derived from cnov2d_shapes. Use depth=16 for all configurations
-    (1, 16, 224, 224, 3, 64, 7, 2, 3),
-#    (1, 16, 56, 56, 64, 128, 3, 2, 1),
-#    (1, 16, 56, 56, 64, 128, 1, 2, 0),
-#    (1, 16, 56, 56, 64, 64, 3, 1, 1),
-    (1, 16, 56, 56, 64, 64, 1, 1, 0),
-#    (1, 16, 28, 28, 128, 256, 3, 2, 1),
-#    (1, 16, 28, 28, 128, 256, 1, 2, 0),
-#    (1, 16, 28, 28, 128, 128, 3, 1, 1),
-#    (1, 16, 14, 14, 256, 512, 3, 2, 1),
-#    (1, 16, 14, 14, 256, 512, 1, 2, 0),
-    (1, 16, 14, 14, 256, 256, 3, 1, 1),
-    (1, 16, 7, 7, 512, 512, 3, 1, 1),
-]
-
-group_conv2d_shapes = [
-    # Derived from cnov2d_shapes. Use group=4 for all configurations
-    (1, 56, 56, 64, 128, 3, 2, 1 , 1, 4),
-#    (1, 56, 56, 64, 128, 1, 2, 0 , 1, 4),
-#    (1, 56, 56, 64, 64, 3, 1, 1  , 1, 4),
-    (1, 56, 56, 64, 64, 1, 1, 0  , 1, 4),
-#    (1, 28, 28, 128, 256, 3, 2, 1, 1, 4),
-#    (1, 28, 28, 128, 256, 1, 2, 0, 1, 4),
-#    (1, 28, 28, 128, 128, 3, 1, 1, 1, 4),
-#    (1, 14, 14, 256, 512, 3, 2, 1, 1, 4),
-#    (1, 14, 14, 256, 512, 1, 2, 0, 1, 4),
-    (1, 14, 14, 256, 256, 3, 1, 1, 1, 4),
-    (1, 7, 7, 512, 512, 3, 1, 1  , 1, 4),
-]
-
-dilation_conv2d_shapes = [
-    # Derived from cnov2d_shapes. Use dilation=2 for all configurations
-    (1, 224, 224, 3, 64, 7, 2, 3 , 2),
-#    (1, 56, 56, 64, 128, 3, 2, 1 , 2),
-#    (1, 56, 56, 64, 128, 1, 2, 0 , 2),
-#    (1, 56, 56, 64, 64, 3, 1, 1  , 2),
-    (1, 56, 56, 64, 64, 1, 1, 0  , 2),
-#    (1, 28, 28, 128, 256, 3, 2, 1, 2),
-#    (1, 28, 28, 128, 256, 1, 2, 0, 2),
-#    (1, 28, 28, 128, 128, 3, 1, 1, 2),
-#    (1, 14, 14, 256, 512, 3, 2, 1, 2),
-#    (1, 14, 14, 256, 512, 1, 2, 0, 2),
-    (1, 14, 14, 256, 256, 3, 1, 1, 2),
-    (1, 7, 7, 512, 512, 3, 1, 1  , 2),
-]
-
-depthwise_conv2d_shapes = [
-    # all depthwise conv2d layers in mobilenet
-    (1, 112, 112, 32,  3, 1, 1),
-    (1, 112, 112, 64,  3, 2, 1),
-#    (1,  56,  56, 128, 3, 1, 1),
-#    (1,  56,  56, 128, 3, 2, 1),
-#    (1,  28,  28, 256, 3, 1, 1),
-#    (1,  28,  28, 256, 3, 2, 1),
-#    (1,  14,  14, 512, 3, 1, 1),
-    (1,  14,  14, 512, 3, 2, 1),
-    (1,   7,   7, 1024, 3, 1, 1),
-]
-
-conv2d_transpose_shapes = [
-    # all conv2d tranpose layers in DCGAN
-    (1, 4, 4, 512, 256, 4, 2, 1),
-    (1, 8, 8, 256, 128, 4, 2, 1),
-    (1, 16, 16, 128, 64, 4, 2, 1),
-    (1, 32, 32, 64, 3, 4, 2, 1),
-]
-
-conv2d_capsule_shapes = [
-    # all conv2d capsule layers in matrix capsules withemrouting (ICLR 2018)
-    (1, 16, 16, 32, 32, 3, 2, 1),
-    (1,  8,  8, 32, 32, 3, 1, 1),
-    (1, 16, 16,  8, 16, 3, 2, 1),
-    (1,  8,  8, 16, 16, 3, 1, 1),
-]
-
-conv2d_winograd_nhwc_shapes = [
-    (1, 56, 56, 64, 64, 3, 1, 1),
-    (1, 28, 28, 128, 128, 3, 1, 1),
-    (1, 14, 14, 256, 256, 3, 1, 1),
-    (1, 7, 7, 512, 512, 3, 1, 1),
-]
-
-conv2d_winograd_nchw_shapes = [
-    (1, 64, 56, 56, 64, 3, 1, 1),
-    (1, 128, 28, 28, 128, 3, 1, 1),
-    (1, 256, 14, 14, 256, 3, 1, 1),
-    (1, 512, 7, 7, 512, 3, 1, 1),
-]
-
-matmul_tensor_core_shapes = [
-    (16, 512, 512, 'float16', 'float32', True),
-    (32, 512, 512, 'float16', 'float32', True),
-    (512, 512, 512, 'float16', 'float32', True),
-]
-
-norm_shapes = [
-    (1, 256, 256),
-    (1, 512, 512),
-    (1, 1024, 1024),
-    (1, 4096, 1024),
-]
-
-single_op_shape_dict = {
-    'C1D': conv1d_shapes,
-    'C2D': conv2d_shapes,
-    'C3D': conv3d_shapes,
-    'GMM': matmul_shapes,
-    'GRP': group_conv2d_shapes,
-    'DIL': dilation_conv2d_shapes,
-    'DEP': depthwise_conv2d_shapes,
-    'T2D': conv2d_transpose_shapes,
-    'CAP': conv2d_capsule_shapes,
-    'NRM': norm_shapes,
-
-#    The following workloads are not in our sinle op evaluation plan.
-#    They should be moved to `common.py` and be used by `tune_wkl.py`.
-#    'C2D_NCHW': conv2d_nchw_shapes,
-#    'C2DWG_NHWC': conv2d_winograd_nhwc_shapes,
-#    'C2DWG_NCHW': conv2d_winograd_nchw_shapes,
-#    'GMM_TC': matmul_tensor_core_shapes,
-}
-
-conv2d_bn_relu_shapes = [
-    (1, 224, 224, 3, 64, 7, 2, 3),
-    (1, 56, 56, 64, 128, 3, 2, 1),
-    (1, 28, 28, 128, 256, 1, 2, 0),
-    (1, 7, 7, 512, 512, 3, 1, 1, 1),
-    (16, 224, 224, 3, 64, 7, 2, 3),
-    (16, 56, 56, 64, 128, 3, 2, 1),
-    (16, 28, 28, 128, 256, 1, 2, 0),
-    (16, 7, 7, 512, 512, 3, 1, 1, 1),
-]
-
-transpose_batch_matmul_shapes = [
-    (1,   128, 12, 64),
-    (1,   128, 16, 64),
-    (1,   64,  12, 128),
-    (1,   128, 12, 128),
-    (16,  128, 12, 64),
-    (16,  128, 16, 64),
-    (16,  64,  12, 128),
-    (16,  128, 12, 128),
-]
-
-subgraph_shape_dict = {
-    "conv2d_bn_relu": conv2d_bn_relu_shapes,
-    "transpose_batch_matmul": transpose_batch_matmul_shapes,
-}
-
-resnet_shapes = [
-    (1, ),
-    (16, ),
-]
-
-mobilenet_v2_shapes = [
-    (1, ),
-    (16, ),
-]
-
-dcgan_shapes = [
-    (1, ),
-    (16, ),
-]
-
-dqn_shapes = [
-    (1, ),
-    (16, ),
-]
-
-bert_shapes = [
-    (1, ),
-    (16, ),
-]
-
-resnet18_3d_shapes = [
-    (1, ),
-    (16, ),
-]
-
-network_shape_dict = {
-    'resnet_50': resnet_shapes,
-    'mobilenet_v2': mobilenet_v2_shapes,
-    'dcgan': dcgan_shapes,
-    'dqn': dqn_shapes,
-    'bert': bert_shapes,
-    'resnet_18_3d': resnet18_3d_shapes,
-}
-
diff --git a/scripts/tune_network.py b/scripts/tune_network.py
deleted file mode 100644
index 188da6cbe6e6..000000000000
--- a/scripts/tune_network.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Tune a whole neural network"""
-import argparse
-import logging
-import random
-import os
-import numpy as np
-
-import tvm
-from tvm import ansor, relay
-import tvm.contrib.graph_runtime as runtime
-from tvm.contrib.debugger import debug_runtime
-from tvm.contrib import util, ndk
-from tvm.relay import testing
-from tvm.ansor.utils import request_remote
-#from baseline.utils import log_line, BenchmarkRecord
-
-from common import str2bool
-from tune_test import create_tune_option
-
-dtype = "float32"
-
-def get_network(name, network_path, batch_size, layout):
-    """Get the relay module and random weights for a network"""
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-    input_name = 'data'
-
-    if name.startswith("resnet3d"):
-        n_layer = int(name.split('-')[1])
-        layout = "NDHWC"
-        image_shape = (16, 112, 112, 3)
-        input_shape = (batch_size, *image_shape)
-        mod, params = relay.testing.resnet3d.get_workload(num_layers=n_layer, batch_size=batch_size, image_shape=image_shape, dtype=dtype, layout=layout)
-    elif name.startswith("resnet"):
-        n_layer = int(name.split('-')[1])
-        image_shape = (224, 224, 3) if layout == 'NHWC' else (3, 224, 224)
-        input_shape = (batch_size, *image_shape)
-        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, layout=layout, image_shape=image_shape, dtype=dtype)
-    elif "lstm" in name:
-        mod, params = relay.testing.lstm.get_workload(iterations=10, num_hidden=512, batch_size=batch_size, dtype=dtype)
-    elif "mlp" in name:
-        input_shape = (batch_size, 1, 28, 28)
-        mod, params = relay.testing.mlp.get_workload(batch_size=batch_size, dtype=dtype)
-    elif "vgg" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
-    elif name == 'dcgan':
-        input_shape = (batch_size, 100)
-        mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size)
-    elif name == 'dqn':
-        layout = "NHWC"
-        image_shape = (84, 84, 4)
-        input_shape = (batch_size, *image_shape)
-        mod, params = relay.testing.dqn.get_workload(batch_size=batch_size, image_shape=image_shape, dtype=dtype, layout=layout)
-    elif name == 'mobilenet':
-        image_shape = (224, 224, 3) if layout == 'NHWC' else (3, 224, 224)
-        input_shape = (batch_size, *image_shape)
-        mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, layout=layout, image_shape=image_shape, dtype=dtype)
-    elif name == 'r3d_18':
-        import torch
-        import torchvision
-
-        model = getattr(torchvision.models.video, name)(pretrained=False)
-        model = model.eval()
-
-        # We grab the TorchScripted model via tracing
-        input_shape = [batch_size, 3, 16, 112, 112]
-        input_data = torch.randn(input_shape)
-        scripted_model = torch.jit.trace(model, input_data).eval()
-
-        input_name = 'input0'  # only one input, set it to this name
-        shape_list = {input_name: input_shape}
-        mod, params = relay.frontend.from_pytorch(scripted_model,
-                                                  shape_list)
-    elif name == 'squeezenet_v1.1':
-        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
-    elif name == 'inception_v3':
-        input_shape = (batch_size, 3, 299, 299)
-        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'mxnet':
-        # an example for mxnet model
-        from mxnet.gluon.model_zoo.vision import get_model
-        block = get_model('resnet18_v1', pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={"input_name": input_shape}, dtype=dtype)
-        net = mod["main"]
-        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
-        mod = relay.Module.from_expr(net)
-    elif name == 'tflite-mobilenet-v2' or name == 'tflite-resnet-v2-50':
-        try:
-            import tflite.Model
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-        input_name = "input"
-        input_shape = (1, 224, 224, 3)
-        output_shape = (1, 1001)
-        input_dtype = "float32"
-        tflite_model_buf = open(network_path, "rb").read()
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-        mod, params = relay.frontend.from_tflite(tflite_model,
-                                                 shape_dict={input_name: input_shape},
-                                                 dtype_dict={input_name: input_dtype})
-    elif name == 'pytorch-mobilenet-v2':
-        import torch
-
-        model = torch.hub.load('pytorch/vision:v0.5.0', 'mobilenet_v2', pretrained=False)
-        model.eval()
-
-        input_shape = [batch_size, 3, 224, 224]
-        input_data = torch.randn(input_shape)
-        scripted_model = torch.jit.trace(model, input_data).eval()
-
-        input_name = 'input0'
-        shape_list = {input_name: input_shape}
-        mod, params = relay.frontend.from_pytorch(scripted_model,
-                                                  shape_list)
-    elif name == 'bert':
-        import tensorflow as tf
-
-        bert_pb = './baseline/tensorflow/tf_models/bert/bert-B%d.pb' % batch_size
-        try:
-            with tf.compat.v1.gfile.GFile(bert_pb, 'rb') as f:
-                graph_def = tf.compat.v1.GraphDef()
-                graph_def.ParseFromString(f.read())
-        except:
-            raise ValueError("Need to run ./baseline/tensorflow/bert/generate_bert_pb.py to get model first")
-
-        input_shape = (batch_size, 128)
-        input_name = ['input']
-        shape_dict = {
-            'input': input_shape
-        }
-        out_names = [
-            'bert/pooler/dense/Tanh'
-        ]
-
-        mod, params = relay.frontend.from_tensorflow(graph_def,
-                                                    shape=shape_dict,
-                                                    outputs=out_names)
-    else:
-        raise ValueError("Unsupported network: " + name)
-
-    return mod, params, input_name, input_shape, output_shape
-
-
-def create_module(data_shape, graph, lib, target, input_name, params, debug_profile,
-        local_measure, ndk_cc, rpc_device_key, rpc_host, rpc_port, rpc_num_threads, seed=43):
-    if local_measure:
-        if target.target_name == "cuda":
-            ctx = tvm.gpu()
-        else:
-            ctx = tvm.cpu()
-    else:
-        print("=============== Request Remote ===============")
-        if 'TVM_NDK_CC' not in os.environ:
-            os.environ['TVM_NDK_CC'] = ndk_cc
-        remote = request_remote(rpc_device_key, rpc_host, rpc_port)
-
-        print("=============== Export ===============")
-        ctx = remote.cpu()
-        temp = util.tempdir()
-        path_lib = temp.relpath("deploy_lib.so")
-        lib.export_library(path_lib, ndk.create_shared)
-
-        print("=============== Upload ===============")
-        remote.upload(path_lib)
-
-        print("=============== Load ===============")
-        lib = remote.load_module("deploy_lib.so")
-
-        if rpc_num_threads:
-            config_threadpool = remote.get_function('runtime.config_threadpool')
-            config_threadpool(0, rpc_num_threads)
-
-    np.random.seed(seed)
-    data_tvm = tvm.nd.array(100 * (np.random.uniform(size=data_shape)).astype(dtype), ctx=ctx)
-    if debug_profile:
-        module = debug_runtime.create(graph, lib, ctx)
-    else:
-        module = runtime.create(graph, lib, ctx)
-
-    if type(input_name) == list:
-        for name in input_name:
-            module.set_input(name, data_tvm)
-    else:
-        module.set_input(input_name, data_tvm)
-    for k, v in params.items():
-        module.set_input(k, v)
-
-    return module, ctx
-
-
-def tune_and_evaluate(network_arguments, target, target_host,
-                      search_policy, task_scheduler_arguments, tune_option_arguments,
-                      tune, debug_profile, check_correctness, log_n_lines):
-    # Extract tasks from relay program
-    mod, params, input_name, data_shape, out_shape = get_network(**network_arguments)
-
-    # Tune all
-    if tune:
-        print("=============== Extract Workloads ===============")
-        workloads, wkl_weights = ansor.extract_from_program(mod, target=target, params=params)
-        print("Extract %d workloads in total" % (len(workloads)))
-
-        # Tune workloads with auto scheduler
-        print("=============== Tune ===============")
-        tasks = []
-        for i, wkl_key in enumerate(workloads):
-            dag = ansor.workload_key_to_dag(wkl_key)
-            print("[========= Task %d =========]\n" % i, dag)
-            tasks.append(ansor.SearchTask(dag, wkl_key, target, target_host))
-
-        tuner = ansor.SimpleTaskScheduler(tasks,
-            lambda costs: sum(c * w for c, w in zip(costs, wkl_weights)),
-            **task_scheduler_arguments)
-        tune_option, measure_ctx = create_tune_option(target, **tune_option_arguments)
-
-        if tune_option_arguments['local_measure'] and target.target_name != 'cuda':
-            os.environ['TVM_BIND_MASTER_CORE_0'] = "1"
-        tuner.tune(tune_option, search_policy)
-
-        if measure_ctx:
-            del measure_ctx
-
-    kernel_layout_rewrite = True
-
-    # Compile graph with best states found by auto-scheduler
-    print("=============== Compile ===============")
-    with ansor.apply_history_best(tune_option_arguments['log_file'], log_n_lines):
-        os.environ['TVM_AUTO_CACHE_FLUSH'] = "0"
-
-        if kernel_layout_rewrite:
-            ansor.prepare_layout_rewrite(mod, target=target, params=params)
-        else:
-            # disable layout rewrite
-            ansor.LayoutRewriteLevel.BOTH_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE
-            ansor.LayoutRewriteLevel.COMPUTE_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE
-
-        with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
-            graph, lib, opt_params = relay.build_module.build(
-                mod, target=target, params=params)
-
-        ansor.finish_layout_rewrite()
-        print("=============== Compile Finish ===============")
-
-        module, ctx = create_module(data_shape, graph, lib, target, input_name,
-                                    opt_params, debug_profile, **common_measure_parameters)
-
-        # Evaluate
-        print("========== Evaluate ==========")
-        ftimer = module.module.time_evaluator("run", ctx, number=10, repeat=3)
-        prof_res = np.array(ftimer().results)
-
-        # display profile information
-        if debug_profile or check_correctness:
-            module.run()
-            if check_correctness:
-                actual_output = module.get_output(0).asnumpy()
-                print(actual_output)
-
-        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
-              (np.mean(prof_res) * 1000, np.std(prof_res) * 1000))
-        #log_line(BenchmarkRecord(target.target_name, 'gpu' if target.target_name == 'cuda' else 'cpu', 'network',
-        #                         "%s.B%d" % (network_name, batch_size), 'AutoSchedule', layout,
-        #                         {"costs": prof_res}, time.time()), record_file)
-
-    if check_correctness:
-        print("========== Check Correctness ==========")
-        # clean relay cache
-        relay.backend.compile_engine.get().clear()
-
-        # disable layout rewrite
-        ansor.LayoutRewriteLevel.BOTH_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE
-        ansor.LayoutRewriteLevel.COMPUTE_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE
-        target = tvm.target.create('llvm')
-        with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
-            graph, lib, opt_params = relay.build_module.build(
-                mod, target=target, params=params)
-
-        module, _ = create_module(data_shape, graph, lib, target, input_name,
-                                  opt_params, debug_profile, **common_measure_parameters)
-        module.run()
-
-        expected_output = module.get_output(0).asnumpy()
-        np.testing.assert_allclose(actual_output, expected_output, rtol=1e-3, atol=1e-3)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Search task related arguments
-    parser.add_argument("--network", type=str, required=True)
-    parser.add_argument("--network-path", type=str, default=None, help="The path of tflite model")
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument("--layout", type=str, default='NHWC')
-    parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2')
-    parser.add_argument("--target-host", type=str, default=None)
-    parser.add_argument("--check-correctness", type=str2bool, nargs='?', const=True, default=False)
-    parser.add_argument("--debug-profile", type=str2bool, nargs='?', const=True, default=False)
-    parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True)
-
-    # Search strategy related arguments
-    parser.add_argument("--n-trials", type=int, default=1000)
-    parser.add_argument("--policy", type=str, choices=['sketch'], default='sketch')
-    parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb')
-    parser.add_argument("--task-scheduler", type=str, default='gradient',
-                        choices=['no', 'gradient', 'round-robin'],
-                        help='The strategy of task scheduler')
-    parser.add_argument("--seed", type=int, default=0, help='random seed')
-
-    # Log file related arguments
-    parser.add_argument("--log-file", type=str, help="Write measurement records to this log file")
-    parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search")
-    parser.add_argument("--log-n-lines", type=int, help="Only load the first n lines for history log")
-    parser.add_argument("--load-model", type=str, help="Load pre trained cost model file")
-
-    # Measurement related and other arguments
-    parser.add_argument("--num-measure-per-iter", type=int, default=48,
-                        help="The number of programs to be measured at each iteration")
-    parser.add_argument("--build-timeout", type=int, default=10)
-    parser.add_argument("--run-timeout", type=int, default=10)
-    parser.add_argument("--early-stopping", type=int, default=-1)
-    parser.add_argument("--verbose", type=int, default=1)
-    parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True)
-    parser.add_argument("--rpc-device-key", type=str, default=None)
-    parser.add_argument("--rpc-host", type=str, default='0.0.0.0')
-    parser.add_argument("--rpc-port", type=int, default=9190)
-    parser.add_argument("--rpc-num-threads", type=int, default=None)
-    parser.add_argument("--n-parallel", type=int, default=1)
-    parser.add_argument("--ndk-cc", type=str, default=None)
-    args = parser.parse_args()
-
-    np.random.seed(args.seed)
-    random.seed(args.seed)
-    logging.basicConfig()
-    logging.getLogger('ansor').setLevel(logging.DEBUG)
-    os.environ["TOPHUB_LOCATION"] = "NONE"  # disable autotvm
-
-    target = tvm.target.create(args.target)
-    log_file = args.log_file or "%s-B%d-%s.json" % (args.network, args.batch_size,
-                                                    target.target_name)
-    load_log_file = args.load_log or log_file
-    search_policy = "%s.%s" % (args.policy, args.model_type)
-    if args.layout:
-        layout = args.layout
-    elif target.target_name == "cuda":
-        layout = "NCHW"
-    else:
-        layout = "NHWC"
-
-    network_arguments = {
-        'name': args.network,
-        'network_path': args.network_path,
-        'batch_size': args.batch_size,
-        'layout': layout
-    }
-
-    task_scheduler_parameters = {
-        'strategy': args.task_scheduler,
-        'load_log_file': load_log_file,
-        'load_model_file': args.load_model,
-        'verbose': args.verbose,
-    }
-
-    common_measure_parameters = {
-        'local_measure': args.local_measure,
-        'rpc_device_key': args.rpc_device_key,
-        'rpc_host': args.rpc_host,
-        'rpc_port': args.rpc_port,
-        'rpc_num_threads': args.rpc_num_threads,
-        'ndk_cc': args.ndk_cc,
-    }
-
-    tune_option_arguments = {
-        'log_file': log_file,
-        'n_trials': args.n_trials,
-        'num_measure_per_iter': args.num_measure_per_iter,
-        'verbose': args.verbose,
-        'n_parallel': args.n_parallel,
-        'build_timeout': args.build_timeout,
-        'run_timeout': args.run_timeout,
-        'early_stopping': args.early_stopping,
-        **common_measure_parameters
-    }
-
-    tune_and_evaluate(network_arguments, target, args.target_host,
-                      search_policy, task_scheduler_parameters, tune_option_arguments,
-                      args.tune, args.debug_profile, args.check_correctness,
-                      args.log_n_lines)
diff --git a/scripts/tune_op_subgraph.py b/scripts/tune_op_subgraph.py
deleted file mode 100644
index d3e70501873e..000000000000
--- a/scripts/tune_op_subgraph.py
+++ /dev/null
@@ -1,602 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Tune all workloads for single op & subgraph evaluation"""
-import argparse
-import logging
-import random
-
-import numpy as np
-
-import tvm
-from tvm import te, ansor
-import topi
-from topi.nn.winograd_util import winograd_transform_matrices
-from topi.util import get_const_tuple
-
-from common import measure_schedule, str2bool, norm_bmn, conv2d_nhwc_bn_relu, conv2d_nchw_bn_relu
-from shape_configs import single_op_shape_dict, subgraph_shape_dict
-from tune_test import tune_workloads_jointly, replay_workload, create_tune_option
-
-# ========================== Single Ops ==========================
-
-@ansor.register_workload_func
-def batch_matmul_nkkm(B, N, M, K):
-    X = te.placeholder((B, N, K), name='A')
-    Y = te.placeholder((B, K, M), name='B')
-    k = te.reduce_axis((0, K), name='k')
-    Z = te.compute((B, N, M), lambda b, i, j: te.sum(X[b][i][k] * Y[b][k][j], axis=[k]), name='C')
-    return [X, Y, Z]
-
-@ansor.register_workload_func
-def conv1d_nlc(N, L, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    inputs = te.placeholder((N, L, CI), name='inputs')
-    weight = te.placeholder((kernel_size, CI//groups, CO), name='weight')
-
-    batch_size, in_len, in_channel = inputs.shape
-    k_len, channel_per_group, out_channel = weight.shape
-    out_channel_per_group = out_channel // groups
-    out_len = (in_len + 2 * padding - dilation * (k_len - 1) - 1) // stride + 1
-    rc = te.reduce_axis((0, channel_per_group), name='rc')
-    rl = te.reduce_axis((0, k_len), name='rl')
-
-    padded = topi.nn.pad(inputs, [0, padding, 0])
-    output = te.compute(
-        (batch_size, out_len, out_channel),
-        lambda n, l, co: te.sum(
-            (padded[n, l * stride + rl * dilation, co // out_channel_per_group * channel_per_group + rc] *
-             weight[rl, rc, co]), axis=[rl, rc]),
-        name='conv1d_nlc'
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv2d_nhwc(N, H, W, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    inputs = te.placeholder((N, H, W, CI), name='inputs')
-    weight = te.placeholder((kernel_size, kernel_size, CI//groups, CO), name='weight')
-    batch_size, in_h, in_w, in_channel = inputs.shape
-    k_h, k_w, channel_per_group, out_channel = weight.shape
-    out_channel_per_group = out_channel // groups
-
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rh = te.reduce_axis((0, k_h), name="rh")
-    rw = te.reduce_axis((0, k_w), name="rw")
-    rc = te.reduce_axis((0, channel_per_group), name="rc")
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, 0])
-    output = te.compute(
-        (batch_size, out_h, out_w, out_channel),
-        lambda n, h, w, co: te.sum(
-            (padded[n, h * stride + rh * dilation, w * stride + rw * dilation,
-                    co // out_channel_per_group * channel_per_group + rc]
-             * weight[rh, rw, rc, co]), axis=[rh, rw, rc]
-        ),
-        name='conv2d_nhwc'
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv2d_nchw(N, CI, H, W, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    inputs = te.placeholder((N, CI, H, W), name='inputs')
-    weight = te.placeholder((CO, CI//groups, kernel_size, kernel_size), name='weight')
-    batch_size, in_channel, in_h, in_w = inputs.shape
-    out_channel, channel_per_group, k_h, k_w, = weight.shape
-    out_channel_per_group = out_channel // groups
-
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rc = te.reduce_axis((0, channel_per_group), name="rc")
-    rh = te.reduce_axis((0, k_h), name="rh")
-    rw = te.reduce_axis((0, k_w), name="rw")
-
-    padded = topi.nn.pad(inputs, [0, 0, padding, padding])
-    output = te.compute(
-        (batch_size, out_channel, out_h, out_w),
-        lambda n, co, h, w: te.sum(
-            (padded[n, co // out_channel_per_group * channel_per_group + rc,
-                    h * stride + rh * dilation, w * stride + rw * dilation]
-             * weight[co, rc, rh, rw]), axis=[rc, rh, rw]
-        ),
-        name='conv2d_nchw'
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv3d_ndhwc(N, D, H, W, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    inputs = te.placeholder((N, D, H, W, CI))
-    weight = te.placeholder((kernel_size, kernel_size, kernel_size, CI//groups, CO))
-    batch_size, in_d, in_h, in_w, in_channel = inputs.shape
-    k_d, k_h, k_w, channel_per_group, out_channel = weight.shape
-    out_channel_per_group = out_channel // groups
-
-    out_d = (in_d + 2 * padding - dilation * (k_d - 1) - 1) // stride + 1
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rd = te.reduce_axis((0, k_d), name='rd')
-    rh = te.reduce_axis((0, k_h), name='rh')
-    rw = te.reduce_axis((0, k_w), name='rw')
-    rc = te.reduce_axis((0, channel_per_group), name='rc')
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, padding, 0])
-    output = te.compute(
-        (batch_size, out_d, out_h, out_w, out_channel),
-        lambda n, d, h, w, co: te.sum(
-            (padded[n, d * stride + rd * dilation,
-                    h * stride + rh * dilation, w * stride + rw * dilation,
-                    co // out_channel_per_group * channel_per_group + rc]
-             * weight[rd, rh, rw, rc, co]),
-            axis=[rd, rh, rw, rc]
-        ),
-        name='conv3d_ndhwc'
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def depthwise_conv2d_nhwc(N, H, W, C, kernel_size, stride=1, padding=0, dilation=1, factor=1):
-    inputs = te.placeholder((N, H, W, C))
-    weight = te.placeholder((factor, kernel_size, kernel_size, C))
-
-    batch_size, in_h, in_w, in_channel = inputs.shape
-    factor, k_h, k_w, in_channel = weight.shape
-    out_channel = in_channel * factor
-
-    assert factor.value == 1, "Not optimized for factor != 1"
-
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rh = te.reduce_axis((0, k_h), name='rh')
-    rw = te.reduce_axis((0, k_w), name='rw')
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, 0])
-    output = te.compute(
-        (batch_size, out_h, out_w, out_channel),
-        lambda n, h, w, c: te.sum(
-            (padded[n,  h * stride + rh * dilation, w * stride + rw * dilation, c // factor]
-             * weight[c % factor, rh, rw, c // factor]),
-            axis=[rh, rw]
-        ),
-        name="depth_conv2d_nhwc"
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv2d_transpose_nhwc(N, H, W, CI, CO, kernel_size, stride=1, padding=0):
-    inputs = te.placeholder((N, H, W, CI), name='inputs')
-    weight = te.placeholder((kernel_size, kernel_size, CI, CO), name='weight')
-
-    batch, in_h, in_w, in_c = inputs.shape
-    filter_h, filter_w, in_c, out_c = weight.shape
-    stride_h, stride_w = (stride, stride)
-
-    # compute padding
-    fpad_top, fpad_left, fpad_bottom, fpad_right = topi.nn.get_pad_tuple(padding, (filter_h, filter_w))
-    bpad_top = filter_h - 1 - fpad_top
-    bpad_bottom = filter_h - 1 - fpad_bottom
-    bpad_left = filter_w - 1 - fpad_left
-    bpad_right = filter_w - 1 - fpad_right
-
-    # padding stage
-    padded = topi.nn.pad(inputs,
-                         [0, (bpad_top + stride_h - 1) // stride_h,
-                          (bpad_left + stride_w - 1) // stride_w, 0],
-                         [0, (bpad_bottom + stride_h - 1) // stride_h,
-                          (bpad_right + stride_w - 1) // stride_w, 0])
-
-    # remove extra padding introduced by dilatation
-    idxdiv = te.indexdiv
-    idxmod = te.indexmod
-    border_h = idxmod(stride_h - idxmod(bpad_top, stride_h), stride_h)
-    border_w = idxmod(stride_w - idxmod(bpad_left, stride_w), stride_w)
-
-    # dilation stage
-    strides = [1, stride_h, stride_w, 1]
-    n = len(padded.shape)
-
-    # We should embed this dilation directly into te.compute rather than creating a new te.compute.
-    # Only in this way can we use unroll to eliminate the multiplication of zeros.
-    def _dilate(*indices):
-        not_zero = []
-        index_tuple = []
-        for i in range(n):
-            if not strides[i] == 1:
-                index_tuple.append(idxdiv(indices[i], strides[i]))
-                not_zero.append(idxmod(indices[i], strides[i]).equal(0))
-            else:
-                index_tuple.append(indices[i])
-        if not_zero:
-            not_zero = te.all(*not_zero)
-            return te.if_then_else(not_zero, padded(*index_tuple), tvm.tir.const(0.0, padded.dtype))
-        return padded(*index_tuple)
-
-    # convolution stage
-    out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
-    out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
-    rc = te.reduce_axis((0, in_c), name='rc')
-    rh = te.reduce_axis((0, filter_h), name='rh')
-    rw = te.reduce_axis((0, filter_w), name='rw')
-
-    output = te.compute(
-        (batch, out_h, out_w, out_c),
-        lambda n, h, w, co: te.sum(
-            _dilate(n, h + rh + border_h, w + rw + border_w, rc) *
-            weight[filter_h - 1 - rh, filter_w - 1 - rw, rc, co],
-            axis=[rh, rw, rc]),
-        name="conv2d_transpose_nhwc",
-        attrs={"ansor_always_unroll_inner": ["h", "w", "rh", "rw", "h_c", "w_c"]})
-    # todo(lmzheng): add constraints on the tile size of h and w
-
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv2d_capsule_nhwijc(N, H, W, CI, CO, kernel_size, stride=1, padding=0, capsule_size=4):
-    inputs = te.placeholder((N, H, W, capsule_size, capsule_size, CI), name='inputs')
-    weight = te.placeholder((kernel_size, kernel_size, capsule_size, capsule_size, CI, CO), name='weight')
-    batch_size, in_h, in_w, _, _, in_channel = inputs.shape
-    k_h, k_w, _, _, _, out_channel = weight.shape
-
-    out_h = (in_h + 2 * padding - kernel_size) // stride + 1
-    out_w = (in_w + 2 * padding - kernel_size) // stride + 1
-
-    rh = te.reduce_axis((0, k_h), name="rh")
-    rw = te.reduce_axis((0, k_w), name="rw")
-    cap_k = te.reduce_axis((0, capsule_size), name='cap_k')
-    rc = te.reduce_axis((0, in_channel), name="rc")
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, 0, 0, 0])
-    output = te.compute(
-        (batch_size, out_h, out_w, capsule_size, capsule_size, out_channel),
-        lambda n, h, w, cap_i, cap_j, co: te.sum(
-            (padded[n, h * stride + rh, w * stride + rw, cap_i, cap_k, rc]
-             * weight[rh, rw, cap_k, cap_j, rc, co]), axis=[rh, rw, cap_k, rc]
-        ),
-        name='conv2d_capsule_nhwijc'
-    )
-    return [inputs, weight, output]
-
-
-@ansor.register_workload_func
-def conv2d_winograd_nhwc(N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1):
-    # TODO: implement tile_size
-    tile_size = 4 #_infer_tile_size(data, kernel)
-    inputs = te.placeholder((N, H, W, CI), name='inputs')
-    #weight = te.placeholder((kernel_size, kernel_size, CI, CO), name='weight')
-    N, H, W, CI = get_const_tuple(inputs.shape)
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-    # if dilation_h != 1 or dilation_w != 1:
-    #     weight = topi.nn.dilate(weight, (1, 1, dilation_h, dilation_w))
-    KH = KW = kernel_size
-    HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW))
-    HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride
-    assert HSTR == 1 and WSTR == 1 and KH == KW
-
-    data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0), name="data_pad")
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, 'float32')
-
-    H = (H + 2 * HPAD - KH) // HSTR + 1
-    W = (W + 2 * WPAD - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-    r_kh = te.reduce_axis((0, KH), name='r_kh')
-    r_kw = te.reduce_axis((0, KW), name='r_kw')
-    # kernel_pack = te.compute((alpha, alpha, CO, CI), lambda eps, nu, co, ci:
-    #                           weight[0][0][0][0],
-    #                           name='kernel_pack')
-    kshape = (alpha, alpha, CO, CI)
-    kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")
-
-    idxdiv = te.indexdiv
-    idxmod = te.indexmod
-    # pack input tile
-    input_tile = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci:
-                             data_pad[idxdiv(p, (nH * nW))][idxmod(idxdiv(p, nW), nH) * m + eps]
-                                     [idxmod(p, nW) * m + nu][ci], name='input_tile',)
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_b')
-    data_pack = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci:
-                            te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu],
-                                    axis=[r_a, r_b]), name='data_pack',
-                                    attrs={"ansor_no_split_at_inner": ["eps", "nu", "r_a", "r_b"],
-                                           "ansor_last_split_is_one": ["ci", "p"],
-                                           "ansor_always_unroll": ["eps", "nu", "r_a", "r_b"],
-                                           "ansor_no_cache_write": "True",
-                                           })
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name='ci')
-    bgemm = te.compute((alpha, alpha, P, CO), lambda eps, nu, p, co:
-                        te.sum(data_pack[eps][nu][p][ci] *
-                                kernel_pack[eps][nu][co][ci],
-                                axis=[ci]), name='bgemm')
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_b')
-    inverse = te.compute((m, m, P, CO), lambda vh, vw, p, co:
-                          te.sum(bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw],
-                                  axis=[r_a, r_b]), name='inverse',
-                          attrs={"ansor_no_split_at_inner": ["vh", "vw", "r_a", "r_b"],
-                                 "ansor_always_unroll": ["vh", "vw", "r_a", "r_b"],
-                                 "ansor_last_split_is_one": ["co", "p"],
-                                 "ansor_no_cache_write": "True",
-                                 })
-
-    # output
-    output = te.compute((N, H, W, CO), lambda n, h, w, co:
-                         inverse[idxmod(h, m),
-                                 idxmod(w, m),
-                                 n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
-                                 co],
-                         name='conv2d_winograd',
-                         tag='conv2d_winograd_nhwc',
-                         attrs={"ansor_no_split_at_outer": ["n", "h", "w", "co"],})
-    return [inputs, kernel_pack, output]
-
-@ansor.register_workload_func
-def conv2d_winograd_nchw(N, CI, H, W, CO, kernel_size=3, stride=1, padding=0, dilation=1, precompute=False):
-    # TODO: implement tile_size
-    tile_size = 4 #_infer_tile_size(data, kernel)
-    inputs = te.placeholder((N, CI, H, W), name='inputs')
-    #weight = te.placeholder((CO, CI, kernel_size, kernel_size), name='weight')
-    N, CI, H, W = get_const_tuple(inputs.shape)
-    # if isinstance(dilation, int):
-    #     dilation_h = dilation_w = dilation
-    # else:
-    #     dilation_h, dilation_w = dilation
-    # if dilation_h != 1 or dilation_w != 1:
-    #     weight = topi.nn.dilate(weight, (1, 1, dilation_h, dilation_w))
-    KH = KW = kernel_size
-    HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW))
-    HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride
-    assert HSTR == 1 and WSTR == 1 and KH == KW
-
-    data_pad = topi.nn.pad(inputs, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad")
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, 'float32')
-
-    H = (H + 2 * HPAD - KH) // HSTR + 1
-    W = (W + 2 * WPAD - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-    r_kh = te.reduce_axis((0, KH), name='r_kh')
-    r_kw = te.reduce_axis((0, KW), name='r_kw')
-    # kernel_pack = te.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co:
-    #                           weight[0][0][0][0],
-    #                           name='kernel_pack')
-    kshape = (alpha, alpha, CI, CO)
-    kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")
-
-    idxdiv = te.indexdiv
-    idxmod = te.indexmod
-    # pack input tile
-    input_tile = te.compute((CI, P, alpha, alpha), lambda ci, p, eps, nu:
-                             data_pad[idxdiv(p, (nH * nW))][ci][idxmod(idxdiv(p, nW), nH) * m + eps]
-                                     [idxmod(p, nW) * m + nu], name='input_tile')
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_b')
-    data_pack = te.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p:
-                            te.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
-                                    axis=[r_a, r_b]), name='data_pack',
-                                    attrs={"ansor_no_split_at_inner": ["eps", "nu", "r_a", "r_b"],
-                                           "ansor_no_split_at_outer": ["ci", "p"],
-                                           "ansor_always_unroll": ["eps", "nu", "r_a", "r_b"],
-                                           "ansor_no_cache_write": "True",
-                                           })
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name='ci')
-    bgemm = te.compute((alpha, alpha, CO, P), lambda eps, nu, co, p:
-                        te.sum(data_pack[eps][nu][ci][p] *
-                                kernel_pack[eps][nu][ci][co],
-                                axis=[ci]), name='bgemm')
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_b')
-    inverse = te.compute((CO, P, m, m), lambda co, p, vh, vw:
-                          te.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
-                                  axis=[r_a, r_b]), name='inverse',
-                          attrs={"ansor_no_split_at_outer": ["co", "p", "vh", "vw", "r_a", "r_b"],
-                                 "ansor_always_unroll": ["vh", "vw", "r_a", "r_b"],
-                                 "ansor_no_cache_write": "True"})
-
-    # output
-    output = te.compute((N, CO, H, W), lambda n, co, h, w:
-                         inverse[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
-                                 idxmod(h, m),
-                                 idxmod(w, m)],
-                         name='conv2d_winograd',
-                         attrs={"ansor_no_split_at_outer": ["n", "co", "h", "w"],})
-    return [inputs, kernel_pack, output]
-
-# ========================== Subgraphs ==========================
-
-@ansor.register_workload_func
-def transpose_batch_matmul(batch, seq_len, n_head, n_dim):
-    query = te.placeholder((batch, seq_len, n_head, n_dim), name='query')
-    value = te.placeholder((batch, seq_len, n_head, n_dim), name='value')
-    query_T = te.compute((batch, n_head, seq_len, n_dim),
-                      lambda b, h, l, d: query[b, l, h, d], name="query_T")
-    value_T = te.compute((batch, n_head, n_dim, seq_len),
-                      lambda b, h, d, l: value[b, l, h, d], name="value_T")
-    k = te.reduce_axis((0, n_dim), name='k')
-    out = te.compute((batch, n_head, seq_len, seq_len),
-                 lambda b, h, i, j: te.sum(query_T[b][h][i][k] * value_T[b][h][k][j], axis=[k]),
-                 name='C')
-    return [query, value, out]
-
-# ========================== Tune function & Task dicts ==========================
-
-def tune_wkl(task_func_dict, shape_dict, wkl_type, args):
-    target = tvm.target.create(args.target)
-
-    for wkl_meta_name, func in task_func_dict.items():
-        if not args.wkl in ["all", wkl_type, wkl_meta_name]:
-            continue
-
-        log_file = args.log_file or wkl_meta_name + ".json"
-        wkl_keys = []
-        for shape in shape_dict[wkl_meta_name]:
-            if shape[0] == 1:
-                shape = list(shape)
-                shape[0] = args.batch_size
-
-            wkl_key = ansor.make_workload_key_func(func, shape)
-            wkl_keys.append(wkl_key)
-            if args.fast_check:
-                break
-
-            if not args.tune:
-                cost, gflops = replay_workload(
-                        wkl_key, target, args.target_host, log_file,
-                        args.local_measure, args.rpc_device_key, args.rpc_host,
-                        args.rpc_port, args.rpc_num_threads, args.ndk_cc, False)
-                # log_line(BenchmarkRecord(target.name, 'gpu' if target.name == 'cuda' else 'cpu', 'subgraph',
-                #                          workload_name, "AutoSchedule", "default",
-                #                          {"costs": [cost]}, time.time()), args.out_file)
-
-        if args.tune:
-            print("========== Tune for %s (%d shapes) ========== " % (wkl_meta_name, len(wkl_keys)))
-
-            load_log_file = args.load_log or log_file
-            n_trials = args.n_trials_per_shape * len(wkl_keys)
-
-            tune_option, measure_ctx = create_tune_option(target, log_file,
-                    n_trials, args.num_measure_per_iter, args.verbose,
-                    args.n_parallel, args.build_timeout, args.local_measure,
-                    args.rpc_device_key, args.rpc_host, args.rpc_port,
-                    args.rpc_num_threads, args.ndk_cc)
-
-            # tune workloads jointly using JointTuner
-            tune_workloads_jointly(wkl_keys, np.ones(len(wkl_keys)), args.task_scheduler,
-                                   target, args.target_host, args.policy, args.model_type,
-                                   args.load_model, load_log_file, tune_option)
-
-            if measure_ctx:
-                del measure_ctx
-
-
-single_op_task_func_dict = {
-    'GMM': batch_matmul_nkkm,
-    'C1D': conv1d_nlc,
-    'C2D': conv2d_nhwc,
-    'C3D': conv3d_ndhwc,
-    'GRP': conv2d_nhwc,
-    'DIL': conv2d_nhwc,
-    'DEP': depthwise_conv2d_nhwc,
-    'T2D': conv2d_transpose_nhwc,
-    'CAP': conv2d_capsule_nhwijc,
-    'NRM': norm_bmn,
-    #'SMX': softmax_mn,
-
-#    The following workloads are not in our sinle op evaluation plan.
-#    They should be moved to `common.py` and be used by `tune_wkl.py`.
-#    'C2D_NCHW': conv2d_nchw,
-#    'C2DWG_NHWC': conv2d_winograd_nhwc,
-#    'C2DWG_NCHW': conv2d_winograd_nchw,
-#    'GMM_TC': matmul_nkkm,
-}
-
-subgraph_task_func_dict = {
-    'conv2d_bn_relu': conv2d_nhwc_bn_relu,
-    #'conv2d_bn_relu': conv2d_nchw_bn_relu,    # some old log uses conv2d_nchw_bn_relu
-    'transpose_batch_matmul': transpose_batch_matmul,
-}
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Search task related arguments
-    parser.add_argument("--wkl", type=str, required=True,
-                        help="all      - Tune all workloads; \
-                              op       - Tune all single ops; \
-                              subgraph - Tune all subgraphs; \
-                              specific wkl name - Tune a specific workload")
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2')
-    parser.add_argument("--target-host", type=str, default=None)
-    parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True)
-    parser.add_argument("--fast-check", action='store_true',
-                        help='Only run one shape for each workload. This is used for fast checking')
-
-    # Search strategy related arguments
-    parser.add_argument("--n-trials-per-shape", type=int, default=1000)
-    parser.add_argument("--policy", type=str, choices=['sketch', 'beam-search'], default='sketch')
-    parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb')
-    parser.add_argument("--task-scheduler", type=str, default='round-robin',
-                        choices=['no', 'gradient', 'round-robin'], help='The strategy of task scheduler')
-    parser.add_argument("--seed", type=int, default=0, help='random seed')
-
-    # Log file related arguments
-    parser.add_argument("--log-file", type=str, help="Write measurement records to this log file")
-    parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search")
-    parser.add_argument("--load-model", type=str, help="Load pre-trained cost model from this file")
-
-    # Measurement related and other arguments
-    parser.add_argument("--num-measure-per-iter", type=int, default=48,
-                        help="The number of programs to be measured at each iteration")
-    parser.add_argument("--build-timeout", type=int, default=10)
-    parser.add_argument("--run-timeout", type=int, default=60)
-    parser.add_argument("--verbose", type=int, default=1)
-    parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True)
-    parser.add_argument("--rpc-device-key", type=str, default=None)
-    parser.add_argument("--rpc-host", type=str, default='0.0.0.0')
-    parser.add_argument("--rpc-port", type=int, default=9190)
-    parser.add_argument("--rpc-num-threads", type=int, default=None)
-    parser.add_argument("--n-parallel", type=int, default=1)
-    parser.add_argument("--ndk-cc", type=str, default=None)
-    args = parser.parse_args()
-
-    np.random.seed(args.seed)
-    random.seed(args.seed)
-    logging.basicConfig()
-    logging.getLogger('ansor').setLevel(logging.DEBUG)
-
-    # compute the number of tasks
-    num_tasks = 0
-    for wkl_meta_name in single_op_task_func_dict:
-        if not args.wkl in ["all", "op", wkl_meta_name]:
-            continue
-        if args.fast_check:
-            num_tasks += 1
-        else:
-            num_tasks += len(single_op_shape_dict[wkl_meta_name])
-    for wkl_meta_name in subgraph_task_func_dict:
-        if not args.wkl in ["all", "subgraph", wkl_meta_name]:
-            continue
-        if args.fast_check:
-            num_tasks += 1
-        else:
-            num_tasks += len(subgraph_shape_dict[wkl_meta_name])
-    print("Number of tasks: %d\tTotal trials: %d" % (num_tasks, num_tasks * args.n_trials_per_shape))
-
-    # tune for tasks
-    tune_wkl(single_op_task_func_dict, single_op_shape_dict, "op", args)
-    tune_wkl(subgraph_task_func_dict, subgraph_shape_dict, "subgraph", args)
diff --git a/scripts/tune_test.py b/scripts/tune_test.py
deleted file mode 100644
index 6b39cf5e7865..000000000000
--- a/scripts/tune_test.py
+++ /dev/null
@@ -1,394 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Use auto scheduler to tune workloads"""
-import argparse
-import logging
-import os
-import random
-
-import numpy as np
-
-import tvm
-from tvm import ansor, te
-from tvm.ansor.utils import request_remote
-
-from common import get_workload_keys, get_workload_weights, measure_schedule, str2bool
-
-def tensor_core_meet_condition(meta_policy, state, stage_id):
-    pass
-
-def intrin_wmma_load_matrix(scope):
-    n = 16
-    A = te.placeholder((n, n), name='A', dtype='float16')
-    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)
-    C = te.compute((n, n), lambda i, j: A[i, j], name='C')
-    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-
-        BA = ins[0]
-        BC = outs[0]
-        ib.emit(tvm.tir.call_intrin('handle', 'tvm_load_matrix_sync',
-                                    BC.data, n, n, n, BC.elem_offset // 256,
-                                    BA.access_ptr('r'), n, 'row_major'))
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-@tvm._ffi.register_func
-def intrin_wmma_load_matrix_a():
-    return intrin_wmma_load_matrix("wmma.matrix_a")
-
-@tvm._ffi.register_func
-def intrin_wmma_load_matrix_b():
-    return intrin_wmma_load_matrix("wmma.matrix_b")
-
-@tvm._ffi.register_func
-def intrin_wmma_gemm():
-    n = 16
-    A = te.placeholder((n, n), name='A', dtype='float16')
-    B = te.placeholder((n, n), name='B', dtype='float16')
-    k = te.reduce_axis((0, n), name="k")
-    C = te.compute((n, n),
-                    lambda ii, jj:
-                    te.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
-                    name='C')
-    BA = tvm.tir.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256)
-    BB = tvm.tir.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256)
-    BC = tvm.tir.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256)
-
-    def intrin_func(ins, outs):
-        BA, BB = ins
-        BC, = outs
-
-        def init():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(tvm.tir.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0))
-            return ib.get()
-
-        def update():
-            ib = tvm.tir.ir_builder.create()
-            ib.emit(tvm.tir.call_intrin('handle', 'tvm_mma_sync',
-                                    BC.data, BC.elem_offset // 256,
-                                    BA.data, BA.elem_offset // 256,
-                                    BB.data, BB.elem_offset // 256,
-                                    BC.data, BC.elem_offset // 256))
-            return ib.get()
-
-        return update(), init(), update()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
-
-@tvm._ffi.register_func
-def intrin_wmma_store_matrix():
-    n = 16
-    A = te.placeholder((n, n), name='A', dtype='float32')
-    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256)
-    C = te.compute((n, n), lambda i, j: A[i, j], name='C')
-    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256)
-
-    def intrin_func(ins, outs):
-        ib = tvm.tir.ir_builder.create()
-        BA = ins[0]
-        BC = outs[0]
-        ib.emit(tvm.tir.call_intrin('handle', 'tvm_store_matrix_sync',
-                                BA.data, n, n, n, BA.elem_offset // 256,
-                                BC.access_ptr('w'), n, 'row_major'))
-        return ib.get()
-
-    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
-
-def tensor_core_apply(meta_policy, state, stage_id):
-    ret = []
-    state = ansor.loop_state.State(state, meta_policy.cur_task.compute_dag)
-
-    A, B, C = meta_policy.cur_task.compute_dag.ops
-
-    C_local = state.cache_write(C, "wmma.accumulator")
-
-    its0 = state.split(C_local, state[C_local].iters[0], [None, None])
-    split_step0 = state.transform_steps_size() - 1
-    its1 = state.split(C_local, state[C_local].iters[3], [None, None])
-    split_step1 = state.transform_steps_size() - 1
-    its2 = state.split(C_local, state[C_local].iters[8], [None])
-
-    state.reorder(C_local, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2],
-                            its2[0], its2[1],
-                            state[C_local].iters[6],
-                            state[C_local].iters[7],
-                            state[C_local].iters[10]])
-    state.fuse(C_local, [state[C_local].iters[0], state[C_local].iters[1]])
-    state.fuse(C_local, [state[C_local].iters[1], state[C_local].iters[2]])
-    state.fuse(C_local, [state[C_local].iters[2], state[C_local].iters[3]])
-
-    its0 = state.follow_split(C, state[C].iters[0], split_step0, 2)
-    its1 = state.follow_split(C, state[C].iters[3], split_step1, 2)
-    state.reorder(C, [its0[0], its1[0], its0[1], its1[1], its0[2], its1[2],
-                      state[C].iters[6], state[C].iters[7]])
-    state.fuse(C, [state[C].iters[0], state[C].iters[1]])
-    state.fuse(C, [state[C].iters[1], state[C].iters[2]])
-    local_write_pos = state.fuse(C, [state[C].iters[2], state[C].iters[3]])
-    state.compute_at(C_local, C, local_write_pos)
-    shared_read_pos = state[C_local].iters[3]
-    local_read_pos = state[C_local].iters[4]
-    state.bind_thread(C, state[C].iters[0], "blockIdx.x")
-    state.bind_thread(C, state[C].iters[1], "vthread")
-    state.bind_thread(C, state[C].iters[2], "threadIdx.x")
-
-    B_shared = state.cache_read(B, "shared", [C_local])
-    B_local = state.cache_read(B_shared, "wmma.matrix_b", [C_local])
-    state.compute_at(B_shared, C_local, shared_read_pos)
-    state.compute_at(B_local, C_local, local_read_pos)
-
-    it = state.fuse(B_shared, state[B_shared].iters[:])
-    its = state.split(B_shared, it, [4]) # vectorize add a callback check function
-    state.vectorize(B_shared, its[1])
-    its = state.follow_fused_split(B_shared, its[0], [split_step0, split_step1], 1, True)
-    state.bind_thread(B_shared, its[1], "threadIdx.x")
-
-    A_shared = state.cache_read(A, "shared", [C_local])
-    A_local = state.cache_read(A_shared, "wmma.matrix_a", [C_local])
-    state.compute_at(A_shared, C_local, shared_read_pos)
-    state.compute_at(A_local, C_local, local_read_pos)
-
-    it = state.fuse(A_shared, state[A_shared].iters[:])
-    its = state.split(A_shared, it, [4]) # vectorize add a callback check function
-    state.vectorize(A_shared, its[1])
-    its = state.follow_fused_split(A_shared, its[0], [split_step0, split_step1], 1, True)
-    state.bind_thread(A_shared, its[1], "threadIdx.x")
-
-    state.tensorize(A_local, state[A_local].iters[-2], "intrin_wmma_load_matrix_a")
-    state.tensorize(B_local, state[B_local].iters[-2], "intrin_wmma_load_matrix_b")
-    state.tensorize(C_local, state[C_local].iters[-3], "intrin_wmma_gemm")
-    state.tensorize(C, state[C].iters[-2], "intrin_wmma_store_matrix")
-
-    print(state)
-
-    ret.append([state.state_object, -1])
-    return ret
-
-def create_tune_option(target, log_file, n_trials, num_measure_per_iter, verbose,
-                       n_parallel, build_timeout, local_measure, rpc_device_key, rpc_host,
-                       rpc_port, rpc_num_threads, ndk_cc, early_stopping=-1, run_timeout=10,
-                       tensor_core_matmul=False):
-    builder = runner = measure_ctx = None
-    if local_measure:
-        builder = ansor.LocalBuilder(timeout=build_timeout)
-        if target.target_name == "cuda":
-            measure_ctx = ansor.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400)
-            runner = measure_ctx.runner
-        else:
-            os.environ['TVM_AUTO_CACHE_FLUSH'] = "1"
-            runner = ansor.LocalRunner(repeat=10, number=1, min_repeat_ms=0, timeout=run_timeout)
-    else:
-        os.environ['TVM_NDK_CC'] = ndk_cc
-        builder = ansor.LocalBuilder(timeout=build_timeout, build_func='ndk')
-        runner = ansor.RPCRunner(key=rpc_device_key, host=rpc_host, port=rpc_port,
-                                 timeout=run_timeout, n_parallel=n_parallel,
-                                 repeat=1, min_repeat_ms=200)
-        remote = request_remote(rpc_device_key, rpc_host, rpc_port)
-        if rpc_num_threads:
-            config_threadpool = remote.get_function('runtime.config_threadpool')
-            config_threadpool(0, rpc_num_threads)
-
-    pre_search_callbacks = [ansor.PreloadMeasuredStates(log_file)]
-    if tensor_core_matmul:
-        pre_search_callbacks.append(ansor.PreloadCustomSketchRule(tensor_core_meet_condition, tensor_core_apply))
-    tune_option = ansor.TuneOption(n_trials=n_trials, early_stopping=early_stopping,
-                                   num_measure_per_iter=num_measure_per_iter,
-                                   verbose=verbose,
-                                   builder=builder,
-                                   runner=runner,
-                                   measure_callbacks=[ansor.LogToFile(log_file)],
-                                   pre_search_callbacks=pre_search_callbacks)
-
-    return tune_option, measure_ctx
-
-
-def replay_workload(wkl_key, target, target_host, log_file,
-                    local_measure=True, rpc_device_key=None, rpc_host="0.0.0.0",
-                    rpc_port=9190, rpc_num_threads=None, ndk_cc=None,
-                    show_lower_result=True):
-    cost = gflops = None
-
-    inp, res = ansor.best_measure_pair_in_file(log_file, wkl_key, target)
-    if inp is None:
-        print("Cannot find log for: %s" % wkl_key)
-    else:
-        dag = ansor.workload_key_to_dag(inp.task.workload_key)
-        print("Found schedule for: %s" % wkl_key)
-
-        s, bufs = dag.apply_steps_from_state(inp.state)
-        if show_lower_result:
-            print(tvm.lower(s, bufs, simple_mode=True))
-
-        if local_measure:
-            remote = None
-        else:
-            remote = request_remote(rpc_device_key, rpc_host, rpc_port)
-            if rpc_num_threads:
-                config_threadpool = remote.get_function('runtime.config_threadpool')
-                config_threadpool(0, rpc_num_threads)
-
-        cost = np.mean((measure_schedule(s, bufs, target, target_host,
-                                         remote=remote, ndk_cc=ndk_cc)))
-        gflops = ansor.ComputeDAG(bufs).flop_ct / cost / 1e9
-        print("Best schedule: %.2f GFLOPS\tcost: %.3f ms" % (gflops, cost * 1e3))
-
-    return cost, gflops
-
-
-def tune_workload(wkl_key, target, target_host, policy, model_type,
-                  load_model_file, load_log_file, tune_option):
-    """Tune a workload"""
-
-    if False:
-        # Debug info. Print static analysis results from the access analyzer
-        dag = ansor.workload_key_to_dag(wkl_key)
-        print(dag.access_analyzer)
-        exit()
-
-    if model_type == 'xgb':
-        model = ansor.XGBModel()
-        if load_model_file:
-            print("Load pretrained model...")
-            model.load(load_model_file)
-        elif load_log_file:
-            model.load_log_file(load_log_file)
-    elif model_type == "random":
-        model = ansor.RandomModel()
-    else:
-        raise ValueError("Invalid model: " + model_type)
-
-    if policy == 'sketch':
-        policy = ansor.SketchSearchPolicy(program_cost_model=model)
-    elif policy == 'beam-search':
-        policy = ansor.SketchSearchPolicy(program_cost_model=model,
-                                          params={'use_beam_search': 1})
-    else:
-        raise ValueError("Invalid search policy: " + policy)
-
-    s, bufs = ansor.auto_schedule(wkl_key,
-                                  target=target, target_host=target_host,
-                                  search_policy=policy,
-                                  tune_option=tune_option)
-
-def tune_workloads_jointly(wkl_keys, weights, task_scheduler, target, target_host,
-                           search_policy, model_type, load_model_file, load_log_file,
-                           tune_option):
-    """Tune for multiple workloads together with TaksScheduler"""
-    tasks = []
-    for wkl_key in wkl_keys:
-        dag = ansor.workload_key_to_dag(wkl_key)
-        tasks.append(ansor.SearchTask(dag, wkl_key, target, target_host))
-
-    def objective_func(costs):
-        return sum(c * w for c, w in zip(costs, weights))
-
-    tuner = ansor.SimpleTaskScheduler(tasks, objective_func, strategy=task_scheduler,
-                                      load_log_file=load_log_file, load_model_file=load_model_file)
-    search_policy = "%s.%s" % (search_policy, model_type)
-    tuner.tune(tune_option, search_policy)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Search task related arguments
-    parser.add_argument("--wkl", type=str, required=True)
-    parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2')
-    parser.add_argument("--target-host", type=str, default=None)
-    parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True)
-
-    # Search strategy related arguments
-    parser.add_argument("--n-trials", type=int, default=1000)
-    parser.add_argument("--policy", type=str, choices=['sketch', 'beam-search'], default='sketch')
-    parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb')
-    parser.add_argument("--task-scheduler", type=str, default='no',
-                        choices=['no', 'gradient', 'round-robin'],
-                        help='The strategy of task scheduler')
-    parser.add_argument("--seed", type=int, default=0, help='random seed')
-
-    # Log file related arguments
-    parser.add_argument("--log-file", type=str, help="Write measurement records to this log file")
-    parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search")
-    parser.add_argument("--load-model", type=str, help="Load pre-trained cost model from this file")
-
-    # Measurement related and other arguments
-    parser.add_argument("--num-measure-per-iter", type=int, default=48,
-                        help="The number of programs to be measured at each iteration")
-    parser.add_argument("--build-timeout", type=int, default=10)
-    parser.add_argument("--run-timeout", type=int, default=60)
-    parser.add_argument("--verbose", type=int, default=1)
-    parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True)
-    parser.add_argument("--rpc-device-key", type=str, default=None)
-    parser.add_argument("--rpc-host", type=str, default='0.0.0.0')
-    parser.add_argument("--rpc-port", type=int, default=9190)
-    parser.add_argument("--rpc-num-threads", type=int, default=None)
-    parser.add_argument("--n-parallel", type=int, default=1)
-    parser.add_argument("--ndk-cc", type=str, default=None)
-    args = parser.parse_args()
-
-    np.random.seed(args.seed)
-    random.seed(args.seed)
-    logging.basicConfig()
-    logging.getLogger('ansor').setLevel(logging.DEBUG)
-
-    wkl_keys = get_workload_keys(args.wkl)
-    target = tvm.target.create(args.target)
-    log_file = args.log_file or args.wkl + ".json"
-
-    # Tune workloads
-    if args.tune:
-        load_log_file = args.load_log or log_file
-        weights = get_workload_weights(args.wkl)
-
-        # Special check for tensor core
-        wkl_key = args.wkl
-        wkl_key = wkl_key.split("-")
-        tensor_core_matmul = False
-        if wkl_key[0] == "matmul" and wkl_key[6] == "tc":
-            tensor_core_matmul = True
-
-        tune_option, measure_ctx = create_tune_option(target, log_file,
-            args.n_trials, args.num_measure_per_iter, args.verbose,
-            args.n_parallel, args.build_timeout, args.local_measure,
-            args.rpc_device_key, args.rpc_host, args.rpc_port, args.rpc_num_threads,
-            args.ndk_cc, tensor_core_matmul=tensor_core_matmul)
-
-        if args.task_scheduler == 'no':
-            # tune workloads one by one
-            for wkl_key in wkl_keys:
-                tune_workload(wkl_key, target, args.target_host, args.policy,
-                              args.model_type, args.load_model, load_log_file,
-                              tune_option)
-        else:
-            # tune workloads jointly with TaskScheduler
-            tune_workloads_jointly(wkl_keys, weights, args.task_scheduler,
-                                   target, args.target_host, args.policy,
-                                   args.model_type, args.load_model, load_log_file,
-                                   tune_option)
-        if measure_ctx:
-            del measure_ctx
-
-    # Replay the best found schedule
-    if len(wkl_keys) == 1 or not args.tune:
-        for wkl_key in wkl_keys:
-            replay_workload(wkl_key, target, args.target_host, log_file,
-                            args.local_measure, args.rpc_device_key, args.rpc_host,
-                            args.rpc_port, args.rpc_num_threads, args.ndk_cc)
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index d3af64a4f576..4887ef0ee47d 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -132,13 +132,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) {
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) + broadcast(x, lanes), ramp(b1 + x, s1, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) + ramp(b1, s1, lanes), ramp(x + b1, s1, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) + broadcast(y, lanes), broadcast(x + y, lanes));
-    if ((x + broadcast(y, lanes)).Match(ret)) {
-      if (auto ps = y.Eval().as<FloatImmNode>()) {
-        if (ps->value == 0.0) {
-          return x.Eval();
-        }
-      }
-    }
   }
 
   if (IsIndexType(op->dtype)) {
@@ -429,13 +422,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) {
     TVM_TRY_REWRITE(broadcast(x, lanes) * broadcast(y, lanes), broadcast(x * y, lanes));
     TVM_TRY_REWRITE(ramp(b1, s1, lanes) * broadcast(x, lanes), ramp(b1 * x, s1 * x, lanes));
     TVM_TRY_REWRITE(broadcast(x, lanes) * ramp(b1, s1, lanes), ramp(b1 * x, s1 * x, lanes));
-    if ((broadcast(x, lanes) * y).Match(ret)) {
-      if (auto ps = x.Eval().as<FloatImmNode>()) {
-        if (ps->value == 0.0) {
-          return make_const(op->dtype, 0.0);
-        }
-      }
-    }
   }
 
   if (IsIndexType(op->dtype)) {
@@ -714,9 +700,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
   PrimExpr const_res = TryConstFold<FloorDiv>(op->a, op->b);
   if (const_res.defined()) return const_res;
   // Pattern var to match any expression
-  PVar<PrimExpr> w, x, y, z, b1;
+  PVar<PrimExpr> x, y, z, b1;
   // Pattern var match IntImm
-  PVar<IntImm> c1, c2, c3, c4;
+  PVar<IntImm> c1, c2, c3;
   // Pattern var for lanes in broadcast and ramp
   PVar<int> lanes;
 
@@ -781,11 +767,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     TVM_TRY_REWRITE_IF(floordiv(max(y, x * c1), c2), max(floordiv(y, c2), x * floordiv(c1, c2)),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
-    TVM_TRY_REWRITE_IF(floordiv(x * c1 + y, c2), floordiv(x * c1, c2),
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                       c2.Eval()->value % c1.Eval()->value == 0 &&
-                       CanProveGreaterEqual(-y.Eval(), -c1.Eval()->value + 1));
-
     // Rules involving 3-operands.
     TVM_TRY_REWRITE_IF(floordiv(x * c1 + y + z, c2), x * floordiv(c1, c2) + floordiv(y + z, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
@@ -802,13 +783,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     TVM_TRY_REWRITE_IF(floordiv(x + c1, c2), floordiv(x, c2) + floordiv(c1, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
-    TVM_TRY_REWRITE_IF(floordiv(x * c1 + y * c2 + z, c3), floordiv(x * c1 + y * c2, c3),
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 && c3.Eval()->value > 0 &&
-                       c3.Eval()->value % c1.Eval()->value == 0 &&
-                       c3.Eval()->value % c2.Eval()->value == 0 &&
-                       CanProveGreaterEqual(-z.Eval(),
-                       std::max(-c1.Eval()->value, -c2.Eval()->value) + 1));
-
     TVM_TRY_REWRITE_IF(floordiv(x + y, x), floordiv(y, x) + 1, CanProveGreaterEqual(x.Eval(), 0));
 
     TVM_TRY_REWRITE_IF(floordiv(y + x, x), floordiv(y, x) + 1, CanProveGreaterEqual(x.Eval(), 0));
@@ -833,18 +807,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
                        CanProveGreaterEqual(z.Eval(), 0));
     TVM_TRY_REWRITE_IF(floordiv(y + z * x, z), floordiv(y, z) + x,
                        CanProveGreaterEqual(z.Eval(), 0));
-
-    // Rules involving 4-operands
-    TVM_TRY_REWRITE_IF(floordiv(w * c1 + x * c2 + y * c3 + z, c4),
-                       floordiv(w * c1 + x * c2 + y * c3, c4),
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                       c3.Eval()->value > 0 && c4.Eval()->value > 0 &&
-                       c4.Eval()->value % c1.Eval()->value == 0 &&
-                       c4.Eval()->value % c2.Eval()->value == 0 &&
-                       c4.Eval()->value % c3.Eval()->value == 0 &&
-                       CanProveGreaterEqual(-z.Eval(),
-                       std::max(-c1.Eval()->value,
-                       std::max(-c2.Eval()->value, -c3.Eval()->value)) + 1));
   }
   return ret;
 }
@@ -856,9 +818,9 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
   if (const_res.defined()) return const_res;
 
   // Pattern var to match any expression
-  PVar<PrimExpr> w, x, y, z, b1;
+  PVar<PrimExpr> x, y, z, b1;
   // Pattern var match IntImm
-  PVar<IntImm> c1, c2, c3, c4;
+  PVar<IntImm> c1, c2;
   // Pattern var for lanes in broadcast and ramp
   PVar<int> lanes;
 
@@ -902,31 +864,6 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
     TVM_TRY_REWRITE_IF(floormod(x + y * c1, c2), floormod(x, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
-    TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x, floordiv(c2, c1)) * c1 + y,
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                       c2.Eval()->value % c1.Eval()->value == 0 &&
-                       CanProveGreaterEqual(-y.Eval(), -c1.Eval()->value + 1));
-
-    // TODO(jcf94): For the next three rules, better use the max common factor
-    // of c1, c2, c3 to do the simplify
-    TVM_TRY_REWRITE_IF(floormod(x * c1 + y * c2 + z, c3),
-                       floormod(x * floordiv(c1, c2) + y, floordiv(c3, c2)) * c2 + z,
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                       c3.Eval()->value > 0 &&
-                       c3.Eval()->value % c2.Eval()->value == 0 &&
-                       c1.Eval()->value % c2.Eval()->value == 0 &&
-                       CanProveGreaterEqual(-z.Eval(), -c2.Eval()->value + 1));
-
-    TVM_TRY_REWRITE_IF(floormod(w * c1 + x * c2 + y * c3 + z, c4),
-                       floormod(w * floordiv(c1, c3) + x * floordiv(c2, c3) + y,
-                                floordiv(c4, c3)) * c3 + z,
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                       c3.Eval()->value > 0 && c4.Eval()->value > 0 &&
-                       c4.Eval()->value % c3.Eval()->value == 0 &&
-                       c1.Eval()->value % c3.Eval()->value == 0 &&
-                       c2.Eval()->value % c3.Eval()->value == 0 &&
-                       CanProveGreaterEqual(-z.Eval(), -c3.Eval()->value + 1));
-
     // try modular analysis
     if (floormod(x, c1).Match(ret)) {
       ModularSet mod = analyzer_->modular_set(x.Eval());
diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index 5b063eca4337..a192002825e6 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -219,7 +219,6 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
       return Type(nullptr);
     }
 
-    tt1 = tt2;
     tvm::Array<IndexExpr> shape;
     if (tt1->shape.size() != tt2->shape.size()) {
       this->solver_->ReportError(ErrorBuilder() << "tensor type `" << PrettyPrint(tt1) << "` has "
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 30269b85795f..ee5e291e3d53 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2455,60 +2455,6 @@ the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
     .set_support_level(5)
     .set_attr<FTVMCompute>("FTVMCompute", LayoutTransformCompute);
 
-// relay.kernel_layout_transform
-TVM_REGISTER_NODE_TYPE(KernelLayoutTransformAttrs);
-
-Array<te::Tensor> KernelLayoutTransformCompute(const Attrs& attrs,
-                                               const Array<te::Tensor>& inputs,
-                                               const Type& out_type) {
-  const auto* param = attrs.as<KernelLayoutTransformAttrs>();
-  CHECK(param != nullptr);
-  return Array<te::Tensor>{
-      topi::kernel_layout_transform(inputs[0], param->src_layout, param->dst_layout)
-  };
-}
-
-bool KernelLayoutTransformRel(const Array<Type>& types,
-                              int num_inputs,
-                              const Attrs& attrs,
-                              const TypeReporter& reporter) {
-  const auto* data = types[0].as<TensorTypeNode>();
-  CHECK(data != nullptr);
-  const KernelLayoutTransformAttrs* params = attrs.as<KernelLayoutTransformAttrs>();
-
-  Array<IndexExpr> dst_shape;
-  std::vector<std::string> dst_axes;
-
-  topi::parse_kernel_layout(params->dst_layout, &dst_shape, &dst_axes);
-
-  reporter->Assign(types[1], TensorType(dst_shape, data->dtype));
-  return true;
-}
-
-Expr MakeKernelLayoutTransform(Expr data,
-                               String src_layout,
-                               String dst_layout) {
-  auto attrs = make_object<KernelLayoutTransformAttrs>();
-  attrs->src_layout = std::move(src_layout);
-  attrs->dst_layout = std::move(dst_layout);
-  static const Op& op = Op::Get("kernel_layout_transform");
-  return Call(op, {data}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_GLOBAL("relay.op._make.kernel_layout_transform")
-.set_body_typed(MakeKernelLayoutTransform);
-
-RELAY_REGISTER_OP("kernel_layout_transform")
-    .describe(R"code(Transform the input kernel layout.
-)code" TVM_ADD_FILELINE)
-    .set_attrs_type<KernelLayoutTransformAttrs>()
-    .set_num_inputs(1)
-    .add_argument("data", "Tensor", "The input tensor.")
-    .add_type_rel("kernel_layout_transform", KernelLayoutTransformRel)
-    .set_support_level(5)
-    .set_attr<FTVMCompute>("FTVMCompute", KernelLayoutTransformCompute);
-
-
 /* relay._contrib_reverse_reshape */
 Expr MakeReverseReshape(Expr data, Array<Integer> newshape) {
   auto attrs = make_object<ReshapeAttrs>();
diff --git a/src/relay/transforms/defuse_ops.cc b/src/relay/transforms/defuse_ops.cc
deleted file mode 100644
index 1a108fb08888..000000000000
--- a/src/relay/transforms/defuse_ops.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/transform.h>
-#include <tvm/te/operation.h>
-
-#include <functional>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "pattern_util.h"
-
-namespace tvm {
-namespace relay {
-
-class DefuseOpsMutator : public ExprMutator {
- public:
-  class FuncBodyMutator : public ExprMutator {
-   public:
-    Array<Expr> args_;
-
-    explicit FuncBodyMutator(const Array<Expr>& args) : ExprMutator() { args_ = args; }
-
-    Expr VisitExpr_(const VarNode* n) {
-      const std::string& name = n->name_hint();
-      CHECK_EQ(name[0], 'p');
-      std::string id_str = name.substr(1);
-      int id = atoi(id_str.c_str());
-      CHECK(id >= 0 && size_t(id) < args_.size());
-      return args_[id];
-    }
-  };
-
-  Expr VisitExpr_(const CallNode* n) {
-    auto new_n = ExprMutator::VisitExpr_(n);
-
-    const auto* call = new_n.as<CallNode>();
-    if (call) {
-      const auto* func = call->op.as<FunctionNode>();
-      if (func) {
-        const auto& func_call = func->body.as<CallNode>();
-        if (func_call) {
-          return FuncBodyMutator(call->args).Mutate(func->body);
-        }
-      }
-    }
-    return new_n;
-  }
-};
-
-Expr DeFuseOps(const Expr& expr) { return DefuseOpsMutator().Mutate(expr); }
-
-namespace transform {
-
-Pass DeFuseOps() {
-  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-      [=](Function f, IRModule m, PassContext pc) {
-        return Downcast<Function>(relay::DeFuseOps(f));
-      };
-  return CreateFunctionPass(pass_func, 3, "DeFuseOps", {"InferType"});
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.DeFuseOps").set_body_typed(DeFuseOps);
-
-}  // namespace transform
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/transforms/kernel_layout_transform.cc b/src/relay/transforms/kernel_layout_transform.cc
deleted file mode 100644
index 421968b8a6b9..000000000000
--- a/src/relay/transforms/kernel_layout_transform.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "kernel_layout_transform.h"
-
-#include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/transform.h>
-#include <tvm/te/operation.h>
-
-#include <deque>
-#include <functional>
-#include <string>
-
-namespace tvm {
-namespace relay {
-
-// Todo: do not use global variables
-std::deque<std::string> KernelLayoutVisitor::global_ori_layouts_queue;
-std::deque<std::string> KernelLayoutVisitor::global_new_layouts_queue;
-
-Expr KernelLayoutTransform(const Expr& expr) {
-  KernelLayoutVisitor visitor;
-
-  // Do a pre-order DFS to gather the optimal kernel layouts for all conv2d nodes.
-  // These layouts were written to global static variables in python function
-  // `prepare_layout_rewrite`
-  visitor.VisitExpr(expr);
-
-  // Do a post-order DSF to mutate layout for all conv2d nodes
-  return KernelLayoutTransformer(&visitor).Mutate(expr);
-}
-
-namespace transform {
-
-Pass KernelLayoutTransform() {
-  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-      [=](Function f, IRModule m, PassContext pc) {
-        return Downcast<Function>(relay::KernelLayoutTransform(f));
-      };
-  return CreateFunctionPass(pass_func, 3, "KernelLayoutTransform", {"InferType"});
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.KernelLayoutTransform").set_body_typed(KernelLayoutTransform);
-
-}  // namespace transform
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/transforms/kernel_layout_transform.h b/src/relay/transforms/kernel_layout_transform.h
deleted file mode 100644
index c6c38fb71cf4..000000000000
--- a/src/relay/transforms/kernel_layout_transform.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_RELAY_TRANSFORMS_KERNEL_LAYOUT_TRANSFORM_H_
-#define TVM_RELAY_TRANSFORMS_KERNEL_LAYOUT_TRANSFORM_H_
-
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-
-#include <deque>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <vector>
-
-#include "../../ansor/compute_dag.h"
-#include "pattern_util.h"
-
-namespace tvm {
-namespace relay {
-
-/*! \brief A visitor to gather the optimal kernel layout for all conv2d nodes. */
-class KernelLayoutVisitor : public ExprVisitor {
- public:
-  void VisitExpr_(const CallNode* n) {
-    if (n && n->op.as<OpNode>() &&
-        (std::find(op_white_lists.begin(), op_white_lists.end(), n->op.as<OpNode>()->name) !=
-         op_white_lists.end()) &&
-        n->args[1]->type_as<TensorTypeNode>()->shape[3].as<IntImmNode>()->value > 1 &&
-        !global_ori_layouts_queue.empty() && !global_new_layouts_queue.empty()) {
-      ori_layouts_map[n] = global_ori_layouts_queue.front();
-      new_layouts_map[n] = global_new_layouts_queue.front();
-      // std::cout << "ori_layout " << global_ori_layouts_queue.front()
-      //     << " Filter_shape " << n->args[1]->type_as<TensorTypeNode>()->shape << std::endl;
-      global_ori_layouts_queue.pop_front();
-      global_new_layouts_queue.pop_front();
-    }
-    ExprVisitor::VisitExpr_(n);
-  }
-
-  std::unordered_map<const CallNode*, std::string> ori_layouts_map;
-  std::unordered_map<const CallNode*, std::string> new_layouts_map;
-  std::vector<std::string> op_white_lists{"nn.contrib_conv2d_winograd_without_weight_transform",
-                                          "nn.conv2d", "nn.conv3d"};
-
-  static std::deque<std::string> global_ori_layouts_queue;
-  static std::deque<std::string> global_new_layouts_queue;
-};
-
-/*! \brief A mutator to rewrite kernel layout for all conv2d nodes */
-class KernelLayoutTransformer : public ExprMutator {
- public:
-  explicit KernelLayoutTransformer(KernelLayoutVisitor* visitor)
-      : ExprMutator(), visitor_(visitor) {}
-
-  Expr VisitExpr_(const CallNode* n) {
-    auto new_n = ExprMutator::VisitExpr_(n);
-
-    const auto* call = new_n.as<CallNode>();
-    std::vector<std::string> op_white_lists{"nn.contrib_conv2d_winograd_without_weight_transform",
-                                            "nn.conv2d", "nn.conv3d"};
-    if (call && call->op.as<OpNode>() &&
-        (std::find(op_white_lists.begin(), op_white_lists.end(), n->op.as<OpNode>()->name) !=
-             op_white_lists.end() &&
-         n->args[1]->type_as<TensorTypeNode>()->shape[3].as<IntImmNode>()->value > 1)) {
-      auto ori_layout_iter = visitor_->ori_layouts_map.find(n);
-      auto new_layout_iter = visitor_->new_layouts_map.find(n);
-      if (ori_layout_iter != visitor_->ori_layouts_map.end() &&
-          new_layout_iter != visitor_->new_layouts_map.end()) {
-        const std::string& ori_layout = ori_layout_iter->second;
-        const std::string& new_layout = new_layout_iter->second;
-        Expr updated_kernel = MakeKernelLayoutTransform(call->args[1], ori_layout, new_layout);
-        Array<Expr> updated_args = {call->args[0], updated_kernel};
-        new_n = Call(call->op, updated_args, call->attrs);
-      }
-    }
-    return new_n;
-  }
-
- private:
-  KernelLayoutVisitor* visitor_;
-};
-
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_TRANSFORMS_KERNEL_LAYOUT_TRANSFORM_H_
diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_util.h
index a9d3b5168e47..7518eb9ac81a 100644
--- a/src/relay/transforms/pattern_util.h
+++ b/src/relay/transforms/pattern_util.h
@@ -685,8 +685,6 @@ Expr MakeExpandDims(Expr data, int axis, int num_newaxis);
 
 Expr MakeLayoutTransform(Expr data, String src_layout, String dst_layout);
 
-Expr MakeKernelLayoutTransform(Expr data, String src_layout, String dst_layout);
-
 Expr StopFusion(Expr data);
 
 Expr CastHint(Expr data, DataType dtype);
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 4e71383cc1bb..a6d4a5499469 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -94,10 +94,6 @@ class CUDADeviceAPI final : public DeviceAPI {
       }
       case kGcnArch:
         return;
-      case kMaxRegistersPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(&value, cudaDevAttrMaxRegistersPerBlock, ctx.device_id));
-        break;
-      }
     }
     *rv = value;
   }
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 714535ecc8a6..800a9167dadc 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -26,9 +26,6 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/ndarray.h>
 
-#include <builtin_fp16.h>
-#include <random>
-
 #include "runtime_base.h"
 
 extern "C" {
@@ -183,8 +180,7 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape, DLDataType dtype) {
 
 DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); }
 
-NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype,
-                       DLContext ctx) {
+NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype, DLContext ctx) {
   NDArray ret = Internal::Create(shape, dtype, ctx);
   // setup memory content
   size_t size = GetDataSize(ret.get_mutable()->dl_tensor);
@@ -194,59 +190,6 @@ NDArray NDArray::Empty(std::vector<int64_t> shape, DLDataType dtype,
   return ret;
 }
 
-
-NDArray NDArray::NonEmpty(std::vector<int64_t> shape, DLDataType dtype,
-                          DLContext ctx) {
-  NDArray ret = Internal::Create(shape, dtype, ctx);
-  NDArray dummy_cpu_arr = Internal::Create(shape, dtype, {kDLCPU, 0});
-
-  // setup memory content
-  size_t size = GetDataSize(ret.get_mutable()->dl_tensor);
-  size_t alignment = GetDataAlignment(ret.get_mutable()->dl_tensor);
-  dummy_cpu_arr.get_mutable()->dl_tensor.data =
-      DeviceAPI::Get(dummy_cpu_arr->ctx)->AllocDataSpace(
-          {kDLCPU, 0}, size, alignment, dummy_cpu_arr->dtype);
-  size_t elem_cnt = 1;
-  for (tvm_index_t i = 0; i < dummy_cpu_arr->ndim; ++i) {
-    elem_cnt *= static_cast<size_t>(dummy_cpu_arr->shape[i]);
-  }
-
-  // TODO(..): maybe we could have better solution for assigning values
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<> dis(1.0, 10.0);
-  // Use float representation could make us work well on float / int type too.
-  for (size_t i = 0; i < elem_cnt; ++i) {
-    if (dummy_cpu_arr->dtype.bits == 1) {
-      (reinterpret_cast<bool*>(
-          dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = dis(gen);
-    } else if (dummy_cpu_arr->dtype.bits == 8) {
-      (reinterpret_cast<uint8_t*>(
-          dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = dis(gen);
-    } else if (dummy_cpu_arr->dtype.bits == 16) {
-      (reinterpret_cast<uint16_t*>(
-          dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] =
-              __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(
-                  static_cast<float>(dis(gen)));
-    } else if (dummy_cpu_arr->dtype.bits == 32) {
-      (reinterpret_cast<float*>(
-          dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = dis(gen);
-    } else if (dummy_cpu_arr->dtype.bits == 64) {
-      (reinterpret_cast<double*>(
-          dummy_cpu_arr.get_mutable()->dl_tensor.data))[i] = dis(gen);
-    } else {
-      LOG(FATAL) << "Doesn't support dtype code " << dtype.code
-                 << " dtype bits " << dtype.bits;
-    }
-  }
-  ret.get_mutable()->dl_tensor.data =
-      DeviceAPI::Get(ret->ctx)->AllocDataSpace(
-          ret->ctx, size, alignment, ret->dtype);
-  CopyFromTo(&(dummy_cpu_arr.get_mutable()->dl_tensor),
-             &(ret.get_mutable()->dl_tensor));
-  return ret;
-}
-
 NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   NDArray::Container* data = new NDArray::Container();
   // construct header
@@ -314,9 +257,8 @@ int TVMArrayGetTypeIndex(TVMArrayHandle handle, unsigned* out_tindex) {
   API_END();
 }
 
-int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code,
-                  int dtype_bits, int dtype_lanes, int device_type,
-                  int device_id, TVMArrayHandle* out) {
+int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_bits,
+                  int dtype_lanes, int device_type, int device_id, TVMArrayHandle* out) {
   API_BEGIN();
   DLDataType dtype;
   dtype.code = static_cast<uint8_t>(dtype_code);
@@ -330,22 +272,6 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code,
   API_END();
 }
 
-int TVMArrayAllocNonEmpty(const tvm_index_t* shape, int ndim, int dtype_code,
-                          int dtype_bits, int dtype_lanes, int device_type,
-                          int device_id, TVMArrayHandle* out) {
-  API_BEGIN();
-  DLDataType dtype;
-  dtype.code = static_cast<uint8_t>(dtype_code);
-  dtype.bits = static_cast<uint8_t>(dtype_bits);
-  dtype.lanes = static_cast<uint16_t>(dtype_lanes);
-  DLContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id = device_id;
-  *out = NDArray::Internal::MoveToFFIHandle(
-      NDArray::NonEmpty(std::vector<int64_t>(shape, shape + ndim), dtype, ctx));
-  API_END();
-}
-
 int TVMArrayFree(TVMArrayHandle handle) {
   API_BEGIN();
   NDArray::Internal::FFIDecRef(handle);
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 71d3232ca4d5..6d9835e6231c 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -109,9 +109,6 @@ void OpenCLWorkspace::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue*
     }
     case kGcnArch:
       return;
-    default: {
-      LOG(WARNING) << "Attr not implemented.";
-    }
   }
 }
 
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index d58130d700f4..89f3e7c6c7f8 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -24,14 +24,9 @@
 #include <tvm/runtime/container.h>
 #include <tvm/runtime/registry.h>
 
-#include <cstdlib>
 #include <cstring>
 #include <memory>
 
-#if defined(_M_X64) || defined(__x86_64__)
-#include <x86intrin.h>
-#endif
-
 #include "rpc_endpoint.h"
 #include "rpc_session.h"
 
@@ -305,22 +300,6 @@ std::shared_ptr<RPCSession> RPCModuleGetSession(Module mod) {
   return rmod->sess();
 }
 
-inline void CacheFlush(const char* p, unsigned int allocation_size) {
-// TODO(FrozenGene): Support ARM.
-#if (defined(_M_X64) || defined(__x86_64__))
-  size_t cache_line = 64;
-
-  if (p == nullptr || allocation_size <= 0) {
-    return;
-  }
-
-  for (size_t i = 0; i < allocation_size; i += cache_line) {
-    _mm_clflush(static_cast<const void*>(&p[i]));
-  }
-
-#endif
-}
-
 PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repeat,
                              int min_repeat_ms) {
   CHECK(pf != nullptr);
@@ -334,21 +313,12 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe
   auto ftimer = [pf, ctx, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
-    const char* cache_flush = std::getenv("TVM_AUTO_CACHE_FLUSH");
     // skip first time call, to activate lazy compilation components.
     pf.CallPacked(args, &temp);
 
     DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr);
 
     for (int i = 0; i < repeat; ++i) {
-      if (cache_flush && std::atoi(cache_flush) != 0) {
-        CHECK_EQ(number, 1);
-        // we want to keep input data
-        for (int j = 1; j < args.size(); j++) {
-          CacheFlush(reinterpret_cast<char*>(args[j].operator DLTensor*()->data),
-                     GetDataSize(*(args[j].operator DLTensor*())));
-        }
-      }
       std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
           tend;
       double duration_ms = 0.0;
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 3b1889aed8ef..e5520efe30a6 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -166,13 +166,8 @@ class ThreadGroup::Impl {
 #if defined(_M_X64) || defined(__x86_64__)
       big_count /= 2;  // ignore hyper-threading
 #endif
-      const char* bind_master_core_0 = getenv("TVM_BIND_MASTER_CORE_0");
-      if (bind_master_core_0 && atoi(bind_master_core_0) != 0) {
-        CPU_SET(sorted_order_[0], &cpuset);
-      } else {
-        for (int i = 0; i < big_count; ++i) {
-          CPU_SET(sorted_order_[i], &cpuset);
-        }
+      for (int i = 0; i < big_count; ++i) {
+        CPU_SET(sorted_order_[i], &cpuset);
       }
     }
 #if defined(__ANDROID__)
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index 04a3f0b25bee..af72d3b1a1df 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -461,7 +461,7 @@ void RebaseNonZeroMinLoop(const Schedule& sch) {
     for (IterVar iv : root_iter_vars) {
       size_t idx = FindNodeRef(leaf_vars, iv);
       auto it = s->iter_var_attrs.find(iv);
-      // don't need to rebase path that are binded.
+      // don;t need to rebase path that are binded.
       if (it != s->iter_var_attrs.end() && (*it).second->bind_thread.defined()) {
         continue;
       }
@@ -614,74 +614,10 @@ void InjectInline(ScheduleNode* sch) {
   }
 }
 
-void LegalizeInvalidAttach(ScheduleNode* sch) {
-  std::unordered_map<IterVar, IterVar> replace_map;
-
-  for (Stage stage : sch->stages) {
-    for (Stage s = stage; s.defined();) {
-      Stage spec = s.GetAttachSpec();
-      if (spec->attach_type != kScope) {
-        break;
-      }
-      bool start_attach = false;
-      IterVar attach_ivar = spec->attach_ivar;
-      s = spec->attach_stage;
-      CHECK(attach_ivar.defined());
-      CHECK(s.defined());
-
-      for (size_t i = s->leaf_iter_vars.size(); i != 0; --i) {
-        IterVar iv = s->leaf_iter_vars[i - 1];
-        if (!start_attach && iv.same_as(attach_ivar)) {
-          start_attach = true;
-        }
-      }
-      if (!start_attach) {
-        // If the attach_var is fused into another iter_var, update the
-        // attach_var to be the fused one
-        // Do this recursively.
-        IterVar new_attach_ivar = attach_ivar;;
-        bool updated = true;
-        while (updated) {
-          updated = false;
-          for (const auto& rel : s->relations) {
-            if (const FuseNode* r = rel.as<FuseNode>()) {
-              if (new_attach_ivar.same_as(r->inner)) {
-                new_attach_ivar = r->fused;
-                updated = true;
-              }
-            } else if (const SplitNode* r = rel.as<SplitNode>()) {
-              if (new_attach_ivar.same_as(r->parent)) {
-                new_attach_ivar = r->inner;
-                updated = true;
-              }
-            }
-          }
-          replace_map[attach_ivar] = new_attach_ivar;
-        }
-      }
-    }
-  }
-
-  // remap the parent relation
-  for (Stage s : sch->stages) {
-    if (s->attach_type != kScope) continue;
-    if (replace_map.count(s->attach_ivar)) {
-      s->attach_ivar = replace_map.at(s->attach_ivar);
-    }
-  }
-  for (Stage s : sch->groups) {
-    if (s->attach_type != kScope) continue;
-    if (replace_map.count(s->attach_ivar)) {
-      s->attach_ivar = replace_map.at(s->attach_ivar);
-    }
-  }
-}
-
 Schedule Schedule::normalize() {
   Schedule sn = copy();
   InjectInline(sn.operator->());
   RebaseNonZeroMinLoop(sn);
-  LegalizeInvalidAttach(sn.operator->());
   return sn;
 }
 
diff --git a/src/tir/analysis/verify_gpu_code.cc b/src/tir/analysis/verify_gpu_code.cc
index f6a8ad034aa5..1fbae0fd2dcd 100644
--- a/src/tir/analysis/verify_gpu_code.cc
+++ b/src/tir/analysis/verify_gpu_code.cc
@@ -33,22 +33,20 @@
 namespace tvm {
 namespace tir {
 
-class GPUCodeVerifier : public StmtExprVisitor {
+class GPUCodeVerifier : public StmtVisitor {
  public:
   bool Verify(Stmt stmt, int64_t max_local_memory_per_block, int64_t max_shared_memory_per_block,
               int64_t max_threads_per_block, int64_t max_thread_x, int64_t max_thread_y,
-              int64_t max_thread_z, int64_t max_vector_bytes) {
+              int64_t max_thread_z) {
     max_local_memory_per_block_ = static_cast<size_t>(max_local_memory_per_block);
     max_shared_memory_per_block_ = static_cast<size_t>(max_shared_memory_per_block);
     max_threads_per_block_ = static_cast<size_t>(max_threads_per_block);
     max_thread_x_ = static_cast<size_t>(max_thread_x);
     max_thread_y_ = static_cast<size_t>(max_thread_y);
     max_thread_z_ = static_cast<size_t>(max_thread_z);
-    max_vector_bytes_ = static_cast<size_t>(max_vector_bytes);
 
     Reset_();
 
-    // TODO(jcf94): Add support of detecting CUDA Misaligned Address error
     this->VisitStmt(stmt);
 
     return valid_;
@@ -64,10 +62,6 @@ class GPUCodeVerifier : public StmtExprVisitor {
       size_t size = static_cast<size_t>(op->constant_allocation_size());
       shared_memory_per_block_ += size * op->dtype.bytes() * op->dtype.lanes();
     }
-
-    if (op->dtype.lanes() > 1) {
-      valid_ &= op->dtype.lanes() * op->dtype.bytes() <= static_cast<int>(max_vector_bytes_);
-    }
   }
 
   void VisitStmt_(const AttrStmtNode* op) final {
@@ -135,18 +129,6 @@ class GPUCodeVerifier : public StmtExprVisitor {
     }
   }
 
-  void VisitExpr_(const LoadNode* op) {
-    // Currently not able to check:
-    // if the index expression failed to be simplified to a Ramp
-    if (op->index->IsInstance<RampNode>()) {
-      if (op->dtype.lanes() > 1) {
-        valid_ &= op->dtype.lanes() * op->dtype.bytes() <=
-            static_cast<int>(max_vector_bytes_);
-      }
-    }
-    ExprVisitor::VisitExpr_(op);
-  }
-
  private:
   int nest_level_{0};
 
@@ -164,7 +146,6 @@ class GPUCodeVerifier : public StmtExprVisitor {
   size_t max_shared_memory_per_block_;
   size_t max_threads_per_block_;
   size_t max_thread_x_, max_thread_y_, max_thread_z_;
-  size_t max_vector_bytes_;
 
   bool valid_{true};
 
@@ -188,32 +169,27 @@ bool VerifyGPUCode(const PrimFunc& func, Map<String, PrimExpr> constraints) {
   int64_t max_thread_x = INT64_MAX;
   int64_t max_thread_y = INT64_MAX;
   int64_t max_thread_z = INT64_MAX;
-  int64_t max_vector_bytes = INT64_MAX;
 
   for (auto iter : constraints) {
     const IntImmNode* val = iter.second.as<IntImmNode>();
-    if (iter.first == "max_local_memory_per_block") {
+    if (iter.first == "max_local_memory_per_block")
       max_local_memory_per_block = val->value;
-    } else if (iter.first == "max_shared_memory_per_block") {
+    else if (iter.first == "max_shared_memory_per_block")
       max_shared_memory_per_block = val->value;
-    } else if (iter.first == "max_threads_per_block") {
+    else if (iter.first == "max_threads_per_block")
       max_threads_per_block = val->value;
-    } else if (iter.first == "max_thread_x") {
+    else if (iter.first == "max_thread_x")
       max_thread_x = val->value;
-    } else if (iter.first == "max_thread_y") {
+    else if (iter.first == "max_thread_y")
       max_thread_y = val->value;
-    } else if (iter.first == "max_thread_z") {
+    else if (iter.first == "max_thread_z")
       max_thread_z = val->value;
-    } else if (iter.first == "max_vector_bytes") {
-      max_vector_bytes = val->value;
-    } else {
+    else
       LOG(FATAL) << "Invalid check item: " << iter.first;
-    }
   }
 
   return verifier.Verify(func->body, max_local_memory_per_block, max_shared_memory_per_block,
-                         max_threads_per_block, max_thread_x, max_thread_y, max_thread_z,
-                         max_vector_bytes);
+                         max_threads_per_block, max_thread_x, max_thread_y, max_thread_z);
 }
 
 TVM_REGISTER_GLOBAL("tir.analysis.verify_gpu_code").set_body_typed(VerifyGPUCode);
diff --git a/src/tir/transforms/unroll_loop.cc b/src/tir/transforms/unroll_loop.cc
index 4f1078165f34..a15190665949 100644
--- a/src/tir/transforms/unroll_loop.cc
+++ b/src/tir/transforms/unroll_loop.cc
@@ -43,7 +43,6 @@ struct UnrollLoopConfigNode : public tvm::AttrsNode<UnrollLoopConfigNode> {
   int auto_max_depth;
   int auto_max_extent;
   int explicit_unroll;
-  int explicit_unroll_max_extent;
 
   TVM_DECLARE_ATTRS(UnrollLoopConfigNode, "tir.transform.UnrollLoopConfig") {
     TVM_ATTR_FIELD(auto_max_step)
@@ -58,9 +57,6 @@ struct UnrollLoopConfigNode : public tvm::AttrsNode<UnrollLoopConfigNode> {
     TVM_ATTR_FIELD(explicit_unroll)
         .describe("Whether to explicitly unroll the loop instead of setting a pragma")
         .set_default(true);
-    TVM_ATTR_FIELD(explicit_unroll_max_extent)
-        .describe("The maximum extent of a loop that can be unrolled explicitly (-1 for infinite)")
-        .set_default(32);
   }
 };
 
@@ -75,12 +71,11 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.UnrollLoop", UnrollLoopConfig);
 class LoopUnroller : public StmtExprMutator {
  public:
   explicit LoopUnroller(int auto_max_step, int auto_max_depth, int auto_max_extent,
-                        bool explicit_unroll, int explicit_unroll_max_extent)
+                        bool explicit_unroll)
       : auto_max_step_(auto_max_step),
         auto_max_depth_(auto_max_depth),
         auto_max_extent_(auto_max_extent),
-        explicit_unroll_(explicit_unroll),
-        explicit_unroll_max_extent_(explicit_unroll_max_extent) {}
+        explicit_unroll_(explicit_unroll) {}
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == "pragma_auto_unroll_max_step") {
@@ -170,12 +165,6 @@ class LoopUnroller : public StmtExprMutator {
     // For loop must have a constant integer extent
     CHECK_NE(value, -1) << "loop doesn't have a constant integer extent";
     if (value == 0) return Evaluate(0);
-    if (explicit_unroll_max_extent_ > 0 && value > explicit_unroll_max_extent_ &&
-        explicit_unroll_) {
-      // Do not unroll too long loops
-      ForType for_type = op->for_type == ForType::Unrolled ? ForType::Serial : op->for_type;
-      return For(op->loop_var, op->min, op->extent, for_type, op->device_api, op->body);
-    }
     Stmt body = op->body;
     Map<Var, PrimExpr> vmap;
     Array<Stmt> unrolled;
@@ -208,10 +197,7 @@ class LoopUnroller : public StmtExprMutator {
   // max extent of loop to auto unroll
   // this not not count the total steps, only count the number of loops
   int auto_max_extent_;
-  // Whether to explicitly unroll the loop instead of setting a pragma
   bool explicit_unroll_;
-  // The maximum extent of a loop that can be unrolled explicitly (-1 means infinite)
-  int explicit_unroll_max_extent_;
   // Number of normal loops in scope
   int normal_loop_depth_{0};
   // number of unrolled cases in current scope.
@@ -224,7 +210,7 @@ class LoopUnroller : public StmtExprMutator {
 
 Stmt UnrollLoop(Stmt stmt, UnrollLoopConfig cfg) {
   Stmt ret = LoopUnroller(cfg->auto_max_step, cfg->auto_max_depth, cfg->auto_max_extent,
-                          cfg->explicit_unroll, cfg->explicit_unroll_max_extent)(stmt);
+                          cfg->explicit_unroll)(stmt);
   if (!ret.same_as(stmt)) {
     return ConvertSSA(ret);
   } else {
diff --git a/tests/python/unittest/test_ansor_feature.py b/tests/python/unittest/test_ansor_feature.py
deleted file mode 100644
index 705556c65edf..000000000000
--- a/tests/python/unittest/test_ansor_feature.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test feature extraction"""
-
-import math
-import tempfile
-
-import tvm
-from tvm import te, ansor
-
-from test_ansor_common import matmul_ansor_test
-
-
-def fequal(a, b):
-    return math.fabs(a - b) < 1e-6
-
-
-def test_cpu_matmul():
-    dag = ansor.ComputeDAG(matmul_ansor_test(512, 512, 512))
-    s = dag.get_init_state()
-    C = s.stage_ops[2]
-
-    i, j, k = s[C].iters
-    io, ii = s.split(C, i, [16])
-    jo, ji = s.split(C, j, [8])
-    s.reorder(C, [io, jo, k, ji, ii])
-    s.vectorize(C, ji)
-    s.parallel(C, io)
-    s.parallel(C, jo)
-    s.unroll(C, k)
-
-    target = tvm.target.create('llvm')
-    task = ansor.SearchTask(dag, "test", target)
-    names = ansor.feature.get_per_stmt_feature_names()
-    fea = ansor.feature.get_per_stmt_features_from_states([s], task)[0]
-
-    stage_0 = fea[0]
-    assert len(stage_0) == len(names), "%d vs %d" % (len(stage_0), len(names))
-    fea_dict = {}
-    for name, value in zip(names, stage_0):
-        fea_dict[name] = value
-
-    for name in ["B0", "B1", "B2"]:
-        if fequal(fea_dict[name + ".acc_type.kReadWrite"], 1.0):
-            c_name = name
-        if fequal(fea_dict[name + ".acc_type.kRead"], 1.0):
-            if fequal(fea_dict[name + ".stride"], 0.0):
-                b_name = name
-            else:
-                a_name = name
-
-    assert fequal(fea_dict[c_name + ".bytes"], math.log2(512 ** 3 * 4 + 1))
-    assert fequal(fea_dict[b_name + ".unique_bytes"], math.log2(512 ** 2 * 4 + 1))
-    assert fequal(fea_dict[c_name + ".reuse_dis_iter"], math.log2(8 * 16 + 1))
-    assert fequal(fea_dict[c_name + ".reuse_dis_bytes"], math.log2((8 * 16 + 8 + 16) * 4 + 1))
-    assert fequal(fea_dict[c_name + ".reuse_ct"], math.log2(512 + 1))
-
-    assert fequal(fea_dict["unroll_num"], math.log2(1 + 1))
-    # assert fequal(fea_dict["unroll_type.kPosInnerReduce"], 1.0)
-    assert fequal(fea_dict["vec_num"], math.log2(1 + 1))
-    assert fequal(fea_dict["parallel_num"], math.log2(2 + 1))
-    assert fequal(fea_dict["parallel_prod"], math.log2((512 * 512 / 16 / 8) + 1))
-
-
-def test_cpu_fusion():
-    def fusion_test(N, M):
-        A = te.placeholder((N, M), name='A')
-        B = te.compute((N, M), lambda i, j: A[i][j], name='B')
-        C = te.compute((N, M), lambda i, j: B[i][j], name='C')
-        return [A, B, C]
-
-    dag = ansor.ComputeDAG(fusion_test(64, 32))
-    s = dag.get_init_state()
-    s.compute_at(1, 2, s.stages[2].iters[1])
-
-    target = tvm.target.create('llvm')
-    task = ansor.SearchTask(dag, "test", target)
-    names = ansor.feature.get_per_stmt_feature_names()
-    fea = ansor.feature.get_per_stmt_features_from_states([s], task)[0]
-
-    found = False
-    for stage_fea in fea:
-        for i, (name, value) in enumerate(zip(names, stage_fea)):
-            if 'reuse_type.kSerialMultipleReadWrite' in name and value > 0.5:
-                assert fequal(stage_fea[i + 2], 1.0)
-                assert fequal(stage_fea[i + 3], math.log2(16 + 1))
-                found = True
-    assert found
-
-
-def test_gpu_feature():
-    ctx = tvm.context("cuda", 0)
-    if not ctx.exist:
-        return
-
-    json_records = "\n".join((
-        """{"i": [["[\\"matmul_ansor_test\\", 512, 512, 512]", "cuda"], [[], [["CHW", 2, "local"], ["SP", 2, 0, 512, [1, 16, 32, 1], 1], ["SP", 2, 5, 512, [4, 1, 1, 16], 1], ["SP", 2, 10, 512, [1, 2], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 3, 0, 1, 3], ["FSP", 3, 4, 2, 3], ["RE", 3, [0, 4, 1, 5, 2, 6, 3, 7]], ["FU", 2, [0, 1]], ["FU", 3, [0, 1]], ["FU", 2, [1, 2]], ["FU", 3, [1, 2]], ["FU", 2, [2, 3]], ["FU", 3, [2, 3]], ["CA", 2, 3, 2], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 3], ["FU", 2, [0, 1]], ["FFSP", 2, 0, [1, 2], 1, 1], ["AN", 2, 1, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 3], ["FU", 1, [0, 1]], ["FFSP", 1, 0, [1, 2], 1, 1], ["AN", 1, 1, 6], ["AN", 5, 0, 5], ["AN", 5, 1, 4], ["AN", 5, 2, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.00536798], 0, 2.49277, 1585564852], "v": "v0.1"}""",
-    ))
-
-    # load states
-    with tempfile.NamedTemporaryFile(mode='w') as f:
-        f.write(json_records)
-        f.flush()
-        inputs, results = ansor.LogReader(f.name).read_lines()
-
-        inp = inputs[0]
-        dag = ansor.workload_key_to_dag(inp.task.workload_key)
-        task = ansor.SearchTask(dag, inp.task.workload_key, inp.task.target, None, ansor.HardwareParams(100000, 16, 64, 4, 64))
-
-        state = ansor.serialization.get_states_from_measure_inputs(inputs, task)[0]
-        state = dag.infer_bound_from_state(state)
-        fea = ansor.feature.get_per_stmt_features_from_states([state], task)[0]
-        names = ansor.feature.get_per_stmt_feature_names()
-
-        # build feature dict
-        fea_dicts = []
-        for i in range(len(fea)):
-            tmp_dict = {}
-            for j in range(len(names)):
-                tmp_dict[names[j]] = fea[i][j]
-            fea_dicts.append(tmp_dict)
-
-        # check values
-        assert fequal(fea_dicts[0]['blockIdx_x_len'], math.log2(8 + 1))
-        assert fequal(fea_dicts[0]['vthread_len'], math.log2(4 + 1))
-        assert fequal(fea_dicts[1]['threadIdx_x_len'], math.log2(16 + 1))
-        assert fequal(fea_dicts[0]['threadIdx_y_len'], math.log2(1 + 1))
-        assert fequal(fea_dicts[2]['blockIdx_z_len'], math.log2(1 + 1))
-        assert fequal(fea_dicts[0]['is_gpu'], 1.0)
-
-
-if __name__ == "__main__":
-    test_cpu_matmul()
-    test_cpu_fusion()
-    test_gpu_feature()
diff --git a/tests/python/unittest/test_ansor_relay_integration.py b/tests/python/unittest/test_ansor_relay_integration.py
deleted file mode 100644
index 1ad507e2f371..000000000000
--- a/tests/python/unittest/test_ansor_relay_integration.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Test Relay Integration """
-
-import tempfile
-import numpy as np
-
-import tvm
-from tvm import ansor, relay
-import tvm.contrib.graph_runtime as runtime
-from tvm.relay.testing import dqn
-
-def test_tune_dense_graph():
-    def dense_graph(N, dtype="float32"):
-        ori_data = relay.var("data", shape=(N, N), dtype=dtype)
-        weight = relay.var("weight", shape=(N, N), dtype=dtype)
-        data = relay.multiply(ori_data, relay.const(2, dtype=dtype))
-        dense = relay.nn.dense(data, weight, out_dtype=dtype)
-        dense = relay.add(dense, weight)
-        dense = relay.nn.dense(dense, weight, out_dtype=dtype)
-        return ori_data, weight, dense
-
-    N = 128
-    data, weight, dense = dense_graph(N)
-    mod = relay.Function([data, weight], dense)
-    mod = tvm.IRModule.from_expr(mod)
-
-    ctx = tvm.context("llvm")
-    target = tvm.target.create("llvm")
-    d = tvm.nd.array(np.random.uniform(size=(N, N)).astype(data.type_annotation.dtype), ctx)
-    w = tvm.nd.array(np.random.uniform(size=(N, N)).astype(weight.type_annotation.dtype), ctx)
-    wkl_keys, wkl_weights = ansor.extract_from_program(mod, {}, target=target)
-
-    assert len(wkl_keys) == 2
-    assert len(wkl_weights) == 2
-
-    tasks = []
-    for wkl_key in wkl_keys:
-        dag = ansor.workload_key_to_dag(wkl_key)
-        tasks.append(ansor.SearchTask(dag, wkl_key, target))
-
-    tuner = ansor.SimpleTaskScheduler(tasks)
-    measure_ctx = ansor.LocalRPCMeasureContext()
-    with tempfile.NamedTemporaryFile() as fp:
-        tuner.tune(ansor.TuneOption(n_trials=2, runner=measure_ctx.runner,
-                                    measure_callbacks=[ansor.LogToFile(fp.name)]))
-        with ansor.apply_history_best(fp.name):
-            with tvm.transform.PassContext(opt_level=3,  disabled_pass={"AlterOpLayout"}):
-                graph, lib, opt_params = relay.build_module.build(
-                    mod, target=target)
-
-                m = runtime.create(graph, lib, ctx)
-                m.set_input('data', d)
-                m.set_input('weight', w)
-                m.run()
-                res = m.get_output(0)
-
-    del measure_ctx
-
-    d = d.asnumpy()
-    d = d * 2
-    w = w.asnumpy()
-    d = np.dot(d, np.transpose(w))
-    d = d + w
-    d = np.dot(d, np.transpose(w))
-
-    tvm.testing.assert_allclose(res.asnumpy(), d, rtol=1e-5)
-
-
-def test_tune_dqn():
-    mod, params = dqn.get_workload(1, image_shape=(84, 84, 4), layout='NHWC')
-    target = tvm.target.create('llvm')
-
-    wkl_keys, wkl_weights = ansor.extract_from_program(mod, params, target)
-
-    tasks = []
-    for wkl_key in wkl_keys:
-        dag = ansor.workload_key_to_dag(wkl_key)
-        tasks.append(ansor.SearchTask(dag, wkl_key, target))
-
-    assert len(tasks) == 5
-
-    tuner = ansor.SimpleTaskScheduler(tasks)
-    measure_ctx = ansor.LocalRPCMeasureContext()
-    with tempfile.NamedTemporaryFile() as fp:
-        tuner.tune(ansor.TuneOption(n_trials=len(tasks), runner=measure_ctx.runner,
-                                    measure_callbacks=[ansor.LogToFile('tmp.json')]),
-                   search_policy='sketch.random')
-        with ansor.apply_history_best('tmp.json'):
-            ansor.prepare_layout_rewrite(mod, params, target)
-            with tvm.transform.PassContext(opt_level=3,  disabled_pass={"AlterOpLayout"}):
-                graph, lib, opt_params = relay.build_module.build(mod, target=target)
-            ansor.finish_layout_rewrite()
-
-    del measure_ctx
-
-if __name__ == "__main__":
-    test_tune_dense_graph()
-    test_tune_dqn()
-
diff --git a/tests/python/unittest/test_ansor_task_scheduler.py b/tests/python/unittest/test_ansor_task_scheduler.py
deleted file mode 100644
index 53cf2059c1f3..000000000000
--- a/tests/python/unittest/test_ansor_task_scheduler.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test the task scheduler """
-
-import threading
-
-import tvm
-from tvm import ansor
-
-from test_ansor_common import matmul_ansor_test
-
-def test_task_scheduler_basic():
-    N = 128
-    A, B, C = matmul_ansor_test(N, N, N)
-    dag = ansor.ComputeDAG([A, B, C])
-    tgt = tvm.target.create("llvm")
-    task1 = ansor.SearchTask(dag, "test", tgt)
-    task2 = ansor.SearchTask(dag, "test", tgt)
-
-    def basic_test_func(task1, task2):
-        def objective(costs):
-            return sum(costs)
-
-        task_scheduler = ansor.SimpleTaskScheduler([task1, task2], objective)
-        tune_option = ansor.TuneOption(n_trials=3, runner='local')
-        task_scheduler.tune(tune_option)
-
-    # Ansor search process with local runner has some modification on thread
-    # binding, wrap this to a subprocess to eliminate the impacts to other tests
-    t = threading.Thread(target=basic_test_func,
-                         kwargs={'task1': task1, 'task2': task2})
-    t.start()
-    t.join()
-
-
-if __name__ == "__main__":
-    test_task_scheduler_basic()
diff --git a/tests/python/unittest/test_tir_transform_unroll_loop.py b/tests/python/unittest/test_tir_transform_unroll_loop.py
index 12c686634548..68639940bb05 100644
--- a/tests/python/unittest/test_tir_transform_unroll_loop.py
+++ b/tests/python/unittest/test_tir_transform_unroll_loop.py
@@ -110,31 +110,7 @@ def test_unroll_single_count_loops():
         ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body
         assert ret == stmt
 
-def test_unroll_explicitly_max_extent():
-    n = 64
-    A = te.placeholder((n,), name='A')
-    B = te.compute((n,), lambda *i: A(*i), name='B')
-    s = te.create_schedule(B.op)
-    s = s.normalize()
-    dom_map = tvm.te.schedule.InferBound(s)
-    stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
-
-    with tvm.transform.PassContext(config={
-        "tir.UnrollLoop": {"explicit_unroll_max_extent": n-1}
-    }):
-        ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body
-        assert tvm.ir.structural_equal(ret, stmt)
-
-    with tvm.transform.PassContext(config={
-        "tir.UnrollLoop": {"explicit_unroll_max_extent": n}
-    }):
-        ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body
-        assert not tvm.ir.structural_equal(ret, stmt)
-
-
 if __name__ == "__main__":
     test_unroll_loop()
     test_unroll_fake_loop()
     test_unroll_single_count_loops()
-    test_unroll_explicitly_max_extent()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 7dd782f5b622..e0e455667889 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -1295,75 +1295,6 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
       name, tag);
 }
 
-/*!
- * \brief utility function for kernel_layout_transform
- */
-inline void parse_kernel_layout(const String& layout,
-                                Array<PrimExpr>* shape,
-                                std::vector<std::string>* axes) {
-  int32_t factor = 0;
-  std::string axis = "";
-  for (char c : std::string(layout)) {
-    if (c >= 'A' && c <= 'z') {
-      axis += c;
-      if (factor != 0) {
-        shape->push_back(factor);
-        factor = 0;
-      }
-    } else if (c >= '0' && c <= '9') {
-      factor = factor * 10 + c - '0';
-      if (!axis.empty()) {
-        axes->push_back(axis);
-        axis = "";
-      }
-    } else {
-      LOG(FATAL) << "Invalid layout " << layout;
-    }
-  }
-  if (!axis.empty()) {
-    axes->push_back(axis);
-  }
-}
-
-/*!
- * \brief Transform the kernel layout according to \p src_layout and \p dst_layout
- * \param src the source input.
- * \param src_layout the source layout.
- * \param dst_layout the destination layout.
- * \param name output tensor name.
- * \param tag output tensor tag.
- * \return A tensor with shape in \p dst_layout
- */
-inline Tensor kernel_layout_transform(const Tensor& src,
-                                      const String& src_layout,
-                                      const String& dst_layout,
-                                      const String name = "T_kernel_layout_trans",
-                                      const String tag = kInjective) {
-  Array<PrimExpr> src_shape;
-  std::vector<std::string> src_axes;
-  Array<PrimExpr> dst_shape;
-  std::vector<std::string> dst_axes;
-
-  parse_kernel_layout(src_layout, &src_shape, &src_axes);
-  parse_kernel_layout(dst_layout, &dst_shape, &dst_axes);
-  return compute(
-      dst_shape, [&](const Array<Var>& dst_indices) {
-        Array<PrimExpr> dst_indices_expr(dst_indices.begin(), dst_indices.end());
-        Array<PrimExpr> src_indices;
-        for (const std::string& src_axis : src_axes) {
-          PrimExpr src_index = 0;
-          CHECK_EQ(dst_indices_expr.size(), dst_axes.size());
-          for (size_t i = 0; i < dst_axes.size(); ++i) {
-            if (dst_axes[i] == src_axis) {
-              src_index = src_index * dst_shape[i] + dst_indices_expr[i];
-            }
-          }
-          src_indices.push_back(src_index);
-        }
-        return src(src_indices);
-      }, name, tag);
-}
-
 /*!
  * \brief Get the shape of input tensor.
  * \param src the input tensor.
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 6800129c12aa..4c7941b49692 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
-from tvm import te, ansor
+from tvm import te
 
 from .pad import pad
 from .util import get_pad_tuple
@@ -342,37 +342,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
         dilation_h, dilation_w = dilation
 
     batch, in_height, in_width, in_channel = Input.shape
-    if ansor.GLOBAL_SCOPE.topi_in_compute_rewrite_mode:
-        # infer shape for the rewritten layout
-        if len(Filter.shape) >= 10:
-            # For cpu tile structure SSRSRS
-            base = len(Filter.shape) - 10
-            kernel_h = Filter.shape[2 + base] * Filter.shape[6 + base]
-            kernel_w = Filter.shape[3 + base] * Filter.shape[7 + base]
-            channel = Filter.shape[4 + base] * Filter.shape[8 + base]
-            num_filter = Filter.shape[5 + base] * Filter.shape[9 + base]
-            for i in range(base + 2):
-                num_filter *= Filter.shape[i]
-        elif len(Filter.shape) == 6:
-            # For cpu tile structure SRS
-            num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[5]
-            kernel_h = Filter.shape[2]
-            kernel_w = Filter.shape[3]
-            channel = Filter.shape[4]
-        elif len(Filter.shape) == 5:
-            # For cpu tile structure SRS
-            num_filter = Filter.shape[0] * Filter.shape[4]
-            kernel_h = Filter.shape[1]
-            kernel_w = Filter.shape[2]
-            channel = Filter.shape[3]
-        elif len(Filter.shape) == 4:
-            num_filter, kernel_h, kernel_w, channel = Filter.shape
-        else:
-            raise ValueError("Don't know how to infer layout for filter shape: %s. " \
-                             "You can add a new branch for it to fix this." % str(Filter))
-    else:
-        kernel_h, kernel_w, channel, num_filter = Filter.shape
-
+    kernel_h, kernel_w, channel, num_filter = Filter.shape
     # compute the output shape
     dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
     dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
@@ -392,9 +362,8 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
         lambda nn, yy, xx, ff: te.sum(
             PaddedInput[nn, yy * stride_h + ry * dilation_h,
                         xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
-            Filter[ry, rx, rc, ff].astype(out_dtype)
-            , axis=[ry, rx, rc]),
-        name="Conv2dOutput", tag="conv2d_nhwc", attrs={"layout_free_placeholders": [Filter]})
+            Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
+        name="Conv2dOutput", tag="conv2d_nhwc")
     return Output
 
 
diff --git a/tutorials/ansor/README.txt b/tutorials/ansor/README.txt
deleted file mode 100644
index 85b6ba401dae..000000000000
--- a/tutorials/ansor/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _tutorial-ansor-auto-schedule:
-
-Ansor: Template Free Auto Scheduling
-------------------------------------
diff --git a/tutorials/ansor/tune_conv2d_cuda.py b/tutorials/ansor/tune_conv2d_cuda.py
deleted file mode 100644
index 03f1b24a768e..000000000000
--- a/tutorials/ansor/tune_conv2d_cuda.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Auto-scheduling High Performance Convolution on NVIDIA GPUs
-===========================================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
-            `Chengfan Jia <https://github.com/jcf94>`_, \
-            `Minmin Sun <https://github.com/minminsun>`_, \
-            `Zhao Wu <https://github.com/FrozenGene>`_
-
-This is an tutorial for searching high performance schedule for NVIDIA GPU using
-Ansor auto-scheduler. By running Ansor on this template, we can outperform the
-vendor provided library CuDNN in many cases.
-"""
-
-######################################################################
-# Install dependencies
-# --------------------
-# To use autotvm package in tvm, we need to install some extra dependencies.
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install --user psutil xgboost tornado
-#
-# To make TVM run faster in tuning, it is recommended to use cython
-# as FFI of tvm. In the root directory of tvm, execute
-#
-# .. code-block:: bash
-#
-#   pip3 install --user cython
-#   sudo make cython3
-#
-# Now return to python code. Import packages.
-
-import random
-import sys
-
-import numpy as np
-import tvm
-import topi
-from topi.testing import conv2d_nchw_python
-from tvm import te
-
-# the module is called `ansor`
-from tvm import ansor
-
-######################################################################
-# Step 1:  Define the search task
-# -------------------------------
-# There are plenty of useful schedule primitives in tvm. You can also find
-# some tutorials that describe them in more details, such as
-# (1). :ref:`opt-conv-gpu`
-# (2). `Optimizing DepthwiseConv on NVIDIA GPU <https://tvm.apache.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example>`_
-#
-# It's usually a hard job if one wants to get a high performance schedule for a
-# specific workload. Even writing an AutoTVM tunable template needs user to have
-# expertises on how each schedule primitive works as well as how they finally
-# reflect on the hardward architecture.
-#
-# However, with Ansor this will be quite simple. Firstly, define the target workload.
-# Both :code:`tvm.te` API or topi op API are fine to be used.
-#
-# We can use the retuned :code:`Tensors` to create a ComputeDAG just like what we do
-# in :ref:`ansor-simple-subgraph`, while the way to use workload registry is more
-# recommended.
-
-# Use an extra function decorator to regist this workload
-@ansor.register_workload_func
-def conv2d_nchw(N, H, W, CO, CI, KH, KW, stride, padding):
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
-    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')
-
-    return [data, kernel, conv]
-
-######################################################################
-# Step 2:  Search through the schedule space
-# ------------------------------------------
-# We pick the last layer on resnet as test case.
-# Since our space is very large, :code:`XGBModel` is most suitable
-# for our case. Here we only do 20 trials for demonstration.
-# In practice, making 1000 trials usually can find some good kernels
-# for this workload.
-
-tgt = tvm.target.cuda()
-
-# The last layer in resnet
-N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
-# Generate workload key with the ansor API
-wkl_key = ansor.make_workload_key_func(conv2d_nchw, (N, H, W, CO, CI, KH, KW, strides, padding))
-# Generate ComputeDAG using the workload key
-dag = ansor.workload_key_to_dag(wkl_key)
-task = ansor.SearchTask(dag, wkl_key, target=tgt)
-
-log_file = "conv2d_nchw.json"
-seed = 0
-random.seed(seed)
-cost_model = ansor.XGBModel(seed=seed)
-search_policy = ansor.SketchSearchPolicy(cost_model, seed=seed)
-
-#########################################################################
-# The :code:`ansor.LocalRPCMeasureContext` is used to create a RPC runner environment.
-# 
-# Use local gpu, measure 10 times for every schedule to reduce variance. The timeout
-# for each running is set to 4 seconds.
-#
-# During the searching process, we may generate several invalid schedules and they
-# will be filtered out. It's fine to see "Encountered errors during feature extraction."
-# in the tuning logs.
-# :code:`ansor.LogToFile` callback will log the tuning results into a
-# log file, which can be used to get the best config later.
-# :code:`ansor.PreloadMeasuredStates` callback will load measured states
-# from history log before schedule search, we can add this callback to make
-# sure a same schedule will never be measured for multiple times.
-
-measure_ctx = ansor.LocalRPCMeasureContext(repeat=3, min_repeat_ms=100, timeout=4)
-tune_option = ansor.TuneOption(n_trials=20,
-                               runner=measure_ctx.runner,
-                               measure_callbacks=[ansor.LogToFile(log_file)],
-                               pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)])
-s, arg_bufs = ansor.auto_schedule(task, search_policy=search_policy, tune_option=tune_option)
-
-print("==== Get Lowered Stmt ====")
-print(tvm.lower(s, arg_bufs, simple_mode=True))
-
-# Release the RPC runner environment
-del measure_ctx
-
-#########################################################################
-# From the example lower result showed above, we can see that Ansor has tried
-# techniques such as `Shared Memory Cooperative Fetching`, `Kernel Fusion`,
-# `Axis unroll`, `Axis Vectorize` and so on. There is no need for users to care
-# about the details, and Ansor will catch them well.
-#
-# Finally we can directly use the returned result to get the generated schedule,
-# while in the following tutorial we'll show how to inspect the best config from
-# log file, check correctness, and measure running time.
-
-# Get history best from log file
-inp, res = ansor.best_measure_pair_in_file(log_file)
-# Get the task ComputeDAG from log result
-dag = ansor.workload_key_to_dag(inp.task.workload_key)
-# Apply log result to TVM schedule
-s, arg_bufs = dag.apply_steps_from_state(inp.state)
-func = tvm.build(s, arg_bufs, target=tgt)
-
-# check correctness
-a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
-w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
-c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
-
-ctx = tvm.gpu()
-a_tvm = tvm.nd.array(a_np, ctx=ctx)
-w_tvm = tvm.nd.array(w_np, ctx=ctx)
-c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
-func(a_tvm, w_tvm, c_tvm)
-
-tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
-
-# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
-# and the overhead of kernel launch. You can also use nvprof to validate the result.
-evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
-print('Time cost of this operator: %f s' % evaluator(a_tvm, w_tvm, c_tvm).mean)
-
diff --git a/tutorials/ansor/tune_simple_subgraph.py b/tutorials/ansor/tune_simple_subgraph.py
deleted file mode 100644
index 00bef82cf855..000000000000
--- a/tutorials/ansor/tune_simple_subgraph.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _ansor-simple-subgraph:
-
-Writing compute expression and Using Ansor auto-scheduler
-=========================================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
-            `Chengfan Jia <https://github.com/jcf94>`_, \
-            `Minmin Sun <https://github.com/minminsun>`_, \
-            `Zhao Wu <https://github.com/FrozenGene>`_
-
-This is an introduction tutorial to the auto-scheduler module in TVM.
-
-There are two steps in auto-scheduling.
-The first step is defining the target task.
-The second step is running a search algorithm to auto explore the schedule.
-In this tutorial, you can learn how to perform these two steps in TVM.
-The whole workflow is illustrated by a matrix multiplication with bias add example.
-"""
-
-######################################################################
-# Install dependencies
-# --------------------
-# To use Ansor package in TVM, we need to install some extra dependencies.
-# This step (installing xgboost) can be skipped as it doesn't need XGBoost
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install --user psutil xgboost
-#
-# To make TVM run faster in tuning, it is recommended to use cython
-# as FFI of TVM. In the root directory of TVM, execute
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install --user cython
-#   sudo make cython3
-#
-# Now return to python code. Import packages.
-
-import random
-import sys
-
-import numpy as np
-import tvm
-from tvm import te
-
-# the module is called `ansor`
-from tvm import ansor
-
-######################################################################
-# Step 1:  Define the target compute subgraph
-# -------------------------------------------
-# In this section, we will write a deterministic TVM compute expression code
-# to a compute subgraph.
-#
-# .. note:: Comparing to :ref:`tutorials-autotvm-sec`
-#
-#  In Ansor, we do not need users to provide a schedule template, the only input
-#  is the compute expression writing by :code:`tvm.te` API or topi op API.
-#
-# Here is how we implement a matrix multiplication subgraph in TVM.
-
-# Matmul with bias add
-def matmul_add(N, L, M, dtype):
-    A = te.placeholder((N, L), name='A', dtype=dtype)
-    B = te.placeholder((L, M), name='B', dtype=dtype)
-    C = te.placeholder((N, M), name='C', dtype=dtype)
-
-    k = te.reduce_axis((0, L), name='k')
-    mul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
-                     name='Mul')
-    D = te.compute((N, M), lambda i, j: C[i, j] + mul[i, j], name='D')
-
-    return [A, B, C, D]
-
-######################################################################
-# Step 2:  Search through the schedule space
-# ------------------------------------------
-# In step 1, we build the compute subgraph.
-# The next step is to pick a cost model as well as a search policy and explore the
-# possible schedule.
-#
-# Auto-scheduler in TVM
-# ^^^^^^^^^^^^^^^^^^^^^
-# The job for the Ansor auto-scheduler can be described by following pseudo code
-#
-#   .. code-block:: c
-#
-#    ct = 0
-#    while ct < max_number_of_trials:
-#        auto generate a batch of schedules
-#        measure this batch of schedules on real hardware and get results
-#        ct += batch_size
-#
-# When proposing the next batch of schedules, Ansor can take different cost models to
-# guide the schedule generating process.
-#
-# * :code:`RandomModel`: Generate and take new schedule randomly
-# * :code:`XGBModel`: Use XGBoost model to estimate the performance of potential schedules, try to pick schedules with better performance in each step
-#
-# XGBModel can explore more efficiently and find better schedules.
-
-################################################################
-# Begin tuning
-# ^^^^^^^^^^^^
-# Here we continue our matrix multiplication example.
-#
-# The :code:`ansor.ComputeDAG` takes the Tensor list as input, and generates
-# a dag structure. During which process, :code:`ansor.ComputeDAG` will
-# do some analyzes with the target subgraph and the results will be used in
-# search policy later.
-#
-# Then we create the :code:`tvm.target` and a tuning task.
-
-N, L, M = 128, 128, 128
-A, B, C, D = matmul_add(N, L, M, 'float32')
-dag = ansor.ComputeDAG([A, B, C, D])
-
-print(dag)
-print(dag.access_analyzer)
-
-tgt = tvm.target.create("llvm")
-task = ansor.SearchTask(dag, "test", tgt)
-
-################################################################
-# Next, we choose random model and create a default search policy:
-# :code:`ansor.SketchSearchPolicy`.
-#
-# We only make 5 trials in this tutorial for demonstration. In practice,
-# you can do more trials according to your time budget.
-# :code:`ansor.LogToFile` callback will log the tuning results into a
-# log file, which can be used to get the best config later.
-# :code:`ansor.PreloadMeasuredStates` callback will load measured states
-# from history log before schedule search, we can add this callback to make
-# sure a same schedule will never be measured for multiple times.
-
-log_file = "matmul_add.json"
-
-seed = 0
-random.seed(seed)
-cost_model = ansor.RandomModel()
-search_policy = ansor.SketchSearchPolicy(cost_model, seed=seed)
-
-tune_option = ansor.TuneOption(n_trials=5,
-                               measure_callbacks=[ansor.LogToFile(log_file)],
-                               pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)])
-
-################################################################
-# Then just call :code:`ansor.auto_schedule` and Ansor will try to find a high
-# performance schedule for the target subgraph automatically.
-#
-# The returned result will be a :code:`te.schedule` and a list of :code:`te.Tensor`,
-# which can be used as the input of :code:`tvm.lower` or :code:`tvm.build`.
-
-s, arg_bufs = ansor.auto_schedule(task, search_policy=search_policy,
-                                  tune_option=tune_option)
-
-print("==== Get Lowered Stmt ====")
-print(tvm.lower(s, arg_bufs, simple_mode=True))
-
-#########################################################################
-# Check the correctness to make sure we generate a right schedule.
-
-func = tvm.build(s, arg_bufs)
-
-# check correctness
-a_np = np.random.uniform(size=(N, L)).astype(np.float32)
-b_np = np.random.uniform(size=(L, M)).astype(np.float32)
-c_np = np.random.uniform(size=(N, M)).astype(np.float32)
-d_np = a_np.dot(b_np) + c_np
-
-d_tvm = tvm.nd.empty(d_np.shape)
-func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm)
-
-tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-2)
diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt
index 4ad36c000e3c..38e3b3343f4e 100644
--- a/tutorials/autotvm/README.txt
+++ b/tutorials/autotvm/README.txt
@@ -1,4 +1,4 @@
 .. _tutorials-autotvm-sec:
 
-AutoTVM: Template Based Auto Tuning
------------------------------------
+Auto tuning
+-----------