From a5e54b1decce06e666f4fe0fa348e97853993dbd Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sat, 15 Feb 2020 10:12:06 +0900
Subject: [PATCH 01/73] [QNN] Add support for per channel weight scale in dense
 op (#4880)

* add test case for per channel dense

* add unit arg in tflite frontend

* update qnn legalize test

* fix output dim index
---
 python/tvm/relay/frontend/tflite.py          |  2 +
 python/tvm/relay/qnn/op/qnn.py               |  4 +-
 src/relay/qnn/op/dense.cc                    |  2 +-
 tests/python/relay/test_op_qnn_dense.py      | 75 ++++++--------------
 tests/python/relay/test_pass_qnn_legalize.py |  1 +
 5 files changed, 29 insertions(+), 55 deletions(-)
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index d889631a4cd8..e92e4cef205d 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -982,6 +982,7 @@ def convert_fully_connected(self, op):
 
         weight_value = self.get_tensor_value(weight_tensor)
         weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str)
+        weight_shape = _infer_shape(weight_expr)
 
         if input_tensor.qnn_params:
             out = _qnn.op.dense(in_expr, weight_expr,
@@ -989,6 +990,7 @@ def convert_fully_connected(self, op):
                                 kernel_zero_point=weight_tensor.qnn_params['zero_point'],
                                 input_scale=input_tensor.qnn_params['scale'],
                                 kernel_scale=weight_tensor.qnn_params['scale'],
+                                units=weight_shape[0],
                                 out_dtype='int32')
         else:
             out = _op.nn.dense(in_expr, weight_expr)
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index eaca625df83b..a7529f6c8505 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -345,7 +345,7 @@ def dense(data,
           kernel_zero_point,
           input_scale,
           kernel_scale,
-          units=None,
+          units,
           out_dtype="int32"):
     """Qnn Dense operator.
     Applies a quantized linear transformation
@@ -371,7 +371,7 @@ def dense(data,
         stored for access to this during relay. This information is not
         needed in the pass pipeline after qnn.conv2d is lowered to the
         sequence of steps as in nn.conv2d. See also input_scale in Requantize.
-    units : int, optional
+    units : int
         Number of hidden units of the dense transformation.
     out_dtype : str, optional
         Specifies the output data type for mixed precision dense can be int32 or int16.
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index b7a12e1a64b3..de3c4dbc7dc1 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -55,7 +55,7 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   CHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
   CHECK(IsScalarType(types[3], DataType::Int(32)));    // kernel_zero_point
   CHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
-  CHECK(IsScalarType(types[5], DataType::Float(32)));  // kernel_scale
+  AssignType(types[5], DataType::Float(32), param->units, reporter);
 
   CHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
 
diff --git a/tests/python/relay/test_op_qnn_dense.py b/tests/python/relay/test_op_qnn_dense.py
index 0e7c284653f4..43600cbf60c5 100644
--- a/tests/python/relay/test_op_qnn_dense.py
+++ b/tests/python/relay/test_op_qnn_dense.py
@@ -75,52 +75,8 @@ def make_configuration(quantized_data,
     return config
 
 
-def make_uint_configuration(use_bias=False, requantize_output=False):
-    input_shape, kernel_shape, output_shape = (2, 10), (3,10), (2, 3)
-    input_zero_point, kernel_zero_point = 127, 127
-    input_scale = 0.5
-    kernel_scale = 0.5
-    output_scale = 1.0
-    in_dtype = 'uint8'
-    out_dtype = 'int32' if not requantize_output else 'uint8'
-    units = 3
-    quantized_data_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 109, 107,
-                                  129, 131, 133, 135, 137, 139, 141, 111, 145, 107]) \
-        .astype(in_dtype) \
-        .reshape(input_shape)
-    quantized_kernel_np = np.array([129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
-                                    129, 131, 133, 135, 137, 139, 141, 143, 145, 147,
-                                    129, 131, 133, 135, 137, 139, 141, 143, 145, 147]) \
-        .astype(in_dtype) \
-        .reshape(kernel_shape)
-    bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units, )) if use_bias else None
-    requant_params = make_requantize_params(input_scale * kernel_scale, output_scale, 127, 'uint8') if requantize_output else None
-
-    if requantize_output:
-        assert use_bias
-        output = np.array([151, 152, 153, 185, 186, 187])
-    elif use_bias:
-        output = np.array([96, 100, 104, 232, 236, 240 ])
-    else:
-        output = np.array([92, 92, 92, 228, 228, 228 ])
-    output = output.astype(out_dtype).reshape(output_shape)
-    return make_configuration(quantized_data=quantized_data_np,
-                              quantized_kernel=quantized_kernel_np,
-                              dtype=in_dtype,
-                              input_shape=input_shape,
-                              kernel_shape=kernel_shape,
-                              input_zero_point=input_zero_point,
-                              kernel_zero_point=kernel_zero_point,
-                              input_scale=input_scale,
-                              kernel_scale= kernel_scale,
-                              units=units,
-                              output=output,
-                              bias=bias,
-                              requantize=requant_params)
-
-
-def make_int_configuration(use_bias=False, requantize_output=False):
-    input_shape, kernel_shape, output_shape = (2, 10), (3,10), (2, 3)
+def make_int_configuration(use_bias=False, requantize_output=False, per_channel=False):
+    input_shape, kernel_shape, output_shape = (2, 10), (3, 10), (2, 3)
     input_zero_point, kernel_zero_point = -1, -1
     in_dtype = 'int8'
     out_dtype = 'int32' if not requantize_output else 'int8'
@@ -138,15 +94,22 @@ def make_int_configuration(use_bias=False, requantize_output=False):
     kernel_scale = 0.5
     output_scale = 1.0
     bias = np.array([4, 8, 12]).astype(out_dtype).reshape((units, )) if use_bias else None
-    requant_params = make_requantize_params(input_scale * kernel_scale, output_scale, -1, 'int8') if requantize_output else None
 
-    if requantize_output:
+    if per_channel:
+        assert use_bias and requantize_output
+        kernel_scale = np.array([0.5, 0.3, 0.4], dtype=np.float32)
+        output = np.array([23, 14, 20, 57, 34, 47])
+    elif requantize_output:
         assert use_bias
         output = np.array([23, 24, 25, 57, 58, 59])
     elif use_bias:
-        output = np.array([96, 100, 104, 232, 236, 240 ])
+        output = np.array([96, 100, 104, 232, 236, 240])
     else:
-        output = np.array([92, 92, 92, 228, 228, 228 ])
+        output = np.array([92, 92, 92, 228, 228, 228])
+
+    requant_params = make_requantize_params(input_scale * kernel_scale,
+                                            output_scale, -1, 'int8') if requantize_output else None
+
     output = output.astype(out_dtype).reshape(output_shape)
     return make_configuration(quantized_data=quantized_data_np,
                               quantized_kernel=quantized_kernel_np,
@@ -206,8 +169,8 @@ def qnn_dense_driver(test_configuration):
     with relay.build_config(opt_level=2):
         graph, lib, params = relay.build(mod, "llvm", params=None)
         mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
-        mod.set_input(quantized_data_name,test_configuration[quantized_data_name])
-        mod.set_input(quantized_kernel_name,test_configuration[quantized_kernel_name])
+        mod.set_input(quantized_data_name, test_configuration[quantized_data_name])
+        mod.set_input(quantized_kernel_name, test_configuration[quantized_kernel_name])
         if test_configuration[bias_name] is not None:
             mod.set_input(bias_name, test_configuration[bias_name])
         mod.set_input(**params)
@@ -241,7 +204,15 @@ def test_qnn_dense_with_requantized_output():
         qnn_dense_driver(int8_requantized_output_with_bias_params)
 
 
+def test_per_channel_weight_scale():
+    with TempOpAttr("qnn.dense", "FTVMQnnLegalize", legalize_qnn_dense):
+        config = make_int_configuration(use_bias=True, requantize_output=True,
+                                        per_channel=True)
+        qnn_dense_driver(config)
+
+
 if __name__ == "__main__":
     test_qnn_dense_without_bias()
     test_qnn_dense_with_bias()
     test_qnn_dense_with_requantized_output()
+    test_per_channel_weight_scale()
diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
index e5893c9904b5..dee19f766605 100644
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ b/tests/python/relay/test_pass_qnn_legalize.py
@@ -191,6 +191,7 @@ def _get_mod(data_dtype, kernel_dtype):
                 kernel_zero_point=relay.const(1, 'int32'),
                 input_scale=relay.const(1, 'float32'),
                 kernel_scale=relay.const(1, 'float32'),
+                units=kernel_shape[0],
                 out_dtype='int32')
 
         mod = relay.Function(relay.analysis.free_vars(func), func)

From feda150e34a79af3ea688ab27f9742720390b1e2 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Fri, 14 Feb 2020 20:15:47 -0800
Subject: [PATCH 02/73] [AutoTVM] Support range in index based tuners (#4870)

* Support range in index based tuners

* Address comments

* Remove __*state__

* trigger CI
---
 python/tvm/autotvm/tuner/__init__.py          |   2 +-
 python/tvm/autotvm/tuner/gridsearch_tuner.py  |  85 --------------
 python/tvm/autotvm/tuner/index_based_tuner.py | 110 ++++++++++++++++++
 tests/python/unittest/test_autotvm_common.py  |  16 ++-
 .../unittest/test_autotvm_index_tuner.py      |  68 +++++++++++
 tests/python/unittest/test_autotvm_measure.py |  23 +---
 6 files changed, 200 insertions(+), 104 deletions(-)
 delete mode 100644 python/tvm/autotvm/tuner/gridsearch_tuner.py
 create mode 100644 python/tvm/autotvm/tuner/index_based_tuner.py
 create mode 100644 tests/python/unittest/test_autotvm_index_tuner.py

diff --git a/python/tvm/autotvm/tuner/__init__.py b/python/tvm/autotvm/tuner/__init__.py
index c5ad6bff112a..7ffe9a2294c5 100644
--- a/python/tvm/autotvm/tuner/__init__.py
+++ b/python/tvm/autotvm/tuner/__init__.py
@@ -25,6 +25,6 @@
 
 from .tuner import Tuner
 
-from .gridsearch_tuner import GridSearchTuner, RandomTuner
+from .index_based_tuner import GridSearchTuner, RandomTuner
 from .ga_tuner import GATuner
 from .xgboost_tuner import XGBTuner
diff --git a/python/tvm/autotvm/tuner/gridsearch_tuner.py b/python/tvm/autotvm/tuner/gridsearch_tuner.py
deleted file mode 100644
index 4e9a4a2821d7..000000000000
--- a/python/tvm/autotvm/tuner/gridsearch_tuner.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=abstract-method
-"""Grid search tuner and random tuner"""
-
-import numpy as np
-
-from .tuner import Tuner
-
-
-class GridSearchTuner(Tuner):
-    """Enumerate the search space in a grid search order"""
-    def __init__(self, task):
-        super(GridSearchTuner, self).__init__(task)
-        self.counter = 0
-
-    def next_batch(self, batch_size):
-        ret = []
-        for _ in range(batch_size):
-            if self.counter >= len(self.task.config_space):
-                continue
-            index = self.counter
-            ret.append(self.task.config_space.get(index))
-            self.counter = self.counter + 1
-        return ret
-
-    def has_next(self):
-        return self.counter < len(self.task.config_space)
-
-    def load_history(self, data_set):
-        pass
-
-    def __getstate__(self):
-        return {"counter": self.counter}
-
-    def __setstate__(self, state):
-        self.counter = state['counter']
-
-
-class RandomTuner(Tuner):
-    """Enumerate the search space in a random order"""
-    def __init__(self, task):
-        super(RandomTuner, self).__init__(task)
-        self.visited = set()
-
-    def next_batch(self, batch_size):
-        ret = []
-        counter = 0
-        while counter < batch_size:
-            if len(self.visited) >= len(self.task.config_space):
-                break
-            index = np.random.randint(len(self.task.config_space))
-            while index in self.visited:
-                index = np.random.randint(len(self.task.config_space))
-
-            ret.append(self.task.config_space.get(index))
-            self.visited.add(index)
-            counter += 1
-        return ret
-
-    def has_next(self):
-        return len(self.visited) < len(self.task.config_space)
-
-    def load_history(self, data_set):
-        pass
-
-    def __getstate__(self):
-        return {"visited": self.counter}
-
-    def __setstate__(self, state):
-        self.counter = state['visited']
diff --git a/python/tvm/autotvm/tuner/index_based_tuner.py b/python/tvm/autotvm/tuner/index_based_tuner.py
new file mode 100644
index 000000000000..99fc9f288881
--- /dev/null
+++ b/python/tvm/autotvm/tuner/index_based_tuner.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=abstract-method
+"""Grid search tuner and random tuner"""
+
+import numpy as np
+
+from .tuner import Tuner
+
+class IndexBaseTuner(Tuner):
+    """Base class for index based tuner
+    This type of tuner determine the next batch of configs based on config indices.
+
+    Parameters
+    ----------
+    task: autotvm.task.Task
+        The tuning task
+
+    range_idx: Optional[Tuple[int, int]]
+        A tuple of index range that this tuner can select from
+    """
+    def __init__(self, task, range_idx=None):
+        super(IndexBaseTuner, self).__init__(task)
+        assert range_idx is None or isinstance(range_idx, tuple), \
+            "range_idx must be None or (int, int)"
+
+        self.range_length = len(self.task.config_space)
+        self.index_offset = 0
+        if range_idx is not None:
+            assert range_idx[1] > range_idx[0], "Index range must be positive"
+            assert range_idx[0] >= 0, "Start index must be positive"
+            self.range_length = range_idx[1] - range_idx[0] + 1
+            self.index_offset = range_idx[0]
+        self.counter = 0
+
+    def has_next(self):
+        return self.counter < self.range_length
+
+    def load_history(self, data_set):
+        pass
+
+
+class GridSearchTuner(IndexBaseTuner):
+    """Enumerate the search space in a grid search order"""
+
+    def next_batch(self, batch_size):
+        ret = []
+        for _ in range(batch_size):
+            if self.counter >= self.range_length:
+                break
+            index = self.counter + self.index_offset
+            ret.append(self.task.config_space.get(index))
+            self.counter = self.counter + 1
+        return ret
+
+
+class RandomTuner(IndexBaseTuner):
+    """Enumerate the search space in a random order
+
+    Parameters
+    ----------
+    task: autotvm.task.Task
+        Tuning Task
+
+    range_idx: Optional[Tuple[int, int]]
+        A tuple of index range to random
+    """
+    def __init__(self, task, range_idx=None):
+        super(RandomTuner, self).__init__(task, range_idx)
+
+        # Use a dict to mimic a range(n) list without storing rand_state[i] = i entries so that
+        # we can generate non-repetitive random indices.
+        self.rand_state = {}
+        self.rand_max = self.range_length
+        self.visited = []
+
+    def next_batch(self, batch_size):
+        ret = []
+        for _ in range(batch_size):
+            if self.rand_max == 0:
+                break
+
+            # Random an indirect index.
+            index_ = np.random.randint(self.rand_max)
+            self.rand_max -= 1
+
+            # Use the indirect index to get a direct index.
+            index = self.rand_state.get(index_, index_) + self.index_offset
+            ret.append(self.task.config_space.get(index))
+            self.visited.append(index)
+
+            # Update the direct index map.
+            self.rand_state[index_] = self.rand_state.get(self.rand_max, self.rand_max)
+            self.rand_state.pop(self.rand_max, None)
+            self.counter += 1
+        return ret
diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py
index 7043e473ec4d..fac9f062a2e8 100644
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/tests/python/unittest/test_autotvm_common.py
@@ -17,9 +17,24 @@
 """Common utilities for testing autotvm"""
 import time
 
+import numpy as np
+
 import tvm
 from tvm import autotvm
 from tvm.autotvm import MeasureInput, MeasureResult
+from tvm.autotvm.measure.measure import Runner
+
+
+class DummyRunner(Runner):
+    def __init__(self):
+        super(DummyRunner, self).__init__(1, 1)
+
+    def run(self, measure_inputs, build_results):
+        return [MeasureResult((np.random.random(),), 0, 0.2, time.time())
+                for _ in range(len(measure_inputs))]
+
+    def get_build_kwargs(self):
+        return {}
 
 @autotvm.template
 def matmul(N, L, M, dtype):
@@ -82,4 +97,3 @@ def get_sample_records(n):
         inps.append(MeasureInput(target, tsk, tsk.config_space.get(i)))
         ress.append(MeasureResult((i+1,), 0, i, time.time()))
     return list(zip(inps, ress))
-
diff --git a/tests/python/unittest/test_autotvm_index_tuner.py b/tests/python/unittest/test_autotvm_index_tuner.py
new file mode 100644
index 000000000000..c7fa2ea364b5
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_index_tuner.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test index based tuners"""
+
+from test_autotvm_common import DummyRunner, get_sample_task
+from tvm import autotvm
+from tvm.autotvm.tuner import GridSearchTuner, RandomTuner
+
+
+def test_gridsearch_tuner():
+    """Test GridSearchTuner"""
+
+    task, _ = get_sample_task()
+    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
+
+    # When no range index, range_length should be the length of config space
+    tuner = autotvm.tuner.GridSearchTuner(task)
+    assert tuner.range_length == len(task.config_space)
+    assert tuner.index_offset == 0
+
+    # With range index, range_length should be the length of the specified range
+    tuner = autotvm.tuner.GridSearchTuner(task, range_idx=(8, 15))
+    assert tuner.range_length == 8
+    assert tuner.index_offset == 8
+
+    # Tuner should only focus on the specified range
+    tuner.tune(n_trial=8, measure_option=measure_option)
+    assert tuner.counter == 8
+    assert not tuner.has_next()
+
+
+def test_random_tuner():
+    """Test RandomTuner"""
+
+    task, _ = get_sample_task()
+    measure_option = autotvm.measure_option(builder=autotvm.LocalBuilder(), runner=DummyRunner())
+
+    tuner = autotvm.tuner.RandomTuner(task, range_idx=(8, 15))
+    assert tuner.range_length == 8
+    assert tuner.index_offset == 8
+
+    # Tuner should only focus on the specified range and should visit all indices
+    tuner.tune(n_trial=8, measure_option=measure_option)
+    assert tuner.counter == 8
+    assert not tuner.has_next()
+    visited = set()
+    for idx in tuner.visited:
+        assert idx not in visited
+        assert 8 <= idx <= 15
+
+
+if __name__ == '__main__':
+    test_gridsearch_tuner()
+    test_random_tuner()
\ No newline at end of file
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index 29009487cd7e..48a1d31899e7 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -21,24 +21,14 @@
 import numpy as np
 
 import tvm
+from test_autotvm_common import DummyRunner, bad_matmul, get_sample_task
 from tvm import autotvm
-from test_autotvm_common import get_sample_task, bad_matmul
-from tvm.autotvm.measure.measure import Runner, MeasureResult, MeasureErrorNo
+from tvm.autotvm.measure.measure import MeasureErrorNo, MeasureResult
+
 
 def test_task_tuner_without_measurement():
     """test task and tuner without measurement"""
-    task, target = get_sample_task()
-
-    class DummyRunner(Runner):
-        def __init__(self):
-            super(DummyRunner, self).__init__(1, 1)
-
-        def run(self, measure_inputs, build_results):
-            return [MeasureResult((np.random.random(),), 0, 0.2, time.time())
-                    for _ in range(len(measure_inputs))]
-
-        def get_build_kwargs(self):
-            return {}
+    task, _ = get_sample_task()
 
     measure_option = autotvm.measure_option(
         builder=autotvm.LocalBuilder(),
@@ -64,7 +54,7 @@ def test_check_correctness():
     )
 
     def _callback_correct(tuner, measure_inputs, measure_results):
-        for inp, res in zip(measure_inputs, measure_results):
+        for _, res in zip(measure_inputs, measure_results):
             assert res.error_no == 0
 
     tuner = autotvm.tuner.RandomTuner(task)
@@ -77,7 +67,7 @@ def _callback_correct(tuner, measure_inputs, measure_results):
     task = autotvm.task.create(bad_matmul, args=(n, n, n, 'float32'), target=target)
 
     def _callback_wrong(tuner, measure_inputs, measure_results):
-        for inp, res in zip(measure_inputs, measure_results):
+        for _, res in zip(measure_inputs, measure_results):
             assert res.error_no == MeasureErrorNo.WRONG_ANSWER
 
     tuner = autotvm.tuner.RandomTuner(task)
@@ -90,4 +80,3 @@ def _callback_wrong(tuner, measure_inputs, measure_results):
 
     test_task_tuner_without_measurement()
     test_check_correctness()
-

From 7e9ec7352d487fddb0be25f4b53cce3abfad71c9 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sun, 16 Feb 2020 03:32:58 +0900
Subject: [PATCH 03/73] improve antlr import error message (#4888)

---
 python/tvm/relay/_parser.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index 78ab8ff63ec3..49bdbb393c2e 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -47,13 +47,6 @@ def __new__(cls, *args, **kwds):
 from . import op
 
 PYTHON_VERSION = sys.version_info.major
-try:
-    from .grammar.py3.RelayVisitor import RelayVisitor
-    from .grammar.py3.RelayParser import RelayParser
-    from .grammar.py3.RelayLexer import RelayLexer
-except ImportError:
-    raise Exception("Couldn't find ANTLR parser. Try building with USE_ANTLR=ON.")
-
 try:
     from antlr4 import InputStream, CommonTokenStream
     from antlr4.error.ErrorListener import ErrorListener
@@ -62,6 +55,14 @@ def __new__(cls, *args, **kwds):
                     "Try running `pip{version} install antlr4-python{version}-runtime`."
                     .format(version=PYTHON_VERSION))
 
+try:
+    from .grammar.py3.RelayVisitor import RelayVisitor
+    from .grammar.py3.RelayParser import RelayParser
+    from .grammar.py3.RelayLexer import RelayLexer
+except ImportError:
+    raise Exception("Couldn't find ANTLR parser. Try building with USE_ANTLR=ON.")
+
+
 sys.setrecursionlimit(10000)
 
 class ParseError(Exception):

From d50ba721eb5f7c0dbeceeaa78335d6f4c8cf2973 Mon Sep 17 00:00:00 2001
From: wpan11nv <60017475+wpan11nv@users.noreply.github.com>
Date: Sat, 15 Feb 2020 19:47:36 -0800
Subject: [PATCH 04/73] [CodeGen][CUDA] Fix issues in cuda codegen (#4876)

- Do not emit __shared__ etc. as part of type for casting

- Fix fp16 reduction kernels with compiler errors:

  "no operator "+" matches these operands, volatile half + volatile half

  This patch inserts casts to remove volatile type qualifier following
  volatile loads (fp16 only). CUDA fp16 library headers should add
  volatile member functions.

- Update have_fp16 to include compute 6.1 GPUs, which do support fp16,
  although their fp16 throughput is low. Updated tests.

Signed-off-by: Wei Pan <weip@nvidia.com>
---
 python/tvm/contrib/nvcc.py                 |  6 +---
 src/target/source/codegen_c.cc             | 13 +++----
 src/target/source/codegen_c.h              | 34 +++++++++++++++++-
 src/target/source/codegen_cuda.cc          | 28 +++++++--------
 src/target/source/codegen_cuda.h           |  9 +++++
 tests/python/unittest/test_codegen_cuda.py | 41 ++++++++++++++++++----
 topi/tests/python/test_topi_relu.py        | 14 ++------
 topi/tests/python/test_topi_tensor.py      | 14 ++------
 8 files changed, 105 insertions(+), 54 deletions(-)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index c50a9cef2889..8712f73c2343 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -232,11 +232,7 @@ def have_fp16(compute_version):
     # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#arithmetic-instructions
     if major == 5 and minor == 3:
         return True
-    # NOTE: exclude compute capability 6.1 devices although it is actually available
-    #       to compute fp16, because these devices only have low-rate fp16 performance.
-    if major == 6 and minor != 1:
-        return True
-    if major == 7:
+    if major >= 6:
         return True
 
     return False
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index b871b26008eb..7f89307c04a3 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -153,14 +153,15 @@ std::string CodeGenC::GetBufferRef(
   if (alloc_storage_scope_.count(buffer)) {
     scope = alloc_storage_scope_.at(buffer);
   }
-  bool is_vol = volatile_buf_.count(buffer) != 0;
+  bool is_vol = IsVolatile(buffer);
   if (t.lanes() == 1) {
     if (!HandleTypeMatch(buffer, t) || is_vol) {
       os << "((";
       if (is_vol) {
         os << "volatile ";
       }
-      if (scope.length() != 0) {
+      // Scope may not be part of type.
+      if (!scope.empty() && IsScopePartOfType()) {
         PrintStorageScope(scope, os);
       }
       os << ' ';
@@ -189,7 +190,7 @@ std::string CodeGenC::GetBufferRef(
     if (is_vol) {
       os << "volatile ";
     }
-    if (scope.length() != 0) {
+    if (!scope.empty() && IsScopePartOfType()) {
       PrintStorageScope(scope, os);
     }
     os << ' ';
@@ -197,7 +198,7 @@ std::string CodeGenC::GetBufferRef(
     os << "*)(";
     if (!HandleTypeMatch(buffer, t.element_of())) {
       os << '(';
-      if (scope.length() != 0) {
+      if (!scope.empty() && IsScopePartOfType()) {
         PrintStorageScope(scope, os);
       }
       os << ' ';
@@ -620,14 +621,14 @@ void CodeGenC::VisitExpr_(const LoadNode* op, std::ostream& os) {  // NOLINT(*)
   // delcare type.
   if (op->dtype.lanes() == 1) {
     std::string ref = GetBufferRef(op->dtype, op->buffer_var.get(), op->index);
-    os << ref;
+    HandleVolatileLoads(ref, op, os);
   } else {
     CHECK(is_one(op->predicate))
         << "predicated load is not supported";
     PrimExpr base;
     if (GetRamp1Base(op->index, op->dtype.lanes(), &base)) {
       std::string ref = GetVecLoad(op->dtype, op->buffer_var.get(), base);
-      os << ref;
+      HandleVolatileLoads(ref, op, os);
     } else {
       // The assignment below introduces side-effect, and the resulting value cannot
       // be reused across multiple expression, thus a new scope is needed
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index 00ed91278c33..c6da1c4ceb9f 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -178,9 +178,36 @@ class CodeGenC :
   // Print reference to struct location
   std::string GetStructRef(
       DataType t, const PrimExpr& buffer, const PrimExpr& index, int kind);
-  // print reference to a buffer as type t in index.
+  // Print reference to a buffer as type t in index.
   virtual std::string GetBufferRef(
       DataType t, const VarNode* buffer, PrimExpr index);
+
+  /*!
+   * \brief Handle volatile loads.
+   *
+   * This is to workaround a bug in CUDA cuda_fp16.h. Volatile accesses
+   * to shared memory are required for reductions. However, __half class
+   * does not implement volatile member functions. CUDA codegen will cast
+   * away volatile qualifier from CUDA __half types.
+   */
+  virtual void HandleVolatileLoads(const std::string& value, const LoadNode* op,
+                                   std::ostream& os) {
+    // By default, do nothing but print the loaded value.
+    os << value;
+  }
+
+  /*!
+   * \brief Check if scope is part of type in the target language.
+   *
+   * **NOTE** In OpenCL, __local is part of type, so "__local int *"
+   * is legal. This is not the case for CUDA, where "__shared__"
+   * or "__constant__" is not part of type but a storage class (like
+   * C/C++ static).
+   */
+  virtual bool IsScopePartOfType() const {
+    return true;
+  }
+
   /*!
    * \brief If buffer is allocated as type t.
    * \param buf_var The buffer variable.
@@ -205,6 +232,11 @@ class CodeGenC :
   /*! \brief reserves common C keywords */
   void ReserveKeywordsAsUnique();
 
+  /*! \brief Check if buf_var is volatile or not. */
+  bool IsVolatile(const VarNode *buf_var) const {
+    return volatile_buf_.count(buf_var) != 0;
+  }
+
  private:
   /*! \brief whether to print in SSA form */
   bool print_ssa_form_{false};
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 0b2c54e592ce..889d8b6a62d4 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -57,20 +57,6 @@ std::string CodeGenCUDA::Finish() {
                 << "{\n  return __hgt(__half(a), __half(b)) ? a : b;\n}\n";
     decl_stream << "__device__ half min(half a, half b)\n"
                 << "{\n  return __hlt(__half(a), __half(b)) ? a : b;\n}\n";
-    // FIXME(tvm-team): "volatile" is used to enable cross thread reduction,
-    // which is needed by operations such as softmax.
-    // However, volatile overloading is not supported in NVRTC and CUDA < 9.2.
-    // We need to figure out a solution which can satisfy both scenario.
-    // decl_stream << "__device__ half operator<="
-    //             << "(const volatile __half &a,  const volatile __half &b)\n"
-    //             << "{\n  return __hlt(a, b);\n}\n";
-    // decl_stream << "__device__ half operator+"
-    //             << "(const volatile __half &a,  const volatile __half &b)\n"
-    //             <<"{\n  return __hadd(a, b);\n}\n";
-    // decl_stream << "__device__ half operator*"
-    //             << "(const volatile __half &a, const volatile __half &b)\n"
-    //             <<   "{\n  return __hmul(a, b);\n}\n";
-    // otherwise simulate computation via float32
     decl_stream << "#else\n";
     decl_stream << _cuda_half_t_def;
     decl_stream << "#endif\n\n";
@@ -605,5 +591,19 @@ int32_t CodeGenCUDA::GetWmmaFragmentSize(const std::string &scope,
   return 0;
 }
 
+void CodeGenCUDA::HandleVolatileLoads(const std::string& value,
+                                      const LoadNode* op, std::ostream& os) {
+  // Cast away volatile qualifier for fp16 types. That is, only loads and
+  // stores are volatile. The loaded objects are not marked as volatile.
+  //
+  if (op->dtype.is_float16() && IsVolatile(op->buffer_var.get())) {
+    os << "(";
+    PrintType(op->dtype, os);
+    os << ")(" << value << ")";
+  } else {
+    os << value;
+  }
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h
index eca687152784..d0a98a69a19f 100644
--- a/src/target/source/codegen_cuda.h
+++ b/src/target/source/codegen_cuda.h
@@ -66,6 +66,15 @@ class CodeGenCUDA final : public CodeGenC {
   void VisitStmt_(const AttrStmtNode *op) final;
 
  private:
+  // Handle volatile loads
+  void HandleVolatileLoads(const std::string& value, const LoadNode* op,
+                           std::ostream& os) final;
+
+  // Whether scope such as "__shared__" or "__constant__"  is part of type.
+  bool IsScopePartOfType() const final {
+    return false;
+  }
+
   // Whether global barrier is needed.
   bool need_global_barrier_{false};
   // Global barrier state
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index 79b3544f46eb..ec36a5fa5a7a 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -17,8 +17,9 @@
 # under the License.
 import tvm
 import numpy as np
+import topi
 import unittest
-from tvm.contrib.nvcc import parse_compute_version, have_int8
+from tvm.contrib.nvcc import have_fp16, have_int8
 from tvm.contrib import nvcc
 
 tx = tvm.thread_axis("threadIdx.x")
@@ -30,11 +31,8 @@ def check_cuda(dtype, n, lanes):
         if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
             print("skip because cuda is not enabled..")
             return
-        if dtype == "float16":
-            major, minor = parse_compute_version(tvm.gpu(0).compute_version)
-            # fp16 starts from 5.3
-            if major < 6 or (major == 5 and minor < 3):
-                print("skip because gpu does not support fp16")
+        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
+            print("Skip because gpu does not have fp16 support")
             return
         if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
             print("skip because gpu does not support int8")
@@ -291,6 +289,36 @@ def test_cuda_const_float_to_half():
     func(a, c)
     np.testing.assert_equal(c.asnumpy(), a_np > b.value)
 
+def test_cuda_reduction():
+    def check_cuda(dtype, m=32, n=32):
+        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
+            print("Skip because gpu does not have fp16 support")
+            return
+
+        a = tvm.placeholder((m, n), name="a", dtype=dtype)
+        b = tvm.placeholder((m, n), name="b", dtype=dtype)
+        c = a + b
+        d = a * b
+        e = topi.elemwise_sum([c, d])
+        g = topi.sum(e)
+        with tvm.target.cuda():
+            sg = topi.generic.schedule_reduce(g)
+            ctx = tvm.gpu(0)
+            func = tvm.build(sg, [a, b, g], 'cuda')
+            a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
+            b_np = np.random.uniform(size=(m, n)).astype(b.dtype)
+            g_np = np.sum(np.add(a_np * b_np, a_np + b_np))
+            a_nd = tvm.nd.array(a_np, ctx)
+            b_nd = tvm.nd.array(b_np, ctx)
+            g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
+            func(a_nd, b_nd, g_nd)
+            tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3)
+
+    check_cuda("float32")
+    check_cuda("float16")
 
 if __name__ == "__main__":
     test_cuda_vectorize_add()
@@ -302,3 +330,4 @@ def test_cuda_const_float_to_half():
     test_cuda_reducition_binding()
     test_rfactor_predicates()
     test_cuda_const_float_to_half()
+    test_cuda_reduction()
\ No newline at end of file
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index 414edbca4f0f..8868d4ebffe3 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -20,18 +20,9 @@
 import tvm
 import topi
 from topi.util import get_const_tuple
-from tvm.contrib.nvcc import parse_compute_version
+from tvm.contrib.nvcc import have_fp16
 from common import get_all_backend
 
-def skip_test(dtype, device):
-    if dtype == "float16" and device == "cuda":
-        major, minor = parse_compute_version(tvm.gpu(0).compute_version)
-        # fp16 starts from 5.3
-        if major < 6 or (major == 5 and minor < 3):
-            print("skip because gpu does not support fp16")
-            return True
-    return False
-
 def verify_relu(m, n, dtype="float32"):
     A = tvm.placeholder((m, n), name='A', dtype=dtype)
     B = topi.nn.relu(A)
@@ -44,7 +35,8 @@ def check_device(device):
         if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
-        if skip_test(dtype, device):
+        if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
+            print("Skip because %s does not have fp16 support" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
diff --git a/topi/tests/python/test_topi_tensor.py b/topi/tests/python/test_topi_tensor.py
index 84718ff3a647..8e7073f4060b 100644
--- a/topi/tests/python/test_topi_tensor.py
+++ b/topi/tests/python/test_topi_tensor.py
@@ -19,16 +19,7 @@
 import tvm
 import topi
 from tvm.contrib.pickle_memoize import memoize
-from tvm.contrib.nvcc import parse_compute_version
-
-def skip_test(dtype, device):
-    if dtype == "float16" and device == "cuda":
-        major, minor = parse_compute_version(tvm.gpu(0).compute_version)
-        # fp16 starts from 5.3
-        if major < 6 or (major == 5 and minor < 3):
-            print("skip because gpu does not support fp16")
-            return True
-    return False
+from tvm.contrib.nvcc import have_fp16
 
 def verify_elemwise_sum(num_args, dtype):
     shape = (3,5,4)
@@ -99,7 +90,8 @@ def check_device(device):
         if not tvm.runtime.enabled(device):
             print("Skip because %s is not enabled" % device)
             return
-        if skip_test(dtype, device):
+        if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
+            print("Skip because gpu does not have fp16 support")
             return
         with tvm.target.create(device):
             ctx = tvm.context(device, 0)

From 529ee1feb6c96967d8ab28e08b72006c6d7e8887 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sun, 16 Feb 2020 15:43:22 +0900
Subject: [PATCH 05/73] [Relay] Fix VM compiler for while loop with free vars 
 (#4889)

* add additional switch to handle nested call node

* Fix VM compiler for while loop with free var
---
 src/relay/backend/vm/compiler.cc |  3 +++
 tests/python/relay/test_vm.py    | 27 +++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 8d4f4addaca9..73a6450c16ec 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -637,6 +637,9 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       // emit invoke closure here.
       VisitExpr(GetRef<Var>(var_node));
       Emit(Instruction::InvokeClosure(last_register_, args_registers, NewRegister()));
+    } else if (auto inner_call_node = op.as<CallNode>()) {
+      VisitExpr(GetRef<Call>(inner_call_node));
+      Emit(Instruction::InvokeClosure(last_register_, args_registers, NewRegister()));
     } else {
       // Finally if there are any other cases this is a bug.
       LOG(FATAL) << "internal error: unreachable code,"
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index c4cd616cdec0..8cac656ee5a1 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -23,6 +23,7 @@
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay.testing.config import ctx_list
 from tvm.relay.prelude import Prelude
+from tvm.relay.loops import while_loop
 from tvm.relay import testing
 
 def check_result(args, expected_result, mod=None):
@@ -576,5 +577,31 @@ def test_vm_optimize():
     comp = relay.vm.VMCompiler()
     opt_mod, _ = comp.optimize(mod, "llvm", params)
 
+def test_loop_free_var():
+    x = relay.var('x', shape=(), dtype='int32')
+    i = relay.var('i', shape=(), dtype='int32')
+    s = relay.var('s', shape=(), dtype='int32')
+
+    def cond(i, _):
+        return i < relay.const(10, dtype='int32')
+
+    def body_no_free_var(i, acc):
+        incr = relay.const(1, "int32")
+        return i + incr, acc + i
+
+    def body_with_free_var(i, acc):
+        incr = relay.const(1, "int32")
+        return i + incr, acc + x
+
+    for args, body, expected in zip([[], [1]],
+                                    [body_no_free_var, body_with_free_var],
+                                    [45, 10]):
+        loop = while_loop(cond, [i, s], body)
+        tup = loop(relay.const(0, dtype='int32'), relay.zeros(shape=(), dtype='int32'))
+        ret = relay.TupleGetItem(tup, 1)
+        mod = tvm.IRModule()
+        mod["main"] = relay.Function(relay.analysis.free_vars(ret), ret)
+        check_result(args, expected, mod=mod)
+
 if __name__ == "__main__":
     pytest.main([__file__])

From e7be8bf43de4c1b19ea68134812ea7b0cd8e361f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 16 Feb 2020 15:02:39 -0800
Subject: [PATCH 06/73] [CI] Cleanup logfile before tutorial runs (#4896)

---
 tests/scripts/task_python_docs.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 951d1a3a317b..d463a61e1802 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -27,6 +27,9 @@ rm -rf docs/_build/html/javadoc
 rm -rf docs/tutorials
 rm -rf docs/vta/tutorials
 
+# cleanup stale log files
+find . -type f -path "*.log" | xargs rm -f
+
 # C++ doc
 make doc
 

From 95de08ba4f0d90dde308f4b2b401da8aaa333d2b Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Sun, 16 Feb 2020 17:44:22 -0800
Subject: [PATCH 07/73] Fix alpha_equal bug (#4897)

---
 src/relay/ir/alpha_equal.cc                   |  2 +-
 tests/python/relay/test_ir_nodes.py           |  2 +
 tests/python/relay/test_pass_alpha_equal.py   | 25 ++++-
 tests/python/relay/test_pass_fuse_ops.py      | 36 ++++++-
 .../python/relay/test_pass_merge_composite.py | 93 ++++++++++---------
 5 files changed, 109 insertions(+), 49 deletions(-)

diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 48634bafa744..78688d7dc730 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -92,7 +92,7 @@ class AlphaEqualHandler:
     auto compute = [&]() {
       if (&lhs == &rhs) return true;
       if (auto lhsd = lhs.as<DictAttrsNode>()) {
-        auto rhsd = lhs.as<DictAttrsNode>();
+        auto rhsd = rhs.as<DictAttrsNode>();
         if (!rhsd) return false;
         if (lhsd->dict.size() != rhsd->dict.size()) return false;
         for (const auto& k : lhsd->dict) {
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index ad1525576d08..bdda72ca8702 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """ test ir"""
+import pytest
 import tvm
 from tvm import relay
 from tvm.tir.expr import *
@@ -174,6 +175,7 @@ def test_function():
     str(fn)
     check_json_roundtrip(fn)
 
+@pytest.mark.skip(reason="AttrsEqualHandler doesn't handle Map so far.")
 def test_function_attrs():
     param_names = ['a', 'b', 'c', 'd']
     params = tvm.convert([relay.var(n, shape=(5, 2)) for n in param_names])
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 5985273ce6de..0319d0b1a371 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -18,6 +18,7 @@
 import tvm
 from tvm import relay
 from tvm.relay import analysis
+from tvm.relay.testing import run_opt_pass
 
 def alpha_equal(x, y):
     """
@@ -313,7 +314,7 @@ def test_tuple_get_item_alpha_equal():
     assert alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 1))
 
 
-def test_multi_node_subgraph():
+def test_function_attr():
     x0 = relay.var('x0', shape=(10, 10))
     w00 = relay.var('w00', shape=(10, 10))
     w01 = relay.var('w01', shape=(10, 10))
@@ -607,6 +608,7 @@ def test_graph_equal():
 
     z3 = relay.add(relay.add(x, x), relay.add(x, x))
 
+    assert alpha_equal(z0, z1)
     assert alpha_equal(z0, z1)
 
     # z3's dataflow format is different from z0
@@ -649,6 +651,26 @@ def test_tuple_match():
     assert analysis.structural_hash(x) == analysis.structural_hash(y)
 
 
+def test_fn_attribute():
+    # create function that performs add
+    a = relay.var('a', shape=(10, 10))
+    b = relay.var('b', shape=(10, 10))
+    add = relay.add(a, b)
+    add_fn = relay.Function([a, b], add)
+    add_fn = run_opt_pass(add_fn, relay.transform.InferType())
+
+    # create function that performs add with test attribute
+    c = relay.var('c', shape=(10, 10))
+    d = relay.var('d', shape=(10, 10))
+    add_1 = relay.add(c, d)
+    add_1_fn = relay.Function([c, d], add_1)
+    add_1_fn = add_1_fn.set_attribute("TestAttribute", tvm.tir.StringImm("test"))
+    add_1_fn = run_opt_pass(add_1_fn, relay.transform.InferType())
+
+    assert not relay.analysis.alpha_equal(add_1_fn, add_fn)
+    assert not relay.analysis.alpha_equal(add_fn, add_1_fn)
+
+
 if __name__ == "__main__":
     test_tensor_type_alpha_equal()
     test_incomplete_type_alpha_equal()
@@ -672,3 +694,4 @@ def test_tuple_match():
     test_var_alpha_equal()
     test_graph_equal()
     test_hash_unequal()
+    test_fn_attribute()
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 18916f758a6c..e11b6aeb0a2c 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -35,6 +35,7 @@ def expected():
         z = relay.exp(y)
         w = relay.squeeze(z)
         f1 = relay.Function([x], w)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
         x = relay.var("x", shape=(10, 20))
         y = relay.Call(f1, [x])
         return relay.Function([x], y)
@@ -76,6 +77,8 @@ def expected(dshape):
         x = relay.var("p0", shape=dshape)
         y = relay.add(x, relay.const(1, "float32"))
         f0 = relay.Function([x], y)
+        f0 = f0.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
+
         # segment 1
         x = relay.var("p0", shape=dshape)
         w = relay.var("p1")
@@ -86,6 +89,8 @@ def expected(dshape):
         y1 = relay.add(relay.const(1, "float32"), y)
         y = relay.add(y, y1)
         f1 = relay.Function([x, w], y)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
+
         # segment 2
         x = relay.var("p0", shape=dshape)
         w = relay.var("p1")
@@ -94,6 +99,8 @@ def expected(dshape):
                              padding=(1,1),
                              channels=16)
         f2 = relay.Function([x, w], z2)
+        f2 = f2.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
+
         # segment 3
         x = relay.var("p0", shape=dshape)
         w = relay.var("p1")
@@ -104,6 +111,8 @@ def expected(dshape):
                              channels=16)
         z3 = relay.add(z3, offset)
         f3 = relay.Function([x, w, offset], z3)
+        f3 = f3.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
+
         # compose
         x = relay.var("x", shape=dshape)
         y = relay.Call(f0, [x])
@@ -135,6 +144,7 @@ def expected(dshape):
         x = relay.var("x", shape=dshape)
         pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
         f0 = relay.Function([x], pooled)
+        f0 = f0.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
         p1 = relay.var("p1", shape=dshape)
@@ -142,6 +152,7 @@ def expected(dshape):
         concat = relay.concatenate((upsampled, p1), axis=1)
         out = relay.add(concat, relay.const(1, "float32"))
         f1 = relay.Function([p0, p1], out)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         x = relay.var("x", shape=dshape)
         y = relay.Call(f0, [x])
@@ -172,10 +183,12 @@ def expected(dshape):
         x = relay.var("x", shape=dshape)
         pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
         f0 = relay.Function([x], pooled)
+        f0 = f0.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
         upsampled = relay.nn.upsampling(p0, scale_h=2, scale_w=2, layout="NCHW")
         f1 = relay.Function([p0], upsampled)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         x = relay.var("x", shape=dshape)
         y = relay.Call(f0, [x])
@@ -205,10 +218,12 @@ def expected(dshape):
         x = relay.var("p0", shape=dshape)
         y = relay.add(x, relay.const(1, "float32"))
         f1 = relay.Function([x], y)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         x = relay.var("p01", shape=dshape)
         y = relay.exp(x)
         f2 = relay.Function([x], y)
+        f2 = f2.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         x = relay.var("x", shape=dshape)
         y = relay.Call(f1, [x])
@@ -242,6 +257,7 @@ def expected(dshape, dtype):
         p2 = relay.var('p2', shape=dshape, dtype=dtype)
         fused_gt = relay.Function([p1, p2],
             relay.op.greater(p1, p2))
+        fused_gt = fused_gt.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
         with sb.if_scope(fused_gt(x, y)):
             sb.ret(relay.Function([], x))
         with sb.else_scope():
@@ -271,11 +287,13 @@ def expected(dim):
         p1 = relay.var("p1", shape=(3 * dim, dim))
         matmul = relay.nn.dense(p0, p1)
         f0 = relay.Function([p0, p1], matmul)
+        f0 = f0.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p01 = relay.var("p01", shape=(1, 3 * dim))
         splitted = relay.split(p01, indices_or_sections=3, axis=1)
         out = relay.sigmoid(splitted[0]) + relay.tanh(splitted[1]) * relay.exp(splitted[2])
         f1 = relay.Function([p01], out)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         X = relay.var("X", shape=(1, dim))
         W = relay.var("W", shape=(3 * dim, dim))
@@ -306,11 +324,13 @@ def expected(dim):
         splitted = relay.split(p0, indices_or_sections=3, axis=1)
         out = splitted[0]
         f0 = relay.Function([p0], out)
+        f0 = f0.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p01 = relay.var("p01", shape=(1, dim))
         p1 = relay.var("p1", shape=(dim, dim))
         out = relay.nn.dense(p01, p1)
         f1 = relay.Function([p01, p1], out)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         X = relay.var("X", shape=(1, 3 * dim))
         W = relay.var("W", shape=(dim, dim))
@@ -346,8 +366,9 @@ def before(x):
 
     def expected(p0):
         f0 = before(p0)
+        f1 = f0.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
         x = relay.var("x", shape=dshape)
-        y = relay.Call(f0, [x])
+        y = relay.Call(f1, [x])
         return relay.Function([x], y)
 
     dshape = (1, 16, 64, 64)
@@ -388,15 +409,18 @@ def expected(dshape):
         p0 = relay.var("p0", shape=dshape)
         concat = gen_consecutive_tuple(p0)
         f0 = relay.Function([p0], concat)
+        f0 = f0.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p01 = relay.var("p01", shape=(1, dshape[1]*9, dshape[2], dshape[3]))
         pooled = relay.nn.max_pool2d(p01, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
         out = relay.add(pooled, relay.const(1, "float32"))
         f1 = relay.Function([p01], out)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p02 = relay.var("p02", shape=(1, dshape[1]*9, dshape[2]//2, dshape[3]//2))
         out = relay.add(p02, relay.const(1, "float32"))
         f2 = relay.Function([p02], out)
+        f2 = f2.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         x = relay.var("x", shape=dshape)
         y = relay.Call(f0, [x])
@@ -438,30 +462,36 @@ def expected(dshape):
         p0 = relay.var("p0", shape=dshape)
         c = conv(p0)
         f0 = relay.Function(relay.analysis.free_vars(c), c)
+        f0 = f0.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p01 = relay.var("p01", shape=dshape)
         c = conv(p01)
         f1 = relay.Function(relay.analysis.free_vars(c), c)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p02 = relay.var("p02", shape=dshape)
         p12 = relay.var("p12", shape=dshape)
         concat1 = relay.concatenate((p02, p12), axis=1)
         f_concat1 = relay.Function([p02, p12], concat1)
+        f_concat1 = f_concat1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         dshape2 = (dshape[0], dshape[1]*2, dshape[2], dshape[3])
 
         p03 = relay.var("p03", shape=dshape2)
         c = conv(p03)
         f2 = relay.Function(relay.analysis.free_vars(c), c)
+        f2 = f2.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p04 = relay.var("p04", shape=dshape2)
         c = conv(p04)
         f3 = relay.Function(relay.analysis.free_vars(c), c)
+        f3 = f3.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         p05 = relay.var("p05", shape=dshape)
         p15 = relay.var("p15", shape=dshape)
         concat2 = relay.concatenate((p05, p15), axis=1)
         f_concat2 = relay.Function([p05, p15], concat2)
+        f_concat2 = f_concat2.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
 
         x = relay.var("x", shape=dshape)
         c1 = relay.Call(f0, [x, relay.var("w1")])
@@ -499,6 +529,7 @@ def expected():
         u = relay.transpose(y, axes=[0, 1])
         w = relay.left_shift(z, u)
         f1 = relay.Function([x], w)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
         x = relay.var("x", shape=(10, 20))
         y = relay.Call(f1, [x])
         return relay.Function([x], y)
@@ -529,6 +560,7 @@ def expected():
         z = relay.exp(y)
         w = relay.squeeze(z)
         f1 = relay.Function([x], w)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
         x = relay.var("x", shape=(10, 20))
         y = relay.Call(f1, [x])
         mod = tvm.IRModule()
@@ -570,6 +602,7 @@ def expected():
         for i in range(max_fused_ops):
             y = relay.exp(y)
         f1 = relay.Function([x], y)
+        f1 = f1.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
         x = relay.var("x", shape=(10, 20))
         z = relay.Call(f1, [x])
         xx = relay.var("pp", shape=(10, 20))
@@ -577,6 +610,7 @@ def expected():
         for i in range(n-max_fused_ops):
             yy = relay.exp(yy)
         f2 = relay.Function([xx], yy)
+        f2 = f2.set_attribute("Primitive", tvm.tir.IntImm("int32", 1))
         zz = relay.Call(f2, [z])
         return relay.Function([x], zz)
 
diff --git a/tests/python/relay/test_pass_merge_composite.py b/tests/python/relay/test_pass_merge_composite.py
index 4f785d7c915e..4f5acc707a52 100644
--- a/tests/python/relay/test_pass_merge_composite.py
+++ b/tests/python/relay/test_pass_merge_composite.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """Unit tests for merge composite."""
-from tvm import expr
 from tvm import relay
+from tvm import tir
 from tvm.relay.testing import run_opt_pass
 
 """
@@ -144,6 +144,8 @@ def expected():
         add_node = relay.add(in_1, in_2)
         relu_node = relay.nn.relu(add_node)
         add_relu = relay.Function([in_1, in_2], relu_node)
+        add_relu = add_relu.set_attribute("Primitive", tir.IntImm("int32", 1))
+        add_relu = add_relu.set_attribute("Composite", tir.StringImm("add_relu"))
 
         # merged function
         r = relay.Call(add_relu, [a, b])
@@ -208,11 +210,27 @@ def expected():
         sub_node = relay.subtract(in_1, in_2)
         mul_node = relay.multiply(add_node, sub_node)
         add_sub_mul = relay.Function([in_1, in_2], mul_node)
+        add_sub_mul = add_sub_mul.set_attribute("Primitive",
+                                                tir.IntImm("int32", 1))
+        add_sub_mul = add_sub_mul.set_attribute("Composite",
+                                                tir.StringImm("add_sub_mul"))
+
+        # add_sub_mul1 function
+        in_3 = relay.var('in_3', shape=(10, 10))
+        in_4 = relay.var('in_4', shape=(10, 10))
+        add_node_1 = relay.add(in_3, in_4)
+        sub_node_1 = relay.subtract(in_3, in_4)
+        mul_node_1 = relay.multiply(add_node_1, sub_node_1)
+        add_sub_mul_1 = relay.Function([in_3, in_4], mul_node_1)
+        add_sub_mul_1 = add_sub_mul_1.set_attribute("Primitive",
+                                                    tir.IntImm("int32", 1))
+        add_sub_mul_1 = add_sub_mul_1.set_attribute("Composite",
+                                                    tir.StringImm("add_sub_mul"))
 
         # merged function
-        add_sub_mul_1 = relay.Call(add_sub_mul, [a, b])
-        add_sub_mul_2 = relay.Call(add_sub_mul, [c, add_sub_mul_1])
-        r = relay.nn.relu(add_sub_mul_2)
+        m_add_sub_mul_1 = relay.Call(add_sub_mul, [a, b])
+        m_add_sub_mul_2 = relay.Call(add_sub_mul_1, [c, m_add_sub_mul_1])
+        r = relay.nn.relu(m_add_sub_mul_2)
         return relay.Function([a, b, c], r)
 
     result = run_opt_pass(before(), relay.transform.MergeComposite(pattern_table))
@@ -291,6 +309,9 @@ def expected():
         bias_node = relay.nn.bias_add(conv_node, in_3)
         r = relay.nn.relu(bias_node)
         conv_bias_add_relu = relay.Function([in_1, in_2, in_3], r)
+        conv_bias_add_relu = conv_bias_add_relu.set_attribute("Primitive", tir.IntImm("int32", 1))
+        conv_bias_add_relu = conv_bias_add_relu.set_attribute("Composite",
+                                                              tir.StringImm("conv2d_bias_relu"))
 
         # add_relu function
         in_4 = relay.var('in_4', shape=(1, 256, 28, 28))
@@ -298,6 +319,8 @@ def expected():
         add_node = relay.add(in_4, in_5)
         r = relay.nn.relu(add_node)
         add_relu = relay.Function([in_4, in_5], r)
+        add_relu = add_relu.set_attribute("Primitive", tir.IntImm("int32", 1))
+        add_relu = add_relu.set_attribute("Composite", tir.StringImm("add_relu"))
 
         # merged function
         conv_bias_add_relu_1 = relay.Call(conv_bias_add_relu, [data, kernel, bias])
@@ -357,7 +380,7 @@ def before():
         out = relay.nn.relu(out)
         return relay.Function([input_1, input_2], out)
 
-    def after_A_priority():
+    def after_A_priority(composite_name):
         input_1 = relay.var('input_1', shape=(10, 10))
         input_2 = relay.var('input_2', shape=(10, 10))
         x = relay.var('x')
@@ -366,38 +389,12 @@ def after_A_priority():
         out = relay.abs(out)
         out = relay.nn.relu(out)
         merged_func = relay.Function([x, y], out)
-        merged_func = merged_func.set_attribute('Primitive', expr.IntImm('int32', 1))
-        merged_func = merged_func.set_attribute('Composite', expr.StringImm('A'))
+        merged_func = merged_func.set_attribute('Primitive', tir.IntImm('int32', 1))
+        merged_func = merged_func.set_attribute('Composite',
+                                                tir.StringImm(composite_name))
         ret = relay.Call(merged_func, [input_1, input_2])
         return relay.Function([input_1, input_2], ret)
 
-    def after_B_priority():
-        input_1 = relay.var('input_1', shape=(10, 10))
-        input_2 = relay.var('input_2', shape=(10, 10))
-        x = relay.var('x')
-        y = relay.var('y')
-        out = relay.add(x, y)
-        out = relay.abs(out)
-        merged_func = relay.Function([x, y], out)
-        merged_func = merged_func.set_attribute('Primitive', expr.IntImm('int32', 1))
-        merged_func = merged_func.set_attribute('Composite', expr.StringImm('B'))
-        merged_call = relay.Call(merged_func, [input_1, input_2])
-        ret = relay.nn.relu(merged_call)
-        return relay.Function([input_1, input_2], ret)
-
-    def after_C_priority():
-        input_1 = relay.var('input_1', shape=(10, 10))
-        input_2 = relay.var('input_2', shape=(10, 10))
-        add = relay.add(input_1, input_2)
-        x = relay.var('x')
-        out = relay.abs(x)
-        out = relay.nn.relu(out)
-        merged_func = relay.Function([x], out)
-        merged_func = merged_func.set_attribute('Primitive', expr.IntImm('int32', 1))
-        merged_func = merged_func.set_attribute('Composite', expr.StringImm('C'))
-        ret = relay.Call(merged_func, [add])
-        return relay.Function([input_1, input_2], ret)
-
     # check A highest priority
     pattern_table = [
         ("A", pattern_A()),
@@ -406,7 +403,7 @@ def after_C_priority():
     ]
     result = run_opt_pass(before(), relay.transform.MergeComposite(pattern_table))
     assert not relay.analysis.free_vars(result)
-    expected = run_opt_pass(after_A_priority(), relay.transform.InferType())
+    expected = run_opt_pass(after_A_priority("A"), relay.transform.InferType())
     assert relay.analysis.alpha_equal(result, expected)
 
     # check B highest priority
@@ -417,7 +414,7 @@ def after_C_priority():
     ]
     result = run_opt_pass(before(), relay.transform.MergeComposite(pattern_table))
     assert not relay.analysis.free_vars(result)
-    expected = run_opt_pass(after_A_priority(), relay.transform.InferType())
+    expected = run_opt_pass(after_A_priority("B"), relay.transform.InferType())
     assert relay.analysis.alpha_equal(result, expected)
 
     # check C highest priority
@@ -428,7 +425,7 @@ def after_C_priority():
     ]
     result = run_opt_pass(before(), relay.transform.MergeComposite(pattern_table))
     assert not relay.analysis.free_vars(result)
-    expected = run_opt_pass(after_A_priority(), relay.transform.InferType())
+    expected = run_opt_pass(after_A_priority("C"), relay.transform.InferType())
     assert relay.analysis.alpha_equal(result, expected)
 
 
@@ -459,11 +456,15 @@ def after():
         y = relay.var('y')
         branch_1 = relay.multiply(relay.add(x, y), relay.subtract(x, y))
         func_1 = relay.Function([x, y], branch_1)
+        func_1 = func_1.set_attribute('Primitive', tir.IntImm('int32', 1))
+        func_1 = func_1.set_attribute('Composite', tir.StringImm("add_sub_mul"))
         call_1 = relay.Call(func_1, [input_1, input_2])
         x1 = relay.var('x1')
         y1 = relay.var('y1')
         branch_2 = relay.multiply(relay.add(x1, y1), relay.subtract(x1, y1))
         func_2 = relay.Function([x1, y1], branch_2)
+        func_2 = func_2.set_attribute('Primitive', tir.IntImm('int32', 1))
+        func_2 = func_2.set_attribute('Composite', tir.StringImm("add_sub_mul"))
         call_2 = relay.Call(func_2, [input_1, input_2])
         out = relay.multiply(call_1, call_2)
         return relay.Function([input_1, input_2], out)
@@ -542,16 +543,16 @@ def after_A():
         add_relu_1 = relay.add(x, y)
         add_relu_1 = relay.nn.relu(add_relu_1)
         add_relu_1 = relay.Function([x, y], add_relu_1)
-        add_relu_1 = add_relu_1.set_attribute('Primitive', expr.IntImm('int32', 1))
-        add_relu_1 = add_relu_1.set_attribute('Composite', expr.StringImm('add_relu'))
+        add_relu_1 = add_relu_1.set_attribute('Primitive', tir.IntImm('int32', 1))
+        add_relu_1 = add_relu_1.set_attribute('Composite', tir.StringImm('add_relu'))
         add_relu_call_1 = relay.Call(add_relu_1, [inputs[0], inputs[1]])
         x1 = relay.var('x1')
         y1 = relay.var('y1')
         add_relu_2 = relay.add(x1, y1)
         add_relu_2 = relay.nn.relu(add_relu_2)
         add_relu_2 = relay.Function([x1, y1], add_relu_2)
-        add_relu_2 = add_relu_2.set_attribute('Primitive', expr.IntImm('int32', 1))
-        add_relu_2 = add_relu_2.set_attribute('Composite', expr.StringImm('add_relu'))
+        add_relu_2 = add_relu_2.set_attribute('Primitive', tir.IntImm('int32', 1))
+        add_relu_2 = add_relu_2.set_attribute('Composite', tir.StringImm('add_relu'))
         add_relu_call_2 = relay.Call(add_relu_2, [inputs[2], inputs[3]])
         x2 = relay.var('x2')
         y2 = relay.var('y2')
@@ -559,8 +560,8 @@ def after_A():
         sub = relay.subtract(x2, y2)
         add_sub_mul = relay.multiply(add, sub)
         add_sub_mul = relay.Function([x2, y2], add_sub_mul)
-        add_sub_mul = add_sub_mul.set_attribute('Primitive', expr.IntImm('int32', 1))
-        add_sub_mul = add_sub_mul.set_attribute('Composite', expr.StringImm('add_sub_mul'))
+        add_sub_mul = add_sub_mul.set_attribute('Primitive', tir.IntImm('int32', 1))
+        add_sub_mul = add_sub_mul.set_attribute('Composite', tir.StringImm('add_sub_mul'))
         add_sub_mul_call = relay.Call(add_sub_mul, [add_relu_call_1, add_relu_call_2])
         return relay.Function(inputs, add_sub_mul_call)
 
@@ -573,8 +574,8 @@ def after_B():
             add_relu = relay.add(x, y)
             add_relu = relay.nn.relu(add_relu)
             add_relu = relay.Function([x, y], add_relu)
-            add_relu = add_relu.set_attribute('Primitive', expr.IntImm('int32', 1))
-            add_relu = add_relu.set_attribute('Composite', expr.StringImm('add_relu'))
+            add_relu = add_relu.set_attribute('Primitive', tir.IntImm('int32', 1))
+            add_relu = add_relu.set_attribute('Composite', tir.StringImm('add_relu'))
             add_relu_call = relay.Call(add_relu, [inputs[i*2], inputs[i*2+1]])
             add_relu_calls.append(add_relu_call)
 
@@ -606,4 +607,4 @@ def after_B():
     test_multiple_patterns()
     test_merge_order()
     test_parallel_merge()
-    test_multiple_input_subgraphs()
\ No newline at end of file
+    test_multiple_input_subgraphs()

From a43e326fb0250a46c61f726a4633633c2af2bf03 Mon Sep 17 00:00:00 2001
From: Baden Hughes <580499+badenh@users.noreply.github.com>
Date: Mon, 17 Feb 2020 11:56:25 +1000
Subject: [PATCH 08/73] Update faq.md (#4893)

various minor editorial updates - style, grammar, typos.
---
 docs/faq.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/faq.md b/docs/faq.md
index f070ed59a575..b5bf65eb52b0 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -26,24 +26,24 @@ See [Installation](http://docs.tvm.ai/install/)
 TVM's relation to Other IR/DSL Projects
 ---------------------------------------
 There are usually two levels of abstractions of IR in the deep learning systems.
-TensorFlow's XLA and Intel's ngraph uses computation graph representation.
+TensorFlow's XLA and Intel's ngraph both use a computation graph representation.
 This representation is high level, and can be helpful to perform generic optimizations
 such as memory reuse, layout transformation and automatic differentiation.
 
-TVM adopts a low level representation, that explicitly express the choice of memory
+TVM adopts a low-level representation, that explicitly express the choice of memory
 layout, parallelization pattern, locality and hardware primitives etc.
 This level of IR is closer to directly target hardwares.
-The low level IR adopt ideas from existing image processing languages like Halide, darkroom
-and loop transformation tools like loopy and polyhedra based analysis.
-We specifically focus of expressing deep learning workloads(e.g. recurrence),
+The low-level IR adopts ideas from existing image processing languages like Halide, darkroom
+and loop transformation tools like loopy and polyhedra-based analysis.
+We specifically focus on expressing deep learning workloads (e.g. recurrence),
 optimization for different hardware backends and embedding with frameworks to provide
 end-to-end compilation stack.
 
 
-TVM's relation to libDNN cuDNN
+TVM's relation to libDNN, cuDNN
 ------------------------------
-TVM can incorporate these library as external calls. One goal of TVM is to be able to
-generate high performing kernels. We will evolve TVM an incremental manner as
-we learn from the technics of manual kernel crafting and add these as primitives in DSL.
+TVM can incorporate these libraries as external calls. One goal of TVM is to be able to
+generate high-performing kernels. We will evolve TVM an incremental manner as
+we learn from the techniques of manual kernel crafting and add these as primitives in DSL.
 See also [TVM Operator Inventory](https://github.com/apache/incubator-tvm/tree/master/topi) for
 recipes of operators in TVM.

From 13140916eeb6fc33b962f3faf9dbe6b702057865 Mon Sep 17 00:00:00 2001
From: Alex Gladkov <gladkova@lab126.com>
Date: Mon, 17 Feb 2020 09:22:11 -0800
Subject: [PATCH 09/73] Fast exponent (#4790)

---
 topi/include/topi/elemwise.h        | 80 +++++++++++++++++++++++++++++
 topi/python/topi/math.py            | 16 ++++++
 topi/src/topi.cc                    |  5 ++
 topi/tests/python/test_topi_math.py | 38 ++++++++++++++
 4 files changed, 139 insertions(+)

diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index e3f4678c1163..e35e3e424d6e 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -377,5 +377,85 @@ inline Tensor full_like(const Tensor& x,
   }, name, tag);
 }
 
+/*!
+ * \brief Fast exponential function implementation
+ *
+ * \param _x The input tensor
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is exponent operation
+ *
+ * \note Function computes:
+ * log2(e^x) = x * log2(e) * log2(2) =>
+ * log2(e^x) = log2(2^(x*log2(e))) =>
+ * e^x = 2^(x*log2(e))
+ * Splitting power x*log2(e) into integer and fractional parts:
+ * e^(n+f) = e^n * e^f
+ * n = floor(x*log2(e) + 1/2)
+ * f = x - n * ln(2)
+ * exp(x) = 2^n * exp(y)
+ * Approximation for fractional part:
+ * y = exp(f) = 1 + 2 * P(x**2)/(Q(x**2) - P(x**2))
+ */
+inline Tensor fast_exp_float32(const Tensor& _x,
+                               std::string name,
+                               std::string tag) {
+  auto x_hi = make_const(DataType::Float(32), 88.3762626647950f);
+  auto x_lo = make_const(DataType::Float(32), -88.3762626647949f);
+  auto log2e = make_const(DataType::Float(32), 1.44269504088896341f);
+  auto ln2 = make_const(DataType::Float(32), 0.6931471805599453f);
+  PrimExpr p[6] = {make_const(DataType::Float(32), 1.9875691500E-4f),
+                   make_const(DataType::Float(32), 1.3981999507E-3f),
+                   make_const(DataType::Float(32), 8.3334519073E-3f),
+                   make_const(DataType::Float(32), 4.1665795894E-2f),
+                   make_const(DataType::Float(32), 1.6666665459E-1f),
+                   make_const(DataType::Float(32), 5.0000001201E-1f)};
+  auto one = make_const(DataType::Float(32), 1.0f);
+  auto one_half = make_const(DataType::Float(32), 0.5f);
+  auto b = make_const(DataType::Float(32), 127.0f);
+
+  return compute(_x->shape,
+                 [&](const Array<Var>& i) {
+                   // clamp x
+                   auto x = ::tvm::max(::tvm::min(_x(i), x_hi), x_lo);
+                   // integer part
+                   auto n = ::tvm::floor(x * log2e + one_half);
+                   // fractional part
+                   auto f = x - n * ln2;
+                   auto y = (((((p[0] * f + p[1]) * f + p[2]) * f + p[3])* f+ p[4]) * f
+                             + p[5]) * f * f + f + one;
+                   // Return 2^m * exp(r).
+                   auto ef = tvm::reinterpret(DataType::Float(32),
+                                              ::tvm::cast(DataType::Int(32), n + b) << 23);
+                   return ::tvm::max(ef * y, _x(i)); // NOLINT(*)
+                 },
+                 name, tag);
+}
+
+
+/*!
+ * \brief Fast exponential function implementation
+ *
+ * \param x The input tensor
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is exponent operation
+ *
+ */
+inline Tensor fast_exp(const Tensor& x,
+                  std::string name = "T_fast_exp",
+                  std::string tag = kElementWise) {
+  if (x->dtype == DataType::Float(32)) {
+    auto ret = fast_exp_float32(x, name, tag);
+    return ret;
+  } else {
+    return compute(x->shape, [&](const Array<Var>& i) {
+        return ::tvm::exp(x(i));
+      }, name, tag);
+  }
+}
+
 }  // namespace topi
 #endif  // TOPI_ELEMWISE_H_
diff --git a/topi/python/topi/math.py b/topi/python/topi/math.py
index c3e1a102471e..148d53a54cfe 100644
--- a/topi/python/topi/math.py
+++ b/topi/python/topi/math.py
@@ -451,3 +451,19 @@ def reinterpret(x, dtype):
         The result.
     """
     return cpp.reinterpret(x, dtype)
+
+
+def fast_exp(x):
+    """Take exponential of input x using fast_exp implementation
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return cpp.fast_exp(x, x.dtype, tag.ELEMWISE)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 2b2142bb5759..a7b916093d98 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -165,6 +165,11 @@ TVM_REGISTER_GLOBAL("topi.exp")
   *rv = exp(args[0]);
   });
 
+TVM_REGISTER_GLOBAL("topi.fast_exp")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = fast_exp(args[0]);
+  });
+
 TVM_REGISTER_GLOBAL("topi.erf")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = erf(args[0]);
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index bb674364ff2e..5bb95ba10e3b 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -185,7 +185,45 @@ def verify(from_dtype, to_dtype, low=-100, high=100):
     verify("bool", "int32")
 
 
+def test_fastmath():
+    def test_apply(
+        func,
+        name,
+        f_numpy,
+        low,
+        high,
+        step,
+        dtype=tvm.float32
+    ):
+        a_np = np.arange(low, high, step).astype(dtype)
+        b_np = f_numpy(a_np)
+        A = tvm.placeholder(a_np.shape, dtype=dtype, name="A")
+        B = func(A)
+        assert tuple(B.shape) == tuple(A.shape)
+
+        def check_device(device):
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("Skip because %s is not enabled" % device)
+                return
+            with tvm.target.create(device):
+                s = topi.generic.schedule_injective(B)
+            func = tvm.build(s, [A, B], device, name=name)
+            a = tvm.nd.array(a_np, ctx)
+            b = tvm.nd.array(np.zeros_like(b_np), ctx)
+            func(a, b)
+            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+
+        check_device('llvm')
+        check_device('llvm -device=arm-cpu')
+
+
+    test_apply(topi.fast_exp, "fast_exp", np.exp,
+               low=-88, high=88,
+               step = 0.01)
+
 if __name__ == "__main__":
     test_util()
     test_ewise()
     test_cast()
+    test_fastmath()

From 0b2d11a5745779ec139a05e8ece73c93fa6d7db8 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 17 Feb 2020 10:53:05 -0800
Subject: [PATCH 10/73] [DOCS] Introduce how to add hardware backend to FAQ
 (#4898)

---
 docs/api/python/target.rst                |  2 +-
 docs/dev/relay_bring_your_own_codegen.rst |  2 +
 docs/faq.md                               | 49 -----------------
 docs/faq.rst                              | 64 +++++++++++++++++++++++
 docs/install/index.rst                    |  2 +
 docs/vta/index.rst                        |  4 +-
 tutorials/autotvm/README.txt              |  5 +-
 tutorials/language/tensorize.py           |  2 +
 8 files changed, 77 insertions(+), 53 deletions(-)
 delete mode 100644 docs/faq.md
 create mode 100644 docs/faq.rst

diff --git a/docs/api/python/target.rst b/docs/api/python/target.rst
index 6851c04c5b6b..625b98e9de43 100644
--- a/docs/api/python/target.rst
+++ b/docs/api/python/target.rst
@@ -19,4 +19,4 @@ tvm.target
 ----------
 .. automodule:: tvm.target
     :members:
-   :imported-members:
+    :imported-members:
diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/relay_bring_your_own_codegen.rst
index b735bb809e08..3e3369dba52f 100644
--- a/docs/dev/relay_bring_your_own_codegen.rst
+++ b/docs/dev/relay_bring_your_own_codegen.rst
@@ -15,6 +15,8 @@
     specific language governing permissions and limitations
     under the License.
 
+.. _relay-bring-your-own-codegen:
+
 =============================
 Bring Your Own Codegen To TVM
 =============================
diff --git a/docs/faq.md b/docs/faq.md
deleted file mode 100644
index b5bf65eb52b0..000000000000
--- a/docs/faq.md
+++ /dev/null
@@ -1,49 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Frequently Asked Questions
-==========================
-This document contains frequently asked questions.
-
-How to Install
---------------
-See [Installation](http://docs.tvm.ai/install/)
-
-TVM's relation to Other IR/DSL Projects
----------------------------------------
-There are usually two levels of abstractions of IR in the deep learning systems.
-TensorFlow's XLA and Intel's ngraph both use a computation graph representation.
-This representation is high level, and can be helpful to perform generic optimizations
-such as memory reuse, layout transformation and automatic differentiation.
-
-TVM adopts a low-level representation, that explicitly express the choice of memory
-layout, parallelization pattern, locality and hardware primitives etc.
-This level of IR is closer to directly target hardwares.
-The low-level IR adopts ideas from existing image processing languages like Halide, darkroom
-and loop transformation tools like loopy and polyhedra-based analysis.
-We specifically focus on expressing deep learning workloads (e.g. recurrence),
-optimization for different hardware backends and embedding with frameworks to provide
-end-to-end compilation stack.
-
-
-TVM's relation to libDNN, cuDNN
-------------------------------
-TVM can incorporate these libraries as external calls. One goal of TVM is to be able to
-generate high-performing kernels. We will evolve TVM an incremental manner as
-we learn from the techniques of manual kernel crafting and add these as primitives in DSL.
-See also [TVM Operator Inventory](https://github.com/apache/incubator-tvm/tree/master/topi) for
-recipes of operators in TVM.
diff --git a/docs/faq.rst b/docs/faq.rst
new file mode 100644
index 000000000000..32714b0813dd
--- /dev/null
+++ b/docs/faq.rst
@@ -0,0 +1,64 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+
+Frequently Asked Questions
+==========================
+
+
+How to Install
+--------------
+See :ref:`installation`.
+
+
+How to add a new Hardware Backend
+---------------------------------
+
+- If the hardware backend has LLVM support,
+  then we can directly generate the code by setting the correct target triple as in :py:mod:`~tvm.target`.
+- If the target hardware is a GPU, try to use the cuda, opencl or vulkan backend.
+- If the target hardware is a special accelerator,
+  checkout :ref:`vta-index` and :ref:`relay-bring-your-own-codegen`.
+- For all of the above cases, You may want to add target specific
+  optimization templates using AutoTVM, see :ref:`tutorials-autotvm-sec`.
+- Besides using LLVM's vectorization, we can also embed micro-kernels to leverage hardware intrinsics,
+  see :ref:`tutorials-tensorize`.
+
+
+TVM's relation to Other IR/DSL Projects
+---------------------------------------
+There are usually two levels of abstractions of IR in the deep learning systems.
+TensorFlow's XLA and Intel's ngraph both use a computation graph representation.
+This representation is high level, and can be helpful to perform generic optimizations
+such as memory reuse, layout transformation and automatic differentiation.
+
+TVM adopts a low-level representation, that explicitly express the choice of memory
+layout, parallelization pattern, locality and hardware primitives etc.
+This level of IR is closer to directly target hardwares.
+The low-level IR adopts ideas from existing image processing languages like Halide, darkroom
+and loop transformation tools like loopy and polyhedra-based analysis.
+We specifically focus on expressing deep learning workloads (e.g. recurrence),
+optimization for different hardware backends and embedding with frameworks to provide
+end-to-end compilation stack.
+
+
+TVM's relation to libDNN, cuDNN
+-------------------------------
+TVM can incorporate these libraries as external calls. One goal of TVM is to be able to
+generate high-performing kernels. We will evolve TVM an incremental manner as
+we learn from the techniques of manual kernel crafting and add these as primitives in DSL.
+See also top for recipes of operators in TVM.
diff --git a/docs/install/index.rst b/docs/install/index.rst
index f1caec14e68b..5f739418add3 100644
--- a/docs/install/index.rst
+++ b/docs/install/index.rst
@@ -15,6 +15,8 @@
     specific language governing permissions and limitations
     under the License.
 
+.. _installation:
+
 Installation
 ============
 To install TVM, please read :ref:`install-from-source`.
diff --git a/docs/vta/index.rst b/docs/vta/index.rst
index b719e2ded962..d31e377e5de3 100644
--- a/docs/vta/index.rst
+++ b/docs/vta/index.rst
@@ -15,6 +15,8 @@
     specific language governing permissions and limitations
     under the License.
 
+.. _vta-index:
+
 VTA: Deep Learning Accelerator Stack
 ====================================
 
@@ -50,4 +52,4 @@ Literature
 - Read the VTA tech report: `An Open Hardware Software Stack for Deep Learning`_.
 
 .. _release blog post: https://tvm.apache.org/2018/07/12/vta-release-announcement
-.. _An Open Hardware Software Stack for Deep Learning: https://arxiv.org/abs/1807.04188
\ No newline at end of file
+.. _An Open Hardware Software Stack for Deep Learning: https://arxiv.org/abs/1807.04188
diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt
index c511381dd57d..38e3b3343f4e 100644
--- a/tutorials/autotvm/README.txt
+++ b/tutorials/autotvm/README.txt
@@ -1,3 +1,4 @@
-Auto tuning
--------------
+.. _tutorials-autotvm-sec:
 
+Auto tuning
+-----------
diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
index 7fb2d5368c9c..afc708e5d1d3 100644
--- a/tutorials/language/tensorize.py
+++ b/tutorials/language/tensorize.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
+.. _tutorials-tensorize:
+
 Use Tensorize to Leverage Hardware Intrinsics
 =============================================
 **Author**: `Yizhi Liu <https://github.com/yzhliu>`_

From 27a02844cb52e883a4a66da68a527590d76f7d01 Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Mon, 17 Feb 2020 12:18:15 -0800
Subject: [PATCH 11/73] [Relay][Pass] Fix bug in re-processing call node in
 MergeComposite pass (#4879)

* Fix bug in re-processing call node

* Add test

* Add to main

* temp changes to work from another machine

* fix rest of tests

* fix test_reuse_call_merge

* fix merge

Co-authored-by: Jon Soifer <jonso@microsoft.com>
---
 src/relay/pass/merge_composite.cc             | 25 ++++--
 .../python/relay/test_pass_merge_composite.py | 82 +++++++++++++++++++
 2 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/src/relay/pass/merge_composite.cc b/src/relay/pass/merge_composite.cc
index 28bf8fa8c33a..4e1094b617e9 100644
--- a/src/relay/pass/merge_composite.cc
+++ b/src/relay/pass/merge_composite.cc
@@ -87,7 +87,7 @@ class MergeCompositeWrapper : public ExprMutator {
    * a new Relay expression ready to be wrapped into a composite function.
    */
   Expr ExtractPattern(const Call& pattern, const Call& root,
-          Map<std::string, Array<Expr>>* var_map) {
+          Map<std::string, Array<Expr>>* var_map, Map<Expr, Expr>* call_map) {
     // check to make sure both calls are to operators (not functions)
     if (!pattern->op->IsInstance<OpNode>() || !root->op->IsInstance<OpNode>())
       return Expr();
@@ -99,14 +99,20 @@ class MergeCompositeWrapper : public ExprMutator {
     for (const auto& arg : pattern->args) {
       Expr new_arg;
       if (arg->IsInstance<CallNode>()) {
-        // fail if the root argument is not also a call node
-        if (!root->args[i]->IsInstance<CallNode>()) {
-          return Expr();
+        // if we've already processed this call node, return the previous result
+        if (call_map->find(arg) != call_map->end()) {
+          new_arg = (*call_map)[arg];
+        } else {
+          // fail if the root argument is not also a call node
+          if (!root->args[i]->IsInstance<CallNode>()) {
+            return Expr();
+          }
+          // if it's a call node, recursively call this function
+          new_arg = ExtractPattern(Downcast<Call>(arg),
+                                  Downcast<Call>(root->args[i]),
+                                  var_map, call_map);
+          call_map->Set(arg, new_arg);
         }
-        // if it's a call node, recursively call this function
-        new_arg = ExtractPattern(Downcast<Call>(arg),
-                                 Downcast<Call>(root->args[i]),
-                                 var_map);
       } else if (arg->IsInstance<VarNode>()) {
         // if there's a var in the pattern, it must be a free var
         // so call the function to update the var_map
@@ -155,7 +161,8 @@ class MergeCompositeWrapper : public ExprMutator {
     Call pattern = Downcast<Call>(pattern_);
     CHECK(pattern.defined());
     Map<std::string, Array<Expr>> args_map;
-    auto extract = ExtractPattern(pattern, call, &args_map);
+    Map<Expr, Expr> call_map;
+    auto extract = ExtractPattern(pattern, call, &args_map, &call_map);
     if (extract.defined()) {
       auto free_vars = FreeVars(extract);
       // make the composite function
diff --git a/tests/python/relay/test_pass_merge_composite.py b/tests/python/relay/test_pass_merge_composite.py
index 4f5acc707a52..b96a89b1f483 100644
--- a/tests/python/relay/test_pass_merge_composite.py
+++ b/tests/python/relay/test_pass_merge_composite.py
@@ -110,6 +110,26 @@ def make_conv_bias_relu_pattern():
     return r
 
 
+def make_add_add_add_pattern():
+    """Create a pattern to match the following graph.
+       Useful for testing re-using a call node.
+
+        x    y
+      /  \  /
+      |  add
+       \  |  \
+         add |
+          | /
+         add
+    """
+    x = relay.var('x')
+    y = relay.var('y')
+    add_node = relay.add(x, y)
+    add_node_1 = relay.add(x, add_node)
+    r = relay.add(add_node_1, add_node)
+    return r
+
+
 def test_simple_merge():
     """Test composite function is correctly produced from simple graph.
 
@@ -239,6 +259,67 @@ def expected():
     assert relay.analysis.alpha_equal(result, expected)
 
 
+def test_reuse_call_merge():
+    """Test composite function is correctly produced from simple graph
+       which re-uses call nodes.
+
+    We could expect the pattern `make_add_add_add` to be merged
+    into a single op `add_add_add`.
+
+        x     y
+         \   / \
+          sub  |           x     y
+        /  |  /             \   / |
+        | add      ====>     sub  |
+         \ |  \               |  /
+          add |           add_add_add
+           | /
+          add
+
+    """
+    pattern_table = [
+        ("add_add_add", make_add_add_add_pattern())
+    ]
+
+    def before():
+        a = relay.var('a', shape=(10, 10))
+        b = relay.var('b', shape=(10, 10))
+        sub_node = relay.subtract(a, b)
+
+        # pattern
+        add_node = relay.add(sub_node, b)
+        add_node_1 = relay.add(sub_node, add_node)
+        r = relay.add(add_node_1, add_node)
+
+        return relay.Function([a, b], r)
+
+    def expected():
+        a = relay.var('a', shape=(10, 10))
+        b = relay.var('b', shape=(10, 10))
+
+        # add_relu_add function
+        in_1 = relay.var('in_1', shape=(10, 10))
+        in_2 = relay.var('in_2', shape=(10, 10))
+        add_node = relay.add(in_1, in_2)
+        add_node_1 = relay.add(in_1, add_node)
+        add_node_2 = relay.add(add_node_1, add_node)
+        add_add_add = relay.Function([in_1, in_2], add_node_2)
+        add_add_add = add_add_add.set_attribute("Primitive",
+                                                tir.IntImm("int32", 1))
+        add_add_add = add_add_add.set_attribute("Composite",
+                                                tir.StringImm("add_add_add"))
+
+        # merged function
+        sub_node = relay.subtract(a, b)
+        call = relay.Call(add_add_add, [sub_node, b])
+        return relay.Function([a, b], call)
+
+    result = run_opt_pass(before(), relay.transform.MergeComposite(pattern_table))
+    assert not relay.analysis.free_vars(result)
+    expected = run_opt_pass(expected(), relay.transform.InferType())
+    assert relay.analysis.alpha_equal(result, expected)
+
+
 def test_multiple_patterns():
     """Test different patterns are merged correctly in the graph.
 
@@ -608,3 +689,4 @@ def after_B():
     test_merge_order()
     test_parallel_merge()
     test_multiple_input_subgraphs()
+    test_reuse_call_merge()
\ No newline at end of file

From 08338dd5f8089b4fbf61ae8a63f02277dfcca713 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 17 Feb 2020 18:47:09 -0800
Subject: [PATCH 12/73] [REFACTOR][PY] Establish tvm.te and tvm.driver (#4900)

- Move the related files to tvm.te
- Move build_module.py to tvm.driver
---
 python/tvm/__init__.py                        |  27 +-
 python/tvm/api.py                             | 610 +-----------------
 python/tvm/arith.py                           |   5 +-
 python/tvm/autotvm/feature.py                 |   3 +-
 python/tvm/autotvm/task/topi_integration.py   |   7 +-
 python/tvm/driver/__init__.py                 |  18 +
 python/tvm/{ => driver}/build_module.py       | 272 +-------
 python/tvm/error.py                           |   2 +-
 python/tvm/hybrid/__init__.py                 |   2 +-
 python/tvm/hybrid/parser.py                   |  13 +-
 python/tvm/hybrid/util.py                     |   2 +-
 python/tvm/ir/expr.py                         |  24 +-
 python/tvm/relay/backend/_backend.py          |   7 +-
 python/tvm/relay/op/op.py                     |   2 +-
 python/tvm/relay/quantize/_calibrate.py       |   1 +
 python/tvm/target/__init__.py                 |   1 +
 python/tvm/target/build_config.py             | 254 ++++++++
 python/tvm/te/__init__.py                     |  27 +
 python/tvm/te/_ffi_api.py                     |  21 +
 python/tvm/te/operation.py                    | 406 ++++++++++++
 python/tvm/{ => te}/schedule.py               |  86 +--
 python/tvm/{ => te}/tag.py                    |   2 +-
 python/tvm/{ => te}/tensor.py                 |  20 +-
 python/tvm/{ => te}/tensor_intrin.py          |  40 +-
 python/tvm/testing.py                         |   5 +
 python/tvm/tir/__init__.py                    |   4 +-
 python/tvm/tir/expr.py                        |  51 ++
 python/tvm/tir/op.py                          | 188 +++++-
 src/api/api_arith.cc                          |   8 +-
 src/api/api_lang.cc                           | 110 ++--
 src/api/api_schedule.cc                       |   4 +-
 src/api/api_test.cc                           |  14 +-
 src/target/target.cc                          |  10 +-
 tests/python/unittest/test_lang_buffer.py     |   4 +-
 .../python/unittest/test_lang_constructor.py  |   4 +-
 tests/python/unittest/test_runtime_error.py   |  13 +-
 .../unittest/test_runtime_packed_func.py      |   4 +-
 tests/python/unittest/test_runtime_rpc.py     |   3 +-
 vta/python/vta/build_module.py                |   4 +-
 39 files changed, 1206 insertions(+), 1072 deletions(-)
 create mode 100644 python/tvm/driver/__init__.py
 rename python/tvm/{ => driver}/build_module.py (61%)
 create mode 100644 python/tvm/target/build_config.py
 create mode 100644 python/tvm/te/__init__.py
 create mode 100644 python/tvm/te/_ffi_api.py
 create mode 100644 python/tvm/te/operation.py
 rename python/tvm/{ => te}/schedule.py (86%)
 rename python/tvm/{ => te}/tag.py (98%)
 rename python/tvm/{ => te}/tensor.py (93%)
 rename python/tvm/{ => te}/tensor_intrin.py (81%)

diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 7d9cc1a918d1..65cb67266de6 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -47,25 +47,30 @@
 
 # tvm.target
 from . import target
+from .target import build_config
 
-# others
-from . import tensor
-from . import arith
-from . import make
-from . import schedule
-from . import hybrid
+# tvm.te
+from .te import decl_tensor_intrin, create_schedule, tag_scope
+
+# tvm.testing
 from . import testing
 
-from .api import *
-from .tensor_intrin import decl_tensor_intrin
-from .schedule import create_schedule
-from .build_module import build, lower, build_config
-from .tag import tag_scope
+# tvm.driver
+from .driver import build, lower
+
+# tvm.hybrid
+from . import hybrid
+
+# others
+from . import arith
 
 # backward compact for topi, to be removed later
+from .api import *
 from .tir import expr, stmt, ir_builder, ir_pass, generic
+from .te import tensor, schedule
 from .tir.op import *
 from . import intrin
+from . import make
 
 # Contrib initializers
 from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 3a8eedcf8e2e..9afaf03ee255 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -16,623 +16,23 @@
 # under the License.
 """Functions defined in TVM."""
 # pylint: disable=invalid-name,unused-import,redefined-builtin
-from numbers import Integral as _Integral
-
 import tvm._ffi
 import tvm.ir
+import tvm.tir
 
 from tvm.runtime import convert, const, DataType
-from tvm.ir import container as _container
-from tvm.tir import expr as _expr
-from tvm.tir import stmt as _stmt
+from tvm.ir import container as _container, Range
 from tvm.tir import decl_buffer, layout, bijective_layout
-from tvm.tir import min_value, max_value, indexdiv, indexmod
-import tvm.tir._ffi_api
+from tvm.tir import min_value, max_value, indexdiv, indexmod, all, any
+from tvm.te import placeholder, compute, scan, extern, var, size_var, thread_axis, reduce_axis
+
 
 from ._ffi.base import string_types, TVMError
 from ._ffi.registry import register_func, get_global_func, extract_ext_funcs
 
-from . import _api_internal
 from . import make as _make
-from . import tensor as _tensor
-from . import schedule as _schedule
-from . import tag as _tag
 
 int8 = "int8"
 int32 = "int32"
 float32 = "float32"
 handle = "handle"
-
-
-def var(name="tindex", dtype=int32):
-    """Create a new variable with specified name and dtype
-
-    Parameters
-    ----------
-    name : str
-        The name
-
-    dtype : str
-        The data type
-
-    Returns
-    -------
-    var : Var
-        The result symbolic variable.
-    """
-    return _expr.Var(name, dtype)
-
-
-def size_var(name="size", dtype=int32):
-    """Create a new variable represents a tensor shape size, which is non-negative.
-
-    Parameters
-    ----------
-    name : str
-        The name
-
-    dtype : str
-        The data type
-
-    Returns
-    -------
-    var : SizeVar
-        The result symbolic shape variable.
-    """
-    return _expr.SizeVar(name, dtype)
-
-
-def any(*args):
-    """Create a new experssion of the union of all conditions in the arguments
-
-    Parameters
-    ----------
-    args : list
-        List of symbolic boolean expressions
-
-    Returns
-    -------
-    expr: Expr
-        Expression
-    """
-    if not args:
-        raise ValueError("Any must take at least 1 argument")
-    if len(args) == 1:
-        return args[0]
-    ret = tvm.tir._ffi_api._OpOr(args[0], args[1])
-    for i in range(2, len(args)):
-        ret = tvm.tir._ffi_api._OpOr(ret, args[i])
-    return ret
-
-
-def all(*args):
-    """Create a new experssion of the intersection of all conditions in the
-      arguments
-
-    Parameters
-    ----------
-    args : list
-        List of symbolic boolean expressions
-
-    Returns
-    -------
-    expr: Expr
-        Expression
-    """
-    if not args:
-        raise ValueError("Any must take at least 1 argument")
-    if len(args) == 1:
-        return args[0]
-    ret = tvm.tir._ffi_api._OpAnd(args[0], args[1])
-    for i in range(2, len(args)):
-        ret = tvm.tir._ffi_api._OpAnd(ret, args[i])
-    return ret
-
-
-def placeholder(shape, dtype=None, name="placeholder"):
-    """Construct an empty tensor object.
-
-    Parameters
-    ----------
-    shape: Tuple of Expr
-        The shape of the tensor
-
-    dtype: str, optional
-        The data type of the tensor
-
-    name: str, optional
-        The name hint of the tensor
-
-    Returns
-    -------
-    tensor: Tensor
-        The created tensor
-    """
-    shape = (shape,) if isinstance(shape, _expr.PrimExpr) else shape
-    dtype = float32 if dtype is None else dtype
-    return _api_internal._Placeholder(
-        shape, dtype, name)
-
-
-def compute(shape, fcompute, name="compute", tag="", attrs=None):
-    """Construct a new tensor by computing over the shape domain.
-
-    The compute rule is result[axis] = fcompute(axis)
-
-    Parameters
-    ----------
-    shape: Tuple of Expr
-        The shape of the tensor
-
-    fcompute: lambda function of indices-> value
-        Specifies the input source expression
-
-    name: str, optional
-        The name hint of the tensor
-
-    tag: str, optional
-        Additional tag information about the compute.
-
-    attrs: dict, optional
-        The additional auxiliary attributes about the compute.
-
-    Returns
-    -------
-    tensor: Tensor
-        The created tensor
-    """
-    if _tag.TagScope.get_current() is not None:
-        if tag != "":
-            raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.get_current().tag
-    shape = (shape,) if isinstance(shape, _expr.PrimExpr) else shape
-    # for python3
-    shape = tuple([int(s) if isinstance(s, float) else s for s in shape])
-    ndim = len(shape)
-    code = fcompute.__code__
-
-    out_ndim = ndim
-    if code.co_argcount == 0:
-        arg_names = ["i%d" % i for i in range(ndim)]
-    else:
-        arg_names = code.co_varnames[:code.co_argcount]
-        out_ndim = code.co_argcount
-
-    if out_ndim != len(arg_names):
-        raise ValueError("fcompute do not match dimension, ndim=%d" % ndim)
-
-    dim_var = [_IterVar((0, s), x, 0) for x, s in zip(arg_names, shape[:out_ndim])]
-    body = fcompute(*[v.var for v in dim_var])
-
-    if isinstance(body, _tensor.TensorIntrinCall):
-        for i, s in enumerate(shape[out_ndim:]):
-            var_name = "ax" + str(i)
-            dim_var.append(_IterVar((0, s), var_name, 4))
-        op_node = _api_internal._TensorComputeOp(name,
-                                                 tag,
-                                                 dim_var,
-                                                 body.reduce_axis,
-                                                 out_ndim,
-                                                 body.intrin,
-                                                 body.tensors,
-                                                 body.regions,
-                                                 body.scalar_inputs)
-    else:
-        if not isinstance(body, (list, tuple)):
-            body = [body]
-        body = convert(body)
-        op_node = _api_internal._ComputeOp(
-            name, tag, attrs, dim_var, body)
-
-    num = op_node.num_outputs
-    outputs = tuple(op_node.output(i) for i in range(num))
-    return outputs[0] if num == 1 else outputs
-
-
-def scan(init, update, state_placeholder, inputs=None, name="scan", tag="", attrs=None):
-    """Construct new tensors by scanning over axis.
-
-    Parameters
-    ----------
-    init: Tensor or list of Tensor
-        The initial condition of first init.shape[0] timestamps
-
-    update: Tensor or list of Tensor
-        The update rule of the scan given by symbolic tensor.
-
-    state_placeholder: Tensor or list of Tensor
-        The placeholder variables used by update.
-
-    inputs: Tensor or list of Tensor, optional
-        The list of inputs to the scan. This is not required, but can
-        be useful for the compiler to detect scan body faster.
-
-    name: str, optional
-        The name hint of the tensor
-
-    tag: str, optional
-        Additonal tag information about the compute.
-
-    attrs: dict, optional
-        The additional auxiliary attributes about the compute.
-
-    Returns
-    -------
-    tensor: Tensor or list of Tensors
-        The created tensor or tuple of tensors it it contains multiple outputs.
-
-    Example
-    -------
-    .. code-block:: python
-
-      # The following code is equivalent to numpy.cumsum
-      m = tvm.var("m")
-      n = tvm.var("n")
-      X = tvm.placeholder((m, n), name="X")
-      s_state = tvm.placeholder((m, n))
-      s_init = tvm.compute((1, n), lambda _, i: X[0, i])
-      s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
-      res = tvm.scan(s_init, s_update, s_state, X)
-    """
-    if _tag.TagScope.get_current() is not None:
-        if tag != "":
-            raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.get_current().tag
-    if isinstance(init, _tensor.Tensor):
-        init = [init]
-    if isinstance(update, _tensor.Tensor):
-        update = [update]
-    if isinstance(state_placeholder, _tensor.Tensor):
-        state_placeholder = [state_placeholder]
-    if isinstance(inputs, _tensor.Tensor):
-        inputs = [inputs]
-    if inputs is None:
-        inputs = []
-    if len(init) != len(update) or len(init) != len(state_placeholder):
-        raise ValueError("init, update, state_placeholder must have same length")
-    axis = _IterVar((init[0].shape[0], update[0].shape[0]), "%s.idx" % name, 3)
-    op = _api_internal._ScanOp(name, tag, attrs,
-                               axis, init, update,
-                               state_placeholder, inputs)
-    res = [op.output(i) for i in range(len(update))]
-    return res[0] if len(res) == 1 else res
-
-
-def extern(shape,
-           inputs,
-           fcompute,
-           name="extern",
-           dtype=None,
-           in_buffers=None,
-           out_buffers=None,
-           tag="",
-           attrs=None):
-    """Compute several tensor via extern function.
-
-    Parameters
-    ----------
-    shape: tuple or list of tuples.
-        The shape of the outputs.
-
-    inputs: list of Tensor
-        The inputs
-
-    fcompute: lambda function of inputs, outputs-> stmt
-        Specifies the IR statement to do the computation.
-        See the following note for function signature of fcompute
-
-        .. note::
-             **Parameters**
-
-             - **ins** (list of :any:`Buffer`) - Placeholder for each inputs
-             - **outs** (list of :any:`Buffer`) - Placeholder for each outputs
-
-             **Returns**
-
-             - **stmt** (:any:`Stmt`) - The statement that carries out array computation.
-
-    name: str, optional
-        The name hint of the tensor
-
-    dtype: str or list of str, optional
-        The data types of outputs,
-        by default dtype will be same as inputs.
-
-    in_buffers: Buffer or list of Buffer, optional
-        Input buffers.
-
-    out_buffers: Buffer or list of Buffers, optional
-        Output buffers.
-
-
-    tag: str, optional
-        Additonal tag information about the compute.
-
-    attrs: dict, optional
-        The additional auxiliary attributes about the compute.
-
-    Returns
-    -------
-    tensor: Tensor or list of Tensors
-        The created tensor or tuple of tensors it it contains multiple outputs.
-
-    Example
-    -------
-    In the code below, C is generated by calling external PackedFunc
-    `tvm.contrib.cblas.matmul`
-
-    .. code-block:: python
-
-        A = tvm.placeholder((n, l), name='A')
-        B = tvm.placeholder((l, m), name='B')
-        C = tvm.extern((n, m), [A, B],
-                       lambda ins, outs: tvm.call_packed(
-                          "tvm.contrib.cblas.matmul",
-                            ins[0], ins[1], outs[0], 0, 0), name="C")
-    """
-    if _tag.TagScope.get_current() is not None:
-        if tag != "":
-            raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.get_current().tag
-    shape = (shape,) if isinstance(shape, (_expr.PrimExpr, _Integral)) else shape
-    if shape == () or isinstance(shape[0], (_expr.PrimExpr, _Integral)):
-        shape = [shape]
-    if in_buffers is not None:
-        in_buffers = [in_buffers] if not isinstance(in_buffers, list) else in_buffers
-        if len(inputs) != len(in_buffers):
-            raise RuntimeError("Number of inputs and in_buffers mismatch: %d vs %d."
-                               % (len(inputs), len(in_buffers)))
-    if out_buffers is not None:
-        out_buffers = [out_buffers] if not isinstance(out_buffers, list) else out_buffers
-        if len(shape) != len(out_buffers):
-            raise RuntimeError("Number of outputs and out_buffers mismatch: %d vs %d."
-                               % (len(shape), len(out_buffers)))
-    input_placeholders = in_buffers or []
-    output_placeholders = out_buffers or []
-    types = set()
-    for t in inputs:
-        if not isinstance(t, _tensor.Tensor):
-            raise ValueError("expect inputs to be tensor")
-        if in_buffers is None:
-            input_placeholders.append(
-                decl_buffer(t.shape, t.dtype, t.op.name))
-        types.add(t.dtype)
-
-    if dtype is None:
-        if len(types) != 1:
-            raise ValueError("Cannot infer output type, please provide dtype argument")
-        infered_type = types.pop()
-        dtype = [infered_type for _ in shape]
-    if isinstance(dtype, str):
-        dtype = [dtype]
-
-    if out_buffers is None:
-        for shp, dt in zip(shape, dtype):
-            output_placeholders.append(decl_buffer(shp, dt, name))
-    body = fcompute(input_placeholders, output_placeholders)
-    if isinstance(body, _expr.PrimExpr):
-        body = _stmt.Evaluate(body)
-
-    op = _api_internal._ExternOp(name, tag, attrs,
-                                 inputs, input_placeholders,
-                                 output_placeholders, body)
-    res = [op.output(i) for i in range(len(output_placeholders))]
-    return res[0] if len(res) == 1 else res
-
-
-def _IterVar(dom, name, iter_type, thread_tag=''):
-    """Internal function to create IterVar
-
-    Parameters
-    ----------
-    dom : Range
-        The domain of iteration.
-
-    name : str
-        The name of iteration variable.
-
-    iter_type : int
-        The type of iteration.
-
-    thread_tag : str
-        The thread tag of the iteration variable.
-
-    Returns
-    -------
-    iter_var : IterVar
-       The result itervar
-    """
-    if dom is not None:
-        if isinstance(dom, (list, tuple)):
-            if len(dom) != 2:
-                raise TypeError("need to be list of ranges")
-            dom = Range(dom[0], dom[1])
-
-        if not isinstance(dom, tvm.ir.Range):
-            raise TypeError("dom need to be Range")
-    name = name if name else 'iter'
-    v = var(name)
-    return _api_internal._IterVar(dom, v, iter_type, thread_tag)
-
-
-def thread_axis(dom=None, tag='', name=''):
-    """Create a new IterVar to represent thread index.
-
-    Parameters
-    ----------
-    dom : Range or str
-        The domain of iteration
-        When str is passed, dom is set to None and str is used as tag
-
-    tag : str, optional
-        The thread tag
-
-    name : str, optional
-        The name of the var.
-
-    Returns
-    -------
-    axis : IterVar
-        The thread itervar.
-    """
-    if isinstance(dom, string_types):
-        tag, dom = dom, None
-    if not tag:
-        raise ValueError("tag must be given as Positional or keyword argument")
-    name = name if name else tag
-    return _IterVar(dom, name, 1, tag)
-
-
-def reduce_axis(dom, name="rv"):
-    """Create a new IterVar for reduction.
-
-    Parameters
-    ----------
-    dom : Range
-        The domain of iteration.
-
-    name : str
-        The name of the variable.
-
-    Returns
-    -------
-    axis : IterVar
-        An iteration variable representing the value.
-    """
-    return _IterVar(dom, name, 2)
-
-
-def comm_reducer(fcombine, fidentity, name="reduce"):
-    """Create a commutative reducer for reduction.
-
-    Parameters
-    ----------
-    fcombine : function(Expr -> Expr -> Expr)
-        A binary function which takes two Expr as input to return a Expr.
-
-    fidentity : function(str -> Expr)
-        A function which takes a type string as input to return a const Expr.
-
-    Returns
-    -------
-    reducer : function
-        A function which creates a reduce expression over axis.
-        There are two ways to use it:
-
-        1. accept (expr, axis, where) to produce an Reduce Expr on
-           specified axis;
-        2. simply use it with multiple Exprs.
-
-    Example
-    -------
-    .. code-block:: python
-
-        n = tvm.var('n')
-        m = tvm.var('m')
-        mysum = tvm.comm_reducer(lambda x, y: x+y,
-            lambda t: tvm.const(0, dtype=t), name="mysum")
-        A = tvm.placeholder((n, m), name='A')
-        k = tvm.reduce_axis((0, m), name='k')
-        B = tvm.compute((n,), lambda i: mysum(A[i, k], axis=k), name='B')
-    """
-    def _reduce_directly(*args):
-        num = len(args)
-        # process `where` is None
-        if num == 3 and args[2] is None:
-            num = 2
-        res = args[0]
-        for i in range(num-1):
-            res = fcombine(res, args[i+1])
-        return res
-
-    def _make_reduce(expr, axis, where=None):
-        code = fcombine.__code__
-        assert fcombine.__code__.co_argcount == 2
-        expr = convert(expr)
-        if isinstance(expr, _container.Array):
-            size = len(expr)
-            larr = []
-            rarr = []
-            dtypes = []
-            for i in range(size):
-                dtype = expr[i].dtype
-                dtypes.append(dtype)
-                lname = code.co_varnames[0] + '_' + str(i)
-                larr.append(var(lname, dtype))
-                rname = code.co_varnames[1] + '_' + str(i)
-                rarr.append(var(rname, dtype))
-            lhs = convert(larr)
-            rhs = convert(rarr)
-            result = fcombine(lhs, rhs)
-            id_elem = fidentity(*dtypes)
-        else:
-            assert isinstance(expr, _expr.PrimExpr)
-            size = 1
-            dtype = expr.dtype
-            lvar = var(code.co_varnames[0], dtype)
-            rvar = var(code.co_varnames[1], dtype)
-            result = [fcombine(lvar, rvar)]
-            id_elem = [fidentity(dtype)]
-            lhs = convert([lvar])
-            rhs = convert([rvar])
-            expr = convert([expr])
-        result = convert(result)
-        id_elem = convert(id_elem)
-        combiner = _expr.CommReducer(lhs, rhs, result, id_elem)
-        axis = convert(axis if isinstance(axis, (list, tuple)) else [axis])
-        if where is None:
-            where = convert(True)
-        outputs = tuple(_expr.Reduce(combiner, expr, axis, where, i)
-                        for i in range(size))
-        return outputs[0] if size == 1 else outputs
-
-    # pylint: disable=keyword-arg-before-vararg
-    def reducer(expr, axis, where=None, *args):
-        if isinstance(axis, (_schedule.IterVar, list, tuple)):
-            assert not args
-            return _make_reduce(expr, axis, where)
-        if where is None:
-            assert not args
-            return _reduce_directly(expr, axis)
-        return _reduce_directly(expr, axis, where, *args)
-
-    doc_str = """Create a {0} expression over axis.
-
-              Parameters
-              ----------
-              expr : Expr
-                  The source expression.
-              axis : IterVar
-                  The reduction IterVar axis
-              where : optional, Expr
-                  Filtering predicate of the reduction.
-              Returns
-              -------
-              value : Expr
-                  The result value.
-
-              Example
-              -------
-              .. code-block:: python
-
-                m = tvm.var("m")
-                n = tvm.var("n")
-                A = tvm.placeholder((m, n), name="A")
-                k = tvm.reduce_axis((0, n), name="k")
-
-                # there are two way to use this {0} reducer:
-                # mode 1, accept (expr, axis, where) to produce an Reduce Expr
-                B = tvm.compute((m,), lambda i: tvm.{0}(A[i, k], axis=k), name="B")
-
-                # mode 2, simply use it with multiple Exprs:
-                {0}_res = tvm.{0}(m, n)
-              """
-    reducer.__doc__ = doc_str.format(name)
-    return reducer
-
-# pylint: disable=unnecessary-lambda
-sum = comm_reducer(lambda x, y: x+y, lambda t: const(0, dtype=t), name="sum")
-min = comm_reducer(lambda x, y: tvm.tir._ffi_api._OpMin(x, y), max_value, name='min')
-max = comm_reducer(lambda x, y: tvm.tir._ffi_api._OpMax(x, y), min_value, name='max')
-
-tvm._ffi._init_api("tvm.api")
diff --git a/python/tvm/arith.py b/python/tvm/arith.py
index 35cd6a38f4b4..b67e99c204ba 100644
--- a/python/tvm/arith.py
+++ b/python/tvm/arith.py
@@ -18,17 +18,16 @@
 import tvm._ffi
 from tvm.runtime import Object
 
-from . import _api_internal
 
 class IntSet(Object):
     """Represent a set of integer in one dimension."""
     def is_nothing(self):
         """Whether the set represent nothing"""
-        return _api_internal._IntSetIsNothing(self)
+        return _IntSetIsNothing(self)
 
     def is_everything(self):
         """Whether the set represent everything"""
-        return _api_internal._IntSetIsEverything(self)
+        return _IntSetIsEverything(self)
 
 
 @tvm._ffi.register_object("arith.IntervalSet")
diff --git a/python/tvm/autotvm/feature.py b/python/tvm/autotvm/feature.py
index 56c52a52f218..b7d1c44117a7 100644
--- a/python/tvm/autotvm/feature.py
+++ b/python/tvm/autotvm/feature.py
@@ -29,7 +29,8 @@
 import struct
 import numpy as np
 
-from tvm import schedule, ir_pass, build_module, get_global_func, target as _target
+from tvm import schedule, ir_pass, get_global_func, target as _target
+from tvm.driver import build_module
 
 def ana_lower(sch, args,
               binds=None,
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 3b788fddcc90..3d3a1d3d3a4e 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -26,8 +26,9 @@
 
 See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
 """
+import tvm.te._ffi_api
 
-from ... import _api_internal, tensor, placeholder
+from ... import tensor, placeholder
 
 from .task import args_to_workload, dispatcher, register
 from ..util import get_const_tuple
@@ -420,10 +421,10 @@ def template_call(cfg, *args, **kwargs):
                     attrs[k] = v
                 attrs['workload'] = args_to_workload(args, topi_compute)
                 if isinstance(op, tensor.ComputeOp):
-                    op = _api_internal._ComputeOp(
+                    op = tvm.te._ffi_api.ComputeOp(
                         op.name, op.tag, attrs, op.axis, op.body)
                 elif isinstance(op, tensor.ExternOp):
-                    op = _api_internal._ExternOp(
+                    op = tvm.te._ffi_api.ExternOp(
                         op.name, op.tag, attrs,
                         op.inputs, op.input_placeholders,
                         op.output_placeholders, op.body)
diff --git a/python/tvm/driver/__init__.py b/python/tvm/driver/__init__.py
new file mode 100644
index 000000000000..75e94cc91c83
--- /dev/null
+++ b/python/tvm/driver/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Namespace for driver APIs"""
+from .build_module import lower, build
diff --git a/python/tvm/build_module.py b/python/tvm/driver/build_module.py
similarity index 61%
rename from python/tvm/build_module.py
rename to python/tvm/driver/build_module.py
index de78a3ee2700..f529ee26b58f 100644
--- a/python/tvm/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -20,253 +20,18 @@
 LoweredFunc and compiled Module.
 """
 import warnings
-import tvm._ffi
-import tvm.runtime
 
-from tvm.runtime import Object, ndarray
+import tvm.tir
+
+from tvm.runtime import ndarray
 from tvm.ir import container
-from tvm.target import codegen
-from tvm.tir import expr
+from tvm.target import codegen, BuildConfig
 from tvm.tir import ir_pass
-from tvm.tir import Stmt
 from tvm.tir.stmt import LoweredFunc
+from tvm.te import tensor
+from tvm.te import schedule
+from tvm import target as _target
 
-from . import target as _target
-
-from . import api
-from . import _api_internal
-from . import tensor
-from . import schedule
-from . import make
-
-
-
-class DumpIR(object):
-    """
-    Dump IR for each pass.
-    With it, you can dump ir just like gcc/llvm.
-
-    How to use:
-    -----------
-    .. code-block:: python
-
-        with tvm.build_config(dump_pass_ir=True)
-            run()
-    """
-    scope_level = 0
-    def __init__(self):
-        self._pass_id = 0
-        self._recover_list = []
-
-    def decorate(self, func):
-        """ decorate the pass function"""
-        def dump(*args, **kwargs):
-            """dump function"""
-            retv = func(*args, **kwargs)
-            if not isinstance(retv, (Stmt, LoweredFunc, container.Array)):
-                return retv
-            fname = func.func_name if hasattr(func, 'func_name') else func.__name__
-            pname = str(self._pass_id) + "_" + fname + "_ir.cc"
-            with open(pname, "a") as f:
-                out = retv.body if isinstance(retv, LoweredFunc) else retv
-                f.write(str(out))
-                if isinstance(retv, container.Array):
-                    for x in retv:
-                        out = x.body if isinstance(x, LoweredFunc) else x
-                        f.write("---------%s\n%s\n-----------\n"%(x.name, str(out)))
-                self._pass_id += 1
-            return retv
-        return dump
-
-    def decorate_irpass(self):
-        """decorate ir_pass and ScheduleOps"""
-        self._old_sgpass = schedule.ScheduleOps
-        schedule.ScheduleOps = self.decorate(schedule.ScheduleOps)
-        vset = vars(ir_pass)
-        k = v = 0
-        def recover():
-            vset[k] = v
-        for k, v in vset.items():
-            self._recover_list.append(recover)
-            vset[k] = self.decorate(v) if isinstance(v, Function) else v
-
-    def decorate_custompass(self, custom_pass):
-        """decorate given list of custom passes, and return decorated passes"""
-        custom_pass = custom_pass if custom_pass else []
-        pass_list = []
-        for idx, x in enumerate(custom_pass):
-            x[1].__name__ = "custom{}_phase{}".format(idx, x[0])
-            pass_list += [(x[0], self.decorate(x[1]))]
-        return pass_list
-
-    def enter(self):
-        """only decorate outermost nest"""
-        if DumpIR.scope_level > 0:
-            return
-        self.decorate_irpass()
-        self._pass_id = 0
-        DumpIR.scope_level += 1
-
-    def exit(self):
-        """recover outermost nest"""
-        if DumpIR.scope_level > 1:
-            return
-        # recover decorated functions
-        for f in self._recover_list:
-            f()
-        schedule.ScheduleOps = self._old_sgpass
-        DumpIR.scope_level -= 1
-
-
-@tvm._ffi.register_object
-class BuildConfig(Object):
-    """Configuration scope to set a build config option.
-
-    Note
-    ----
-    This object is backed by object protocol in C++, with arguments that can be
-    exchanged between python and C++.
-
-    Do not construct directly, use build_config instead.
-
-    The fields that are backed by the C++ object are immutable once an instance
-    is constructed. See _object_defaults for the fields.
-    """
-
-    _object_defaults = {
-        "auto_unroll_max_step": 0,
-        "auto_unroll_max_depth": 8,
-        "auto_unroll_max_extent": 0,
-        "unroll_explicit": True,
-        "detect_global_barrier": False,
-        "partition_const_loop": False,
-        "offset_factor": 0,
-        "data_alignment": -1,
-        "restricted_func": True,
-        "double_buffer_split_loop": 1,
-        "dump_pass_ir": False,
-        "instrument_bound_checkers": False,
-        "disable_select_rewriting": False,
-        "disable_vectorize": False,
-        "disable_assert": False
-    }
-    _dump_ir = DumpIR()
-
-    # pylint: disable=no-member
-    def __init__(self, handle):
-        """Initialize the function with handle
-
-        Parameters
-        ----------
-        handle : SymbolHandle
-            the handle to the underlying C++ Symbol
-        """
-        super(BuildConfig, self).__init__(handle)
-        self.handle = handle
-
-    @property
-    def add_lower_pass(self):
-        size = _api_internal._BuildConfigGetAddLowerPassInfo(self)
-        result = []
-        for i in range(size):
-            phase = _api_internal._BuildConfigGetAddLowerPassInfo(self, i, True)
-            func = _api_internal._BuildConfigGetAddLowerPassInfo(self, i, False)
-            result += [(phase, func)]
-        return result
-
-    @add_lower_pass.setter
-    def add_lower_pass(self, value):
-        add_lower_pass_args = []
-        for x in value:
-            add_lower_pass_args += [x[0], x[1]]
-        _api_internal._BuildConfigSetAddLowerPass(self, *add_lower_pass_args)
-
-    def __enter__(self):
-        # pylint: disable=protected-access
-        _api_internal._EnterBuildConfigScope(self)
-        if self.dump_pass_ir:
-            BuildConfig._dump_ir.enter()
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        if self.dump_pass_ir:
-            BuildConfig._dump_ir.exit()
-        _api_internal._ExitBuildConfigScope(self)
-
-    def __setattr__(self, name, value):
-        if name in BuildConfig._object_defaults:
-            raise AttributeError(
-                "'%s' object cannot set attribute '%s'" % (str(type(self)), name))
-        return super(BuildConfig, self).__setattr__(name, value)
-
-
-def current_build_config():
-    """Get the current build configuration."""
-    return _api_internal._GetCurrentBuildConfig()
-
-
-def build_config(**kwargs):
-    """Configure the build behavior by setting config variables.
-
-    Parameters
-    ----------
-    auto_unroll_max_step: int, default=0
-        Threshold of number of steps in the loop to be automatically unrolled.
-        This takes inner loop count into consideration.
-
-    auto_unroll_max_depth: int, default=8
-        The maximum nested level of loops that can be automatically unrolled.
-
-    unroll_explicit: bool, default=True
-        Whether explicitly unroll the loop, if set false, the unroll hint will
-        be passed to the CodeGen phase, which may generate pragma unroll hint.
-        Set this to be true if CodeGen support unroll pragma and
-        when we want to be more readable.
-
-    detect_global_barrier: bool, default=True
-        Whether detect global barrier.
-
-    partition_const_loop: bool, default=False
-        Whether partition const loop
-
-    data_alignment: int, optional
-        The alignment of data pointer in bytes.
-        If -1 is passed, the alignment will be set to TVM's internal default.
-
-    offset_factor: int, default=0
-        The factor used in default buffer declaration.
-        If specified as 0, offset field is not used.
-
-    restricted_func: bool, default=True
-        Whether build restricted function.
-        That is each buffer argument to the function are guaranteed
-        not to overlap. This enables more optimization.
-        Corresponds to restricted keyword in C99
-
-    double_buffer_split_loop: int, default=2
-        Whether split the loop with factor. If it is zero, no splitting will happen.
-        It it is bigger than one, the logic will do a split with factor equals the integer
-        and unroll the inner loop. This allows the buffer fetching won't contain condition.
-
-    add_lower_pass: list of tuple (phase, function(Stmt->Stmt)), default=None
-        phase contains an integer on which optimization pass we apply the pass.
-        Additional lowering passes to be applied before make_api.
-
-    dump_pass_ir: dump ir of each pass into file idx_passname_ir.cc, default=False
-
-    Returns
-    -------
-    config: BuildConfig
-        The build configuration
-    """
-    node_args = {k: v if k not in kwargs else kwargs[k]
-                 for k, v in BuildConfig._object_defaults.items()}
-    config = make.node("BuildConfig", **node_args)
-
-    if "add_lower_pass" in kwargs:
-        config.add_lower_pass = kwargs["add_lower_pass"]
-
-    return config
 
 def get_binds(args, compact=False, binds=None):
     """Internal function to get binds and arg_list given arguments.
@@ -293,26 +58,27 @@ def get_binds(args, compact=False, binds=None):
         The list of symbolic buffers of arguments.
     """
     binds = {} if binds is None else binds.copy()
-    cfg = current_build_config()
+    cfg = BuildConfig.current()
     arg_list = []
     for x in args:
         if isinstance(x, tensor.Tensor):
-            any_dim = any(isinstance(i, expr.Var) for i in x.shape)
+            any_dim = any(isinstance(i, tvm.tir.Var) for i in x.shape)
             buffer_type = "auto_broadcast" if any_dim and not compact else ""
             if x not in binds:
-                buf = api.decl_buffer(x.shape,
-                                      dtype=x.dtype,
-                                      name=x.name,
-                                      data_alignment=cfg.data_alignment,
-                                      offset_factor=cfg.offset_factor,
-                                      buffer_type=buffer_type)
+                buf = tvm.tir.decl_buffer(
+                    x.shape,
+                    dtype=x.dtype,
+                    name=x.name,
+                    data_alignment=cfg.data_alignment,
+                    offset_factor=cfg.offset_factor,
+                    buffer_type=buffer_type)
                 binds[x] = buf
                 arg_list.append(buf)
             else:
                 arg_list.append(binds[x])
         elif isinstance(x, schedule.Buffer):
             arg_list.append(x)
-        elif isinstance(x, expr.Var):
+        elif isinstance(x, tvm.tir.Var):
             arg_list.append(x)
         else:
             raise ValueError("args must be Tensor, Buffer or Var")
@@ -371,7 +137,7 @@ def lower(sch,
        The result function, if with_api_wrapper=False
        Then the Stmt before make api is returned.
     """
-    cfg = current_build_config()
+    cfg = BuildConfig.current()
     add_lower_pass = cfg.add_lower_pass if cfg.add_lower_pass else []
     if cfg.dump_pass_ir:
         add_lower_pass = BuildConfig._dump_ir.decorate_custompass(add_lower_pass)
@@ -465,7 +231,7 @@ def _build_for_device(flist, target, target_host):
                 "Direct host side access to device memory is detected in %s. "
                 "Did you forget to bind?" % func.name)
         if func.func_type == LoweredFunc.MixedFunc:
-            if current_build_config().detect_global_barrier:
+            if BuildConfig.current().detect_global_barrier:
                 func = ir_pass.ThreadSync(func, "global")
             func = ir_pass.ThreadSync(func, "shared")
             func = ir_pass.ThreadSync(func, "warp")
diff --git a/python/tvm/error.py b/python/tvm/error.py
index a6d4f701d2a6..083c7e825255 100644
--- a/python/tvm/error.py
+++ b/python/tvm/error.py
@@ -21,7 +21,7 @@
 To make the code more readable, we recommended developers to
 copy the examples and raise errors with the same message convention.
 """
-from ._ffi.base import register_error, TVMError
+from tvm._ffi.base import register_error, TVMError
 
 @register_error
 class InternalError(TVMError):
diff --git a/python/tvm/hybrid/__init__.py b/python/tvm/hybrid/__init__.py
index 988e5a67d7d2..68294965da6f 100644
--- a/python/tvm/hybrid/__init__.py
+++ b/python/tvm/hybrid/__init__.py
@@ -30,9 +30,9 @@
 # 2. Support multi-level HalideIR
 import inspect
 import tvm._ffi
+from tvm.driver.build_module import form_body
 
 from .._ffi.base import decorate
-from ..build_module import form_body
 
 from .module import HybridModule
 from .parser import source_to_op
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index 6be0006a851f..cf8584a1e999 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -26,19 +26,20 @@
 from enum import Enum
 from tvm.ir import Array, Range
 import tvm.tir
+import tvm.te._ffi_api
+
 from tvm.tir import expr as _expr
 from tvm.tir import stmt as _stmt
 from tvm.tir import ir_pass as _ir_pass
+from tvm.te.tensor import Tensor, Operation
+from tvm.tir import all as _all
+from tvm.tir import any as _any
 
 from .util import _internal_assert
 from . import calls
 from . import util
 from .preprocessor import determine_variable_usage
-from ..api import all as _all
-from ..api import any as _any
 
-from ..tensor import Tensor, Operation
-from .. import _api_internal as _tvm_internal
 from .. import api  as _api
 
 
@@ -653,7 +654,7 @@ def get_input_tensors(arg):
 
     for i in args:
         get_input_tensors(i)
-    op = _tvm_internal._HybridOp(parser.func_name, "HybridOp", None, input_tensors,
-                                 parser.outputs, parser.parsed_body)
+    op = tvm.te._ffi_api.HybridOp(parser.func_name, "HybridOp", None, input_tensors,
+                                  parser.outputs, parser.parsed_body)
     res = [op.output(i) for i in range(len(parser.outputs))]
     return res[0] if len(res) == 1 else res
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 2f449dc8f69e..2b6795652878 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -27,9 +27,9 @@
 
 from tvm.tir import expr as _expr
 from tvm.tir import stmt as _stmt
+from tvm.te.tensor import Tensor
 
 from .. import api as _api
-from ..tensor import Tensor
 
 
 #pylint: disable=invalid-name
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index d29e73a9b10e..feced8da0538 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -17,10 +17,10 @@
 """Common expressions data structures in the IR."""
 import tvm._ffi
 
-
 from .base import Node
 from . import _ffi_api
 
+
 class BaseExpr(Node):
     """Base class of all the expressions."""
 
@@ -98,7 +98,29 @@ class Range(Node):
 
     You do not need to create a Range explicitly.
     Python lists and tuples will be converted automatically to a Range in API functions.
+
+    Parameters
+    ----------
+    begin : PrimExpr
+        The begin value of the range when end is None.
+        Otherwise it is the length of the range.
+
+    end : Optional[PrimExpr]
+        The end value of the range.
+
+    Note
+    ----
+    The constructor creates the range `[begin, end)`
+    if the end argument is not None. Otherwise, it creates `[0, begin)`.
     """
+    def __init__(self, begin, end=None):
+        if end is None:
+            self.__init_handle_by_constructor__(
+                _ffi_api.Range, 0, begin)
+        else:
+            self.__init_handle_by_constructor__(
+                _ffi_api.Range, begin, end)
+
     @staticmethod
     def make_by_min_extent(min_value, extent):
         """Construct a Range by min and extent.
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index c2f1df915509..9169ef49210d 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -16,10 +16,9 @@
 # under the License.
 """The interface of expr function exposed from C++."""
 import tvm._ffi
+import tvm.driver
 from tvm.ir import container as _container
 
-from ... import build_module as _build
-
 
 @tvm._ffi.register_func("relay.backend.lower")
 def lower(sch, inputs, func_name, source_func):
@@ -48,7 +47,7 @@ def lower(sch, inputs, func_name, source_func):
     import traceback
 
     try:
-        f = _build.lower(sch, inputs, name=func_name)
+        f = tvm.driver.lower(sch, inputs, name=func_name)
         # logging.debug("lower function %s", func_name)
         # logging.debug("%s", _build.lower(sch, inputs, simple_mode=True))
     except Exception:
@@ -85,7 +84,7 @@ def build(funcs, target, target_host=None):
     """
     if target_host == "":
         target_host = None
-    return _build.build(funcs, target=target, target_host=target_host)
+    return tvm.driver.build(funcs, target=target, target_host=target_host)
 
 
 @tvm._ffi.register_func("relay._tensor_value_repr")
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index c74201ef9c1f..c6d301213e98 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -18,11 +18,11 @@
 """The base node types for the Relay language."""
 import topi
 import tvm._ffi
+from tvm.driver import lower, build
 
 from ..base import register_relay_node
 from ..expr import RelayExpr
 from ...api import register_func
-from ...build_module import lower, build
 from . import _make
 
 @register_relay_node
diff --git a/python/tvm/relay/quantize/_calibrate.py b/python/tvm/relay/quantize/_calibrate.py
index 8f83bfbf0659..9794698a0447 100644
--- a/python/tvm/relay/quantize/_calibrate.py
+++ b/python/tvm/relay/quantize/_calibrate.py
@@ -20,6 +20,7 @@
 import multiprocessing as mp
 import numpy as np
 import tvm
+import tvm.driver
 from tvm.ir import IRModule
 
 from . import _quantize
diff --git a/python/tvm/target/__init__.py b/python/tvm/target/__init__.py
index 287649670fb0..3975f30e644a 100644
--- a/python/tvm/target/__init__.py
+++ b/python/tvm/target/__init__.py
@@ -61,3 +61,4 @@
 from . import datatype
 from . import codegen
 from .intrin import register_intrin_rule
+from .build_config import BuildConfig, build_config
diff --git a/python/tvm/target/build_config.py b/python/tvm/target/build_config.py
new file mode 100644
index 000000000000..8782d24d2da9
--- /dev/null
+++ b/python/tvm/target/build_config.py
@@ -0,0 +1,254 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Target dependent BuildConfig for low-level passes."""
+# TODO(tvm-team) consolidate with PassContext
+import tvm._ffi
+import tvm.ir
+
+from tvm.runtime import Object
+from tvm.ir import container
+from tvm.tir import Stmt
+from tvm.tir.stmt import LoweredFunc
+from . import _ffi_api
+
+
+class DumpIR(object):
+    """
+    Dump IR for each pass.
+    With it, you can dump ir just like gcc/llvm.
+
+    How to use:
+    -----------
+    .. code-block:: python
+
+        with tvm.build_config(dump_pass_ir=True)
+            run()
+    """
+    scope_level = 0
+    def __init__(self):
+        self._pass_id = 0
+        self._recover_list = []
+
+    def decorate(self, func):
+        """ decorate the pass function"""
+        def dump(*args, **kwargs):
+            """dump function"""
+            retv = func(*args, **kwargs)
+            if not isinstance(retv, (Stmt, LoweredFunc, container.Array)):
+                return retv
+            fname = func.func_name if hasattr(func, 'func_name') else func.__name__
+            pname = str(self._pass_id) + "_" + fname + "_ir.cc"
+            with open(pname, "a") as f:
+                out = retv.body if isinstance(retv, LoweredFunc) else retv
+                f.write(str(out))
+                if isinstance(retv, container.Array):
+                    for x in retv:
+                        out = x.body if isinstance(x, LoweredFunc) else x
+                        f.write("---------%s\n%s\n-----------\n"%(x.name, str(out)))
+                self._pass_id += 1
+            return retv
+        return dump
+
+    def decorate_irpass(self):
+        """decorate ir_pass and ScheduleOps"""
+        self._old_sgpass = schedule.ScheduleOps
+        schedule.ScheduleOps = self.decorate(schedule.ScheduleOps)
+        vset = vars(ir_pass)
+        k = v = 0
+        def recover():
+            vset[k] = v
+        for k, v in vset.items():
+            self._recover_list.append(recover)
+            vset[k] = self.decorate(v) if isinstance(v, Function) else v
+
+    def decorate_custompass(self, custom_pass):
+        """decorate given list of custom passes, and return decorated passes"""
+        custom_pass = custom_pass if custom_pass else []
+        pass_list = []
+        for idx, x in enumerate(custom_pass):
+            x[1].__name__ = "custom{}_phase{}".format(idx, x[0])
+            pass_list += [(x[0], self.decorate(x[1]))]
+        return pass_list
+
+    def enter(self):
+        """only decorate outermost nest"""
+        if DumpIR.scope_level > 0:
+            return
+        self.decorate_irpass()
+        self._pass_id = 0
+        DumpIR.scope_level += 1
+
+    def exit(self):
+        """recover outermost nest"""
+        if DumpIR.scope_level > 1:
+            return
+        # recover decorated functions
+        for f in self._recover_list:
+            f()
+        schedule.ScheduleOps = self._old_sgpass
+        DumpIR.scope_level -= 1
+
+
+@tvm._ffi.register_object
+class BuildConfig(Object):
+    """Configuration scope to set a build config option.
+
+    Note
+    ----
+    This object is backed by object protocol in C++, with arguments that can be
+    exchanged between python and C++.
+
+    Do not construct directly, use build_config instead.
+
+    The fields that are backed by the C++ object are immutable once an instance
+    is constructed. See _object_defaults for the fields.
+    """
+
+    _object_defaults = {
+        "auto_unroll_max_step": 0,
+        "auto_unroll_max_depth": 8,
+        "auto_unroll_max_extent": 0,
+        "unroll_explicit": True,
+        "detect_global_barrier": False,
+        "partition_const_loop": False,
+        "offset_factor": 0,
+        "data_alignment": -1,
+        "restricted_func": True,
+        "double_buffer_split_loop": 1,
+        "dump_pass_ir": False,
+        "instrument_bound_checkers": False,
+        "disable_select_rewriting": False,
+        "disable_vectorize": False,
+        "disable_assert": False
+    }
+    _dump_ir = DumpIR()
+
+    # pylint: disable=no-member
+    def __init__(self, handle):
+        """Initialize the function with handle
+
+        Parameters
+        ----------
+        handle : SymbolHandle
+            the handle to the underlying C++ Symbol
+        """
+        super(BuildConfig, self).__init__(handle)
+        self.handle = handle
+
+    @property
+    def add_lower_pass(self):
+        size = _ffi_api.BuildConfigGetAddLowerPassInfo(self)
+        result = []
+        for i in range(size):
+            phase = _ffi_api.BuildConfigGetAddLowerPassInfo(self, i, True)
+            func = _ffi_api.BuildConfigGetAddLowerPassInfo(self, i, False)
+            result += [(phase, func)]
+        return result
+
+    @add_lower_pass.setter
+    def add_lower_pass(self, value):
+        add_lower_pass_args = []
+        for x in value:
+            add_lower_pass_args += [x[0], x[1]]
+        _ffi_api.BuildConfigSetAddLowerPass(self, *add_lower_pass_args)
+
+    def __enter__(self):
+        # pylint: disable=protected-access
+        _ffi_api.EnterBuildConfigScope(self)
+        if self.dump_pass_ir:
+            BuildConfig._dump_ir.enter()
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        if self.dump_pass_ir:
+            BuildConfig._dump_ir.exit()
+        _ffi_api.ExitBuildConfigScope(self)
+
+    def __setattr__(self, name, value):
+        if name in BuildConfig._object_defaults:
+            raise AttributeError(
+                "'%s' object cannot set attribute '%s'" % (str(type(self)), name))
+        return super(BuildConfig, self).__setattr__(name, value)
+
+    @staticmethod
+    def current():
+        """Get the current build configuration."""
+        return _ffi_api.GetCurrentBuildConfig()
+
+
+def build_config(**kwargs):
+    """Configure the build behavior by setting config variables.
+
+    Parameters
+    ----------
+    auto_unroll_max_step: int, default=0
+        Threshold of number of steps in the loop to be automatically unrolled.
+        This takes inner loop count into consideration.
+
+    auto_unroll_max_depth: int, default=8
+        The maximum nested level of loops that can be automatically unrolled.
+
+    unroll_explicit: bool, default=True
+        Whether explicitly unroll the loop, if set false, the unroll hint will
+        be passed to the CodeGen phase, which may generate pragma unroll hint.
+        Set this to be true if CodeGen support unroll pragma and
+        when we want to be more readable.
+
+    detect_global_barrier: bool, default=True
+        Whether detect global barrier.
+
+    partition_const_loop: bool, default=False
+        Whether partition const loop
+
+    data_alignment: int, optional
+        The alignment of data pointer in bytes.
+        If -1 is passed, the alignment will be set to TVM's internal default.
+
+    offset_factor: int, default=0
+        The factor used in default buffer declaration.
+        If specified as 0, offset field is not used.
+
+    restricted_func: bool, default=True
+        Whether build restricted function.
+        That is each buffer argument to the function are guaranteed
+        not to overlap. This enables more optimization.
+        Corresponds to restricted keyword in C99
+
+    double_buffer_split_loop: int, default=2
+        Whether split the loop with factor. If it is zero, no splitting will happen.
+        It it is bigger than one, the logic will do a split with factor equals the integer
+        and unroll the inner loop. This allows the buffer fetching won't contain condition.
+
+    add_lower_pass: list of tuple (phase, function(Stmt->Stmt)), default=None
+        phase contains an integer on which optimization pass we apply the pass.
+        Additional lowering passes to be applied before make_api.
+
+    dump_pass_ir: dump ir of each pass into file idx_passname_ir.cc, default=False
+
+    Returns
+    -------
+    config: BuildConfig
+        The build configuration
+    """
+    node_args = {k: v if k not in kwargs else kwargs[k]
+                 for k, v in BuildConfig._object_defaults.items()}
+    config = tvm.ir.make_node("BuildConfig", **node_args)
+
+    if "add_lower_pass" in kwargs:
+        config.add_lower_pass = kwargs["add_lower_pass"]
+
+    return config
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
new file mode 100644
index 000000000000..0564fff0e5cb
--- /dev/null
+++ b/python/tvm/te/__init__.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-import, redefined-builtin, wildcard-import
+"""Namespace for Tensor-level IR"""
+# expose all operators in tvm tir.op
+from tvm.tir.op import *
+
+from .schedule import Schedule, create_schedule
+from .tensor import TensorSlice, Tensor
+from .tensor_intrin import decl_tensor_intrin
+from .tag import tag_scope
+from .operation import placeholder, compute, scan, extern, var, size_var
+from .operation import thread_axis, reduce_axis
diff --git a/python/tvm/te/_ffi_api.py b/python/tvm/te/_ffi_api.py
new file mode 100644
index 000000000000..ac814c844724
--- /dev/null
+++ b/python/tvm/te/_ffi_api.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for tvm.te"""
+import tvm._ffi
+
+
+tvm._ffi._init_api("te", __name__)
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
new file mode 100644
index 000000000000..3c5b610e99be
--- /dev/null
+++ b/python/tvm/te/operation.py
@@ -0,0 +1,406 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Operation class for computation declaration."""
+# pylint: disable=invalid-name
+from numbers import Integral as _Integral
+
+import tvm._ffi
+import tvm.tir
+import tvm.tir._ffi_api
+
+from tvm._ffi.base import string_types
+from tvm.runtime import convert
+
+from . import tag as _tag
+from . import tensor as _tensor
+from . import _ffi_api
+
+
+def placeholder(shape, dtype=None, name="placeholder"):
+    """Construct an empty tensor object.
+
+    Parameters
+    ----------
+    shape: Tuple of Expr
+        The shape of the tensor
+
+    dtype: str, optional
+        The data type of the tensor
+
+    name: str, optional
+        The name hint of the tensor
+
+    Returns
+    -------
+    tensor: Tensor
+        The created tensor
+    """
+    shape = (shape,) if isinstance(shape, tvm.tir.PrimExpr) else shape
+    dtype = "float32" if dtype is None else dtype
+    return _ffi_api.Placeholder(
+        shape, dtype, name)
+
+
+def compute(shape, fcompute, name="compute", tag="", attrs=None):
+    """Construct a new tensor by computing over the shape domain.
+
+    The compute rule is result[axis] = fcompute(axis)
+
+    Parameters
+    ----------
+    shape: Tuple of Expr
+        The shape of the tensor
+
+    fcompute: lambda function of indices-> value
+        Specifies the input source expression
+
+    name: str, optional
+        The name hint of the tensor
+
+    tag: str, optional
+        Additional tag information about the compute.
+
+    attrs: dict, optional
+        The additional auxiliary attributes about the compute.
+
+    Returns
+    -------
+    tensor: Tensor
+        The created tensor
+    """
+    if _tag.TagScope.get_current() is not None:
+        if tag != "":
+            raise ValueError("nested tag is not allowed for now")
+        tag = _tag.TagScope.get_current().tag
+    shape = (shape,) if isinstance(shape, tvm.tir.PrimExpr) else shape
+    # for python3
+    shape = tuple([int(s) if isinstance(s, float) else s for s in shape])
+    ndim = len(shape)
+    code = fcompute.__code__
+
+    out_ndim = ndim
+    if code.co_argcount == 0:
+        arg_names = ["i%d" % i for i in range(ndim)]
+    else:
+        arg_names = code.co_varnames[:code.co_argcount]
+        out_ndim = code.co_argcount
+
+    if out_ndim != len(arg_names):
+        raise ValueError("fcompute do not match dimension, ndim=%d" % ndim)
+
+    dim_var = [tvm.tir.IterVar((0, s), x, 0) for x, s in zip(arg_names, shape[:out_ndim])]
+    body = fcompute(*[v.var for v in dim_var])
+
+    if isinstance(body, _tensor.TensorIntrinCall):
+        for i, s in enumerate(shape[out_ndim:]):
+            var_name = "ax" + str(i)
+            dim_var.append(tvm.tir.IterVar((0, s), var_name, 4))
+        op_node = _ffi_api.TensorComputeOp(name,
+                                           tag,
+                                           dim_var,
+                                           body.reduce_axis,
+                                           out_ndim,
+                                           body.intrin,
+                                           body.tensors,
+                                           body.regions,
+                                           body.scalar_inputs)
+    else:
+        if not isinstance(body, (list, tuple)):
+            body = [body]
+        body = convert(body)
+        op_node = _ffi_api.ComputeOp(
+            name, tag, attrs, dim_var, body)
+
+    num = op_node.num_outputs
+    outputs = tuple(op_node.output(i) for i in range(num))
+    return outputs[0] if num == 1 else outputs
+
+
+def scan(init, update, state_placeholder, inputs=None, name="scan", tag="", attrs=None):
+    """Construct new tensors by scanning over axis.
+
+    Parameters
+    ----------
+    init: Tensor or list of Tensor
+        The initial condition of first init.shape[0] timestamps
+
+    update: Tensor or list of Tensor
+        The update rule of the scan given by symbolic tensor.
+
+    state_placeholder: Tensor or list of Tensor
+        The placeholder variables used by update.
+
+    inputs: Tensor or list of Tensor, optional
+        The list of inputs to the scan. This is not required, but can
+        be useful for the compiler to detect scan body faster.
+
+    name: str, optional
+        The name hint of the tensor
+
+    tag: str, optional
+        Additonal tag information about the compute.
+
+    attrs: dict, optional
+        The additional auxiliary attributes about the compute.
+
+    Returns
+    -------
+    tensor: Tensor or list of Tensors
+        The created tensor or tuple of tensors it it contains multiple outputs.
+
+    Example
+    -------
+    .. code-block:: python
+
+      # The following code is equivalent to numpy.cumsum
+      m = tvm.var("m")
+      n = tvm.var("n")
+      X = tvm.placeholder((m, n), name="X")
+      s_state = tvm.placeholder((m, n))
+      s_init = tvm.compute((1, n), lambda _, i: X[0, i])
+      s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+      res = tvm.scan(s_init, s_update, s_state, X)
+    """
+    if _tag.TagScope.get_current() is not None:
+        if tag != "":
+            raise ValueError("nested tag is not allowed for now")
+        tag = _tag.TagScope.get_current().tag
+    if isinstance(init, _tensor.Tensor):
+        init = [init]
+    if isinstance(update, _tensor.Tensor):
+        update = [update]
+    if isinstance(state_placeholder, _tensor.Tensor):
+        state_placeholder = [state_placeholder]
+    if isinstance(inputs, _tensor.Tensor):
+        inputs = [inputs]
+    if inputs is None:
+        inputs = []
+    if len(init) != len(update) or len(init) != len(state_placeholder):
+        raise ValueError("init, update, state_placeholder must have same length")
+    axis = tvm.tir.IterVar((init[0].shape[0], update[0].shape[0]), "%s.idx" % name, 3)
+    op = _ffi_api.ScanOp(name, tag, attrs,
+                         axis, init, update,
+                         state_placeholder, inputs)
+    res = [op.output(i) for i in range(len(update))]
+    return res[0] if len(res) == 1 else res
+
+
+def extern(shape,
+           inputs,
+           fcompute,
+           name="extern",
+           dtype=None,
+           in_buffers=None,
+           out_buffers=None,
+           tag="",
+           attrs=None):
+    """Compute several tensor via extern function.
+
+    Parameters
+    ----------
+    shape: tuple or list of tuples.
+        The shape of the outputs.
+
+    inputs: list of Tensor
+        The inputs
+
+    fcompute: lambda function of inputs, outputs-> stmt
+        Specifies the IR statement to do the computation.
+        See the following note for function signature of fcompute
+
+        .. note::
+             **Parameters**
+
+             - **ins** (list of :any:`Buffer`) - Placeholder for each inputs
+             - **outs** (list of :any:`Buffer`) - Placeholder for each outputs
+
+             **Returns**
+
+             - **stmt** (:any:`Stmt`) - The statement that carries out array computation.
+
+    name: str, optional
+        The name hint of the tensor
+
+    dtype: str or list of str, optional
+        The data types of outputs,
+        by default dtype will be same as inputs.
+
+    in_buffers: Buffer or list of Buffer, optional
+        Input buffers.
+
+    out_buffers: Buffer or list of Buffers, optional
+        Output buffers.
+
+
+    tag: str, optional
+        Additonal tag information about the compute.
+
+    attrs: dict, optional
+        The additional auxiliary attributes about the compute.
+
+    Returns
+    -------
+    tensor: Tensor or list of Tensors
+        The created tensor or tuple of tensors it it contains multiple outputs.
+
+    Example
+    -------
+    In the code below, C is generated by calling external PackedFunc
+    `tvm.contrib.cblas.matmul`
+
+    .. code-block:: python
+
+        A = tvm.placeholder((n, l), name="A")
+        B = tvm.placeholder((l, m), name="B")
+        C = tvm.extern((n, m), [A, B],
+                       lambda ins, outs: tvm.call_packed(
+                          "tvm.contrib.cblas.matmul",
+                            ins[0], ins[1], outs[0], 0, 0), name="C")
+    """
+    if _tag.TagScope.get_current() is not None:
+        if tag != "":
+            raise ValueError("nested tag is not allowed for now")
+        tag = _tag.TagScope.get_current().tag
+    shape = (shape,) if isinstance(shape, (tvm.tir.PrimExpr, _Integral)) else shape
+    if shape == () or isinstance(shape[0], (tvm.tir.PrimExpr, _Integral)):
+        shape = [shape]
+    if in_buffers is not None:
+        in_buffers = [in_buffers] if not isinstance(in_buffers, list) else in_buffers
+        if len(inputs) != len(in_buffers):
+            raise RuntimeError("Number of inputs and in_buffers mismatch: %d vs %d."
+                               % (len(inputs), len(in_buffers)))
+    if out_buffers is not None:
+        out_buffers = [out_buffers] if not isinstance(out_buffers, list) else out_buffers
+        if len(shape) != len(out_buffers):
+            raise RuntimeError("Number of outputs and out_buffers mismatch: %d vs %d."
+                               % (len(shape), len(out_buffers)))
+    input_placeholders = in_buffers or []
+    output_placeholders = out_buffers or []
+    types = set()
+    for t in inputs:
+        if not isinstance(t, _tensor.Tensor):
+            raise ValueError("expect inputs to be tensor")
+        if in_buffers is None:
+            input_placeholders.append(
+                tvm.tir.decl_buffer(t.shape, t.dtype, t.op.name))
+        types.add(t.dtype)
+
+    if dtype is None:
+        if len(types) != 1:
+            raise ValueError("Cannot infer output type, please provide dtype argument")
+        infered_type = types.pop()
+        dtype = [infered_type for _ in shape]
+    if isinstance(dtype, str):
+        dtype = [dtype]
+
+    if out_buffers is None:
+        for shp, dt in zip(shape, dtype):
+            output_placeholders.append(tvm.tir.decl_buffer(shp, dt, name))
+    body = fcompute(input_placeholders, output_placeholders)
+    if isinstance(body, tvm.tir.PrimExpr):
+        body = tvm.tir.Evaluate(body)
+
+    op = _ffi_api.ExternOp(name, tag, attrs,
+                           inputs, input_placeholders,
+                           output_placeholders, body)
+    res = [op.output(i) for i in range(len(output_placeholders))]
+    return res[0] if len(res) == 1 else res
+
+
+def var(name="tindex", dtype="int32"):
+    """Create a new variable with specified name and dtype
+
+    Parameters
+    ----------
+    name : str
+        The name
+
+    dtype : str
+        The data type
+
+    Returns
+    -------
+    var : Var
+        The result symbolic variable.
+    """
+    return tvm.tir.Var(name, dtype)
+
+
+def size_var(name="size", dtype="int32"):
+    """Create a new variable represents a tensor shape size, which is non-negative.
+
+    Parameters
+    ----------
+    name : str
+        The name
+
+    dtype : str
+        The data type
+
+    Returns
+    -------
+    var : SizeVar
+        The result symbolic shape variable.
+    """
+    return tvm.tir.SizeVar(name, dtype)
+
+
+def thread_axis(dom=None, tag="", name=""):
+    """Create a new IterVar to represent thread index.
+
+    Parameters
+    ----------
+    dom : Range or str
+        The domain of iteration
+        When str is passed, dom is set to None and str is used as tag
+
+    tag : str, optional
+        The thread tag
+
+    name : str, optional
+        The name of the var.
+
+    Returns
+    -------
+    axis : IterVar
+        The thread itervar.
+    """
+    if isinstance(dom, string_types):
+        tag, dom = dom, None
+    if not tag:
+        raise ValueError("tag must be given as Positional or keyword argument")
+    name = name if name else tag
+    return tvm.tir.IterVar(dom, name, 1, tag)
+
+
+def reduce_axis(dom, name="rv"):
+    """Create a new IterVar for reduction.
+
+    Parameters
+    ----------
+    dom : Range
+        The domain of iteration.
+
+    name : str
+        The name of the variable.
+
+    Returns
+    -------
+    axis : IterVar
+        An iteration variable representing the value.
+    """
+    return tvm.tir.IterVar(dom, name, 2)
diff --git a/python/tvm/schedule.py b/python/tvm/te/schedule.py
similarity index 86%
rename from python/tvm/schedule.py
rename to python/tvm/te/schedule.py
index cd48d495a74a..d160f78d7c89 100644
--- a/python/tvm/schedule.py
+++ b/python/tvm/te/schedule.py
@@ -21,10 +21,10 @@
 
 from tvm.runtime import Object, convert
 from tvm.ir import container as _container
-from tvm.tir import expr as _expr, Buffer
+from tvm.tir import IterVar, Buffer
 
-from . import _api_internal
 from . import tensor as _tensor
+from . import _ffi_api
 
 
 @tvm._ffi.register_object
@@ -42,31 +42,6 @@ class Singleton(Object):
     """Singleton axis."""
 
 
-@tvm._ffi.register_object
-class IterVar(Object, _expr.ExprOp):
-    """Represent iteration variable.
-
-    IterVar is normally created by Operation, to represent
-    axis iterations in the computation.
-    It can also created by schedule primitives like :any:`tvm.schedule.Stage.split`.
-
-    See Also
-    --------
-    tvm.thread_axis: Create thread axis IterVar.
-    tvm.reduce_axis: Create reduce axis IterVar.
-    """
-    DataPar = 0
-    ThreadIndex = 1
-    CommReduce = 2
-    Ordered = 3
-    DimInfo = 4
-    Unrolled = 5
-    Vectorized = 6
-    Parallelized = 7
-    Tensorized = 8
-
-_tensor.iter_var_cls = IterVar
-
 def create_schedule(ops):
     """Create a schedule for list of ops
 
@@ -82,7 +57,7 @@ def create_schedule(ops):
     """
     if not isinstance(ops, (list, _container.Array)):
         ops = [ops]
-    return _api_internal._CreateSchedule(ops)
+    return _ffi_api.CreateSchedule(ops)
 
 
 @tvm._ffi.register_object
@@ -108,7 +83,7 @@ def normalize(self):
         sch : Schedule
             The normalized schedule.
         """
-        return _api_internal._ScheduleNormalize(self)
+        return _ffi_api.ScheduleNormalize(self)
 
     def create_group(self, outputs, inputs, include_inputs=False):
         """Create stage group by giving output and input boundary.
@@ -137,7 +112,7 @@ def create_group(self, outputs, inputs, include_inputs=False):
             outputs = [outputs]
         if isinstance(inputs, _tensor.Tensor):
             inputs = [inputs]
-        return _api_internal._ScheduleCreateGroup(
+        return _ffi_api.ScheduleCreateGroup(
             self, outputs, inputs, include_inputs)
 
     def cache_read(self, tensor, scope, readers):
@@ -164,7 +139,7 @@ def cache_read(self, tensor, scope, readers):
         if isinstance(readers, (_tensor.Tensor, _tensor.Operation)):
             readers = [readers]
         readers = [t.op if isinstance(t, _tensor.Tensor) else t for t in readers]
-        return _api_internal._ScheduleCacheRead(self, tensor, scope, readers)
+        return _ffi_api.ScheduleCacheRead(self, tensor, scope, readers)
 
     def cache_write(self, tensor, scope):
         """Create a cache write of original tensor, before storing into tensor.
@@ -192,7 +167,7 @@ def cache_write(self, tensor, scope):
         cache : Tensor
             The created cache tensor.
         """
-        return _api_internal._ScheduleCacheWrite(self, tensor, scope)
+        return _ffi_api.ScheduleCacheWrite(self, tensor, scope)
 
     def rfactor(self, tensor, axis, factor_axis=0):
         """ Factor a reduction axis in tensor's schedule to be an explicit axis.
@@ -215,7 +190,7 @@ def rfactor(self, tensor, axis, factor_axis=0):
         tfactor : Tensor or Array of Tensor
             The created factored tensor.
         """
-        factored = _api_internal._ScheduleRFactor(self, tensor, axis, factor_axis)
+        factored = _ffi_api.ScheduleRFactor(self, tensor, axis, factor_axis)
         return factored[0] if len(factored) == 1 else factored
 
 
@@ -247,11 +222,11 @@ def split(self, parent, factor=None, nparts=None):
         if nparts is not None:
             if factor is not None:
                 raise ValueError("Do not need to provide both outer and nparts")
-            outer, inner = _api_internal._StageSplitByNParts(self, parent, nparts)
+            outer, inner = _ffi_api.StageSplitByNParts(self, parent, nparts)
         else:
             if factor is None:
                 raise ValueError("Either nparts or factor need to be provided")
-            outer, inner = _api_internal._StageSplitByFactor(self, parent, factor)
+            outer, inner = _ffi_api.StageSplitByFactor(self, parent, factor)
         return outer, inner
 
     def fuse(self, *args):
@@ -270,7 +245,7 @@ def fuse(self, *args):
         fused : IterVar
             The fused variable of iteration.
         """
-        fused = _api_internal._StageFuse(self, args)
+        fused = _ffi_api.StageFuse(self, args)
         return fused
 
     def set_scope(self, scope):
@@ -281,7 +256,7 @@ def set_scope(self, scope):
         scope : str
             The thread scope of this stage
         """
-        return _api_internal._StageSetScope(self, scope)
+        return _ffi_api.StageSetScope(self, scope)
 
     def bind(self, ivar, thread_ivar):
         """Bind ivar to thread index thread_ivar
@@ -294,7 +269,7 @@ def bind(self, ivar, thread_ivar):
         thread_ivar : IterVar
             The thread to be binded.
         """
-        _api_internal._StageBind(self, ivar, thread_ivar)
+        _ffi_api.StageBind(self, ivar, thread_ivar)
 
     def env_threads(self, threads):
         """Mark threads to be launched at the outer scope of composed op.
@@ -306,7 +281,7 @@ def env_threads(self, threads):
         """
         if isinstance(threads, IterVar):
             threads = [threads]
-        _api_internal._StageEnvThreads(self, threads)
+        _ffi_api.StageEnvThreads(self, threads)
 
     def set_store_predicate(self, predicate):
         """Set predicate under which store to the array can be performed.
@@ -319,7 +294,7 @@ def set_store_predicate(self, predicate):
         predicate : Expr
             The guard condition fo store.
         """
-        _api_internal._StageSetStorePredicate(self, predicate)
+        _ffi_api.StageSetStorePredicate(self, predicate)
 
     def compute_at(self, parent, scope):
         """Attach the stage at parent's scope
@@ -332,7 +307,7 @@ def compute_at(self, parent, scope):
         scope : IterVar
             The loop scope t be attached to.
         """
-        _api_internal._StageComputeAt(self, parent, scope)
+        _ffi_api.StageComputeAt(self, parent, scope)
 
     def compute_inline(self):
         """Mark stage as inline
@@ -342,7 +317,7 @@ def compute_inline(self):
         parent : Stage
             The parent stage
         """
-        _api_internal._StageComputeInline(self)
+        _ffi_api.StageComputeInline(self)
 
     def compute_root(self):
         """Attach the stage at parent, and mark it as root
@@ -352,7 +327,7 @@ def compute_root(self):
         parent : Stage
             The parent stage
         """
-        _api_internal._StageComputeRoot(self)
+        _ffi_api.StageComputeRoot(self)
 
     def reorder(self, *args):
         """reorder the arguments in the specified order.
@@ -362,7 +337,7 @@ def reorder(self, *args):
         args : list of IterVar
             The order to be ordered
         """
-        _api_internal._StageReorder(self, args)
+        _ffi_api.StageReorder(self, args)
 
     def tile(self, x_parent, y_parent, x_factor, y_factor):
         """ Perform tiling on two dimensions
@@ -392,7 +367,7 @@ def tile(self, x_parent, y_parent, x_factor, y_factor):
         p_y_inner : IterVar
             Inner axis of y dimension
         """
-        x_outer, y_outer, x_inner, y_inner = _api_internal._StageTile(
+        x_outer, y_outer, x_inner, y_inner = _ffi_api.StageTile(
             self, x_parent, y_parent, x_factor, y_factor)
         return x_outer, y_outer, x_inner, y_inner
 
@@ -404,7 +379,7 @@ def vectorize(self, var):
         var : IterVar
             The iteration to be vectorize
         """
-        _api_internal._StageVectorize(self, var)
+        _ffi_api.StageVectorize(self, var)
 
     def tensorize(self, var, tensor_intrin):
         """Tensorize the computation enclosed by var with tensor_intrin
@@ -417,7 +392,7 @@ def tensorize(self, var, tensor_intrin):
         tensor_intrin : TensorIntrin
             The tensor intrinsic used for computation.
         """
-        _api_internal._StageTensorize(self, var, tensor_intrin)
+        _ffi_api.StageTensorize(self, var, tensor_intrin)
 
     def unroll(self, var):
         """Unroll the iteration.
@@ -427,7 +402,7 @@ def unroll(self, var):
         var : IterVar
             The iteration to be unrolled.
         """
-        _api_internal._StageUnroll(self, var)
+        _ffi_api.StageUnroll(self, var)
 
     def parallel(self, var):
         """Parallelize the iteration.
@@ -437,7 +412,7 @@ def parallel(self, var):
         var : IterVar
             The iteration to be parallelized.
         """
-        _api_internal._StageParallel(self, var)
+        _ffi_api.StageParallel(self, var)
 
     def pragma(self, var, pragma_type, pragma_value=None):
         """Annotate the iteration with pragma
@@ -489,7 +464,7 @@ def pragma(self, var, pragma_type, pragma_value=None):
         """
         if isinstance(pragma_value, string_types):
             pragma_value = convert(pragma_value)
-        _api_internal._StagePragma(self, var, pragma_type, pragma_value)
+        _ffi_api.StagePragma(self, var, pragma_type, pragma_value)
 
     def prefetch(self, tensor, var, offset):
         """Prefetch the specified variable
@@ -503,7 +478,7 @@ def prefetch(self, tensor, var, offset):
         offset : Expr
             The number of iterations to be prefetched before actual execution
         """
-        _api_internal._StagePrefetch(self, tensor, var, offset)
+        _ffi_api.StagePrefetch(self, tensor, var, offset)
 
     def storage_align(self, axis, factor, offset):
         """Set alignment requirement for specific axis
@@ -523,7 +498,7 @@ def storage_align(self, axis, factor, offset):
         offset : int
             The offset in the alignment specification.
         """
-        _api_internal._StageStorageAlign(self, axis, factor, offset)
+        _ffi_api.StageStorageAlign(self, axis, factor, offset)
 
     def double_buffer(self):
         """Compute the current stage via double buffering.
@@ -532,13 +507,14 @@ def double_buffer(self):
         This will double the storage cost of the current stage.
         Can be useful to hide load latency.
         """
-        _api_internal._StageDoubleBuffer(self)
+        _ffi_api.StageDoubleBuffer(self)
 
     def opengl(self):
         """The special OpenGL schedule
 
         Maps each output element to a pixel.
         """
-        _api_internal._StageOpenGL(self)
+        _ffi_api.StageOpenGL(self)
+
 
-tvm._ffi._init_api("tvm.schedule")
+tvm._ffi._init_api("schedule", __name__)
diff --git a/python/tvm/tag.py b/python/tvm/te/tag.py
similarity index 98%
rename from python/tvm/tag.py
rename to python/tvm/te/tag.py
index d6e48e3a4c73..189076d03cc3 100644
--- a/python/tvm/tag.py
+++ b/python/tvm/te/tag.py
@@ -16,7 +16,7 @@
 # under the License.
 """Tag class for TVM operators."""
 import warnings
-from ._ffi.base import decorate
+from tvm._ffi.base import decorate
 
 class TagScope(object):
     """Tag scope object to set tag for operators, working as context
diff --git a/python/tvm/tensor.py b/python/tvm/te/tensor.py
similarity index 93%
rename from python/tvm/tensor.py
rename to python/tvm/te/tensor.py
index 00bd9d146b36..fcbb68f33f22 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -14,15 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Tensor and Operation class for computation declaration."""
+"""Tensor class for computation declaration."""
 # pylint: disable=invalid-name
 import tvm._ffi
 
 from tvm.runtime import Object, ObjectGeneric, convert_to_object
 from tvm.tir import expr as _expr
 
-from . import _api_internal
-
+from . import _ffi_api
 
 class TensorSlice(ObjectGeneric, _expr.ExprOp):
     """Auxiliary data structure for enable slicing syntax from tensor."""
@@ -52,9 +51,6 @@ class TensorIntrinCall(Object):
     """Intermediate structure for calling a tensor intrinsic."""
 
 
-itervar_cls = None
-
-
 @tvm._ffi.register_object
 class Tensor(Object, _expr.ExprOp):
     """Tensor object, to construct, see function.Tensor"""
@@ -68,7 +64,7 @@ def __call__(self, *indices):
         for x in indices:
             if isinstance(x, _expr.PrimExpr):
                 args.append(x)
-            elif isinstance(x, iter_var_cls):
+            elif isinstance(x, _expr.IterVar):
                 args.append(x.var)
             else:
                 raise ValueError("The indices must be expression")
@@ -81,7 +77,7 @@ def __getitem__(self, indices):
         return TensorSlice(self, indices)
 
     def __hash__(self):
-        return _api_internal._TensorHash(self)
+        return _ffi_api.TensorHash(self)
 
     def __eq__(self, other):
         if not isinstance(other, Tensor):
@@ -92,7 +88,7 @@ def __eq__(self, other):
             raise ValueError("Equal == comparison among rank-0 tensor is ambiguous, "
                              "use Tensor.equal for content expression equvalence, "
                              "use Tensor.same_as for exact reference comparison")
-        return _api_internal._TensorEqual(self, other)
+        return _ffi_api.TensorEqual(self, other)
 
     @property
     def ndim(self):
@@ -143,17 +139,17 @@ def output(self, index):
         out : Tensor
             The i-th output.
         """
-        return _api_internal._OpGetOutput(self, index)
+        return _ffi_api.OpGetOutput(self, index)
 
     @property
     def num_outputs(self):
         """Number of outputs from this op."""
-        return _api_internal._OpNumOutputs(self)
+        return _ffi_api.OpNumOutputs(self)
 
     @property
     def input_tensors(self):
         """List of input tensors to this op."""
-        return _api_internal._OpInputTensors(self)
+        return _ffi_api.OpInputTensors(self)
 
 
 @tvm._ffi.register_object
diff --git a/python/tvm/tensor_intrin.py b/python/tvm/te/tensor_intrin.py
similarity index 81%
rename from python/tvm/tensor_intrin.py
rename to python/tvm/te/tensor_intrin.py
index 1fd8bee720ba..c5c2afef1c93 100644
--- a/python/tvm/tensor_intrin.py
+++ b/python/tvm/te/tensor_intrin.py
@@ -16,17 +16,15 @@
 # under the License.
 """Tensor intrinsics"""
 import tvm._ffi
+import tvm.tir
 
-from tvm.runtime import Object
+from tvm.runtime import Object, convert
 from tvm.ir import Range
-from tvm.tir import expr as _expr
-from tvm.tir import stmt as _stmt
+from tvm.target import BuildConfig
+from .tensor import PlaceholderOp
 
-from . import _api_internal
-from . import api as _api
 from . import tensor as _tensor
-from . import schedule as _schedule
-from .build_module import current_build_config
+from . import _ffi_api
 
 
 def _get_region(tslice):
@@ -34,15 +32,16 @@ def _get_region(tslice):
     for idx in tslice.indices:
         if isinstance(idx, slice):
             assert idx.step is None
-            region.append(_api.Range(idx.start, idx.stop))
+            region.append(Range(idx.start, idx.stop))
         else:
-            if isinstance(idx, _schedule.IterVar):
+            if isinstance(idx, tvm.tir.IterVar):
                 begin = idx.var
             else:
                 begin = idx
             region.append(Range.make_by_min_extent(begin, 1))
     return region
 
+
 @tvm._ffi.register_object
 class TensorIntrin(Object):
     """Tensor intrinsic functions for certain computation.
@@ -60,10 +59,11 @@ def __call__(self, *args, **kwargs):
             reduce_axis = kwargs["reduce_axis"]
             if not isinstance(reduce_axis, (list, tuple)):
                 reduce_axis = [reduce_axis]
-            reduce_axis = _api.convert(reduce_axis)
+            reduce_axis = convert(reduce_axis)
         if scalar_inputs:
-            scalar_inputs = _api.convert(scalar_inputs)
-        return _api_internal._TensorIntrinCall(self, tensors, regions, reduce_axis, scalar_inputs)
+            scalar_inputs = convert(scalar_inputs)
+        return _ffi_api.TensorIntrinCall(self, tensors, regions, reduce_axis, scalar_inputs)
+
 
 def decl_tensor_intrin(op,
                        fcompute,
@@ -119,15 +119,15 @@ def decl_tensor_intrin(op,
 
     binds_list = []
     for t in inputs:
-        if not isinstance(t.op, _tensor.PlaceholderOp):
+        if not isinstance(t.op, PlaceholderOp):
             raise ValueError("Do not yet support composition op")
 
-    cfg = current_build_config()
+    cfg = BuildConfig.current()
     for t in tensors:
         buf = (binds[t] if t in binds else
-               _api.decl_buffer(t.shape, t.dtype, t.op.name,
-                                data_alignment=cfg.data_alignment,
-                                offset_factor=cfg.offset_factor))
+               tvm.tir.decl_buffer(t.shape, t.dtype, t.op.name,
+                                   data_alignment=cfg.data_alignment,
+                                   offset_factor=cfg.offset_factor))
         binds_list.append(buf)
 
     if scalar_params:
@@ -135,10 +135,10 @@ def decl_tensor_intrin(op,
     else:
         body = fcompute(binds_list[:len(inputs)], binds_list[len(inputs):])
         scalar_params = []
-    if isinstance(body, (_expr.PrimExpr, _stmt.Stmt)):
+    if isinstance(body, (tvm.tir.PrimExpr, tvm.tir.Stmt)):
         body = [body]
-    body = [_stmt.Evaluate(x) if isinstance(x, _expr.PrimExpr) else x for x in body]
+    body = [tvm.tir.Evaluate(x) if isinstance(x, tvm.tir.PrimExpr) else x for x in body]
     if len(body) < 3:
         body += [None] * (3 - len(body))
-    return _api_internal._TensorIntrin(
+    return _ffi_api.TensorIntrin(
         name, op, inputs, binds_list, scalar_params, *body)
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
index 20cbe2431e6d..077ac35f69a0 100644
--- a/python/tvm/testing.py
+++ b/python/tvm/testing.py
@@ -17,6 +17,8 @@
 """ TVM testing utilities """
 import logging
 import numpy as np
+import tvm._ffi
+
 
 def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
     """ Version of np.testing.assert_allclose with `atol` and `rtol` fields set
@@ -161,3 +163,6 @@ def compare_derivative(j, n_der, grad):
         logging.info("Numerical grad test wrt '%s' of shape %s passes, "
                      "dist = %f, max_diff = %f, avg_diff = %f",
                      x_name, grad.shape, dist, max_diff, avg_diff)
+
+
+tvm._ffi._init_api("testing", __name__)
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 8621540f7223..ab78ca6d6d63 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -23,16 +23,18 @@
 from .expr import Add, Sub, Mul, Div, Mod, FloorDiv, FloorMod
 from .expr import Min, Max, EQ, NE, LT, LE, GT, GE, And, Or, Not
 from .expr import Select, Load, Ramp, Broadcast, Shuffle, Call, Let
+from .expr import IterVar
 
 from .stmt import Stmt, LetStmt, AssertStmt, ProducerConsumer, For
 from .stmt import Store, Provide, Allocate, AttrStmt, Free, Realize, SeqStmt
 from .stmt import IfThenElse, Evaluate, Prefetch, LoweredFunc, stmt_seq, stmt_list
 
 from .op import call_packed, call_pure_intrin, call_intrin, call_pure_extern, call_extern
-from .op import call_llvm_intrin, min_value, max_value
+from .op import call_llvm_intrin, all, any, min_value, max_value
 from .op import exp, erf, tanh, sigmoid, log, cos, sin, atan, sqrt, rsqrt, floor, ceil
 from .op import trunc, abs, round, nearbyint, isnan, power, popcount, fmod, if_then_else
 from .op import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod
+from .op import comm_reducer, min, max, sum
 
 from . import ir_builder
 from . import ir_pass
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index 92d6fbe42f17..e36ca2c1dede 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -308,6 +308,57 @@ def __init__(self, name, dtype):
             _ffi_api.SizeVar, name, dtype)
 
 
+@tvm._ffi.register_object
+class IterVar(Object, ExprOp):
+    """Represent iteration variable.
+
+    IterVar represents axis iterations in the computation.
+
+    Parameters
+    ----------
+    dom : Range
+        The domain of the iteration.
+
+    var : Union[Var, str]
+        The internal variable that is used for iteration.
+
+    iter_type : int
+        The iteration type.
+
+    thread_tag : str
+        The thread type tag.
+
+    See Also
+    --------
+    tvm.thread_axis: Create thread axis IterVar.
+    tvm.reduce_axis: Create reduce axis IterVar.
+    """
+    DataPar = 0
+    ThreadIndex = 1
+    CommReduce = 2
+    Ordered = 3
+    DimInfo = 4
+    Unrolled = 5
+    Vectorized = 6
+    Parallelized = 7
+    Tensorized = 8
+
+    def __init__(self, dom, var, iter_type, thread_tag=""):
+        if dom is not None:
+            if isinstance(dom, (list, tuple)):
+                if len(dom) != 2:
+                    raise TypeError("need to be list of ranges")
+                dom = tvm.ir.Range(dom[0], dom[1])
+
+            if not isinstance(dom, tvm.ir.Range):
+                raise TypeError("dom need to be Range")
+
+        name = var if var is not None else "iter"
+        var = Var(name, dtype="int32") if not isinstance(var, Var) else var
+        self.__init_handle_by_constructor__(
+            _ffi_api.IterVar, dom, var, iter_type, thread_tag)
+
+
 @tvm._ffi.register_object
 class CommReducer(Object):
     """Communicative reduce operator
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index a10fe695c245..66e70c508438 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -14,13 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=redefined-builtin
+# pylint: disable=redefined-builtin, invalid-name
 """Operators used in TIR expression."""
 import tvm._ffi
 from tvm.runtime import convert, const
-from tvm.schedule import Buffer
+from tvm.ir import Array
 
-from .expr import Call
+from .buffer import Buffer
+from .expr import Call, Var, CommReducer
 from . import _ffi_api
 
 
@@ -196,6 +197,53 @@ def call_llvm_intrin(dtype, name, *args):
     return call_pure_intrin(dtype, 'llvm_intrin', tvm.const(llvm_id, 'uint32'), *args)
 
 
+def any(*args):
+    """Create a new experssion of the union of all conditions in the arguments
+
+    Parameters
+    ----------
+    args : list
+        List of symbolic boolean expressions
+
+    Returns
+    -------
+    expr: Expr
+        Expression
+    """
+    if not args:
+        raise ValueError("Any must take at least 1 argument")
+    if len(args) == 1:
+        return args[0]
+    ret = _ffi_api._OpOr(args[0], args[1])
+    for i in range(2, len(args)):
+        ret = _ffi_api._OpOr(ret, args[i])
+    return ret
+
+
+def all(*args):
+    """Create a new experssion of the intersection of all conditions in the
+      arguments
+
+    Parameters
+    ----------
+    args : list
+        List of symbolic boolean expressions
+
+    Returns
+    -------
+    expr: Expr
+        Expression
+    """
+    if not args:
+        raise ValueError("Any must take at least 1 argument")
+    if len(args) == 1:
+        return args[0]
+    ret = _ffi_api._OpAnd(args[0], args[1])
+    for i in range(2, len(args)):
+        ret = _ffi_api._OpAnd(ret, args[i])
+    return ret
+
+
 @tvm._ffi.register_func("tvm.default_trace_action")
 def _tvm_default_trace_action(*args):
     print(list(args))
@@ -780,3 +828,137 @@ def floormod(a, b):
         The result expression.
     """
     return _ffi_api._OpFloorMod(a, b)
+
+
+def comm_reducer(fcombine, fidentity, name="reduce"):
+    """Create a commutative reducer for reduction.
+
+    Parameters
+    ----------
+    fcombine : function(Expr -> Expr -> Expr)
+        A binary function which takes two Expr as input to return a Expr.
+
+    fidentity : function(str -> Expr)
+        A function which takes a type string as input to return a const Expr.
+
+    Returns
+    -------
+    reducer : function
+        A function which creates a reduce expression over axis.
+        There are two ways to use it:
+
+        1. accept (expr, axis, where) to produce an Reduce Expr on
+           specified axis;
+        2. simply use it with multiple Exprs.
+
+    Example
+    -------
+    .. code-block:: python
+
+        n = tvm.var("n")
+        m = tvm.var("m")
+        mysum = tvm.comm_reducer(lambda x, y: x+y,
+            lambda t: tvm.const(0, dtype=t), name="mysum")
+        A = tvm.placeholder((n, m), name="A")
+        k = tvm.reduce_axis((0, m), name="k")
+        B = tvm.compute((n,), lambda i: mysum(A[i, k], axis=k), name="B")
+    """
+    def _reduce_directly(*args):
+        num = len(args)
+        # process `where` is None
+        if num == 3 and args[2] is None:
+            num = 2
+        res = args[0]
+        for i in range(num-1):
+            res = fcombine(res, args[i+1])
+        return res
+
+    def _make_reduce(expr, axis, where=None):
+        code = fcombine.__code__
+        assert fcombine.__code__.co_argcount == 2
+        expr = convert(expr)
+        if isinstance(expr, Array):
+            size = len(expr)
+            larr = []
+            rarr = []
+            dtypes = []
+            for i in range(size):
+                dtype = expr[i].dtype
+                dtypes.append(dtype)
+                lname = code.co_varnames[0] + "_" + str(i)
+                larr.append(Var(lname, dtype))
+                rname = code.co_varnames[1] + "_" + str(i)
+                rarr.append(Var(rname, dtype))
+            lhs = convert(larr)
+            rhs = convert(rarr)
+            result = fcombine(lhs, rhs)
+            id_elem = fidentity(*dtypes)
+        else:
+            assert isinstance(expr, tvm.ir.PrimExpr)
+            size = 1
+            dtype = expr.dtype
+            lvar = Var(code.co_varnames[0], dtype)
+            rvar = Var(code.co_varnames[1], dtype)
+            result = [fcombine(lvar, rvar)]
+            id_elem = [fidentity(dtype)]
+            lhs = convert([lvar])
+            rhs = convert([rvar])
+            expr = convert([expr])
+        result = convert(result)
+        id_elem = convert(id_elem)
+        combiner = CommReducer(lhs, rhs, result, id_elem)
+        axis = convert(axis if isinstance(axis, (list, tuple)) else [axis])
+        if where is None:
+            where = convert(True)
+        outputs = tuple(tvm.tir.Reduce(combiner, expr, axis, where, i)
+                        for i in range(size))
+        return outputs[0] if size == 1 else outputs
+
+    # pylint: disable=keyword-arg-before-vararg
+    def reducer(expr, axis, where=None, *args):
+        if isinstance(axis, (tvm.tir.IterVar, list, tuple)):
+            assert not args
+            return _make_reduce(expr, axis, where)
+        if where is None:
+            assert not args
+            return _reduce_directly(expr, axis)
+        return _reduce_directly(expr, axis, where, *args)
+
+    doc_str = """Create a {0} expression over axis.
+
+              Parameters
+              ----------
+              expr : PrimExpr
+                  The source expression.
+              axis : IterVar
+                  The reduction IterVar axis
+              where : optional, Expr
+                  Filtering predicate of the reduction.
+              Returns
+              -------
+              value : PrimExpr
+                  The result value.
+
+              Example
+              -------
+              .. code-block:: python
+
+                m = tvm.var("m")
+                n = tvm.var("n")
+                A = tvm.placeholder((m, n), name="A")
+                k = tvm.reduce_axis((0, n), name="k")
+
+                # there are two way to use this {0} reducer:
+                # mode 1, accept (expr, axis, where) to produce an Reduce Expr
+                B = tvm.compute((m,), lambda i: tvm.{0}(A[i, k], axis=k), name="B")
+
+                # mode 2, simply use it with multiple Exprs:
+                {0}_res = tvm.{0}(m, n)
+              """
+    reducer.__doc__ = doc_str.format(name)
+    return reducer
+
+# pylint: disable=unnecessary-lambda
+sum = comm_reducer(lambda x, y: x+y, lambda t: const(0, dtype=t), name="sum")
+min = comm_reducer(lambda x, y: _ffi_api._OpMin(x, y), max_value, name="min")
+max = comm_reducer(lambda x, y: _ffi_api._OpMax(x, y), min_value, name="max")
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index 6ac12b12da7e..f996bdbfcbbe 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -64,16 +64,16 @@ TVM_REGISTER_GLOBAL("arith.DeduceBound")
 TVM_REGISTER_GLOBAL("arith.DomainTouched")
 .set_body_typed(DomainTouched);
 
-TVM_REGISTER_GLOBAL("_IntervalSetGetMin")
+TVM_REGISTER_GLOBAL("arith._IntervalSetGetMin")
 .set_body_method(&IntSet::min);
 
-TVM_REGISTER_GLOBAL("_IntervalSetGetMax")
+TVM_REGISTER_GLOBAL("arith._IntervalSetGetMax")
 .set_body_method(&IntSet::max);
 
-TVM_REGISTER_GLOBAL("_IntSetIsNothing")
+TVM_REGISTER_GLOBAL("arith._IntSetIsNothing")
 .set_body_method(&IntSet::is_nothing);
 
-TVM_REGISTER_GLOBAL("_IntSetIsEverything")
+TVM_REGISTER_GLOBAL("arith._IntSetIsEverything")
 .set_body_method(&IntSet::is_everything);
 
 ConstIntBound MakeConstIntBound(int64_t min_value, int64_t max_value) {
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index d2f2cb69b721..613b82311aed 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -40,115 +40,113 @@ TVM_REGISTER_GLOBAL("tir.min_value")
 TVM_REGISTER_GLOBAL("tir.max_value")
 .set_body_typed(max_value);
 
-TVM_REGISTER_GLOBAL("Range")
+TVM_REGISTER_GLOBAL("ir.Range")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    if (args.size() == 1) {
-      *ret = Range(0, args[0]);
-    } else {
-      *ret = Range(args[0], args[1]);
-    }
+  *ret = Range(args[0], args[1]);
   });
 
+namespace tir {
+TVM_REGISTER_GLOBAL("tir.IterVar")
+.set_body_typed([](Range dom, Var var, int iter_type, std::string thread_tag) {
+  return IterVarNode::make(
+      dom, var,
+      static_cast<IterVarType>(iter_type),
+      thread_tag);
+});
+}
+
 namespace te {
-TVM_REGISTER_GLOBAL("_Tensor")
+TVM_REGISTER_GLOBAL("te.Tensor")
 .set_body_typed(TensorNode::make);
 
-TVM_REGISTER_GLOBAL("_TensorIntrin")
+TVM_REGISTER_GLOBAL("te.TensorIntrin")
 .set_body_typed(TensorIntrinNode::make);
 
-TVM_REGISTER_GLOBAL("_TensorIntrinCall")
+TVM_REGISTER_GLOBAL("te.TensorIntrinCall")
 .set_body_typed(TensorIntrinCallNode::make);
 
-TVM_REGISTER_GLOBAL("_TensorEqual")
+TVM_REGISTER_GLOBAL("te.TensorEqual")
 .set_body_method(&Tensor::operator==);
 
-TVM_REGISTER_GLOBAL("_TensorHash")
+TVM_REGISTER_GLOBAL("te.TensorHash")
 .set_body_typed([](Tensor tensor) -> int64_t {
     return static_cast<int64_t>(std::hash<Tensor>()(tensor));
   });
 
-TVM_REGISTER_GLOBAL("_Placeholder")
+TVM_REGISTER_GLOBAL("te.Placeholder")
 .set_body_typed([](Array<PrimExpr> shape, DataType dtype, std::string name) {
   return placeholder(shape, dtype, name);
 });
 
-TVM_REGISTER_GLOBAL("_ComputeOp")
+TVM_REGISTER_GLOBAL("te.ComputeOp")
 .set_body_typed(ComputeOpNode::make);
 
-TVM_REGISTER_GLOBAL("_ScanOp")
+TVM_REGISTER_GLOBAL("te.ScanOp")
 .set_body_typed(ScanOpNode::make);
 
-TVM_REGISTER_GLOBAL("_TensorComputeOp")
+TVM_REGISTER_GLOBAL("te.TensorComputeOp")
 .set_body_typed(TensorComputeOpNode::make);
 
-TVM_REGISTER_GLOBAL("_ExternOp")
+TVM_REGISTER_GLOBAL("te.ExternOp")
 .set_body_typed(ExternOpNode::make);
 
-TVM_REGISTER_GLOBAL("_HybridOp")
+TVM_REGISTER_GLOBAL("te.HybridOp")
 .set_body_typed(HybridOpNode::make);
 
-TVM_REGISTER_GLOBAL("_OpGetOutput")
+TVM_REGISTER_GLOBAL("te.OpGetOutput")
 .set_body_typed([](Operation op, int64_t output) {
   return op.output(static_cast<size_t>(output));
 });
 
-TVM_REGISTER_GLOBAL("_OpNumOutputs")
+TVM_REGISTER_GLOBAL("te.OpNumOutputs")
 .set_body_method<Operation>(&OperationNode::num_outputs);
 
-TVM_REGISTER_GLOBAL("_OpInputTensors")
+TVM_REGISTER_GLOBAL("te.OpInputTensors")
 .set_body_method<Operation>(&OperationNode::InputTensors);
 
-TVM_REGISTER_GLOBAL("_IterVar")
-.set_body_typed([](Range dom, Var var, int iter_type, std::string thread_tag) {
-  return IterVarNode::make(
-      dom, var,
-      static_cast<IterVarType>(iter_type),
-      thread_tag);
-});
-
-TVM_REGISTER_GLOBAL("_CreateSchedule")
+TVM_REGISTER_GLOBAL("te.CreateSchedule")
 .set_body_typed(create_schedule);
 
-TVM_REGISTER_GLOBAL("_StageSetScope")
+TVM_REGISTER_GLOBAL("te.StageSetScope")
 .set_body_method(&Stage::set_scope);
 
-TVM_REGISTER_GLOBAL("_StageBind")
+TVM_REGISTER_GLOBAL("te.StageBind")
 .set_body_method(&Stage::bind);
 
-TVM_REGISTER_GLOBAL("_StageSplitByFactor")
+TVM_REGISTER_GLOBAL("te.StageSplitByFactor")
 .set_body_typed([](Stage stage, IterVar parent, PrimExpr factor) {
   IterVar outer, inner;
   stage.split(parent, factor, &outer, &inner);
   return Array<IterVar>({outer, inner});
 });
 
-TVM_REGISTER_GLOBAL("_StageSplitByNParts")
+TVM_REGISTER_GLOBAL("te.StageSplitByNParts")
 .set_body_typed([](Stage stage, IterVar parent, PrimExpr nparts) {
   IterVar outer, inner;
   stage.split_by_nparts(parent, nparts, &outer, &inner);
   return Array<IterVar>({outer, inner});
 });
 
-TVM_REGISTER_GLOBAL("_StageFuse")
+TVM_REGISTER_GLOBAL("te.StageFuse")
 .set_body_typed([](Stage stage, Array<IterVar> axes) {
     IterVar fused;
     stage.fuse(axes, &fused);
     return fused;
   });
 
-TVM_REGISTER_GLOBAL("_StageComputeAt")
+TVM_REGISTER_GLOBAL("te.StageComputeAt")
 .set_body_method(&Stage::compute_at);
 
-TVM_REGISTER_GLOBAL("_StageComputeInline")
+TVM_REGISTER_GLOBAL("te.StageComputeInline")
 .set_body_method(&Stage::compute_inline);
 
-TVM_REGISTER_GLOBAL("_StageComputeRoot")
+TVM_REGISTER_GLOBAL("te.StageComputeRoot")
 .set_body_method(&Stage::compute_root);
 
-TVM_REGISTER_GLOBAL("_StageReorder")
+TVM_REGISTER_GLOBAL("te.StageReorder")
 .set_body_method(&Stage::reorder);
 
-TVM_REGISTER_GLOBAL("_StageTile")
+TVM_REGISTER_GLOBAL("te.StageTile")
 .set_body_typed([](
   Stage stage,
   IterVar x_parent, IterVar y_parent,
@@ -162,49 +160,49 @@ TVM_REGISTER_GLOBAL("_StageTile")
     return Array<IterVar>({x_outer, y_outer, x_inner, y_inner});
   });
 
-TVM_REGISTER_GLOBAL("_StageEnvThreads")
+TVM_REGISTER_GLOBAL("te.StageEnvThreads")
 .set_body_method(&Stage::env_threads);
 
-TVM_REGISTER_GLOBAL("_StageSetStorePredicate")
+TVM_REGISTER_GLOBAL("te.StageSetStorePredicate")
 .set_body_method(&Stage::set_store_predicate);
 
-TVM_REGISTER_GLOBAL("_StageUnroll")
+TVM_REGISTER_GLOBAL("te.StageUnroll")
 .set_body_method(&Stage::unroll);
 
-TVM_REGISTER_GLOBAL("_StageVectorize")
+TVM_REGISTER_GLOBAL("te.StageVectorize")
 .set_body_method(&Stage::vectorize);
 
-TVM_REGISTER_GLOBAL("_StageTensorize")
+TVM_REGISTER_GLOBAL("te.StageTensorize")
 .set_body_method(&Stage::tensorize);
 
-TVM_REGISTER_GLOBAL("_StageParallel")
+TVM_REGISTER_GLOBAL("te.StageParallel")
 .set_body_method(&Stage::parallel);
 
-TVM_REGISTER_GLOBAL("_StagePragma")
+TVM_REGISTER_GLOBAL("te.StagePragma")
 .set_body_method(&Stage::pragma);
 
-TVM_REGISTER_GLOBAL("_StagePrefetch")
+TVM_REGISTER_GLOBAL("te.StagePrefetch")
 .set_body_method(&Stage::prefetch);
 
-TVM_REGISTER_GLOBAL("_StageStorageAlign")
+TVM_REGISTER_GLOBAL("te.StageStorageAlign")
 .set_body_method(&Stage::storage_align);
 
-TVM_REGISTER_GLOBAL("_StageDoubleBuffer")
+TVM_REGISTER_GLOBAL("te.StageDoubleBuffer")
 .set_body_method(&Stage::double_buffer);
 
-TVM_REGISTER_GLOBAL("_StageOpenGL")
+TVM_REGISTER_GLOBAL("te.StageOpenGL")
 .set_body_method(&Stage::opengl);
 
-TVM_REGISTER_GLOBAL("_ScheduleNormalize")
+TVM_REGISTER_GLOBAL("te.ScheduleNormalize")
 .set_body_method(&Schedule::normalize);
 
-TVM_REGISTER_GLOBAL("_ScheduleCreateGroup")
+TVM_REGISTER_GLOBAL("te.ScheduleCreateGroup")
 .set_body_method(&Schedule::create_group);
 
-TVM_REGISTER_GLOBAL("_ScheduleCacheRead")
+TVM_REGISTER_GLOBAL("te.ScheduleCacheRead")
 .set_body_method(&Schedule::cache_read);
 
-TVM_REGISTER_GLOBAL("_ScheduleCacheWrite")
+TVM_REGISTER_GLOBAL("te.ScheduleCacheWrite")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     if (args[1].IsObjectRef<Tensor>()) {
       *ret = args[0].operator Schedule()
@@ -215,11 +213,11 @@ TVM_REGISTER_GLOBAL("_ScheduleCacheWrite")
     }
   });
 
-TVM_REGISTER_GLOBAL("_ScheduleRFactor")
+TVM_REGISTER_GLOBAL("te.ScheduleRFactor")
 .set_body_method(&Schedule::rfactor);
 }  // namespace te
 
-TVM_REGISTER_GLOBAL("_CommReducerCombine")
+TVM_REGISTER_GLOBAL("te.CommReducerCombine")
 .set_body_method<tir::CommReducer>(&tir::CommReducerNode::operator());
 
 }  // namespace tvm
diff --git a/src/api/api_schedule.cc b/src/api/api_schedule.cc
index 4a57376fbcac..a53c6e99a999 100644
--- a/src/api/api_schedule.cc
+++ b/src/api/api_schedule.cc
@@ -47,9 +47,9 @@ TVM_REGISTER_GLOBAL("schedule.ScheduleOps")
     *ret = ScheduleOps(args[0], args[1], args[2]);
 });
 
-#define REGISTER_SCHEDULE_PASS(PassName)                          \
+#define REGISTER_SCHEDULE_PASS(PassName)                             \
   TVM_REGISTER_GLOBAL("schedule."#PassName)                          \
-  .set_body_typed(PassName);                                     \
+  .set_body_typed(PassName);                                         \
 
 
 REGISTER_SCHEDULE_PASS(InferBound);
diff --git a/src/api/api_test.cc b/src/api/api_test.cc
index 9fbe04e96d9e..2a1e60539bdf 100644
--- a/src/api/api_test.cc
+++ b/src/api/api_test.cc
@@ -54,11 +54,11 @@ struct TestAttrs : public AttrsNode<TestAttrs> {
 
 TVM_REGISTER_NODE_TYPE(TestAttrs);
 
-TVM_REGISTER_GLOBAL("_nop")
+TVM_REGISTER_GLOBAL("testing.nop")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
   });
 
-TVM_REGISTER_GLOBAL("_test_wrap_callback")
+TVM_REGISTER_GLOBAL("testing.test_wrap_callback")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     PackedFunc pf = args[0];
     *ret = runtime::TypedPackedFunc<void()>([pf](){
@@ -66,7 +66,7 @@ TVM_REGISTER_GLOBAL("_test_wrap_callback")
       });
   });
 
-TVM_REGISTER_GLOBAL("_test_raise_error_callback")
+TVM_REGISTER_GLOBAL("testing.test_raise_error_callback")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     std::string msg = args[0];
     *ret = runtime::TypedPackedFunc<void()>([msg](){
@@ -74,7 +74,7 @@ TVM_REGISTER_GLOBAL("_test_raise_error_callback")
       });
   });
 
-TVM_REGISTER_GLOBAL("_test_check_eq_callback")
+TVM_REGISTER_GLOBAL("testing.test_check_eq_callback")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     std::string msg = args[0];
     *ret = runtime::TypedPackedFunc<void(int x, int y)>([msg](int x, int y){
@@ -82,7 +82,7 @@ TVM_REGISTER_GLOBAL("_test_check_eq_callback")
       });
   });
 
-TVM_REGISTER_GLOBAL("_context_test")
+TVM_REGISTER_GLOBAL("testing.context_test")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     DLContext ctx = args[0];
     int dtype = args[1];
@@ -103,11 +103,11 @@ void ErrorTest(int x, int y) {
   }
 }
 
-TVM_REGISTER_GLOBAL("_ErrorTest")
+TVM_REGISTER_GLOBAL("testing.ErrorTest")
 .set_body_typed(ErrorTest);
 
 // internal function used for debug and testing purposes
-TVM_REGISTER_GLOBAL("_ndarray_use_count")
+TVM_REGISTER_GLOBAL("testing.ndarray_use_count")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     runtime::NDArray nd = args[0];
     // substract the current one
diff --git a/src/target/target.cc b/src/target/target.cc
index 05253a5a2bc9..ab2077db584c 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -403,7 +403,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
   p->stream << ")";
 });
 
-TVM_REGISTER_GLOBAL("_GetCurrentBuildConfig")
+TVM_REGISTER_GLOBAL("target.GetCurrentBuildConfig")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   *ret = BuildConfig::Current();
   });
@@ -418,13 +418,13 @@ class BuildConfig::Internal {
   }
 };
 
-TVM_REGISTER_GLOBAL("_EnterBuildConfigScope")
+TVM_REGISTER_GLOBAL("target.EnterBuildConfigScope")
 .set_body_typed(BuildConfig::Internal::EnterScope);
 
-TVM_REGISTER_GLOBAL("_ExitBuildConfigScope")
+TVM_REGISTER_GLOBAL("target.ExitBuildConfigScope")
 .set_body_typed(BuildConfig::Internal::ExitScope);
 
-TVM_REGISTER_GLOBAL("_BuildConfigSetAddLowerPass")
+TVM_REGISTER_GLOBAL("target.BuildConfigSetAddLowerPass")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   BuildConfig cfg = args[0];
   std::vector< std::pair<int, PackedFunc> > add_lower_pass;
@@ -437,7 +437,7 @@ TVM_REGISTER_GLOBAL("_BuildConfigSetAddLowerPass")
   cfg->add_lower_pass = add_lower_pass;
   });
 
-TVM_REGISTER_GLOBAL("_BuildConfigGetAddLowerPassInfo")
+TVM_REGISTER_GLOBAL("target.BuildConfigGetAddLowerPassInfo")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
   // Return one of the following:
   //  * Size of add_lower_pass if num_args == 1
diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py
index 9700bbce22b9..7568814fbfe6 100644
--- a/tests/python/unittest/test_lang_buffer.py
+++ b/tests/python/unittest/test_lang_buffer.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm.schedule import Buffer
+from tvm.tir import Buffer
 import numpy as np
 
 def test_buffer():
@@ -25,7 +25,7 @@ def test_buffer():
     Ab = tvm.decl_buffer((m, n), tvm.float32)
     Bb = tvm.decl_buffer((n, l), tvm.float32)
 
-    assert isinstance(Ab, tvm.schedule.Buffer)
+    assert isinstance(Ab, tvm.tir.Buffer)
     assert Ab.dtype == tvm.float32
     assert tuple(Ab.shape) == (m, n)
 
diff --git a/tests/python/unittest/test_lang_constructor.py b/tests/python/unittest/test_lang_constructor.py
index 4ce7e872dc36..797a04fa4574 100644
--- a/tests/python/unittest/test_lang_constructor.py
+++ b/tests/python/unittest/test_lang_constructor.py
@@ -22,8 +22,8 @@ def test_expr_constructor():
     assert x.name == "xx"
 
     x = tvm.tir.Reduce(None, [1],
-                        [tvm.api._IterVar((0, 1), "x", 2)],
-                        None, 0)
+                       [tvm.tir.IterVar((0, 1), "x", 2)],
+                       None, 0)
     assert isinstance(x, tvm.tir.Reduce)
     assert x.combiner == None
     assert x.value_index == 0
diff --git a/tests/python/unittest/test_runtime_error.py b/tests/python/unittest/test_runtime_error.py
index 10a76019de47..d1a2d983ff25 100644
--- a/tests/python/unittest/test_runtime_error.py
+++ b/tests/python/unittest/test_runtime_error.py
@@ -16,9 +16,10 @@
 # under the License.
 """Test runtime error handling"""
 import tvm
+import tvm.testing
 
 def test_op_translation():
-    ferror = tvm._api_internal._test_raise_error_callback(
+    ferror = tvm.testing.test_raise_error_callback(
         "OpNotImplemented: myop")
     try:
         ferror()
@@ -28,7 +29,7 @@ def test_op_translation():
         assert isinstance(e, NotImplementedError)
         assert msg.find("api_test.cc") != -1
 
-    fchk_eq = tvm._api_internal._test_check_eq_callback(
+    fchk_eq = tvm.testing.test_check_eq_callback(
         "InternalError: myop")
     try:
         fchk_eq(0, 1)
@@ -38,7 +39,7 @@ def test_op_translation():
         assert msg.find("api_test.cc") != -1
 
     try:
-        tvm._api_internal._ErrorTest(0, 1)
+        tvm.testing.ErrorTest(0, 1)
         assert False
     except ValueError as e:
         msg = str(e)
@@ -48,13 +49,13 @@ def test_op_translation():
 def test_deep_callback():
     def error_callback():
         raise ValueError("callback error")
-    wrap1 = tvm._api_internal._test_wrap_callback(error_callback)
+    wrap1 = tvm.testing.test_wrap_callback(error_callback)
     def flevel2():
         wrap1()
-    wrap2 = tvm._api_internal._test_wrap_callback(flevel2)
+    wrap2 = tvm.testing.test_wrap_callback(flevel2)
     def flevel3():
         wrap2()
-    wrap3 = tvm._api_internal._test_wrap_callback(flevel3)
+    wrap3 = tvm.testing.test_wrap_callback(flevel3)
 
     try:
         wrap3()
diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_packed_func.py
index 2c229bc98f4e..4f7377008c76 100644
--- a/tests/python/unittest/test_runtime_packed_func.py
+++ b/tests/python/unittest/test_runtime_packed_func.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 import numpy as np
 
 def test_get_global():
@@ -93,7 +94,7 @@ def test_ctx_func(ctx):
     x = test_ctx_func(tvm.gpu(7))
     assert x == tvm.cpu(0)
     x = tvm.opencl(10)
-    x = tvm._api_internal._context_test(x, x.device_type, x.device_id)
+    x = tvm.testing.context_test(x, x.device_type, x.device_id)
     assert x == tvm.opencl(10)
 
 def test_trace_default_action():
@@ -282,4 +283,3 @@ def check_assign(dtype):
     test_trace_default_action()
     test_trace_can_change_traced_value_int()
     test_trace_can_change_traced_value_float()
-
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index ff5f46536d83..75169da9a2ce 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 import os
 import logging
 import time
@@ -210,7 +211,7 @@ def my_module(name):
         if name == "get_arr":
             return lambda : nd
         elif name == "ref_count":
-            return lambda : tvm._api_internal._ndarray_use_count(nd)
+            return lambda : tvm.testing.ndarray_use_count(nd)
         elif name == "get_elem":
             return lambda idx: nd.asnumpy()[idx]
         elif name == "get_arr_elem":
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index f3683626384f..4a62d354eb41 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -96,7 +96,7 @@ def lower(*args, **kwargs):
     --------
     tvm.lower : The original TVM's lower function
     """
-    cfg = tvm.build_module.current_build_config()
+    cfg = tvm.target.BuildConfig.current()
     if not cfg.add_lower_pass:
         with build_config():
             return tvm.lower(*args, **kwargs)
@@ -113,7 +113,7 @@ def build(*args, **kwargs):
     --------
     tvm.build : The original TVM's build function
     """
-    cfg = tvm.build_module.current_build_config()
+    cfg = tvm.target.BuildConfig.current()
     if not cfg.add_lower_pass:
         with build_config():
             return tvm.build(*args, **kwargs)

From 976c08ad61cca9989331bfa57e83bcf92ed20798 Mon Sep 17 00:00:00 2001
From: pankratz <35379668+dpankratz@users.noreply.github.com>
Date: Mon, 17 Feb 2020 19:48:05 -0700
Subject: [PATCH 13/73] Fixed bugs that occured when using bitwise operators on
 floating point type expressions. Further crash when using ops <<, >>, %.
 Finally added regression tests for both types of bug. (#4892)

---
 python/tvm/tir/expr.py                   | 16 ++++++++++++++++
 src/tir/ir/op.cc                         | 10 ++++++++++
 tests/python/unittest/test_lang_basic.py | 22 ++++++++++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index e36ca2c1dede..aeda603e19aa 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -52,6 +52,11 @@ def _dtype_is_int(value):
     return (isinstance(value, ExprOp) and
             DataType(value.dtype).type_code == TypeCode.INT)
 
+def _dtype_is_float(value):
+    if isinstance(value, float):
+        return True
+    return (isinstance(value, ExprOp) and
+            DataType(value.dtype).type_code == TypeCode.FLOAT)
 
 class ExprOp(object):
     """Operator overloading for Expr like expressions."""
@@ -102,6 +107,9 @@ def __rfloordiv__(self, other):
     def __mod__(self, other):
         return _ffi_api._OpFloorMod(self, other)
 
+    def __rmod__(self, other):
+        return _ffi_api._OpFloorMod(other, self)
+
     def __neg__(self):
         neg_one = const(-1, self.dtype)
         return self.__mul__(neg_one)
@@ -109,9 +117,15 @@ def __neg__(self):
     def __lshift__(self, other):
         return _ffi_api.left_shift(self, other)
 
+    def __rlshift__(self, other):
+        return _ffi_api.left_shift(other, self)
+
     def __rshift__(self, other):
         return _ffi_api.right_shift(self, other)
 
+    def __rrshift__(self, other):
+        return _ffi_api.right_shift(other, self)
+
     def __and__(self, other):
         return _ffi_api.bitwise_and(self, other)
 
@@ -131,6 +145,8 @@ def __rxor__(self, other):
         return _ffi_api.bitwise_xor(other, self)
 
     def __invert__(self):
+        if _dtype_is_float(self):
+            raise RuntimeError("Cannot use ~ operator on float type Expr.")
         return _ffi_api.Call(self.dtype, "bitwise_not", [self], Call.PureIntrinsic, None, 0)
 
     def __lt__(self, other):
diff --git a/src/tir/ir/op.cc b/src/tir/ir/op.cc
index d046f5d9df4e..58f8b6b76da8 100644
--- a/src/tir/ir/op.cc
+++ b/src/tir/ir/op.cc
@@ -417,6 +417,8 @@ PrimExpr operator!(PrimExpr a) {
 }
 
 PrimExpr operator>>(PrimExpr a, PrimExpr b) {
+  CHECK(a.dtype().is_int() || a.dtype().is_uint());
+  CHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
       const DataType& rtype = a.dtype();
@@ -430,6 +432,8 @@ PrimExpr operator>>(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr operator<<(PrimExpr a, PrimExpr b) {
+  CHECK(a.dtype().is_int() || a.dtype().is_uint());
+  CHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
       const DataType& rtype = a.dtype();
@@ -443,6 +447,8 @@ PrimExpr operator<<(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr operator&(PrimExpr a, PrimExpr b) {
+  CHECK(a.dtype().is_int() || a.dtype().is_uint());
+  CHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
       const DataType& rtype = a.dtype();
@@ -453,6 +459,8 @@ PrimExpr operator&(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr operator|(PrimExpr a, PrimExpr b) {
+  CHECK(a.dtype().is_int() || a.dtype().is_uint());
+  CHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
       const DataType& rtype = a.dtype();
@@ -463,6 +471,8 @@ PrimExpr operator|(PrimExpr a, PrimExpr b) {
 }
 
 PrimExpr operator^(PrimExpr a, PrimExpr b) {
+  CHECK(a.dtype().is_int() || a.dtype().is_uint());
+  CHECK(b.dtype().is_int() || b.dtype().is_uint());
   BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
       const DataType& rtype = a.dtype();
diff --git a/tests/python/unittest/test_lang_basic.py b/tests/python/unittest/test_lang_basic.py
index 733992595562..3b1431a54d36 100644
--- a/tests/python/unittest/test_lang_basic.py
+++ b/tests/python/unittest/test_lang_basic.py
@@ -178,11 +178,32 @@ def test_bitwise():
     assert str(10 & x) == 'bitwise_and(10, x)'
     assert str(10 | x) == 'bitwise_or(10, x)'
     assert str(10 ^ x) == 'bitwise_xor(10, x)'
+    assert str(10 >> x) == 'shift_right(10, x)'
+    assert str(10 << x) == 'shift_left(10, x)'
+    assert str(10 % x) == 'floormod(10, x)'
     assert str(~x) == 'bitwise_not(x)'
     assert(tvm.const(1, "int8x2") >> 1).dtype == "int8x2"
     assert(x >> tvm.const(1, "int32x2")).dtype == "int32x2"
     assert(tvm.var("z", "int8x2") << tvm.const(1, "int8x2")).dtype == "int8x2"
 
+def test_float_bitwise():
+    t = tvm.const(1.5,dtype='float32')
+    for test in [lambda lhs, rhs : lhs << rhs,
+                    lambda lhs, rhs : lhs >> rhs,
+                    lambda lhs, rhs : lhs | rhs,
+                    lambda lhs, rhs : lhs ^ rhs,
+                    lambda lhs, rhs : lhs & rhs
+                ]:
+        try:
+            test(t,10.0)
+            assert False
+        except tvm.TVMError:
+            pass
+    try:
+        ~t
+        assert False
+    except RuntimeError:
+        pass
 
 def test_isnan():
     x = tvm.var('x', 'float32')
@@ -227,6 +248,7 @@ def test_equality_string_imm():
     test_any()
     test_all()
     test_bitwise()
+    test_float_bitwise()
     test_isnan()
     test_equality()
     test_equality_string_imm()

From 8310b2526e69d1761a67f6a8566691a0eeb2e652 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 17 Feb 2020 20:56:45 -0800
Subject: [PATCH 14/73] [CI] Update ci docker to add autodocsumm (#4903)

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0230a1a6f905..bb57abb32095 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,8 +45,8 @@
 //
 
 ci_lint = "tvmai/ci-lint:v0.60"
-ci_gpu = "tvmai/ci-gpu:v0.60"
-ci_cpu = "tvmai/ci-cpu:v0.55"
+ci_gpu = "tvmai/ci-gpu:v0.61"
+ci_cpu = "tvmai/ci-cpu:v0.60"
 ci_i386 = "tvmai/ci-i386:v0.52"
 
 // tvm libraries

From 38d1dd24a005e2b6902eec7fafeb9297eeb7b996 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 17 Feb 2020 21:40:19 -0800
Subject: [PATCH 15/73] [CI] Add autodocsum as dep (#4902)

---
 docker/install/ubuntu_install_sphinx.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
index bbc60d90e598..2555164e2292 100755
--- a/docker/install/ubuntu_install_sphinx.sh
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -20,4 +20,4 @@ set -e
 set -u
 set -o pipefail
 
-pip3 install sphinx sphinx-gallery==0.4.0 sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image commonmark>=0.7.3 docutils>=0.11
+pip3 install sphinx sphinx-gallery==0.4.0 autodocsumm sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image commonmark>=0.7.3 docutils>=0.11

From d1e1ac49b37210334e543f6c4cd8813cbe80e26d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 18 Feb 2020 08:14:12 -0800
Subject: [PATCH 16/73] [REFACTOR][PY] Establish tvm.arith (#4904)

---
 python/tvm/arith/__init__.py                  | 22 +++++
 python/tvm/arith/_ffi_api.py                  | 21 +++++
 python/tvm/{arith.py => arith/analyzer.py}    | 72 +++++++----------
 python/tvm/arith/bound.py                     | 39 +++++++++
 python/tvm/arith/int_set.py                   | 80 +++++++++++++++++++
 python/tvm/arith/pattern.py                   | 60 ++++++++++++++
 src/api/api_arith.cc                          | 18 +++--
 src/arith/int_set.cc                          |  2 +-
 .../unittest/test_arith_deduce_bound.py       | 56 ++++++-------
 .../unittest/test_arith_detect_clip_bound.py  |  6 +-
 .../test_arith_detect_linear_equation.py      | 24 +++---
 .../unittest/test_arith_domain_touched.py     | 11 ++-
 tests/python/unittest/test_arith_intset.py    |  6 +-
 vta/python/vta/ir_pass.py                     | 12 +--
 14 files changed, 322 insertions(+), 107 deletions(-)
 create mode 100644 python/tvm/arith/__init__.py
 create mode 100644 python/tvm/arith/_ffi_api.py
 rename python/tvm/{arith.py => arith/analyzer.py} (83%)
 create mode 100644 python/tvm/arith/bound.py
 create mode 100644 python/tvm/arith/int_set.py
 create mode 100644 python/tvm/arith/pattern.py

diff --git a/python/tvm/arith/__init__.py b/python/tvm/arith/__init__.py
new file mode 100644
index 000000000000..40e977e61d75
--- /dev/null
+++ b/python/tvm/arith/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Integer bound analysis, simplification and pattern detection."""
+
+from .int_set import IntSet, IntervalSet
+from .analyzer import ModularSet, ConstIntBound, Analyzer
+from .bound import deduce_bound
+from .pattern import detect_linear_equation, detect_clip_bound
diff --git a/python/tvm/arith/_ffi_api.py b/python/tvm/arith/_ffi_api.py
new file mode 100644
index 000000000000..c551e5651563
--- /dev/null
+++ b/python/tvm/arith/_ffi_api.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for tvm.arith"""
+import tvm._ffi
+
+
+tvm._ffi._init_api("arith", __name__)
diff --git a/python/tvm/arith.py b/python/tvm/arith/analyzer.py
similarity index 83%
rename from python/tvm/arith.py
rename to python/tvm/arith/analyzer.py
index b67e99c204ba..382a7e033e75 100644
--- a/python/tvm/arith.py
+++ b/python/tvm/arith/analyzer.py
@@ -17,34 +17,7 @@
 """Arithmetic data structure and utility"""
 import tvm._ffi
 from tvm.runtime import Object
-
-
-class IntSet(Object):
-    """Represent a set of integer in one dimension."""
-    def is_nothing(self):
-        """Whether the set represent nothing"""
-        return _IntSetIsNothing(self)
-
-    def is_everything(self):
-        """Whether the set represent everything"""
-        return _IntSetIsEverything(self)
-
-
-@tvm._ffi.register_object("arith.IntervalSet")
-class IntervalSet(IntSet):
-    """Represent set of continuous interval [min_value, max_value]
-
-    Parameters
-    ----------
-    min_value : Expr
-        The minimum value in the interval.
-
-    max_value : Expr
-        The maximum value in the interval.
-    """
-    def __init__(self, min_value, max_value):
-        self.__init_handle_by_constructor__(
-            _make_IntervalSet, min_value, max_value)
+from . import _ffi_api
 
 
 @tvm._ffi.register_object("arith.ModularSet")
@@ -52,7 +25,7 @@ class ModularSet(Object):
     """Represent range of (coeff * x + base) for x in Z """
     def __init__(self, coeff, base):
         self.__init_handle_by_constructor__(
-            _make_ModularSet, coeff, base)
+            _ffi_api.ModularSet, coeff, base)
 
 
 @tvm._ffi.register_object("arith.ConstIntBound")
@@ -72,7 +45,7 @@ class ConstIntBound(Object):
 
     def __init__(self, min_value, max_value):
         self.__init_handle_by_constructor__(
-            _make_ConstIntBound, min_value, max_value)
+            _ffi_api.ConstIntBound, min_value, max_value)
 
 
 class ConstraintScope:
@@ -105,11 +78,12 @@ class Analyzer:
     be used to perform various symbolic integer analysis.
     """
     def __init__(self):
-        _mod = _CreateAnalyzer()
+        _mod = _ffi_api.CreateAnalyzer()
         self._const_int_bound = _mod("const_int_bound")
         self._const_int_bound_update = _mod("const_int_bound_update")
         self._bind = _mod("bind")
         self._modular_set = _mod("modular_set")
+        self._simplify = _mod("Simplify")
         self._rewrite_simplify = _mod("rewrite_simplify")
         self._canonical_simplify = _mod("canonical_simplify")
         self._int_set = _mod("int_set")
@@ -120,7 +94,7 @@ def const_int_bound(self, expr):
 
         Parameters
         ----------
-        expr : tvm.Expr
+        expr : PrimExpr
             The expression.
 
         Returns
@@ -135,7 +109,7 @@ def modular_set(self, expr):
 
         Parameters
         ----------
-        expr : tvm.Expr
+        expr : PrimExpr
             The expression.
 
         Returns
@@ -145,12 +119,27 @@ def modular_set(self, expr):
         """
         return self._modular_set(expr)
 
+    def simplify(self, expr):
+        """Simplify expression via both rewrite and canonicalization.
+
+        Parameters
+        ----------
+        expr : PrimExpr
+            The expression.
+
+        Returns
+        -------
+        result : Expr
+            The result.
+        """
+        return self._simplify(expr)
+
     def rewrite_simplify(self, expr):
         """Simplify expression via rewriting rules.
 
         Parameters
         ----------
-        expr : tvm.Expr
+        expr : PrimExpr
             The expression.
 
         Returns
@@ -165,7 +154,7 @@ def canonical_simplify(self, expr):
 
         Parameters
         ----------
-        expr : tvm.Expr
+        expr : PrimExpr
             The expression.
 
         Returns
@@ -180,7 +169,7 @@ def int_set(self, expr, dom_map):
 
         Parameters
         ----------
-        expr : tvm.Expr
+        expr : PrimExpr
             The expression.
 
         dom_map : Dict[Var, tvm.arith.IntSet]
@@ -198,10 +187,10 @@ def bind(self, var, expr):
 
         Parameters
         ----------
-        var : tvm.Var
+        var : tvm.tir.Var
             The variable.
 
-        expr : tvm.Expr
+        expr : PrimExpr
             The expression.
         """
         return self._bind(var, expr)
@@ -211,7 +200,7 @@ def constraint_scope(self, constraint):
 
         Parameters
         ----------
-        constraint : tvm.Expr
+        constraint : PrimExpr
             The constraint expression.
 
         returns
@@ -240,7 +229,7 @@ def update(self, var, info, override=False):
 
         Parameters
         ----------
-        var : tvm.Var
+        var : tvm.tir.Var
             The variable.
 
         info : tvm.Object
@@ -254,6 +243,3 @@ def update(self, var, info, override=False):
         else:
             raise TypeError(
                 "Do not know how to handle type {}".format(type(info)))
-
-
-tvm._ffi._init_api("tvm.arith")
diff --git a/python/tvm/arith/bound.py b/python/tvm/arith/bound.py
new file mode 100644
index 000000000000..6f4b220a378e
--- /dev/null
+++ b/python/tvm/arith/bound.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Bound deduction."""
+from . import _ffi_api
+
+
+def deduce_bound(var, cond, hint_map, relax_map):
+    """Deduce the bound of the target variable in the cond.
+
+    Parameters
+    ----------
+    var : Var
+        The target variable to be deduced.
+
+    cond : PrimExpr
+        The condition
+
+    hint_map : Map[Var, IntSet]
+        Domain of variables used to help deduction.
+
+    relax_map : Map[Var, IntSet]
+        The fomain of the variables to be relaxed
+        using the provided domain.
+    """
+    return _ffi_api.DeduceBound(var, cond, hint_map, relax_map)
diff --git a/python/tvm/arith/int_set.py b/python/tvm/arith/int_set.py
new file mode 100644
index 000000000000..838e8e5227ca
--- /dev/null
+++ b/python/tvm/arith/int_set.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Integer set."""
+import tvm._ffi
+from tvm.runtime import Object
+from . import _ffi_api
+
+
+class IntSet(Object):
+    """Represent a set of integer in one dimension."""
+    def is_nothing(self):
+        """Whether the set represent nothing"""
+        return _ffi_api.IntSetIsNothing(self)
+
+    def is_everything(self):
+        """Whether the set represent everything"""
+        return _ffi_api.IntSetIsEverything(self)
+
+    @staticmethod
+    def vector(vec):
+        """Construct an integer set that covers the vector expr
+
+        Parameters
+        ----------
+        vec : PrimExpr
+            The vector expression.
+
+        Returns
+        -------
+        rset : IntSet
+            The result set.
+        """
+        return _ffi_api.intset_vector(vec)
+
+    @staticmethod
+    def single_point(point):
+        """Construct a point set.
+
+        Parameters
+        ----------
+        point : PrimExpr
+            The vector expression.
+
+        Returns
+        -------
+        rset : IntSet
+            The result set.
+        """
+        return _ffi_api.intset_single_point(point)
+
+
+@tvm._ffi.register_object("arith.IntervalSet")
+class IntervalSet(IntSet):
+    """Represent set of continuous interval [min_value, max_value]
+
+    Parameters
+    ----------
+    min_value : PrimExpr
+        The minimum value in the interval.
+
+    max_value : PrimExpr
+        The maximum value in the interval.
+    """
+    def __init__(self, min_value, max_value):
+        self.__init_handle_by_constructor__(
+            _ffi_api.IntervalSet, min_value, max_value)
diff --git a/python/tvm/arith/pattern.py b/python/tvm/arith/pattern.py
new file mode 100644
index 000000000000..22810882701e
--- /dev/null
+++ b/python/tvm/arith/pattern.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Detect common patterns."""
+from . import _ffi_api
+
+
+def detect_linear_equation(expr, var_list):
+    """Match `expr = sum_{i=0}^{n-1} var[i] * coeff[i] + coeff[n]`
+
+    Where coeff[i] and base are invariant of var[j] for all i and j.
+
+    Parameters
+    ----------
+    expr : PrimExpr
+        The expression to be matched.
+
+    var_list : List[tvm.tir.Var]
+        A list of variables.
+
+    Returns
+    -------
+    coeff : List[PrimExpr]
+        A list of co-efficients if the match is successful.
+        An empty list if the match failed.
+    """
+    return _ffi_api.DetectLinearEquation(expr, var_list)
+
+
+def detect_clip_bound(expr, var_list):
+    """ Detect if expression corresponds to clip bound of the vars
+
+    Parameters
+    ----------
+    expr : PrimExpr
+        The expression to be matched.
+
+    var_list : List[tvm.tir.Var]
+        A list of variables.
+
+    Returns
+    -------
+    coeff : List[PrimExpr]
+        `concat([min_value[i], max_value[i]] for i, v in enumerate(var_list))`
+        An empty list if the match failed.
+    """
+    return _ffi_api.DetectClipBound(expr, var_list)
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index f996bdbfcbbe..3942f6ef0f20 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -64,33 +64,33 @@ TVM_REGISTER_GLOBAL("arith.DeduceBound")
 TVM_REGISTER_GLOBAL("arith.DomainTouched")
 .set_body_typed(DomainTouched);
 
-TVM_REGISTER_GLOBAL("arith._IntervalSetGetMin")
+TVM_REGISTER_GLOBAL("arith.IntervalSetGetMin")
 .set_body_method(&IntSet::min);
 
-TVM_REGISTER_GLOBAL("arith._IntervalSetGetMax")
+TVM_REGISTER_GLOBAL("arith.IntervalSetGetMax")
 .set_body_method(&IntSet::max);
 
-TVM_REGISTER_GLOBAL("arith._IntSetIsNothing")
+TVM_REGISTER_GLOBAL("arith.IntSetIsNothing")
 .set_body_method(&IntSet::is_nothing);
 
-TVM_REGISTER_GLOBAL("arith._IntSetIsEverything")
+TVM_REGISTER_GLOBAL("arith.IntSetIsEverything")
 .set_body_method(&IntSet::is_everything);
 
 ConstIntBound MakeConstIntBound(int64_t min_value, int64_t max_value) {
   return ConstIntBound(min_value, max_value);
 }
 
-TVM_REGISTER_GLOBAL("arith._make_ConstIntBound")
+TVM_REGISTER_GLOBAL("arith.ConstIntBound")
 .set_body_typed(MakeConstIntBound);
 
 ModularSet MakeModularSet(int64_t coeff, int64_t base) {
   return ModularSet(coeff, base);
 }
 
-TVM_REGISTER_GLOBAL("arith._make_ModularSet")
+TVM_REGISTER_GLOBAL("arith.ModularSet")
 .set_body_typed(MakeModularSet);
 
-TVM_REGISTER_GLOBAL("arith._CreateAnalyzer")
+TVM_REGISTER_GLOBAL("arith.CreateAnalyzer")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     using runtime::PackedFunc;
     using runtime::TypedPackedFunc;
@@ -108,6 +108,10 @@ TVM_REGISTER_GLOBAL("arith._CreateAnalyzer")
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
             self->const_int_bound.Update(args[0], args[1], args[2]);
         });
+      } else if (name == "Simplify") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->Simplify(args[0]);
+        });
       } else if (name == "rewrite_simplify") {
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
             *ret = self->rewrite_simplify(args[0]);
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index 728cca1b5705..adb38799fdf2 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -54,7 +54,7 @@ IntervalSet MakeIntervalSet(PrimExpr min_value, PrimExpr max_value) {
   return IntervalSet(min_value, max_value);
 }
 
-TVM_REGISTER_GLOBAL("arith._make_IntervalSet")
+TVM_REGISTER_GLOBAL("arith.IntervalSet")
 .set_body_typed(MakeIntervalSet);
 
 
diff --git a/tests/python/unittest/test_arith_deduce_bound.py b/tests/python/unittest/test_arith_deduce_bound.py
index 787dfe80d536..5e08635cd53f 100644
--- a/tests/python/unittest/test_arith_deduce_bound.py
+++ b/tests/python/unittest/test_arith_deduce_bound.py
@@ -38,90 +38,90 @@ def test_deduce():
     fdiv = tvm.floordiv
 
     e0 = (-b)*a+c-d
-    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
+    res0 = tvm.arith.deduce_bound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
     ans0 = fdiv(d - c, b*-1)
     assert_expr_equal(res0.max_value, ans0)
 
     # expression containing variable a is on rhs
-    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
+    res0 = tvm.arith.deduce_bound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
     assert_expr_equal(res0.max_value, ans0)
 
     e0 = d*a+c-d
-    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
+    res0 = tvm.arith.deduce_bound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
     ans0 = fdiv(d-c, d)
     assert_expr_equal(res0.max_value, ans0)
 
     # expression containing variable a is on rhs
-    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
+    res0 = tvm.arith.deduce_bound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
     assert_expr_equal(res0.max_value, ans0)
 
 
     e1 = (a*4+b < c)
-    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
+    res1 = tvm.arith.deduce_bound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
     ans1 = fdiv(c-1-b, 4)
     assert_expr_equal(res1.max_value, ans1)
 
 
     # expression containing variable a is on rhs
     e1 = (c > a*4+b)
-    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
+    res1 = tvm.arith.deduce_bound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
     assert_expr_equal(res1.max_value, ans1)
 
 
     e2 = (tvm.max(5, a * 4) < 0)
-    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
+    res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
     assert str(res2.max_value) == "neg_inf"
     assert str(res2.min_value) == "pos_inf"
 
     # expression containing variable a is on rhs
     e2 = (zero < tvm.max(5, a * 4))
-    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
+    res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
     assert str(res2.max_value) == "neg_inf"
     assert str(res2.min_value) == "pos_inf"
 
     e3 = (-b)+a*c-d
-    res3 = tvm.arith.DeduceBound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
+    res3 = tvm.arith.deduce_bound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
     ans3 = fdiv(2,c)+1
     assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)
 
-    res3 = tvm.arith.DeduceBound(a, zero <= e3, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
+    res3 = tvm.arith.deduce_bound(a, zero <= e3, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
     assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)
 
     # tests for `EQ` op
-    res4 = tvm.arith.DeduceBound(a, a == b, {}, {})
+    res4 = tvm.arith.deduce_bound(a, a == b, {}, {})
     assert_expr_equal(res4.max_value, b)
     assert_expr_equal(res4.min_value, b)
 
     # Unsatisfiable `EQ`, variable as one of the Operand
-    res5 = tvm.arith.DeduceBound(a, (a == b), {b: b_s}, {b: b_s})
+    res5 = tvm.arith.deduce_bound(a, (a == b), {b: b_s}, {b: b_s})
     assert str(res5.max_value) == "neg_inf"
     assert str(res5.min_value) == "pos_inf"
 
     # variable `a` on the RHS side
-    res6 = tvm.arith.DeduceBound(a, 10 == a, {}, {})
+    res6 = tvm.arith.deduce_bound(a, 10 == a, {}, {})
     assert_expr_equal(res6.max_value, 10)
     assert_expr_equal(res6.min_value, 10)
 
     # Add, Sub in `EQ`
     e4 = ((a - c) == (b + d))
     ans4 = (b + d + c)
-    res7 = tvm.arith.DeduceBound(a, e4, {b: b_s, c: c_s, d: d_s}, {})
+    res7 = tvm.arith.deduce_bound(a, e4, {b: b_s, c: c_s, d: d_s}, {})
     assert_expr_equal(res7.max_value, ans4)
     assert_expr_equal(res7.min_value, ans4)
 
     # Satisfiable Mul in `EQ` with negative sign
-    res8 = tvm.arith.DeduceBound(a, (5 * a == -10), {}, {})
+    res8 = tvm.arith.deduce_bound(a, (5 * a == -10), {}, {})
     assert_expr_equal(res8.max_value, -2)
     assert_expr_equal(res8.min_value, -2)
 
     # Unsatisfiable Mul in `EQ`
     e5 = (4 * a == b)
-    res9 = tvm.arith.DeduceBound(a, e5, {b: b_s}, {})
+    res9 = tvm.arith.deduce_bound(a, e5, {b: b_s}, {})
     assert str(res9.max_value) == "neg_inf"
     assert str(res9.min_value) == "pos_inf"
 
     # Unsatisfiable Mul in `EQ`
-    res10 = tvm.arith.DeduceBound(a, (b * a == b), {b: b_s}, {})    # simplifier is not able to prove that (b % b == 0)
+    res10 = tvm.arith.deduce_bound(a, (b * a == b), {b: b_s}, {})    # simplifier is not able to prove that (b % b == 0)
     assert str(res10.max_value) == "neg_inf"
     assert str(res10.min_value) == "pos_inf"
 
@@ -137,15 +137,15 @@ def test_check():
     d_s = tvm.arith.IntervalSet(-3, -1)
 
     # no compare operator
-    res1 = tvm.arith.DeduceBound(a, a+b, {b: b_s}, {})
+    res1 = tvm.arith.deduce_bound(a, a+b, {b: b_s}, {})
     assert res1.is_nothing()
 
     # multiple compare operators
-    res2 = tvm.arith.DeduceBound(a, (a+b>3).astype(c.dtype)>c , {b: b_s, c: c_s}, {})
+    res2 = tvm.arith.deduce_bound(a, (a+b>3).astype(c.dtype)>c , {b: b_s, c: c_s}, {})
     assert res2.is_nothing()
 
     # multiple target variable
-    res2 = tvm.arith.DeduceBound(a, a*2-a>b, {b: b_s}, {})
+    res2 = tvm.arith.deduce_bound(a, a*2-a>b, {b: b_s}, {})
     assert res2.is_nothing()
 
 def test_deduce_basic():
@@ -155,21 +155,21 @@ def test_basic(a1, a2, coff):
         b_s = tvm.arith.IntervalSet(a1, a2)
         e0 = b + a*coff + 3
 
-        res1 = tvm.arith.DeduceBound(a, e0<17, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, e0<17, {b: b_s}, {b: b_s})
         [x, y] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) < 17)).value == 1
 
         # expression containing variable a is on rhs
-        res1 = tvm.arith.DeduceBound(a, tvm.const(17, "int32") < e0, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, tvm.const(17, "int32") < e0, {b: b_s}, {b: b_s})
         [x, y] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) > 17)).value == 1
 
         # expression containing variable a is on rhs
-        res1 = tvm.arith.DeduceBound(a, tvm.const(17, "int32")>= e0, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, tvm.const(17, "int32")>= e0, {b: b_s}, {b: b_s})
         [x, y] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
 
-        res1 = tvm.arith.DeduceBound(a, e0>=17, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, e0>=17, {b: b_s}, {b: b_s})
         [x, y] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1
 
@@ -187,21 +187,21 @@ def test_complex(a1, a2, coff):
         b_s = tvm.arith.IntervalSet(a1, a2)
         e0 = (b*3 + a* coff) * 4
 
-        res1 = tvm.arith.DeduceBound(a, e0<63, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, e0<63, {b: b_s}, {b: b_s})
         [t, x] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
         assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) < 63)).value == 1
 
         # expression containing variable a is on rhs
-        res1 = tvm.arith.DeduceBound(a, tvm.const(63, "int32")>= e0, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, tvm.const(63, "int32")>= e0, {b: b_s}, {b: b_s})
         [t, x] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
         assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) <= 63)).value == 1
 
-        res1 = tvm.arith.DeduceBound(a, e0>63, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, e0>63, {b: b_s}, {b: b_s})
         [t, x] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
         assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) > 63)).value == 1
 
         # expression containing variable a is on rhs
-        res1 = tvm.arith.DeduceBound(a, tvm.const(63, "int32") <= e0, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, tvm.const(63, "int32") <= e0, {b: b_s}, {b: b_s})
         [t, x] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
         assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) >= 63)).value == 1
 
diff --git a/tests/python/unittest/test_arith_detect_clip_bound.py b/tests/python/unittest/test_arith_detect_clip_bound.py
index 3301c24049ae..44ae24cb6815 100644
--- a/tests/python/unittest/test_arith_detect_clip_bound.py
+++ b/tests/python/unittest/test_arith_detect_clip_bound.py
@@ -20,14 +20,14 @@ def test_basic():
     a = tvm.var("a")
     b = tvm.var("b")
     c = tvm.var("c")
-    m = tvm.arith.DetectClipBound(tvm.all(a * 1 < b * 6,
+    m = tvm.arith.detect_clip_bound(tvm.all(a * 1 < b * 6,
                                           a - 1 > 0), [a])
     assert tvm.ir_pass.Simplify(m[1] - (b * 6 - 1)).value == 0
     assert m[0].value == 2
-    m = tvm.arith.DetectClipBound(tvm.all(a * 1 < b * 6,
+    m = tvm.arith.detect_clip_bound(tvm.all(a * 1 < b * 6,
                                           a - 1 > 0), [a, b])
     assert len(m) == 0
-    m = tvm.arith.DetectClipBound(tvm.all(a + 10 * c <= 20,
+    m = tvm.arith.detect_clip_bound(tvm.all(a + 10 * c <= 20,
                                           b - 1 > 0), [a, b])
     assert tvm.ir_pass.Simplify(m[1] - (20 - 10 * c)).value == 0
     assert tvm.ir_pass.Simplify(m[2] - 2).value == 0
diff --git a/tests/python/unittest/test_arith_detect_linear_equation.py b/tests/python/unittest/test_arith_detect_linear_equation.py
index cacb62456b79..3b103026aec3 100644
--- a/tests/python/unittest/test_arith_detect_linear_equation.py
+++ b/tests/python/unittest/test_arith_detect_linear_equation.py
@@ -19,50 +19,50 @@
 def test_basic():
     a = tvm.var("a")
     b = tvm.var("b")
-    m = tvm.arith.DetectLinearEquation(a * 4 + b * 6 + 7, [a])
+    m = tvm.arith.detect_linear_equation(a * 4 + b * 6 + 7, [a])
     assert m[0].value == 4
     assert tvm.ir_pass.Simplify(m[1] - (b * 6 + 7)).value == 0
 
-    m = tvm.arith.DetectLinearEquation(a * 4 * (a+1) + b * 6 + 7, [a])
+    m = tvm.arith.detect_linear_equation(a * 4 * (a+1) + b * 6 + 7, [a])
     assert len(m) == 0
 
-    m = tvm.arith.DetectLinearEquation(a * 4  + (a+1) + b * 6 + 7, [a])
+    m = tvm.arith.detect_linear_equation(a * 4  + (a+1) + b * 6 + 7, [a])
     assert m[0].value == 5
     assert tvm.ir_pass.Simplify(m[1] - (b * 6 + 7 + 1)).value == 0
 
-    m = tvm.arith.DetectLinearEquation(a * b + 7, [a])
+    m = tvm.arith.detect_linear_equation(a * b + 7, [a])
     assert m[0] == b
 
-    m = tvm.arith.DetectLinearEquation(b * 7, [a])
+    m = tvm.arith.detect_linear_equation(b * 7, [a])
     assert m[0].value == 0
 
-    m = tvm.arith.DetectLinearEquation(b * 7, [])
+    m = tvm.arith.detect_linear_equation(b * 7, [])
     assert len(m) == 1
     assert tvm.ir_pass.Simplify(m[0] - b * 7).value == 0
 
 def test_multivariate():
     v = [tvm.var("v%d" % i) for i in range(4)]
     b = tvm.var("b")
-    m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8, v)
+    m = tvm.arith.detect_linear_equation(v[0] * (b + 4) + v[0] + v[1] * 8, v)
     assert(tvm.ir_pass.Equal(tvm.ir_pass.Simplify(m[0]), b + 5))
     assert(m[1].value == 8)
 
-    m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[2], v)
+    m = tvm.arith.detect_linear_equation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[2], v)
     assert(len(m) == 0)
 
-    m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[1] + v[3], v)
+    m = tvm.arith.detect_linear_equation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[1] + v[3], v)
     assert(len(m) == 0)
 
-    m = tvm.arith.DetectLinearEquation(((v[0] * b + v[1]) * 8 + v[2] + 1) * 2, v)
+    m = tvm.arith.detect_linear_equation(((v[0] * b + v[1]) * 8 + v[2] + 1) * 2, v)
     assert(m[1].value == 16)
     assert(m[2].value == 2)
     assert(m[len(m)-1].value == 2)
 
-    m = tvm.arith.DetectLinearEquation((v[0] - v[1]), [v[2]])
+    m = tvm.arith.detect_linear_equation((v[0] - v[1]), [v[2]])
     assert(m[0].value == 0)
     assert(tvm.ir_pass.Simplify(m[1] - (v[0] - v[1])).value == 0)
 
-    m = tvm.arith.DetectLinearEquation((v[0] - v[1]), [])
+    m = tvm.arith.detect_linear_equation((v[0] - v[1]), [])
     assert(len(m) == 1)
     assert(tvm.ir_pass.Simplify(m[0] - (v[0] - v[1])).value == 0)
 
diff --git a/tests/python/unittest/test_arith_domain_touched.py b/tests/python/unittest/test_arith_domain_touched.py
index 3e45d4e5fd93..7876fb6c4d37 100644
--- a/tests/python/unittest/test_arith_domain_touched.py
+++ b/tests/python/unittest/test_arith_domain_touched.py
@@ -35,19 +35,19 @@ def test_domain_touched():
                 )
             )
     )
-    a_domain_r = tvm.arith.DomainTouched(ir, a, True, False)
+    a_domain_r = tvm.arith._ffi_api.DomainTouched(ir, a, True, False)
     assert a_domain_r[0].min.value == -1
     assert a_domain_r[0].extent.value == 100
     assert a_domain_r[1].min.value == -1
     assert a_domain_r[1].extent.name == 'm'
 
-    a_domain_w = tvm.arith.DomainTouched(ir, a, False, True)
+    a_domain_w = tvm.arith._ffi_api.DomainTouched(ir, a, False, True)
     assert a_domain_w[0].min.value == 0
     assert a_domain_w[0].extent.value == 100
     assert a_domain_w[1].min.value == 0
     assert a_domain_w[1].extent.name == 'm'
 
-    a_domain_rw= tvm.arith.DomainTouched(ir, a, True, True)
+    a_domain_rw= tvm.arith._ffi_api.DomainTouched(ir, a, True, True)
     assert a_domain_rw[0].min.value == -1
     assert a_domain_rw[0].extent.value == 101
     assert a_domain_rw[1].min.value == -1
@@ -55,17 +55,16 @@ def test_domain_touched():
     assert a_domain_rw[1].extent.a.name == 'm'
     assert a_domain_rw[1].extent.b.value == 1
 
-    b_domain_r = tvm.arith.DomainTouched(ir, b, True, False)
+    b_domain_r = tvm.arith._ffi_api.DomainTouched(ir, b, True, False)
     assert b_domain_r
     assert b_domain_r[0].min.value == -1
     assert b_domain_r[0].extent.value == 100
     assert b_domain_r[1].min.value == 1
     assert b_domain_r[1].extent.name == 'm'
 
-    b_domain_w = tvm.arith.DomainTouched(ir, b, False, True)
+    b_domain_w = tvm.arith._ffi_api.DomainTouched(ir, b, False, True)
     assert isinstance(b_domain_w, tvm.container.Array)
     assert len(b_domain_w) == 0
 
 if __name__ == "__main__":
     test_domain_touched()
-
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index d83d33db5c1b..dad2fa705b0f 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -36,12 +36,16 @@ def test_basic():
     assert s.min_value.value == 2
     assert s.max_value.value == 3
 
+    s = tvm.arith.IntSet.single_point(2)
+    assert s.min_value.value == 2
+    assert s.max_value.value == 2
+
 
 def test_vector():
     base = 10
     stride = 3
     lanes = 2
-    s = tvm.arith.intset_vector(tvm.tir.Ramp(base, stride, lanes))
+    s = tvm.arith.IntSet.vector(tvm.tir.Ramp(base, stride, lanes))
     assert s.min_value.value == base
     assert s.max_value.value == base + stride * lanes - 1
 
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
index 8b8a2f06b498..36d8e4198a40 100644
--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -76,7 +76,7 @@ def _post_order(op):
                 args = []
                 args += op.args[:base_args]
                 for i in range(3):
-                    m = tvm.arith.DetectLinearEquation(
+                    m = tvm.arith.detect_linear_equation(
                         op.args[i + base_args], [loop_var])
                     if not m:
                         fail[0] = True
@@ -867,25 +867,25 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
                         type(loop_body.value), str(loop_body.value), str(stmt)))
 
             # Derive array index coefficients
-            dst_coeff = tvm.arith.DetectLinearEquation(dst_idx, indices)
+            dst_coeff = tvm.arith.detect_linear_equation(dst_idx, indices)
             # Check if lhs/rhs is immediate
             use_imm = False
             imm_val = None
             if isinstance(rhs, tvm.tir.IntImm):
                 assert lhs.buffer_var.same_as(dst_var)
-                src_coeff = tvm.arith.DetectLinearEquation(lhs.index, indices)
+                src_coeff = tvm.arith.detect_linear_equation(lhs.index, indices)
                 use_imm = True
                 imm_val = rhs
             if isinstance(lhs, tvm.tir.IntImm):
                 assert rhs.buffer_var.same_as(dst_var)
-                src_coeff = tvm.arith.DetectLinearEquation(rhs.index, indices)
+                src_coeff = tvm.arith.detect_linear_equation(rhs.index, indices)
                 use_imm = True
                 imm_val = lhs
             if imm_val is None:
                 imm_val = 0
                 assert lhs.buffer_var.same_as(dst_var) and rhs.buffer_var.same_as(dst_var)
-                src_lhs_coeff = tvm.arith.DetectLinearEquation(lhs.index, indices)
-                src_rhs_coeff = tvm.arith.DetectLinearEquation(rhs.index, indices)
+                src_lhs_coeff = tvm.arith.detect_linear_equation(lhs.index, indices)
+                src_rhs_coeff = tvm.arith.detect_linear_equation(rhs.index, indices)
                 # Determine which side has the same coefficients
                 lhs_equal = True
                 rhs_equal = True

From 9d646543098580490b85f5865d10d087f75ea22e Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@uw.edu>
Date: Tue, 18 Feb 2020 10:24:22 -0800
Subject: [PATCH 17/73] [Relay][Frontend][Keras] NHWC import support. (#4899)

* Basic test working

* Almost all tests working.

* all tests passing.

* Fixed lint.

* Improved Style.
---
 python/tvm/relay/frontend/keras.py          | 148 +++++++++++++++-----
 tests/python/frontend/keras/test_forward.py |  47 ++++---
 2 files changed, 143 insertions(+), 52 deletions(-)

diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index d21f1af124ca..caf41768ada4 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -186,7 +186,7 @@ def _convert_merge(inexpr, keras_layer, _):
         assert len(inexpr) == 2, "Subtract merge takes 2 inputs."
         ret = _op.subtract(ret, inexpr[1])
     elif merge_type in ['Add', 'Multiply', 'Maximum']:
-        op_map = {'Add':_op.add, 'Multiply':_op.multiply, 'Maximum':_op.maximum}
+        op_map = {'Add': _op.add, 'Multiply': _op.multiply, 'Maximum': _op.maximum}
         for i in range(1, len(inexpr)):
             ret = op_map[merge_type](ret, inexpr[i])
     elif merge_type == 'Average':
@@ -206,7 +206,7 @@ def _convert_permute(inexpr, keras_layer, _):
 def _convert_dense(inexpr, keras_layer, etab):
     weightList = keras_layer.get_weights()
     weight = etab.new_const(weightList[0].transpose([1, 0]))
-    params = {'weight':weight, 'units':weightList[0].shape[1]}
+    params = {'weight': weight, 'units': weightList[0].shape[1]}
     input_shape = keras_layer.input_shape
     input_dim = len(input_shape)
     # In case of RNN dense, input shape will be (1, 1, n)
@@ -237,15 +237,28 @@ def _convert_convolution(inexpr, keras_layer, etab):
     is_deconv = type(keras_layer).__name__ == 'Conv2DTranspose'
     is_depthconv = type(keras_layer).__name__ == 'DepthwiseConv2D'
     weightList = keras_layer.get_weights()
+    weight = weightList[0]
+    if etab.data_layout == 'NHWC':
+        if is_depthconv:
+            kernel_layout = 'HWOI'
+        else:
+            kernel_layout = 'HWIO'
+    else:
+        kernel_layout = 'OIHW'
+
     if is_deconv:
-        kernel_h, kernel_w, n_filters, in_channels = weightList[0].shape
-        weight = weightList[0].transpose([3, 2, 0, 1])
+        kernel_h, kernel_w, n_filters, in_channels = weight.shape
+        if kernel_layout == 'OIHW':
+            weight = weight.transpose([3, 2, 0, 1])
     elif is_depthconv:
-        kernel_h, kernel_w, in_channels, depth_mult = weightList[0].shape
-        weight = weightList[0].transpose([2, 3, 0, 1])
+        kernel_h, kernel_w, in_channels, depth_mult = weight.shape
+        if kernel_layout == 'OIHW':
+            weight = weight.transpose([2, 3, 0, 1])
+    elif etab.data_layout == 'NCHW':
+        kernel_h, kernel_w, in_channels, n_filters = weight.shape
+        weight = weight.transpose([3, 2, 0, 1])
     else:
-        kernel_h, kernel_w, in_channels, n_filters = weightList[0].shape
-        weight = weightList[0].transpose([3, 2, 0, 1])
+        kernel_h, kernel_w, in_channels, n_filters = weight.shape
     if isinstance(keras_layer.dilation_rate, (list, tuple)):
         dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
     else:
@@ -257,7 +270,9 @@ def _convert_convolution(inexpr, keras_layer, etab):
               'kernel_size': [kernel_h, kernel_w],
               'strides': [stride_h, stride_w],
               'dilation': dilation,
-              'padding': [0, 0]}
+              'padding': [0, 0],
+              'data_layout': etab.data_layout,
+              'kernel_layout': kernel_layout}
     if is_depthconv:
         params['channels'] = in_channels * depth_mult
         params['groups'] = in_channels
@@ -273,9 +288,13 @@ def _convert_convolution(inexpr, keras_layer, etab):
         pad_l, pad_r = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
         if pad_t == pad_b and pad_l == pad_r:
             params['padding'] = (pad_t, pad_l)
-        else:
+        elif etab.data_layout == 'NCHW':
             inexpr = _op.nn.pad(data=inexpr, pad_width=(
                 (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+        else:
+            inexpr = _op.nn.pad(data=inexpr, pad_width=(
+                (0, 0), (pad_t, pad_b), (pad_l, pad_r), (0, 0)))
+
     else:
         msg = 'Padding with {} is not supported for operator Convolution ' \
               'in frontend Keras.'
@@ -284,9 +303,13 @@ def _convert_convolution(inexpr, keras_layer, etab):
         out = _op.nn.conv2d_transpose(data=inexpr, **params)
     else:
         out = _op.nn.conv2d(data=inexpr, **params)
+
     if keras_layer.use_bias:
         bias = etab.new_const(weightList[1])
-        out = _op.nn.bias_add(out, bias)
+        if etab.data_layout == 'NCHW':
+            out = _op.nn.bias_add(out, bias)
+        else:
+            out = _op.nn.bias_add(out, bias, axis=-1)
     # defuse activation
     if sys.version_info.major < 3:
         act_type = keras_layer.activation.func_name
@@ -299,18 +322,27 @@ def _convert_convolution(inexpr, keras_layer, etab):
 
 def _convert_separable_convolution(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
+    if etab.data_layout == 'NHWC':
+        kernel_layout = 'HWOI'
+    else:
+        kernel_layout = 'OIHW'
     weightList = keras_layer.get_weights()
     # depthwise conv
     kernel_h, kernel_w, in_channels, depth_mult = weightList[0].shape
     stride_h, stride_w = keras_layer.strides
-    weight0 = weightList[0].transpose([2, 3, 0, 1])
+    if kernel_layout == 'OIHW':
+        weight0 = weightList[0].transpose([2, 3, 0, 1])
+    else:
+        weight0 = weightList[0]
     params0 = {'weight': etab.new_const(weight0),
                'channels': in_channels * depth_mult,
                'groups': in_channels,
                'kernel_size': [kernel_h, kernel_w],
                'strides': [stride_h, stride_w],
                'dilation': [1, 1],
-               'padding': [0, 0]}
+               'padding': [0, 0],
+               'data_layout': etab.data_layout,
+               'kernel_layout': kernel_layout}
     if keras_layer.padding == 'valid':
         pass
     # we insert a separate pad operator
@@ -321,27 +353,39 @@ def _convert_separable_convolution(inexpr, keras_layer, etab):
         pad_l, pad_r = _get_pad_pair(in_w, kernel_w, stride_w)
         if pad_t == pad_b and pad_l == pad_r:
             params0['padding'] = (pad_t, pad_l)
-        else:
+        elif etab.data_layout == 'NCHW':
             inexpr = _op.nn.pad(data=inexpr, pad_width=(
                 (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+        else:
+            inexpr = _op.nn.pad(data=inexpr, pad_width=(
+                (0, 0), (pad_t, pad_b), (pad_l, pad_r), (0, 0)))
+
     else:
         msg = 'Padding with {} is not supported for operator Separable ' \
               'Convolution in frontend Keras.'
         raise tvm.error.OpAttributeUnImplemented(msg.format(keras_layer.padding))
-
     depthconv = _op.nn.conv2d(data=inexpr, **params0)
     # pointwise conv
-    weight1 = weightList[1].transpose([3, 2, 0, 1])
+    if kernel_layout == 'OIHW':
+        weight1 = weightList[1].transpose([3, 2, 0, 1])
+    else:
+        weight1 = weightList[1]
+        kernel_layout = "HWIO"
     params1 = {'weight': etab.new_const(weight1),
-               'channels': weight1.shape[0],
+               'channels': weightList[1].shape[3],
                'groups': 1,
                'kernel_size': [1, 1],
                'strides': [1, 1],
-               'dilation': [1, 1]}
+               'dilation': [1, 1],
+               'data_layout': etab.data_layout,
+               'kernel_layout': kernel_layout}
     out = _op.nn.conv2d(data=depthconv, **params1)
     if keras_layer.use_bias:
         bias = etab.new_const(weightList[2])
-        out = _op.nn.bias_add(out, bias)
+        if etab.data_layout == 'NCHW':
+            out = _op.nn.bias_add(out, bias)
+        else:
+            out = _op.nn.bias_add(out, bias, axis=-1)
     # defuse activation
     if sys.version_info.major < 3:
         act_type = keras_layer.activation.func_name
@@ -352,10 +396,11 @@ def _convert_separable_convolution(inexpr, keras_layer, etab):
     return out
 
 
-def _convert_flatten(inexpr, keras_layer, _):
+def _convert_flatten(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
     # NCHW -> NHWC so that dense can be correctly converted
-    inexpr = _op.transpose(inexpr, axes=[0, 2, 3, 1])
+    if etab.data_layout == 'NCHW':
+        inexpr = _op.transpose(inexpr, axes=[0, 2, 3, 1])
     return _op.nn.batch_flatten(inexpr)
 
 
@@ -363,15 +408,19 @@ def _convert_pooling(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
     pool_type = type(keras_layer).__name__
     # global pool in keras = global pool + flatten in relay
+    global_pool_params = {'layout': etab.data_layout}
     if pool_type == 'GlobalMaxPooling2D':
-        return _convert_flatten(_op.nn.global_max_pool2d(inexpr), keras_layer, etab)
+        return _convert_flatten(
+            _op.nn.global_max_pool2d(inexpr, **global_pool_params), keras_layer, etab)
     if pool_type == 'GlobalAveragePooling2D':
-        return _convert_flatten(_op.nn.global_avg_pool2d(inexpr), keras_layer, etab)
+        return _convert_flatten(
+            _op.nn.global_avg_pool2d(inexpr, **global_pool_params), keras_layer, etab)
     pool_h, pool_w = keras_layer.pool_size
     stride_h, stride_w = keras_layer.strides
     params = {'pool_size': [pool_h, pool_w],
               'strides': [stride_h, stride_w],
-              'padding': [0, 0]}
+              'padding': [0, 0],
+              'layout': etab.data_layout}
     if keras_layer.padding == 'valid':
         pass
     elif keras_layer.padding == 'same':
@@ -392,7 +441,7 @@ def _convert_pooling(inexpr, keras_layer, etab):
         'Operator {} is not supported for frontend Keras.'.format(keras_layer))
 
 
-def _convert_upsample(inexpr, keras_layer, _):
+def _convert_upsample(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
     upsample_type = type(keras_layer).__name__
     params = {}
@@ -424,7 +473,9 @@ def _convert_upsample(inexpr, keras_layer, _):
     else:
         raise tvm.error.OpNotImplemented(
             'Operator {} is not supported for frontend Keras.'.format(upsample_type))
-    return _op.nn.upsampling(inexpr, **params)
+    params['layout'] = etab.data_layout
+    out = _op.nn.upsampling(inexpr, **params)
+    return out
 
 
 def _convert_cropping(inexpr, keras_layer, _):
@@ -442,9 +493,15 @@ def _convert_cropping(inexpr, keras_layer, _):
 
 
 def _convert_batchnorm(inexpr, keras_layer, etab):
+    if etab.data_layout == 'NCHW' or len(keras_layer.input_shape) < 4:
+        axis = 1
+    else:
+        axis = 3
+
     params = {'scale': False,
               'center': False,
-              'epsilon': keras_layer.epsilon}
+              'epsilon': keras_layer.epsilon,
+              'axis': axis}
     idx = 0
     if keras_layer.scale:
         params['scale'] = True
@@ -469,7 +526,7 @@ def _convert_batchnorm(inexpr, keras_layer, etab):
     return result
 
 
-def _convert_padding(inexpr, keras_layer, _):
+def _convert_padding(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
     padding_type = type(keras_layer).__name__
     padding = keras_layer.padding
@@ -495,16 +552,21 @@ def _convert_padding(inexpr, keras_layer, _):
     else:
         msg = 'Operator {} is not supported in frontend Keras.'
         raise tvm.error.OpNotImplemented(msg.format(padding_type))
-    return _op.nn.pad(data=inexpr,
-                      pad_width=((0, 0), (0, 0), (top, bottom), (left, right)))
+    if etab.data_layout == 'NCHW':
+        return _op.nn.pad(data=inexpr, pad_width=((0, 0), (0, 0), (top, bottom), (left, right)))
+    return _op.nn.pad(data=inexpr, pad_width=((0, 0), (top, bottom), (left, right), (0, 0)))
 
 
-def _convert_concat(inexpr, keras_layer, _):
+def _convert_concat(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
-    return _op.concatenate(_as_list(inexpr), axis=1)
+    if etab.data_layout == 'NHWC' or len(keras_layer.input_shape[0]) < 4:
+        axis = -1
+    else:
+        axis = 1
+    return _op.concatenate(_as_list(inexpr), axis=axis)
 
 
-def _convert_reshape(inexpr, keras_layer, _):
+def _convert_reshape(inexpr, keras_layer, etab):
     _check_data_format(keras_layer)
     inshape = keras_layer.input_shape # includes batch
     tshape = keras_layer.target_shape # no batch
@@ -525,7 +587,10 @@ def _convert_reshape(inexpr, keras_layer, _):
         assert ch == tshape[-1], \
             "Only supports last dimension in target shape being equal to " \
             "the channel number of input tensor."
-        shape = (-1, ch) + tshape[:-1]
+        if etab.data_layout == 'NCHW':
+            shape = (-1, ch) + tshape[:-1]
+        else:
+            shape = (-1,) + tshape[:-1] + (ch,)
     return _op.reshape(inexpr, newshape=shape)
 
 
@@ -740,7 +805,7 @@ def keras_op_to_relay(inexpr, keras_layer, outname, etab):
         etab.set_expr(name, out)
 
 
-def from_keras(model, shape=None):
+def from_keras(model, shape=None, layout='NCHW'):
     """Convert keras model to relay Function.
 
     Parameters
@@ -751,6 +816,11 @@ def from_keras(model, shape=None):
     shape: dict of str to int list/tuple
         Input shapes of the model, optional
 
+    layout: str
+        One of 'NCHW' or 'NHWC', indicates how data should be arranged in
+        the output model. Default layout is 'NCHW' as it in general
+        performs better across TVM.
+
     Returns
     -------
     mod : tvm.IRModule
@@ -793,6 +863,9 @@ def _convert_input_layer(keras_layer):
     assert isinstance(model, expected_model_class)
 
     etab = ExprTable()
+    # Set global data format.
+    assert layout in ['NCHW', 'NHWC'], "Layout must be one of 'NCHW' or NHWC"
+    etab.data_layout = layout
     for keras_layer in model.layers:
         if isinstance(keras_layer, input_layer_class):
             _convert_input_layer(keras_layer)
@@ -818,7 +891,10 @@ def _convert_input_layer(keras_layer):
                 # The one exception is InputLayer. Changing input variable names after conversion
                 # would confuse users, so we should keep them as far as possible. Fortunately,
                 # they are named uniquely to input_1, input_2, input_3... by default.
-                zip_node = zip(node.node_indices, node.tensor_indices, node.inbound_layers)
+                zip_node = zip(
+                    _as_list(node.node_indices),
+                    _as_list(node.tensor_indices),
+                    _as_list(node.inbound_layers))
                 for n_idx, t_idx, inbound_layer in zip_node:
                     if isinstance(inbound_layer, input_layer_class):
                         expr_name = inbound_layer.name
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index e4df4da4e989..f7dcb29b37aa 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -21,13 +21,18 @@
 from tvm.relay.testing.config import ctx_list
 import keras
 
-# prevent Keras from using up all gpu memory
 import tensorflow as tf
 from tensorflow import keras as tf_keras
-from keras.backend.tensorflow_backend import set_session
-config = tf.ConfigProto()
-config.gpu_options.per_process_gpu_memory_fraction = 0.5
-set_session(tf.Session(config=config))
+# prevent Keras from using up all gpu memory
+if tf.executing_eagerly():
+    gpus = tf.config.list_physical_devices('GPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+else:
+    from keras.backend.tensorflow_backend import set_session
+    config = tf.ConfigProto()
+    config.gpu_options.per_process_gpu_memory_fraction = 0.5
+    set_session(tf.Session(config=config))
 
 
 def pytest_generate_tests(metafunc):
@@ -52,20 +57,27 @@ def pytest_generate_tests(metafunc):
 using_tensorflow_keras = ("tf_keras", {"keras": tf_keras})
 
 
-def verify_keras_frontend(keras_model, need_transpose=True):
+def verify_keras_frontend(keras_model, need_transpose=True, layout='NCHW'):
     # Keras frontend currently supports tensorflow backend only.
     assert(keras.backend.backend() == 'tensorflow')
 
+    if layout != 'NCHW':
+        need_transpose = False
+
     in_shapes = []
     for layer in keras_model._input_layers:
-        in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
+        if tf.executing_eagerly():
+            in_shapes.append(tuple(dim if dim is not None else 1 for dim in layer.input.shape))
+        else:
+            in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
+
 
     def get_keras_output(xs, dtype='float32'):
         return keras_model.predict(xs)
 
     def get_tvm_output(xs, target, ctx, dtype='float32'):
         shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)}
-        mod, params = relay.frontend.from_keras(keras_model, shape_dict)
+        mod, params = relay.frontend.from_keras(keras_model, shape_dict, layout=layout)
         with relay.transform.build_config(opt_level=2):
             graph, lib, params = relay.build(mod,
                                              target,
@@ -357,28 +369,28 @@ def test_forward_rnn(self,keras):
             verify_keras_frontend(keras_model, need_transpose=False)
 
 
-    def test_forward_vgg16(self, keras):
+    def test_forward_vgg16(self, keras, layout='NCHW'):
         keras_model = keras.applications.VGG16(include_top=True, weights='imagenet',
             input_shape=(224, 224, 3), classes=1000)
-        verify_keras_frontend(keras_model)
+        verify_keras_frontend(keras_model, layout=layout)
 
 
-    def test_forward_xception(self, keras):
+    def test_forward_xception(self, keras, layout='NCHW'):
         keras_model = keras.applications.Xception(include_top=True, weights='imagenet',
             input_shape=(299, 299, 3), classes=1000)
-        verify_keras_frontend(keras_model)
+        verify_keras_frontend(keras_model, layout=layout)
 
 
-    def test_forward_resnet50(self, keras):
+    def test_forward_resnet50(self, keras, layout='NCHW'):
         keras_model = keras.applications.ResNet50(include_top=True, weights='imagenet',
             input_shape=(224, 224, 3), classes=1000)
-        verify_keras_frontend(keras_model)
+        verify_keras_frontend(keras_model, layout=layout)
 
 
-    def test_forward_mobilenet(self, keras):
+    def test_forward_mobilenet(self, keras, layout='NCHW'):
         keras_model = keras.applications.MobileNet(include_top=True, weights='imagenet',
             input_shape=(224, 224, 3), classes=1000)
-        verify_keras_frontend(keras_model)
+        verify_keras_frontend(keras_model, layout=layout)
 
 
 if __name__ == '__main__':
@@ -402,6 +414,9 @@ def test_forward_mobilenet(self, keras):
         sut.test_forward_reuse_layers(keras=k)
         sut.test_forward_rnn(keras=k)
         sut.test_forward_vgg16(keras=k)
+        sut.test_forward_vgg16(keras=k, layout='NHWC')
         sut.test_forward_xception(keras=k)
         sut.test_forward_resnet50(keras=k)
+        sut.test_forward_resnet50(keras=k, layout='NHWC')
         sut.test_forward_mobilenet(keras=k)
+        sut.test_forward_mobilenet(keras=k, layout='NHWC')

From 41835d176d31bc2f3ba1f0ed9e35bdbfd453dc39 Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Tue, 18 Feb 2020 13:44:59 -0800
Subject: [PATCH 18/73] [Relay] Expose FunctionGetAttr to Python (#4905)

* [Relay] Expose FunctionGetAttr to Python

* add test

Co-authored-by: Jon Soifer <jonso@microsoft.com>
---
 python/tvm/relay/expr.py            | 3 +++
 src/relay/ir/expr.cc                | 6 ++++++
 tests/python/relay/test_ir_nodes.py | 2 ++
 3 files changed, 11 insertions(+)

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index e5259fbc0da8..39e68b8333ff 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -280,6 +280,9 @@ def set_params(self, params):
     def set_attribute(self, name, ref):
         return _expr.FunctionSetAttr(self, name, ref)
 
+    def get_attribute(self, name):
+        return _expr.FunctionGetAttr(self, name)
+
 
 @register_relay_node
 class Call(ExprWithOp):
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 89395bb742c1..0292a6c2bb05 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -360,6 +360,12 @@ TVM_REGISTER_GLOBAL("relay._expr.FunctionSetAttr")
     return FunctionSetAttr(func, name, ref);
 });
 
+TVM_REGISTER_GLOBAL("relay._expr.FunctionGetAttr")
+.set_body_typed(
+  [](Function func, std::string name) {
+    return FunctionGetAttr(func, name);
+});
+
 TVM_REGISTER_GLOBAL("relay._make.Any")
 .set_body_typed([]() { return Any::make(); });
 
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index bdda72ca8702..b7d7eb9f389c 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -168,10 +168,12 @@ def test_function():
     body = relay.Tuple(tvm.convert([]))
     type_params = tvm.convert([])
     fn = relay.Function(params, body, ret_type, type_params)
+    fn = fn.set_attribute("test_attribute", tvm.tir.StringImm("value"))
     assert fn.params == params
     assert fn.body == body
     assert fn.type_params == type_params
     assert fn.span == None
+    assert fn.get_attribute("test_attribute") == "value"
     str(fn)
     check_json_roundtrip(fn)
 

From d2ae8c95d56d8788b1bf77ef28701eb50bbfb495 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 18 Feb 2020 15:39:59 -0800
Subject: [PATCH 19/73] [DOCS] Update API docs to reflect the status after the
 refactor. (#4907)

---
 docs/_static/css/tvm_theme.css                |  3 +
 docs/api/python/dev.rst                       | 80 -------------------
 docs/api/python/{build.rst => driver.rst}     |  8 +-
 docs/api/python/error.rst                     |  1 +
 docs/api/python/hybrid.rst                    | 11 +--
 docs/api/python/index.rst                     | 14 ++--
 docs/api/python/intrin.rst                    | 58 --------------
 docs/api/python/ir.rst                        |  1 +
 docs/api/python/ndarray.rst                   |  9 ---
 docs/api/python/relay/index.rst               |  4 +-
 docs/api/python/relay/ty.rst                  | 49 ------------
 docs/api/python/rpc.rst                       | 22 +----
 docs/api/python/runtime.rst                   |  5 ++
 docs/api/python/target.rst                    |  1 +
 docs/api/python/{schedule.rst => te.rst}      | 32 +++-----
 docs/api/python/{relay/module.rst => tir.rst} | 14 ++--
 docs/api/python/topi.rst                      |  2 +-
 docs/api/python/vta/index.rst                 |  4 +-
 docs/conf.py                                  | 24 +++---
 docs/contribute/committer_guide.rst           |  2 +-
 docs/contribute/error_handling.rst            | 35 +++-----
 docs/dev/runtime.rst                          |  2 +-
 python/tvm/autotvm/tuner/xgboost_tuner.py     | 20 +++--
 python/tvm/contrib/graph_runtime.py           | 15 +++-
 python/tvm/error.py                           |  4 +
 python/tvm/relay/op/nn/nn.py                  |  8 +-
 python/tvm/relay/op/tensor.py                 |  1 +
 python/tvm/target/__init__.py                 |  2 +-
 python/tvm/target/target.py                   | 14 ++--
 python/tvm/te/__init__.py                     | 10 ++-
 30 files changed, 122 insertions(+), 333 deletions(-)
 delete mode 100644 docs/api/python/dev.rst
 rename docs/api/python/{build.rst => driver.rst} (94%)
 delete mode 100644 docs/api/python/intrin.rst
 delete mode 100644 docs/api/python/relay/ty.rst
 rename docs/api/python/{schedule.rst => te.rst} (66%)
 rename docs/api/python/{relay/module.rst => tir.rst} (86%)

diff --git a/docs/_static/css/tvm_theme.css b/docs/_static/css/tvm_theme.css
index 196aa05c2375..93f4ea4d38c1 100644
--- a/docs/_static/css/tvm_theme.css
+++ b/docs/_static/css/tvm_theme.css
@@ -38,3 +38,6 @@ nav .hidden-section {
     color: #404040 !important;
 }
 
+.wy-nav-content {
+    max-width: 950px !important;
+}
diff --git a/docs/api/python/dev.rst b/docs/api/python/dev.rst
deleted file mode 100644
index f9d9410946c9..000000000000
--- a/docs/api/python/dev.rst
+++ /dev/null
@@ -1,80 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-Developer API
--------------
-This page contains modules that are used by developers of TVM.
-Many of these APIs are PackedFunc registered in C++ backend.
-
-
-
-tvm.expr
-~~~~~~~~
-.. automodule:: tvm.expr
-   :members:
-   :undoc-members:
-
-tvm.codegen
-~~~~~~~~~~~
-.. automodule:: tvm.codegen
-   :members:
-   :undoc-members:
-
-tvm.stmt
-~~~~~~~~
-.. automodule:: tvm.stmt
-   :members:
-   :undoc-members:
-
-tvm.ir_pass
-~~~~~~~~~~~
-.. autosummary::
-
-   tvm.ir_pass.Inline
-   tvm.ir_pass.Simplify
-   tvm.ir_pass.ConvertSSA
-   tvm.ir_pass.VerifySSA
-   tvm.ir_pass.CanonicalSimplify
-   tvm.ir_pass.StorageFlatten
-   tvm.ir_pass.VectorizeLoop
-   tvm.ir_pass.SkipVectorize
-   tvm.ir_pass.UnrollLoop
-   tvm.ir_pass.ThreadSync
-   tvm.ir_pass.StorageRewrite
-   tvm.ir_pass.MakeAPI
-   tvm.ir_pass.SplitHostDevice
-   tvm.ir_pass.InjectVirtualThread
-   tvm.ir_pass.LoopPartition
-   tvm.ir_pass.RemoveNoOp
-   tvm.ir_pass.SplitPipeline
-   tvm.ir_pass.LowerThreadAllreduce
-   tvm.ir_pass.LowerIntrin
-   tvm.ir_pass.LowerTVMBuiltin
-   tvm.ir_pass.NarrowChannelAccess
-
-.. automodule:: tvm.ir_pass
-   :members:
-
-tvm.ir_builder
-~~~~~~~~~~~~~~
-.. automodule:: tvm.ir_builder
-   :members:
-
-tvm.make
-~~~~~~~~
-.. automodule:: tvm.make
-   :members:
diff --git a/docs/api/python/build.rst b/docs/api/python/driver.rst
similarity index 94%
rename from docs/api/python/build.rst
rename to docs/api/python/driver.rst
index 5eee7a5b23c4..1f1bc8c7cf7b 100644
--- a/docs/api/python/build.rst
+++ b/docs/api/python/driver.rst
@@ -15,10 +15,10 @@
     specific language governing permissions and limitations
     under the License.
 
-tvm.build
----------
+tvm.driver
+----------
+.. automodule:: tvm.driver
+
 .. autofunction:: tvm.lower
 
 .. autofunction:: tvm.build
-
-.. autofunction:: tvm.build_config
diff --git a/docs/api/python/error.rst b/docs/api/python/error.rst
index c32d82599af7..a228f7b8bd85 100644
--- a/docs/api/python/error.rst
+++ b/docs/api/python/error.rst
@@ -20,3 +20,4 @@ tvm.error
 .. automodule:: tvm.error
    :members:
    :imported-members:
+   :autosummary:
diff --git a/docs/api/python/hybrid.rst b/docs/api/python/hybrid.rst
index 5acae640d2de..1184c837d2de 100644
--- a/docs/api/python/hybrid.rst
+++ b/docs/api/python/hybrid.rst
@@ -18,11 +18,6 @@
 tvm.hybrid
 ----------
 .. automodule:: tvm.hybrid
-
-.. autosummary::
-
-   tvm.hybrid.parse
-   tvm.hybrid.script
-
-.. autofunction:: tvm.hybrid.parse
-.. autofunction:: tvm.hybrid.script
+   :members:
+   :imported-members:
+   :autosummary:
diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index b37d44eda7b3..f62a4b848650 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -21,23 +21,19 @@ Python API
 .. toctree::
    :maxdepth: 2
 
-   tvm
    runtime
    ndarray
    error
    ir
    target
-   intrin
-   tensor
-   schedule
-   build
-   function
+   tir
+   te
+   driver
    autotvm
    rpc
    contrib
    graph_runtime
-   dev
-   topi
-   vta/index
    hybrid
    relay/index
+   vta/index
+   topi
diff --git a/docs/api/python/intrin.rst b/docs/api/python/intrin.rst
deleted file mode 100644
index 60141d020c9e..000000000000
--- a/docs/api/python/intrin.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.intrin
-----------
-.. automodule:: tvm.intrin
-
-.. autosummary::
-
-   tvm.call_packed
-   tvm.call_pure_intrin
-   tvm.call_intrin
-   tvm.call_pure_extern
-   tvm.call_extern
-   tvm.call_llvm_intrin
-   tvm.register_intrin_rule
-   tvm.if_then_else
-   tvm.exp
-   tvm.log
-   tvm.floor
-   tvm.ceil
-   tvm.trunc
-   tvm.round
-   tvm.nearbyint
-   tvm.abs
-   tvm.isnan
-
-.. autofunction:: tvm.call_packed
-.. autofunction:: tvm.call_pure_intrin
-.. autofunction:: tvm.call_intrin
-.. autofunction:: tvm.call_pure_extern
-.. autofunction:: tvm.call_extern
-.. autofunction:: tvm.call_llvm_intrin
-.. autofunction:: tvm.register_intrin_rule
-.. autofunction:: tvm.if_then_else
-.. autofunction:: tvm.exp
-.. autofunction:: tvm.log
-.. autofunction:: tvm.floor
-.. autofunction:: tvm.ceil
-.. autofunction:: tvm.trunc
-.. autofunction:: tvm.round
-.. autofunction:: tvm.nearbyint
-.. autofunction:: tvm.abs
-.. autofunction:: tvm.isnan
diff --git a/docs/api/python/ir.rst b/docs/api/python/ir.rst
index 9e7df8ffce69..1f0dc0c5e23c 100644
--- a/docs/api/python/ir.rst
+++ b/docs/api/python/ir.rst
@@ -20,3 +20,4 @@ tvm.ir
 .. automodule:: tvm.ir
    :members:
    :imported-members:
+   :autosummary:
diff --git a/docs/api/python/ndarray.rst b/docs/api/python/ndarray.rst
index 6f00eedac3e9..aa828905ca21 100644
--- a/docs/api/python/ndarray.rst
+++ b/docs/api/python/ndarray.rst
@@ -23,14 +23,5 @@ tvm.runtime.ndarray
     :members:
     :inherited-members:
 
-
-.. autoclass:: tvm.runtime.TVMContext
-    :members:
-
-.. autofunction:: tvm.context
-.. autofunction:: tvm.cpu
-.. autofunction:: tvm.gpu
-.. autofunction:: tvm.opencl
-.. autofunction:: tvm.metal
 .. autofunction:: tvm.nd.array
 .. autofunction:: tvm.nd.empty
diff --git a/docs/api/python/relay/index.rst b/docs/api/python/relay/index.rst
index 90746b8e5d4e..b286386b1230 100644
--- a/docs/api/python/relay/index.rst
+++ b/docs/api/python/relay/index.rst
@@ -15,7 +15,7 @@
     specific language governing permissions and limitations
     under the License.
 
-Relay API
+tvm.relay
 =========
 
 This document contains the Python API for the Relay frontend, optimizer, and
@@ -35,9 +35,7 @@ compiler stack.
    image
    analysis
    transform
-   module
    nn
    op
    scope_builder
-   ty
    vision
diff --git a/docs/api/python/relay/ty.rst b/docs/api/python/relay/ty.rst
deleted file mode 100644
index 5bd5321887a5..000000000000
--- a/docs/api/python/relay/ty.rst
+++ /dev/null
@@ -1,49 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.relay.ty
-------------
-
-.. automodule:: tvm.relay.ty
-    :members:
-
-.. autoclass:: tvm.relay.ty.Type
-    :members:
-
-.. autoclass:: tvm.relay.ty.TensorType
-    :members:
-
-.. autoclass:: tvm.relay.ty.Kind
-    :members:
-
-.. autoclass:: tvm.relay.ty.TypeVar
-    :members:
-
-.. autoclass:: tvm.relay.ty.TypeConstraint
-    :members:
-
-.. autoclass:: tvm.relay.ty.TupleType
-    :members:
-
-.. autoclass:: tvm.relay.ty.FuncType
-    :members:
-
-.. autoclass:: tvm.relay.ty.IncompleteType
-    :members:
-
-.. autoclass:: tvm.relay.ty.TypeRelation
-    :members:
diff --git a/docs/api/python/rpc.rst b/docs/api/python/rpc.rst
index da7dc83b1889..d5d119201854 100644
--- a/docs/api/python/rpc.rst
+++ b/docs/api/python/rpc.rst
@@ -18,22 +18,6 @@
 tvm.rpc
 -------
 .. automodule:: tvm.rpc
-
-.. autofunction:: tvm.rpc.connect
-.. autofunction:: tvm.rpc.connect_tracker
-
-.. autoclass:: tvm.rpc.TrackerSession
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.rpc.RPCSession
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.rpc.LocalSession
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.rpc.Server
-    :members:
-    :inherited-members:
+   :members:
+   :imported-members:
+   :autosummary:
diff --git a/docs/api/python/runtime.rst b/docs/api/python/runtime.rst
index 75523cf7b81d..9e395712aa6d 100644
--- a/docs/api/python/runtime.rst
+++ b/docs/api/python/runtime.rst
@@ -19,6 +19,11 @@ tvm.runtime
 -----------
 
 .. automodule:: tvm.runtime
+   :members:
+   :imported-members:
+   :exclude-members: NDArray
+   :autosummary:
+
 
 .. autoclass:: tvm.runtime.PackedFunc
    :members:
diff --git a/docs/api/python/target.rst b/docs/api/python/target.rst
index 625b98e9de43..b3e763379ab6 100644
--- a/docs/api/python/target.rst
+++ b/docs/api/python/target.rst
@@ -20,3 +20,4 @@ tvm.target
 .. automodule:: tvm.target
     :members:
     :imported-members:
+    :autosummary:
diff --git a/docs/api/python/schedule.rst b/docs/api/python/te.rst
similarity index 66%
rename from docs/api/python/schedule.rst
rename to docs/api/python/te.rst
index c5e902980692..dc3d3dacd2ca 100644
--- a/docs/api/python/schedule.rst
+++ b/docs/api/python/te.rst
@@ -15,22 +15,16 @@
     specific language governing permissions and limitations
     under the License.
 
-tvm.schedule
-------------
-.. automodule:: tvm.schedule
-
-.. autoclass:: tvm.schedule.IterVar
-    :members:
-
-.. autoclass:: tvm.schedule.Buffer
-    :members:
-
-.. autofunction:: tvm.create_schedule
-
-.. autoclass:: tvm.schedule.Schedule
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.schedule.Stage
-    :members:
-    :inherited-members:
+tvm.te
+------
+.. Exclude the ops imported from tir.
+
+.. automodule:: tvm.te
+   :members:
+   :imported-members:
+   :exclude-members:
+      exp, erf, tanh, sigmoid, log, cos, sin, atan, sqrt, rsqrt, floor, ceil,
+      trunc, abs, round, nearbyint, isnan, power, popcount, fmod, if_then_else,
+      div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod,
+      comm_reducer, min, max, sum
+   :autosummary:
diff --git a/docs/api/python/relay/module.rst b/docs/api/python/tir.rst
similarity index 86%
rename from docs/api/python/relay/module.rst
rename to docs/api/python/tir.rst
index ae736db0532f..d1017cdb46ef 100644
--- a/docs/api/python/relay/module.rst
+++ b/docs/api/python/tir.rst
@@ -15,10 +15,10 @@
     specific language governing permissions and limitations
     under the License.
 
-tvm.relay.module
-----------------
-
-.. automodule:: tvm.relay.module
-
-.. autoclass:: tvm.relay.module.Module
-    :members:
+tvm.tir
+-------
+.. automodule:: tvm.tir
+   :members:
+   :imported-members:
+   :exclude-members: PrimExpr
+   :autosummary:
diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 75a4271291bf..269d42dc3621 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -15,7 +15,7 @@
     specific language governing permissions and limitations
     under the License.
 
-TOPI
+topi
 ----
 .. automodule:: topi
 
diff --git a/docs/api/python/vta/index.rst b/docs/api/python/vta/index.rst
index 4975032179e4..479b8394f0cb 100644
--- a/docs/api/python/vta/index.rst
+++ b/docs/api/python/vta/index.rst
@@ -15,8 +15,8 @@
     specific language governing permissions and limitations
     under the License.
 
-VTA API
-=======
+vta
+===
 
 This document contains the python API to VTA compiler toolchain.
 
diff --git a/docs/conf.py b/docs/conf.py
index 0b29f2816c8b..3ca622d6ff18 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -49,9 +49,9 @@
 
 # General information about the project.
 project = u'tvm'
-author = u'%s developers' % project
-copyright = u'2018, %s' % author
-github_doc_root = 'https://github.com/tqchen/tvm/tree/master/docs/'
+author = u'Apache Software Foundation'
+copyright = u'2019, %s' % author
+github_doc_root = 'https://github.com/apache/incubator-tvm/tree/master/docs/'
 
 # add markdown parser
 CommonMarkParser.github_doc_root = github_doc_root
@@ -73,6 +73,7 @@
     'sphinx.ext.napoleon',
     'sphinx.ext.mathjax',
     'sphinx_gallery.gen_gallery',
+    'autodocsumm'
 ]
 
 breathe_projects = {'tvm' : 'doxygen/xml/'}
@@ -139,9 +140,6 @@
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = False
 
-# sort autodoc order by the source file.
-autodoc_member_order = 'bysource'
-
 # -- Options for HTML output ----------------------------------------------
 
 # The theme is set by the make target
@@ -199,9 +197,9 @@ def run_doxygen(folder):
 
 intersphinx_mapping = {
     'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),
-    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
-    'scipy': ('http://docs.scipy.org/doc/scipy/reference', None),
-    'matplotlib': ('http://matplotlib.org/', None),
+    'numpy': ('https://docs.scipy.org/doc/numpy/', None),
+    'scipy': ('https://docs.scipy.org/doc/scipy/reference', None),
+    'matplotlib': ('https://matplotlib.org/', None),
 }
 
 from sphinx_gallery.sorting import ExplicitOrder
@@ -242,8 +240,8 @@ def setup(app):
     'doc_module': ('tvm', 'numpy'),
 'reference_url': {
     'tvm': None,
-    'matplotlib': 'http://matplotlib.org',
-    'numpy': 'http://docs.scipy.org/doc/numpy-1.9.1'},
+    'matplotlib': 'https://matplotlib.org/',
+    'numpy': 'https://docs.scipy.org/doc/numpy/'},
     'examples_dirs': examples_dirs,
     'gallery_dirs': gallery_dirs,
     'subsection_order': subsection_order,
@@ -251,3 +249,7 @@ def setup(app):
     'find_mayavi_figures': False,
     'expected_failing_examples': []
 }
+
+autodoc_default_options = {
+    'member-order': 'bysource',
+}
diff --git a/docs/contribute/committer_guide.rst b/docs/contribute/committer_guide.rst
index cc89a43aabdb..6e553bd42a39 100644
--- a/docs/contribute/committer_guide.rst
+++ b/docs/contribute/committer_guide.rst
@@ -59,7 +59,7 @@ Here are some example applications of this principle:
 
 
 Shepherd a Pull Request
-----------------------
+-----------------------
 
 Here are some tips to shepherd a pull request.
 You can also take a look at the :ref:`code_review_guide`.
diff --git a/docs/contribute/error_handling.rst b/docs/contribute/error_handling.rst
index 4d5e5c54f03c..8f71ee61aeb6 100644
--- a/docs/contribute/error_handling.rst
+++ b/docs/contribute/error_handling.rst
@@ -15,25 +15,27 @@
     specific language governing permissions and limitations
     under the License.
 
-.. _error_guide:
+.. _error-handling-guide:
 
 Error Handling Guide
 ====================
 TVM contains structured error classes to indicate specific types of error.
 Please raise a specific error type when possible, so that users can
 write code to handle a specific error category if necessary.
-
-All the error types are defined in :any:`tvm.error` namespace.
 You can directly raise the specific error object in python.
 In other languages like c++, you simply add ``<ErrorType>:`` prefix to
 the error message(see below).
 
+.. note::
+
+   Please refer to :py:mod:`tvm.error` for the list of errors.
+
 Raise a Specific Error in C++
 -----------------------------
 You can add ``<ErrorType>:`` prefix to your error message to
 raise an error of the corresponding type.
 Note that you do not have to add a new type
-:any:`tvm.error.TVMError` will be raised by default when
+:py:class:`tvm.error.TVMError` will be raised by default when
 there is no error type prefix in the message.
 This mechanism works for both ``LOG(FATAL)`` and ``CHECK`` macros.
 The following code gives an example on how to do so.
@@ -55,7 +57,7 @@ Here is what will happen if we call the registered function:
 .. code::
 
   >>> import tvm
-  >>> tvm._api_internal._ErrorTest(0, 1)
+  >>> tvm.testing.ErrorTest(0, 1)
   Traceback (most recent call last):
     File "<stdin>", line 1, in <module>
     File "/path/to/tvm/python/tvm/_ffi/_ctypes/function.py", line 190, in __call__
@@ -68,7 +70,7 @@ Here is what will happen if we call the registered function:
     File "/path/to/tvm/src/api/api_test.cc", line 80
   ValueError: Check failed: x == y (0 vs. 1) : expect x and y to be equal.
   >>>
-  >>> tvm._api_internal._ErrorTest(1, 1)
+  >>> tvm.testing.ErrorTest(1, 1)
   Traceback (most recent call last):
     File "<stdin>", line 1, in <module>
     File "/path/to/tvm/python/tvm/_ffi/_ctypes/function.py", line 190, in __call__
@@ -95,7 +97,7 @@ We try to keep a reasonable amount of error types.
 If you feel there is a need to add a new error type, do the following steps:
 
 - Send a RFC proposal with a description and usage examples in the current codebase.
-- Add the new error type to :any:`tvm.error` with clear documents.
+- Add the new error type to :py:mod:`tvm.error` with clear documents.
 - Update the list in this file to include the new error type.
 - Change the code to use the new error type.
 
@@ -118,22 +120,3 @@ error messages when necessary.
 
 If we need to introduce a wrapper function that constructs multi-line error messages,
 please put wrapper in the same file so other developers can look up the implementation easily.
-
-
-System-wide Errors
-------------------
-
-.. autoclass:: tvm.error.TVMError
-
-.. autoclass:: tvm.error.InternalError
-
-
-Frontend Errors
----------------
-.. autoclass:: tvm.error.OpNotImplemented
-
-.. autoclass:: tvm.error.OpAttributeInvalid
-
-.. autoclass:: tvm.error.OpAttributeRequired
-
-.. autoclass:: tvm.error.OpAttributeNotImplemented
diff --git a/docs/dev/runtime.rst b/docs/dev/runtime.rst
index bb129b038aa9..353b3392d181 100644
--- a/docs/dev/runtime.rst
+++ b/docs/dev/runtime.rst
@@ -183,7 +183,7 @@ RPC server on iPhone/android/raspberry pi or even the browser. The cross compila
 This instant feedback gives us a lot of advantages. For example, to test the correctness of generated code on iPhone, we no longer have to write test-cases in swift/objective-c from scratch -- We can use RPC to execute on iPhone, copy the result back and do verification on the host via numpy. We can also do the profiling using the same script.
 
 TVM Object and Compiler Stack
----------------------------
+-----------------------------
 
 As we mentioned earlier, we build compiler stack API on top of the PackedFunc runtime system.
 We faced a constant changing of the compiler API for the need of research. We need a new language object or IR node whenever we want to test out new primitives.
diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py
index 2ebea86d8e3e..a7ddf851db37 100644
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
@@ -37,28 +37,32 @@ class XGBTuner(ModelBasedTuner):
 
         Note on choosing feature type:
         For single task tuning, 'itervar' and 'knob' are good.
-                                'itervar' is more accurate but 'knob' is much faster.
-                                There are some constraints on 'itervar', if you meet
-                                problems with feature extraction when using 'itervar',
-                                you can switch to 'knob'.
+        'itervar' is more accurate but 'knob' is much faster.
+        There are some constraints on 'itervar', if you meet
+        problems with feature extraction when using 'itervar',
+        you can switch to 'knob'.
 
         For cross-shape tuning (e.g. many convolutions with different shapes),
-                               'itervar' and 'curve' has better transferability,
-                               'knob' is faster.
+        'itervar' and 'curve' has better transferability,
+        'knob' is faster.
+
         For cross-device or cross-operator tuning, you can use 'curve' only.
     loss_type: str
         If is 'reg', use regression loss to train cost model.
-                     The cost model predicts the normalized flops.
+        The cost model predicts the normalized flops.
         If is 'rank', use pairwise rank loss to train cost model.
-                     The cost model predicts relative rank score.
+        The cost model predicts relative rank score.
+
     num_threads: int, optional
         The number of threads.  optimizer: str or ModelOptimizer, optional
         If is 'sa', use a default simulated annealing optimizer.
         Otherwise it should be a ModelOptimizer object.
+
     diversity_filter_ratio: int or float, optional
         If is not None, the tuner will first select
         top-(plan_size * diversity_filter_ratio) candidates according to the cost model
         and then pick batch_size of them according to the diversity metric.
+
     log_interval: int, optional
         The verbose level.
         If is 0, output nothing.
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 6b7c099ff705..73235f71c77b 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -25,19 +25,23 @@
 
 def create(graph_json_str, libmod, ctx):
     """Create a runtime executor module given a graph and module.
+
     Parameters
     ----------
     graph_json_str : str or graph class
         The graph to be deployed in json format output by json graph.
         The graph can only contain one operator(tvm_op) that
         points to the name of PackedFunc in the libmod.
-    libmod : tvm.Module
+
+    libmod : tvm.runtime.Module
         The module of the corresponding function
+
     ctx : TVMContext or list of TVMContext
         The context to deploy the module. It can be local or remote when there
         is only one TVMContext. Otherwise, the first context in the list will
         be used as this purpose. All context should be given for heterogeneous
         execution.
+
     Returns
     -------
     graph_module : GraphModule
@@ -61,11 +65,14 @@ def create(graph_json_str, libmod, ctx):
 
 def get_device_ctx(libmod, ctx):
     """Parse and validate all the device context(s).
+
     Parameters
     ----------
-    libmod : tvm.Module
+    libmod : tvm.runtime.Module
         The module of the corresponding function
+
     ctx : TVMContext or list of TVMContext
+
     Returns
     -------
     ctx : list of TVMContext
@@ -113,12 +120,12 @@ class GraphModule(object):
 
     Parameters
     ----------
-    module : Module
+    module : tvm.runtime.Module
         The internal tvm module that holds the actual graph functions.
 
     Attributes
     ----------
-    module : Module
+    module : tvm.runtime.Module
         The internal tvm module that holds the actual graph functions.
     """
 
diff --git a/python/tvm/error.py b/python/tvm/error.py
index 083c7e825255..02bc90b7cf8f 100644
--- a/python/tvm/error.py
+++ b/python/tvm/error.py
@@ -20,6 +20,10 @@
 See the example sections for for suggested message conventions.
 To make the code more readable, we recommended developers to
 copy the examples and raise errors with the same message convention.
+
+.. note::
+
+    Please also refer to :ref:`error-handling-guide`.
 """
 from tvm._ffi.base import register_error, TVMError
 
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index c76d5ed37135..9ee43438f83d 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1424,10 +1424,12 @@ def batch_norm(data,
 
     Besides the inputs and the outputs, this operator accepts two auxiliary
     states, ``moving_mean`` and ``moving_var``, which are *k*-length
-    vectors. They are global statistics for the whole dataset, which are updated by::
+    vectors. They are global statistics for the whole dataset, which are updated by
 
-    moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
-    moving_var = moving_var * momentum + data_var * (1 - momentum)
+    .. code:: python
+
+        moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+        moving_var = moving_var * momentum + data_var * (1 - momentum)
 
     The parameter ``axis`` specifies which axis of the input shape denotes
     the 'channel' (separately normalized groups).  The default is 1.
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index f1f8dd5a8c90..898038dea004 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -847,6 +847,7 @@ def clip(a, a_min, a_max):
     Examples
     --------
     .. code:: python
+
       x = relay.Constant(tvm.nd.array([0, 1, 5, 3, 4, 2]))
       relay.clip(x, 1., 4.)
       # [1, 1, 4, 3, 4, 2]
diff --git a/python/tvm/target/__init__.py b/python/tvm/target/__init__.py
index 3975f30e644a..3460be47aa95 100644
--- a/python/tvm/target/__init__.py
+++ b/python/tvm/target/__init__.py
@@ -51,7 +51,7 @@
    It is useful in environments where dynamic loading api like dlopen is banned.
    The system lib will be available as long as the result code is linked by the program.
 
-We can use :py:func:`~tvm.target.create` to create a tvm.target.Target from the target string.
+We can use :py:func:`tvm.target.create` to create a tvm.target.Target from the target string.
 We can also use other specific function in this module to create specific targets.
 """
 from .target import Target, create
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 8405bb10720f..e6046cef1839 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -30,12 +30,12 @@ class Target(Object):
     ----
     Do not use class constructor, you can create target using the following functions
 
-    - :py:func:`~tvm.target.create` create target from string
-    - :py:func:`~tvm.target.arm_cpu` create arm_cpu target
-    - :py:func:`~tvm.target.cuda` create CUDA target
-    - :py:func:`~tvm.target.rocm` create ROCM target
-    - :py:func:`~tvm.target.mali` create Mali target
-    - :py:func:`~tvm.target.intel_graphics` create Intel Graphics target
+    - :py:func:`tvm.target.create` create target from string
+    - :py:func:`tvm.target.arm_cpu` create arm_cpu target
+    - :py:func:`tvm.target.cuda` create CUDA target
+    - :py:func:`tvm.target.rocm` create ROCM target
+    - :py:func:`tvm.target.mali` create Mali target
+    - :py:func:`tvm.target.intel_graphics` create Intel Graphics target
     """
     def __new__(cls):
         # Always override new to enable class
@@ -262,7 +262,7 @@ def create(target_str):
 
     Note
     ----
-    See the note on :py:mod:`~tvm.target` on target string format.
+    See the note on :py:mod:`tvm.target` on target string format.
     """
     if isinstance(target_str, Target):
         return target_str
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 0564fff0e5cb..1580da369c33 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -15,12 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=unused-import, redefined-builtin, wildcard-import
-"""Namespace for Tensor-level IR"""
+"""Namespace for Tensor Expression Language
+"""
 # expose all operators in tvm tir.op
-from tvm.tir.op import *
+from tvm.tir import exp, erf, tanh, sigmoid, log, cos, sin, atan, sqrt, rsqrt, floor, ceil
+from tvm.tir import trunc, abs, round, nearbyint, isnan, power, popcount, fmod, if_then_else
+from tvm.tir import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod
+from tvm.tir import comm_reducer, min, max, sum
 
 from .schedule import Schedule, create_schedule
-from .tensor import TensorSlice, Tensor
+from .tensor import Tensor
 from .tensor_intrin import decl_tensor_intrin
 from .tag import tag_scope
 from .operation import placeholder, compute, scan, extern, var, size_var

From 406b5f764d0454e9641880310249a69b2fc59e9b Mon Sep 17 00:00:00 2001
From: Andrew <amcharg@gmail.com>
Date: Tue, 18 Feb 2020 18:03:03 -0800
Subject: [PATCH 20/73] Fix tvm.target.generic_func runtime detection (#4910)

---
 python/tvm/target/generic_func.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/target/generic_func.py b/python/tvm/target/generic_func.py
index 862fbedee0a4..13f280a5ab1a 100644
--- a/python/tvm/target/generic_func.py
+++ b/python/tvm/target/generic_func.py
@@ -20,10 +20,10 @@
 
 try:
     from decorator import decorate
-except ImportError as err_msg:
+except ImportError:
     # Allow decorator to be missing in runtime
-    if _LIB_NAME != "libtvm_runtime.so":
-        raise err_msg
+    if not tvm._ffi.base._RUNTIME_ONLY:
+        raise
 
 from tvm.runtime import Object
 from . target import Target

From fccf2268c857127739fadb0349ef4bafaf66c282 Mon Sep 17 00:00:00 2001
From: hcyang <yhcvb@126.com>
Date: Wed, 19 Feb 2020 14:33:16 +0800
Subject: [PATCH 21/73] [RELAY][FRONTEND][TF] Fix FuseBatchNorm output cast
 error if need_cast is True (#4894)

---
 python/tvm/relay/frontend/tensorflow.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index f920682de2c7..587b07673fbe 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -897,6 +897,7 @@ def _impl(inputs, attr, params):
                       disables=['momentum'])(inputs, attr)
 
         if need_cast:
+            out = _expr.TupleGetItem(out.astuple(), 0)
             out = _op.cast(out, dtype=attr['T'].name)
         return out
     return _impl

From 18295b278678017cfd7cb0bdbd3a9254dd2f7936 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 19 Feb 2020 13:14:42 -0800
Subject: [PATCH 22/73] [REFACTOR] Polish ffi convention. (#4912)

* [REFACTOR] Polish ffi convention.

- Remove the src/api, keep registration local to the c++ function.
- Remove the api_internal as it is no longer needed.

* Update the codebase walk through
---
 CMakeLists.txt                                |   2 +-
 docs/dev/codebase_walkthrough.rst             |  32 +--
 python/tvm/_api_internal.py                   |  25 --
 python/tvm/_ffi/registry.py                   |  17 +-
 src/api/api_arith.cc                          | 153 -----------
 src/api/api_ir.cc                             | 237 ------------------
 src/api/api_lang.cc                           | 223 ----------------
 src/api/api_schedule.cc                       |  63 -----
 src/arith/analyzer.cc                         |  60 +++++
 src/arith/bound_deducer.cc                    |  13 +-
 src/arith/const_int_bound.cc                  |   8 +
 src/arith/detect_linear_equation.cc           |   7 +
 src/arith/domain_touched.cc                   |   3 +
 src/arith/int_set.cc                          |  23 ++
 src/arith/modular_set.cc                      |   7 +
 src/ir/expr.cc                                |   8 +
 .../api_test.cc => support/ffi_testing.cc}    |   6 +-
 src/te/operation/compute_op.cc                |   5 +
 src/te/operation/extern_op.cc                 |   5 +
 src/te/operation/hybrid_op.cc                 |   5 +
 src/te/operation/placeholder_op.cc            |   6 +
 src/te/operation/scan_op.cc                   |   5 +
 src/te/operation/tensor_compute_op.cc         |   5 +
 src/te/schedule/auto_inline_elem_wise.cc      |   8 +
 src/te/schedule/bound.cc                      |   4 +
 src/te/schedule/graph.cc                      |  20 ++
 src/te/schedule/schedule_lang.cc              | 114 +++++++++
 src/te/schedule/schedule_ops.cc               |   9 +
 src/te/tensor.cc                              |  29 +++
 src/tir/ir/expr.cc                            | 134 ++++++++++
 src/tir/ir/op.cc                              |  86 +++++++
 src/tir/ir/stmt.cc                            |  90 ++++++-
 src/{api/api_pass.cc => tir/pass/ffi_api.cc}  |   6 +-
 tests/python/unittest/test_runtime_error.py   |   6 +-
 34 files changed, 672 insertions(+), 752 deletions(-)
 delete mode 100644 python/tvm/_api_internal.py
 delete mode 100644 src/api/api_arith.cc
 delete mode 100644 src/api/api_ir.cc
 delete mode 100644 src/api/api_lang.cc
 delete mode 100644 src/api/api_schedule.cc
 rename src/{api/api_test.cc => support/ffi_testing.cc} (97%)
 rename src/{api/api_pass.cc => tir/pass/ffi_api.cc} (99%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8540a661f99d..9d25e4a9ba58 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,7 +133,7 @@ file(GLOB_RECURSE COMPILER_SRCS
     src/tir/*.cc
     src/driver/*.cc
     src/printer/*.cc
-    src/api/*.cc
+    src/support/*.cc
     )
 
 file(GLOB CODEGEN_SRCS
diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst
index 0732c26f0c58..8513ce5bd89d 100644
--- a/docs/dev/codebase_walkthrough.rst
+++ b/docs/dev/codebase_walkthrough.rst
@@ -55,7 +55,7 @@ We use a simple example that uses the low level TVM API directly. The example is
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
 
-Here, types of ``A``, ``B``, ``C`` are ``tvm.tensor.Tensor``, defined in ``python/tvm/tensor.py``. The Python ``Tensor`` is backed by C++ ``Tensor``, implemented in ``include/tvm/tensor.h`` and ``src/lang/tensor.cc``. All Python types in TVM can be thought of as a handle to the underlying C++ type with the same name. If you look at the definition of Python ``Tensor`` type below, you can see it is a subclass of ``Object``.
+Here, types of ``A``, ``B``, ``C`` are ``tvm.tensor.Tensor``, defined in ``python/tvm/te/tensor.py``. The Python ``Tensor`` is backed by C++ ``Tensor``, implemented in ``include/tvm/te/tensor.h`` and ``src/te/tensor.cc``. All Python types in TVM can be thought of as a handle to the underlying C++ type with the same name. If you look at the definition of Python ``Tensor`` type below, you can see it is a subclass of ``Object``.
 
 ::
 
@@ -68,24 +68,12 @@ Here, types of ``A``, ``B``, ``C`` are ``tvm.tensor.Tensor``, defined in ``pytho
 
 The object protocol is the basis of exposing C++ types to frontend languages, including Python. The way TVM implements Python wrapping is not straightforward. It is briefly covered in `this document <https://docs.tvm.ai/dev/runtime.html#tvm-node-and-compiler-stack>`_, and details are in ``python/tvm/_ffi/`` if you are interested.
 
-``Tensor`` is created by functions in ``python/tvm/api.py``, which in turn calls into C++ functions exposed in ``src/api/api_lang.cc``. All C++ functions that are callable from Python are exposed in the ``src/api`` subdirectory. For example, the ``tvm.compute()`` function above calls into ``_ComputeOp`` API exposed in ``src/api/api_lang.cc``:
-
-::
-
-   TVM_REGISTER_GLOBAL("_ComputeOp")
-   .set_body([](TVMArgs args,  TVMRetValue* ret) {
-       *ret = ComputeOpNode::make(args[0],
-                                  args[1],
-                                  args[2],
-                                  args[3],
-                                  args[4]);
-     });
-
 We use the ``TVM_REGISTER_*`` macro to expose C++ functions to frontend languages, in the form of a `PackedFunc <https://docs.tvm.ai/dev/runtime.html#packedfunc>`_. A ``PackedFunc`` is another mechanism by which TVM implements interoperability between C++ and Python. In particular, this is what makes calling Python functions from the C++ codebase very easy.
+You can also checkout `FFI Navigator <https://github.com/tqchen/ffi-navigator>`_ which allows you to navigate between python and c++ FFI calls.
 
-A ``Tensor`` object has an ``Operation`` object associated with it, defined in ``python/tvm/tensor.py``, ``include/tvm/operation.h``, and ``src/tvm/op`` subdirectory. A ``Tensor`` is an output of its ``Operation`` object. Each ``Operation`` object has in turn ``input_tensors()`` method, which returns a list of input ``Tensor`` to it. This way we can keep track of dependencies between ``Operation``.
+A ``Tensor`` object has an ``Operation`` object associated with it, defined in ``python/tvm/te/tensor.py``, ``include/tvm/te/operation.h``, and ``src/tvm/te/operation`` subdirectory. A ``Tensor`` is an output of its ``Operation`` object. Each ``Operation`` object has in turn ``input_tensors()`` method, which returns a list of input ``Tensor`` to it. This way we can keep track of dependencies between ``Operation``.
 
-We pass the operation corresponding to the output tensor ``C`` to ``tvm.create_schedule()`` function in ``python/tvm/schedule.py``.
+We pass the operation corresponding to the output tensor ``C`` to ``tvm.create_schedule()`` function in ``python/tvm/te/schedule.py``.
 
 ::
 
@@ -103,7 +91,7 @@ This function is mapped to the C++ function in ``include/tvm/schedule.h``.
 
 ``Stage`` corresponds to one ``Operation``. In the vector add example above, there are two placeholder ops and one compute op, so the schedule ``s`` contains three stages. Each ``Stage`` holds information about a loop nest structure, types of each loop (``Parallel``, ``Vectorized``, ``Unrolled``), and where to execute its computation in the loop nest of the next ``Stage``, if any.
 
-``Schedule`` and ``Stage`` are defined in ``tvm/python/schedule.py``, ``include/tvm/schedule.h``, and ``src/schedule/schedule_ops.cc``.
+``Schedule`` and ``Stage`` are defined in ``tvm/python/te/schedule.py``, ``include/tvm/te/schedule.h``, and ``src/te/schedule/schedule_ops.cc``.
 
 To keep it simple, we call ``tvm.build(...)`` on the default schedule created by ``create_schedule()`` function above.
 
@@ -112,7 +100,7 @@ To keep it simple, we call ``tvm.build(...)`` on the default schedule created by
    target = "cuda"
    fadd = tvm.build(s, [A, B, C], target)
 
-``tvm.build()``, defined in ``python/tvm/build_module.py``, takes a schedule, input and output ``Tensor``, and a target, and returns a ``tvm.Module`` object, defined in ``python/tvm/module.py``. A ``Module`` object contains a compiled function which can be invoked with function call syntax.
+``tvm.build()``, defined in ``python/tvm/driver/build_module.py``, takes a schedule, input and output ``Tensor``, and a target, and returns a :py:class:`tvm.runtime.Module` object. A :py:class:`tvm.runtime.Module` object contains a compiled function which can be invoked with function call syntax.
 
 The process of ``tvm.build()`` can be divided into two steps:
 
@@ -133,14 +121,14 @@ Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_modu
       stmt = schedule.ScheduleOps(sch, bounds)
       ...
 
-Bound inference is the process where all loop bounds and sizes of intermediate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/schedule/bound.cc``, ``src/schedule/graph.cc`` and ``src/schedule/message_passing.cc``. For more information on how bound inference works, see `InferBound Pass`_.
+Bound inference is the process where all loop bounds and sizes of intermediate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/te/schedule/bound.cc``, ``src/te/schedule/graph.cc`` and ``src/te/schedule/message_passing.cc``. For more information on how bound inference works, see `InferBound Pass`_.
 
 .. _InferBound Pass: http://docs.tvm.ai/dev/inferbound.html
 
 
-``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects those changes. ``ScheduleOps()`` is defined in ``src/schedule/schedule_ops.cc``.
+``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects those changes. ``ScheduleOps()`` is defined in ``src/te/schedule/schedule_ops.cc``.
 
-Next, we apply a number of lowering passes to ``stmt``. These passes are implemented in ``src/pass`` subdirectory. For example, if you have applied ``vectorize`` or ``unroll`` primitives to your schedule, they are applied in loop vectorization and unrolling passes below.
+Next, we apply a number of lowering passes to ``stmt``. These passes are implemented in ``src/tir/pass`` subdirectory. For example, if you have applied ``vectorize`` or ``unroll`` primitives to your schedule, they are applied in loop vectorization and unrolling passes below.
 
 ::
 
@@ -157,7 +145,7 @@ Next, we apply a number of lowering passes to ``stmt``. These passes are impleme
 
 After lowering is done, ``build()`` function generates target machine code from the lowered function. This code can contain SSE or AVX instructions if you target x86, or PTX instructions for CUDA target. In addition to target specific machine code, TVM also generates host side code that is responsible for memory management, kernel launch etc.
 
-Code generation is done by ``build_module()`` function, defined in ``python/tvm/codegen.py``. On the C++ side, code generation is implemented in ``src/codegen`` subdirectory. ``build_module()`` Python function will reach ``Build()`` function below in ``src/codegen/codegen.cc``:
+Code generation is done by ``build_module()`` function, defined in ``python/tvm/target/codegen.py``. On the C++ side, code generation is implemented in ``src/target/codegen`` subdirectory. ``build_module()`` Python function will reach ``Build()`` function below in ``src/target/codegen/codegen.cc``:
 
 ::
 
diff --git a/python/tvm/_api_internal.py b/python/tvm/_api_internal.py
deleted file mode 100644
index 571523757cac..000000000000
--- a/python/tvm/_api_internal.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Namespace of internal API
-
-The functions in this namespace are automatically exported from C++ side via PackedFunc
-that is registered by "TVM_REGISTER_*" macro. This way makes calling Python functions from C++
-side very easily.
-
-Each string starts with "_" in the "TVM_REGISTER_*" macro is an internal API. You can find
-all the functions in "api_lang.cc", "api_base.cc", "api_arith.cc" and "api_ir.cc" under "src/api".
-"""
diff --git a/python/tvm/_ffi/registry.py b/python/tvm/_ffi/registry.py
index be1578550a3b..e4b8b18b4805 100644
--- a/python/tvm/_ffi/registry.py
+++ b/python/tvm/_ffi/registry.py
@@ -19,7 +19,6 @@
 """FFI registry to register function and objects."""
 import sys
 import ctypes
-from .. import _api_internal
 
 from .base import _LIB, check_call, py_str, c_str, string_types, _FFI_MODE, _RUNTIME_ONLY
 
@@ -288,17 +287,11 @@ def _init_api_prefix(module_name, prefix):
     module = sys.modules[module_name]
 
     for name in list_global_func_names():
-        if prefix == "api":
-            fname = name
-            if name.startswith("_"):
-                target_module = sys.modules["tvm._api_internal"]
-            else:
-                target_module = module
-        else:
-            if not name.startswith(prefix):
-                continue
-            fname = name[len(prefix)+1:]
-            target_module = module
+        if not name.startswith(prefix):
+            continue
+
+        fname = name[len(prefix)+1:]
+        target_module = module
 
         if fname.find(".") != -1:
             continue
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
deleted file mode 100644
index 3942f6ef0f20..000000000000
--- a/src/api/api_arith.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Implementation of API functions related to arith
- * \file api_arith.cc
- */
-#include <tvm/arith/bound.h>
-#include <tvm/arith/int_set.h>
-#include <tvm/arith/pattern.h>
-#include <tvm/arith/analyzer.h>
-
-#include <tvm/tir/expr.h>
-#include <tvm/tir/expr.h>
-#include <tvm/runtime/registry.h>
-
-#include <tvm/te/tensor.h>
-
-namespace tvm {
-namespace arith {
-
-TVM_REGISTER_GLOBAL("arith.intset_single_point")
-.set_body_typed(IntSet::single_point);
-
-TVM_REGISTER_GLOBAL("arith.intset_vector")
-.set_body_typed(IntSet::vector);
-
-TVM_REGISTER_GLOBAL("arith.intset_interval")
-.set_body_typed(IntSet::interval);
-
-
-TVM_REGISTER_GLOBAL("arith.DetectLinearEquation")
-.set_body_typed(DetectLinearEquation);
-
-TVM_REGISTER_GLOBAL("arith.DetectClipBound")
-.set_body_typed(DetectClipBound);
-
-TVM_REGISTER_GLOBAL("arith.DeduceBound")
-.set_body_typed([](
-  PrimExpr v, PrimExpr cond,
-  const Map<Var, IntSet> hint_map,
-  const Map<Var, IntSet> relax_map
-) {
-  return DeduceBound(v, cond, hint_map, relax_map);
-});
-
-
-TVM_REGISTER_GLOBAL("arith.DomainTouched")
-.set_body_typed(DomainTouched);
-
-TVM_REGISTER_GLOBAL("arith.IntervalSetGetMin")
-.set_body_method(&IntSet::min);
-
-TVM_REGISTER_GLOBAL("arith.IntervalSetGetMax")
-.set_body_method(&IntSet::max);
-
-TVM_REGISTER_GLOBAL("arith.IntSetIsNothing")
-.set_body_method(&IntSet::is_nothing);
-
-TVM_REGISTER_GLOBAL("arith.IntSetIsEverything")
-.set_body_method(&IntSet::is_everything);
-
-ConstIntBound MakeConstIntBound(int64_t min_value, int64_t max_value) {
-  return ConstIntBound(min_value, max_value);
-}
-
-TVM_REGISTER_GLOBAL("arith.ConstIntBound")
-.set_body_typed(MakeConstIntBound);
-
-ModularSet MakeModularSet(int64_t coeff, int64_t base) {
-  return ModularSet(coeff, base);
-}
-
-TVM_REGISTER_GLOBAL("arith.ModularSet")
-.set_body_typed(MakeModularSet);
-
-TVM_REGISTER_GLOBAL("arith.CreateAnalyzer")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    using runtime::PackedFunc;
-    using runtime::TypedPackedFunc;
-    auto self = std::make_shared<Analyzer>();
-    auto f = [self](std::string name) -> PackedFunc {
-      if (name == "const_int_bound") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            *ret = self->const_int_bound(args[0]);
-          });
-      } else if (name == "modular_set") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            *ret = self->modular_set(args[0]);
-        });
-      } else if (name == "const_int_bound_update") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            self->const_int_bound.Update(args[0], args[1], args[2]);
-        });
-      } else if (name == "Simplify") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            *ret = self->Simplify(args[0]);
-        });
-      } else if (name == "rewrite_simplify") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            *ret = self->rewrite_simplify(args[0]);
-        });
-      } else if (name == "canonical_simplify") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            *ret = self->canonical_simplify(args[0]);
-        });
-      } else if (name == "int_set") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            *ret = self->int_set(args[0], args[1]);
-        });
-      } else if (name == "bind") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            if (args[1].IsObjectRef<Range>()) {
-              self->Bind(args[0], args[1].operator Range());
-            } else {
-              self->Bind(args[0], args[1].operator PrimExpr());
-            }
-        });
-      } else if (name == "enter_constraint_context") {
-        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            // can't use make_shared due to noexcept(false) decl in destructor,
-            // see https://stackoverflow.com/a/43907314
-            auto ctx = std::shared_ptr<With<ConstraintContext> >(
-                new With<ConstraintContext>(self.get(), args[0]));
-            auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable {
-              ctx.reset();
-            };
-            *ret = PackedFunc(fexit);
-        });
-      }
-      return PackedFunc();
-    };
-    *ret = TypedPackedFunc<PackedFunc(std::string)>(f);
-});
-
-}  // namespace arith
-}  // namespace tvm
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
deleted file mode 100644
index 1e71baf305d4..000000000000
--- a/src/api/api_ir.cc
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Implementation of API functions related to IR build
- * \file api_ir.cc
- */
-#include <tvm/tir/expr.h>
-#include <tvm/tir/expr.h>
-#include <tvm/runtime/registry.h>
-
-#include <tvm/tir/op.h>
-
-namespace tvm {
-namespace tir {
-
-TVM_REGISTER_GLOBAL("tir.Var")
-.set_body_typed([](std::string s, DataType t) {
-    return Var(s, t);
-  });
-
-TVM_REGISTER_GLOBAL("tir.SizeVar")
-.set_body_typed([](std::string s, DataType t) {
-    return SizeVar(s, t);
-  });
-
-TVM_REGISTER_GLOBAL("tir.abs")
-.set_body_typed(tvm::abs);
-
-TVM_REGISTER_GLOBAL("tir.isnan")
-.set_body_typed(tvm::isnan);
-
-TVM_REGISTER_GLOBAL("tir.floor")
-.set_body_typed(tvm::floor);
-
-TVM_REGISTER_GLOBAL("tir.ceil")
-.set_body_typed(tvm::ceil);
-
-TVM_REGISTER_GLOBAL("tir.round")
-.set_body_typed(tvm::round);
-
-TVM_REGISTER_GLOBAL("tir.nearbyint")
-.set_body_typed(tvm::nearbyint);
-
-TVM_REGISTER_GLOBAL("tir.trunc")
-.set_body_typed(tvm::trunc);
-
-TVM_REGISTER_GLOBAL("tir._cast")
-.set_body_typed(tvm::cast);
-
-TVM_REGISTER_GLOBAL("ir.range_by_min_extent")
-.set_body_typed(Range::make_by_min_extent);
-
-
-TVM_REGISTER_GLOBAL("tir.SeqStmt")
-.set_body_typed([](Array<Stmt> seq) {
-  return SeqStmt(std::move(seq));
-});
-
-TVM_REGISTER_GLOBAL("tir.For")
-.set_body_typed([](
-  Var loop_var, PrimExpr min, PrimExpr extent,
-  int for_type, int device_api, Stmt body) {
-  return ForNode::make(loop_var,
-                   min,
-                   extent,
-                   static_cast<ForType>(for_type),
-                   static_cast<DeviceAPI>(device_api),
-                   body);
-});
-
-TVM_REGISTER_GLOBAL("tir.Load")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    DataType t = args[0];
-    if (args.size() == 3) {
-      *ret = LoadNode::make(t, args[1], args[2], const_true(t.lanes()));
-    } else {
-      *ret = LoadNode::make(t, args[1], args[2], args[3]);
-    }
-  });
-
-TVM_REGISTER_GLOBAL("tir.Store")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    PrimExpr value = args[1];
-    if (args.size() == 3) {
-      *ret = StoreNode::make(args[0], value, args[2], const_true(value.dtype().lanes()));
-    } else {
-      *ret = StoreNode::make(args[0], value, args[2], args[3]);
-    }
-  });
-
-TVM_REGISTER_GLOBAL("tir.Realize")
-.set_body_typed(RealizeNode::make);
-
-TVM_REGISTER_GLOBAL("tir.Call")
-.set_body_typed([](
-  DataType type, std::string name,
-  Array<PrimExpr> args, int call_type,
-  FunctionRef func, int value_index
-) {
-  return CallNode::make(type,
-                    name,
-                    args,
-                    static_cast<CallNode::CallType>(call_type),
-                    func,
-                    value_index);
-});
-
-TVM_REGISTER_GLOBAL("tir.CommReducer")
-.set_body_typed(CommReducerNode::make);
-
-// make from two arguments
-#define REGISTER_MAKE(NodeName)                                     \
-  TVM_REGISTER_GLOBAL("tir."#NodeName)                             \
-  .set_body_typed(NodeName ## Node::make);                          \
-
-
-REGISTER_MAKE(Reduce);
-REGISTER_MAKE(AttrStmt);
-
-REGISTER_MAKE(StringImm);
-
-REGISTER_MAKE(Add);
-REGISTER_MAKE(Sub);
-REGISTER_MAKE(Mul);
-REGISTER_MAKE(Div);
-REGISTER_MAKE(Mod);
-REGISTER_MAKE(FloorDiv);
-REGISTER_MAKE(FloorMod);
-REGISTER_MAKE(Min);
-REGISTER_MAKE(Max);
-REGISTER_MAKE(EQ);
-REGISTER_MAKE(NE);
-REGISTER_MAKE(LT);
-REGISTER_MAKE(LE);
-REGISTER_MAKE(GT);
-REGISTER_MAKE(GE);
-REGISTER_MAKE(And);
-REGISTER_MAKE(Or);
-
-REGISTER_MAKE(Not);
-REGISTER_MAKE(Select);
-REGISTER_MAKE(Ramp);
-REGISTER_MAKE(Cast);
-REGISTER_MAKE(Broadcast);
-REGISTER_MAKE(Shuffle);
-REGISTER_MAKE(Let);
-REGISTER_MAKE(LetStmt);
-REGISTER_MAKE(AssertStmt);
-REGISTER_MAKE(ProducerConsumer);
-REGISTER_MAKE(Provide);
-REGISTER_MAKE(Prefetch);
-REGISTER_MAKE(Free);
-REGISTER_MAKE(IfThenElse);
-REGISTER_MAKE(Evaluate);
-
-// overloaded, needs special handling
-// has default args
-TVM_REGISTER_GLOBAL("tir.Allocate")
-  .set_body_typed([](
-    Var buffer_var, DataType type, Array<PrimExpr> extents, PrimExpr condition, Stmt body
-  ){
-    return AllocateNode::make(buffer_var, type, extents, condition, body);
-  });
-
-// operator overloading, smarter than make
-#define REGISTER_MAKE_BINARY_OP(Node, Func)                    \
-  TVM_REGISTER_GLOBAL("tir."#Node)                              \
-  .set_body_typed([](PrimExpr a, PrimExpr b) {                  \
-    return (Func(a, b));                                        \
-  })
-
-#define REGISTER_MAKE_BIT_OP(Node, Func)                               \
-  TVM_REGISTER_GLOBAL("tir."#Node)                                      \
-  .set_body([](TVMArgs args,  TVMRetValue *ret) {                       \
-    bool lhs_is_int = args[0].type_code() == kDLInt;                    \
-    bool rhs_is_int = args[1].type_code() == kDLInt;                    \
-    if (lhs_is_int) {                                                   \
-      *ret = (Func(args[0].operator int(), args[1].operator PrimExpr())); \
-    } else if (rhs_is_int) {                                            \
-      *ret = (Func(args[0].operator PrimExpr(), args[1].operator int())); \
-    } else {                                                            \
-      *ret = (Func(args[0].operator PrimExpr(), args[1].operator PrimExpr())); \
-    }                                                                   \
-  })
-
-
-REGISTER_MAKE_BINARY_OP(_OpAdd, operator+);
-REGISTER_MAKE_BINARY_OP(_OpSub, operator-);
-REGISTER_MAKE_BINARY_OP(_OpMul, operator*);
-REGISTER_MAKE_BINARY_OP(_OpDiv, div);
-REGISTER_MAKE_BINARY_OP(_OpMod, truncmod);
-REGISTER_MAKE_BINARY_OP(_OpIndexDiv, indexdiv);
-REGISTER_MAKE_BINARY_OP(_OpIndexMod, indexmod);
-REGISTER_MAKE_BINARY_OP(_OpFloorDiv, floordiv);
-REGISTER_MAKE_BINARY_OP(_OpFloorMod, floormod);
-REGISTER_MAKE_BINARY_OP(_OpTruncDiv, truncdiv);
-REGISTER_MAKE_BINARY_OP(_OpTruncMod, truncmod);
-REGISTER_MAKE_BINARY_OP(_OpPow, pow);
-REGISTER_MAKE_BINARY_OP(_OpMin, min);
-REGISTER_MAKE_BINARY_OP(_OpMax, max);
-REGISTER_MAKE_BINARY_OP(_OpEQ, operator==);
-REGISTER_MAKE_BINARY_OP(_OpNE, operator!=);
-REGISTER_MAKE_BINARY_OP(_OpLT, operator<); // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(_OpLE, operator<=); // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(_OpGT, operator>);  // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(_OpGE, operator>=);
-REGISTER_MAKE_BINARY_OP(_OpAnd, operator&&);
-REGISTER_MAKE_BINARY_OP(_OpOr, operator||);
-REGISTER_MAKE_BIT_OP(bitwise_and, operator&);
-REGISTER_MAKE_BIT_OP(bitwise_or, operator|);
-REGISTER_MAKE_BIT_OP(bitwise_xor, operator^);
-REGISTER_MAKE_BIT_OP(left_shift, operator<<); // NOLINT(*)
-REGISTER_MAKE_BIT_OP(right_shift, operator>>);
-TVM_REGISTER_GLOBAL("tir._OpIfThenElse")
-.set_body_typed([] (PrimExpr cond, PrimExpr true_value, PrimExpr false_value) {
-  return if_then_else(cond, true_value, false_value);
-});
-
-}  // namespace tir
-}  // namespace tvm
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
deleted file mode 100644
index 613b82311aed..000000000000
--- a/src/api/api_lang.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Implementation of API functions related to Higher DSL build.
- * \file api_lang.cc
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/tir/expr.h>
-#include <tvm/te/tensor.h>
-#include <tvm/te/operation.h>
-#include <tvm/tir/buffer.h>
-#include <tvm/te/schedule.h>
-#include <tvm/runtime/registry.h>
-
-#include <tvm/driver/driver_api.h>
-#include <tvm/tir/data_layout.h>
-
-namespace tvm {
-
-TVM_REGISTER_GLOBAL("tir.min_value")
-.set_body_typed(min_value);
-
-TVM_REGISTER_GLOBAL("tir.max_value")
-.set_body_typed(max_value);
-
-TVM_REGISTER_GLOBAL("ir.Range")
-.set_body([](TVMArgs args,  TVMRetValue* ret) {
-  *ret = Range(args[0], args[1]);
-  });
-
-namespace tir {
-TVM_REGISTER_GLOBAL("tir.IterVar")
-.set_body_typed([](Range dom, Var var, int iter_type, std::string thread_tag) {
-  return IterVarNode::make(
-      dom, var,
-      static_cast<IterVarType>(iter_type),
-      thread_tag);
-});
-}
-
-namespace te {
-TVM_REGISTER_GLOBAL("te.Tensor")
-.set_body_typed(TensorNode::make);
-
-TVM_REGISTER_GLOBAL("te.TensorIntrin")
-.set_body_typed(TensorIntrinNode::make);
-
-TVM_REGISTER_GLOBAL("te.TensorIntrinCall")
-.set_body_typed(TensorIntrinCallNode::make);
-
-TVM_REGISTER_GLOBAL("te.TensorEqual")
-.set_body_method(&Tensor::operator==);
-
-TVM_REGISTER_GLOBAL("te.TensorHash")
-.set_body_typed([](Tensor tensor) -> int64_t {
-    return static_cast<int64_t>(std::hash<Tensor>()(tensor));
-  });
-
-TVM_REGISTER_GLOBAL("te.Placeholder")
-.set_body_typed([](Array<PrimExpr> shape, DataType dtype, std::string name) {
-  return placeholder(shape, dtype, name);
-});
-
-TVM_REGISTER_GLOBAL("te.ComputeOp")
-.set_body_typed(ComputeOpNode::make);
-
-TVM_REGISTER_GLOBAL("te.ScanOp")
-.set_body_typed(ScanOpNode::make);
-
-TVM_REGISTER_GLOBAL("te.TensorComputeOp")
-.set_body_typed(TensorComputeOpNode::make);
-
-TVM_REGISTER_GLOBAL("te.ExternOp")
-.set_body_typed(ExternOpNode::make);
-
-TVM_REGISTER_GLOBAL("te.HybridOp")
-.set_body_typed(HybridOpNode::make);
-
-TVM_REGISTER_GLOBAL("te.OpGetOutput")
-.set_body_typed([](Operation op, int64_t output) {
-  return op.output(static_cast<size_t>(output));
-});
-
-TVM_REGISTER_GLOBAL("te.OpNumOutputs")
-.set_body_method<Operation>(&OperationNode::num_outputs);
-
-TVM_REGISTER_GLOBAL("te.OpInputTensors")
-.set_body_method<Operation>(&OperationNode::InputTensors);
-
-TVM_REGISTER_GLOBAL("te.CreateSchedule")
-.set_body_typed(create_schedule);
-
-TVM_REGISTER_GLOBAL("te.StageSetScope")
-.set_body_method(&Stage::set_scope);
-
-TVM_REGISTER_GLOBAL("te.StageBind")
-.set_body_method(&Stage::bind);
-
-TVM_REGISTER_GLOBAL("te.StageSplitByFactor")
-.set_body_typed([](Stage stage, IterVar parent, PrimExpr factor) {
-  IterVar outer, inner;
-  stage.split(parent, factor, &outer, &inner);
-  return Array<IterVar>({outer, inner});
-});
-
-TVM_REGISTER_GLOBAL("te.StageSplitByNParts")
-.set_body_typed([](Stage stage, IterVar parent, PrimExpr nparts) {
-  IterVar outer, inner;
-  stage.split_by_nparts(parent, nparts, &outer, &inner);
-  return Array<IterVar>({outer, inner});
-});
-
-TVM_REGISTER_GLOBAL("te.StageFuse")
-.set_body_typed([](Stage stage, Array<IterVar> axes) {
-    IterVar fused;
-    stage.fuse(axes, &fused);
-    return fused;
-  });
-
-TVM_REGISTER_GLOBAL("te.StageComputeAt")
-.set_body_method(&Stage::compute_at);
-
-TVM_REGISTER_GLOBAL("te.StageComputeInline")
-.set_body_method(&Stage::compute_inline);
-
-TVM_REGISTER_GLOBAL("te.StageComputeRoot")
-.set_body_method(&Stage::compute_root);
-
-TVM_REGISTER_GLOBAL("te.StageReorder")
-.set_body_method(&Stage::reorder);
-
-TVM_REGISTER_GLOBAL("te.StageTile")
-.set_body_typed([](
-  Stage stage,
-  IterVar x_parent, IterVar y_parent,
-  PrimExpr x_factor, PrimExpr y_factor
-) {
-    IterVar x_outer, y_outer, x_inner, y_inner;
-    stage.tile(x_parent, y_parent,
-               x_factor, y_factor,
-               &x_outer, &y_outer,
-               &x_inner, &y_inner);
-    return Array<IterVar>({x_outer, y_outer, x_inner, y_inner});
-  });
-
-TVM_REGISTER_GLOBAL("te.StageEnvThreads")
-.set_body_method(&Stage::env_threads);
-
-TVM_REGISTER_GLOBAL("te.StageSetStorePredicate")
-.set_body_method(&Stage::set_store_predicate);
-
-TVM_REGISTER_GLOBAL("te.StageUnroll")
-.set_body_method(&Stage::unroll);
-
-TVM_REGISTER_GLOBAL("te.StageVectorize")
-.set_body_method(&Stage::vectorize);
-
-TVM_REGISTER_GLOBAL("te.StageTensorize")
-.set_body_method(&Stage::tensorize);
-
-TVM_REGISTER_GLOBAL("te.StageParallel")
-.set_body_method(&Stage::parallel);
-
-TVM_REGISTER_GLOBAL("te.StagePragma")
-.set_body_method(&Stage::pragma);
-
-TVM_REGISTER_GLOBAL("te.StagePrefetch")
-.set_body_method(&Stage::prefetch);
-
-TVM_REGISTER_GLOBAL("te.StageStorageAlign")
-.set_body_method(&Stage::storage_align);
-
-TVM_REGISTER_GLOBAL("te.StageDoubleBuffer")
-.set_body_method(&Stage::double_buffer);
-
-TVM_REGISTER_GLOBAL("te.StageOpenGL")
-.set_body_method(&Stage::opengl);
-
-TVM_REGISTER_GLOBAL("te.ScheduleNormalize")
-.set_body_method(&Schedule::normalize);
-
-TVM_REGISTER_GLOBAL("te.ScheduleCreateGroup")
-.set_body_method(&Schedule::create_group);
-
-TVM_REGISTER_GLOBAL("te.ScheduleCacheRead")
-.set_body_method(&Schedule::cache_read);
-
-TVM_REGISTER_GLOBAL("te.ScheduleCacheWrite")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-    if (args[1].IsObjectRef<Tensor>()) {
-      *ret = args[0].operator Schedule()
-          .cache_write(args[1].operator Tensor(), args[2]);
-    } else {
-      *ret = args[0].operator Schedule()
-          .cache_write(args[1].operator Array<Tensor>(), args[2]);
-    }
-  });
-
-TVM_REGISTER_GLOBAL("te.ScheduleRFactor")
-.set_body_method(&Schedule::rfactor);
-}  // namespace te
-
-TVM_REGISTER_GLOBAL("te.CommReducerCombine")
-.set_body_method<tir::CommReducer>(&tir::CommReducerNode::operator());
-
-}  // namespace tvm
diff --git a/src/api/api_schedule.cc b/src/api/api_schedule.cc
deleted file mode 100644
index a53c6e99a999..000000000000
--- a/src/api/api_schedule.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Implementation of API functions related to schedule pass.
- * \file api_schedule.cc
- */
-#include <tvm/tir/expr.h>
-#include <tvm/te/tensor.h>
-#include <tvm/te/schedule.h>
-#include <tvm/te/schedule_pass.h>
-#include <tvm/runtime/registry.h>
-
-#include "../te/schedule/graph.h"
-
-namespace tvm {
-namespace te {
-
-TVM_REGISTER_GLOBAL("schedule.AutoInlineElemWise")
-.set_body_typed(AutoInlineElemWise);
-
-
-TVM_REGISTER_GLOBAL("schedule.AutoInlineInjective")
-.set_body_typed(AutoInlineInjective);
-
-TVM_REGISTER_GLOBAL("schedule.ScheduleOps")
-.set_body([](TVMArgs args, TVMRetValue* ret) {
-  if (args.size() == 2)
-    *ret = ScheduleOps(args[0], args[1], false);
-  else
-    *ret = ScheduleOps(args[0], args[1], args[2]);
-});
-
-#define REGISTER_SCHEDULE_PASS(PassName)                             \
-  TVM_REGISTER_GLOBAL("schedule."#PassName)                          \
-  .set_body_typed(PassName);                                         \
-
-
-REGISTER_SCHEDULE_PASS(InferBound);
-REGISTER_SCHEDULE_PASS(CreateReadGraph);
-REGISTER_SCHEDULE_PASS(PostDFSOrder);
-REGISTER_SCHEDULE_PASS(CreateAttachPath);
-REGISTER_SCHEDULE_PASS(ScanGetBody);
-REGISTER_SCHEDULE_PASS(ScanFixPointAnalysis);
-
-}  // namespace te
-}  // namespace tvm
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index b12e5f51f4fb..9df5aa2d246d 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -20,6 +20,7 @@
 /*!
  * \file tvm/arith/analyzer.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/op.h>
@@ -109,5 +110,64 @@ PrimExpr Analyzer::Simplify(const PrimExpr& expr) {
   return res;
 }
 
+TVM_REGISTER_GLOBAL("arith.CreateAnalyzer")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    using runtime::PackedFunc;
+    using runtime::TypedPackedFunc;
+    auto self = std::make_shared<Analyzer>();
+    auto f = [self](std::string name) -> PackedFunc {
+      if (name == "const_int_bound") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->const_int_bound(args[0]);
+          });
+      } else if (name == "modular_set") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->modular_set(args[0]);
+        });
+      } else if (name == "const_int_bound_update") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            self->const_int_bound.Update(args[0], args[1], args[2]);
+        });
+      } else if (name == "Simplify") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->Simplify(args[0]);
+        });
+      } else if (name == "rewrite_simplify") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->rewrite_simplify(args[0]);
+        });
+      } else if (name == "canonical_simplify") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->canonical_simplify(args[0]);
+        });
+      } else if (name == "int_set") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->int_set(args[0], args[1]);
+        });
+      } else if (name == "bind") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            if (args[1].IsObjectRef<Range>()) {
+              self->Bind(args[0], args[1].operator Range());
+            } else {
+              self->Bind(args[0], args[1].operator PrimExpr());
+            }
+        });
+      } else if (name == "enter_constraint_context") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            // can't use make_shared due to noexcept(false) decl in destructor,
+            // see https://stackoverflow.com/a/43907314
+            auto ctx = std::shared_ptr<With<ConstraintContext> >(
+                new With<ConstraintContext>(self.get(), args[0]));
+            auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable {
+              ctx.reset();
+            };
+            *ret = PackedFunc(fexit);
+        });
+      }
+      return PackedFunc();
+    };
+    *ret = TypedPackedFunc<PackedFunc(std::string)>(f);
+});
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/bound_deducer.cc b/src/arith/bound_deducer.cc
index df8f40230e04..26be5d51115f 100644
--- a/src/arith/bound_deducer.cc
+++ b/src/arith/bound_deducer.cc
@@ -21,11 +21,11 @@
  * \file bound_deducer.cc
  * \brief Utility to deduce bound of expression
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/ir_pass.h>
 #include <tvm/tir/expr_functor.h>
 #include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
 
 #include <unordered_set>
 #include <unordered_map>
@@ -362,5 +362,16 @@ IntSet DeduceBound(PrimExpr v, PrimExpr e,
   return DeduceBound(v, e, hmap, rmap);
 }
 
+
+TVM_REGISTER_GLOBAL("arith.DeduceBound")
+.set_body_typed([](
+  PrimExpr v, PrimExpr cond,
+  const Map<Var, IntSet> hint_map,
+  const Map<Var, IntSet> relax_map
+) {
+  return DeduceBound(v, cond, hint_map, relax_map);
+});
+
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 7fb90a5e87c1..9ef5723e153e 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -20,6 +20,7 @@
 /*!
  * \file tvm/arith/const_int_bound.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/expr_functor.h>
 #include <algorithm>
@@ -41,6 +42,13 @@ ConstIntBound::ConstIntBound(
   data_ = std::move(node);
 }
 
+ConstIntBound MakeConstIntBound(int64_t min_value, int64_t max_value) {
+  return ConstIntBound(min_value, max_value);
+}
+
+TVM_REGISTER_GLOBAL("arith.ConstIntBound")
+.set_body_typed(MakeConstIntBound);
+
 inline void PrintBoundValue(std::ostream& os, int64_t val) {
   if (val == ConstIntBound::kPosInf) {
     os << "pos_inf";
diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc
index 53adf35eb6ee..cc9c745a24b8 100644
--- a/src/arith/detect_linear_equation.cc
+++ b/src/arith/detect_linear_equation.cc
@@ -21,6 +21,7 @@
  * \file detect_linear_equation.cc
  * \brief Utility to detect patterns in the expression.
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/ir_pass.h>
 #include <tvm/tir/expr_functor.h>
@@ -268,6 +269,12 @@ Array<PrimExpr> DetectClipBound(const PrimExpr& e, const Array<Var>& vars) {
   return ret;
 }
 
+TVM_REGISTER_GLOBAL("arith.DetectLinearEquation")
+.set_body_typed(DetectLinearEquation);
 
+TVM_REGISTER_GLOBAL("arith.DetectClipBound")
+.set_body_typed([](const PrimExpr& e, const Array<Var>& vars) {
+  return DetectClipBound(e, vars);
+});
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/domain_touched.cc b/src/arith/domain_touched.cc
index aa1ba4eb67be..4eecabdb6d8c 100644
--- a/src/arith/domain_touched.cc
+++ b/src/arith/domain_touched.cc
@@ -119,5 +119,8 @@ Domain DomainTouched(Stmt stmt,
   return FuncTouchedDomain(tensor, consider_calls, consider_provides).Find(stmt);
 }
 
+TVM_REGISTER_GLOBAL("arith.DomainTouched")
+.set_body_typed(DomainTouched);
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index adb38799fdf2..8c5afb1be8b5 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -820,5 +820,28 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
               << "[" << op->min_value << ", "
               << op->max_value << ']';
   });
+
+
+TVM_REGISTER_GLOBAL("arith.intset_single_point")
+.set_body_typed(IntSet::single_point);
+
+TVM_REGISTER_GLOBAL("arith.intset_vector")
+.set_body_typed(IntSet::vector);
+
+TVM_REGISTER_GLOBAL("arith.intset_interval")
+.set_body_typed(IntSet::interval);
+
+TVM_REGISTER_GLOBAL("arith.IntervalSetGetMin")
+.set_body_method(&IntSet::min);
+
+TVM_REGISTER_GLOBAL("arith.IntervalSetGetMax")
+.set_body_method(&IntSet::max);
+
+TVM_REGISTER_GLOBAL("arith.IntSetIsNothing")
+.set_body_method(&IntSet::is_nothing);
+
+TVM_REGISTER_GLOBAL("arith.IntSetIsEverything")
+.set_body_method(&IntSet::is_everything);
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/modular_set.cc b/src/arith/modular_set.cc
index c3031ca0edfc..40cd7f8793ee 100644
--- a/src/arith/modular_set.cc
+++ b/src/arith/modular_set.cc
@@ -21,6 +21,7 @@
  * \file modular_set.cc
  * \brief Modular set analysis
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/expr_functor.h>
@@ -52,6 +53,12 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
               << op->base << ')';
   });
 
+ModularSet MakeModularSet(int64_t coeff, int64_t base) {
+  return ModularSet(coeff, base);
+}
+
+TVM_REGISTER_GLOBAL("arith.ModularSet")
+.set_body_typed(MakeModularSet);
 
 // internal entry for const int bound
 struct ModularSetAnalyzer::Entry {
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index 4feabeb8e505..6244c7645acc 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -134,6 +134,14 @@ Range Range::make_by_min_extent(PrimExpr min, PrimExpr extent) {
   return Range(make_object<RangeNode>(min, extent));
 }
 
+TVM_REGISTER_GLOBAL("ir.range_by_min_extent")
+.set_body_typed(Range::make_by_min_extent);
+
+TVM_REGISTER_GLOBAL("ir.Range")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  *ret = Range(args[0], args[1]);
+  });
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 .set_dispatch<RangeNode>([](const ObjectRef& node, ReprPrinter* p) {
     auto* op = static_cast<const RangeNode*>(node.get());
diff --git a/src/api/api_test.cc b/src/support/ffi_testing.cc
similarity index 97%
rename from src/api/api_test.cc
rename to src/support/ffi_testing.cc
index 2a1e60539bdf..9053f6298999 100644
--- a/src/api/api_test.cc
+++ b/src/support/ffi_testing.cc
@@ -18,13 +18,13 @@
  */
 
  /*!
- *  Code mainly used for test purposes.
- * \file api_test.cc
+ *  FFI registration code used for frontend testing purposes.
+ * \file ffi_testing.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
 #include <tvm/te/tensor.h>
 #include <tvm/ir/attrs.h>
-#include <tvm/runtime/registry.h>
 #include <tvm/ir/env_func.h>
 
 namespace tvm {
diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
index 1886d976555b..6123c613d0bd 100644
--- a/src/te/operation/compute_op.cc
+++ b/src/te/operation/compute_op.cc
@@ -21,6 +21,7 @@
  * \brief Compute Op.
  * \file compute_op.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/expr.h>
@@ -156,6 +157,10 @@ Operation ComputeOpNode::make(std::string name,
   return Operation(n);
 }
 
+TVM_REGISTER_GLOBAL("te.ComputeOp")
+.set_body_typed(ComputeOpNode::make);
+
+
 // The schedule related logics
 Array<Tensor> ComputeOpNode::InputTensors() const {
   Array<Tensor> ret;
diff --git a/src/te/operation/extern_op.cc b/src/te/operation/extern_op.cc
index c1e55046102b..62c8dfd30d49 100644
--- a/src/te/operation/extern_op.cc
+++ b/src/te/operation/extern_op.cc
@@ -21,6 +21,7 @@
  * \brief External computation rule.
  * \file extern_op.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/expr.h>
@@ -86,6 +87,10 @@ Operation ExternOpNode::make(std::string name,
   return Operation(n);
 }
 
+TVM_REGISTER_GLOBAL("te.ExternOp")
+.set_body_typed(ExternOpNode::make);
+
+
 Array<Tensor> ExternOpNode::InputTensors() const {
   return inputs;
 }
diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc
index bb883ae47004..70abf34523b9 100644
--- a/src/te/operation/hybrid_op.cc
+++ b/src/te/operation/hybrid_op.cc
@@ -21,6 +21,7 @@
  * \brief Hybrid computation rule.
  * \file hybrid_op.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/expr.h>
@@ -83,6 +84,10 @@ Operation HybridOpNode::make(std::string name,
   return res;
 }
 
+TVM_REGISTER_GLOBAL("te.HybridOp")
+.set_body_typed(HybridOpNode::make);
+
+
 Array<Tensor> HybridOpNode::InputTensors() const {
   // Because input tensors could be potentially inlined into hybrid scripts,
   // we need to check if all input tensors are used in the body.
diff --git a/src/te/operation/placeholder_op.cc b/src/te/operation/placeholder_op.cc
index 866ef949cf49..d48be4c53668 100644
--- a/src/te/operation/placeholder_op.cc
+++ b/src/te/operation/placeholder_op.cc
@@ -21,6 +21,7 @@
  * \brief Placeholder op.
  * \file placeholder_op.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
 
 namespace tvm {
@@ -67,6 +68,11 @@ Tensor placeholder(Array<PrimExpr> shape, DataType dtype, std::string name) {
   return PlaceholderOpNode::make(name, shape, dtype).output(0);
 }
 
+TVM_REGISTER_GLOBAL("te.Placeholder")
+.set_body_typed([](Array<PrimExpr> shape, DataType dtype, std::string name) {
+  return placeholder(shape, dtype, name);
+});
+
 Array<Tensor> PlaceholderOpNode::InputTensors() const {
   return {};
 }
diff --git a/src/te/operation/scan_op.cc b/src/te/operation/scan_op.cc
index cacfd8c4a4f1..956a297f5b3c 100644
--- a/src/te/operation/scan_op.cc
+++ b/src/te/operation/scan_op.cc
@@ -21,6 +21,7 @@
  * \brief Scan Operator.
  * \file scan_op.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/ir_pass.h>
@@ -120,6 +121,10 @@ Operation ScanOpNode::make(std::string name,
   return Operation(n);
 }
 
+TVM_REGISTER_GLOBAL("te.ScanOp")
+.set_body_typed(ScanOpNode::make);
+
+
 Array<Tensor> scan(Array<Tensor> init,
                    Array<Tensor> update,
                    Array<Tensor> state_placeholder,
diff --git a/src/te/operation/tensor_compute_op.cc b/src/te/operation/tensor_compute_op.cc
index 8ce621ccc55b..4cdc9e1f8d32 100644
--- a/src/te/operation/tensor_compute_op.cc
+++ b/src/te/operation/tensor_compute_op.cc
@@ -21,6 +21,7 @@
  * \brief Tensor Compute Op.
  * \file tensor_compute_op.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/expr.h>
@@ -72,6 +73,10 @@ Operation TensorComputeOpNode::make(std::string name,
   return Operation(n);
 }
 
+TVM_REGISTER_GLOBAL("te.TensorComputeOp")
+.set_body_typed(TensorComputeOpNode::make);
+
+
 Array<Tensor> TensorComputeOpNode::InputTensors() const {
   return inputs;
 }
diff --git a/src/te/schedule/auto_inline_elem_wise.cc b/src/te/schedule/auto_inline_elem_wise.cc
index 3a2226780f20..6d79f4a8d1d6 100644
--- a/src/te/schedule/auto_inline_elem_wise.cc
+++ b/src/te/schedule/auto_inline_elem_wise.cc
@@ -20,6 +20,7 @@
 /*!
  * \file auto_inline_elem_wise.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/schedule_pass.h>
 #include <tvm/te/operation.h>
 #include <tvm/tir/expr_functor.h>
@@ -111,5 +112,12 @@ void AutoInlineInjective(Schedule sch) {
   }
 }
 
+TVM_REGISTER_GLOBAL("schedule.AutoInlineElemWise")
+.set_body_typed(AutoInlineElemWise);
+
+
+TVM_REGISTER_GLOBAL("schedule.AutoInlineInjective")
+.set_body_typed(AutoInlineInjective);
+
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc
index 27896e6738a8..50cbafd2b654 100644
--- a/src/te/schedule/bound.cc
+++ b/src/te/schedule/bound.cc
@@ -21,6 +21,7 @@
  * \file bound.cc
  * \brief The bound inference logic.
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/schedule_pass.h>
 #include <tvm/te/operation.h>
 #include <tvm/tir/ir_pass.h>
@@ -259,5 +260,8 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
   return Map<IterVar, Range>(ret.begin(), ret.end());
 }
 
+TVM_REGISTER_GLOBAL("schedule.InferBound")
+.set_body_typed(InferBound);
+
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/schedule/graph.cc b/src/te/schedule/graph.cc
index eff0a25c569a..9dce36f220ef 100644
--- a/src/te/schedule/graph.cc
+++ b/src/te/schedule/graph.cc
@@ -21,6 +21,7 @@
  * \file graph.cc
  * \brief Utilities to get information about schedule graph.
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/te/operation.h>
@@ -429,5 +430,24 @@ Map<IterVar, PrimExpr> ScanFixPointAnalysis(const Operation& scan_op) {
   return ret;
 }
 
+
+TVM_REGISTER_GLOBAL("schedule.CreateReadGraph")
+.set_body_typed(CreateReadGraph);
+
+TVM_REGISTER_GLOBAL("schedule.PostDFSOrder")
+.set_body_typed([](const Array<Operation>& roots,
+                   const ReadGraph& g) {
+  return PostDFSOrder(roots, g);
+});
+
+TVM_REGISTER_GLOBAL("schedule.CreateAttachPath")
+.set_body_typed(CreateAttachPath);
+
+TVM_REGISTER_GLOBAL("schedule.ScanGetBody")
+.set_body_typed(ScanGetBody);
+
+TVM_REGISTER_GLOBAL("schedule.ScanFixPointAnalysis")
+.set_body_typed(ScanFixPointAnalysis);
+
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc
index 1763bd64c15f..d3b448d37790 100644
--- a/src/te/schedule/schedule_lang.cc
+++ b/src/te/schedule/schedule_lang.cc
@@ -20,6 +20,7 @@
 /*!
  * \file schedule_lang.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/schedule.h>
 #include <tvm/te/operation.h>
 #include <unordered_set>
@@ -848,5 +849,118 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     auto* op = static_cast<const ScheduleNode*>(node.get());
     p->stream << "schedule(" << op << ")";
   });
+
+
+TVM_REGISTER_GLOBAL("te.CreateSchedule")
+.set_body_typed(create_schedule);
+
+TVM_REGISTER_GLOBAL("te.StageSetScope")
+.set_body_method(&Stage::set_scope);
+
+TVM_REGISTER_GLOBAL("te.StageBind")
+.set_body_method(&Stage::bind);
+
+TVM_REGISTER_GLOBAL("te.StageSplitByFactor")
+.set_body_typed([](Stage stage, IterVar parent, PrimExpr factor) {
+  IterVar outer, inner;
+  stage.split(parent, factor, &outer, &inner);
+  return Array<IterVar>({outer, inner});
+});
+
+TVM_REGISTER_GLOBAL("te.StageSplitByNParts")
+.set_body_typed([](Stage stage, IterVar parent, PrimExpr nparts) {
+  IterVar outer, inner;
+  stage.split_by_nparts(parent, nparts, &outer, &inner);
+  return Array<IterVar>({outer, inner});
+});
+
+TVM_REGISTER_GLOBAL("te.StageFuse")
+.set_body_typed([](Stage stage, Array<IterVar> axes) {
+    IterVar fused;
+    stage.fuse(axes, &fused);
+    return fused;
+  });
+
+TVM_REGISTER_GLOBAL("te.StageComputeAt")
+.set_body_method(&Stage::compute_at);
+
+TVM_REGISTER_GLOBAL("te.StageComputeInline")
+.set_body_method(&Stage::compute_inline);
+
+TVM_REGISTER_GLOBAL("te.StageComputeRoot")
+.set_body_method(&Stage::compute_root);
+
+TVM_REGISTER_GLOBAL("te.StageReorder")
+.set_body_method(&Stage::reorder);
+
+TVM_REGISTER_GLOBAL("te.StageTile")
+.set_body_typed([](
+  Stage stage,
+  IterVar x_parent, IterVar y_parent,
+  PrimExpr x_factor, PrimExpr y_factor
+) {
+    IterVar x_outer, y_outer, x_inner, y_inner;
+    stage.tile(x_parent, y_parent,
+               x_factor, y_factor,
+               &x_outer, &y_outer,
+               &x_inner, &y_inner);
+    return Array<IterVar>({x_outer, y_outer, x_inner, y_inner});
+  });
+
+TVM_REGISTER_GLOBAL("te.StageEnvThreads")
+.set_body_method(&Stage::env_threads);
+
+TVM_REGISTER_GLOBAL("te.StageSetStorePredicate")
+.set_body_method(&Stage::set_store_predicate);
+
+TVM_REGISTER_GLOBAL("te.StageUnroll")
+.set_body_method(&Stage::unroll);
+
+TVM_REGISTER_GLOBAL("te.StageVectorize")
+.set_body_method(&Stage::vectorize);
+
+TVM_REGISTER_GLOBAL("te.StageTensorize")
+.set_body_method(&Stage::tensorize);
+
+TVM_REGISTER_GLOBAL("te.StageParallel")
+.set_body_method(&Stage::parallel);
+
+TVM_REGISTER_GLOBAL("te.StagePragma")
+.set_body_method(&Stage::pragma);
+
+TVM_REGISTER_GLOBAL("te.StagePrefetch")
+.set_body_method(&Stage::prefetch);
+
+TVM_REGISTER_GLOBAL("te.StageStorageAlign")
+.set_body_method(&Stage::storage_align);
+
+TVM_REGISTER_GLOBAL("te.StageDoubleBuffer")
+.set_body_method(&Stage::double_buffer);
+
+TVM_REGISTER_GLOBAL("te.StageOpenGL")
+.set_body_method(&Stage::opengl);
+
+TVM_REGISTER_GLOBAL("te.ScheduleNormalize")
+.set_body_method(&Schedule::normalize);
+
+TVM_REGISTER_GLOBAL("te.ScheduleCreateGroup")
+.set_body_method(&Schedule::create_group);
+
+TVM_REGISTER_GLOBAL("te.ScheduleCacheRead")
+.set_body_method(&Schedule::cache_read);
+
+TVM_REGISTER_GLOBAL("te.ScheduleCacheWrite")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    if (args[1].IsObjectRef<Tensor>()) {
+      *ret = args[0].operator Schedule()
+          .cache_write(args[1].operator Tensor(), args[2]);
+    } else {
+      *ret = args[0].operator Schedule()
+          .cache_write(args[1].operator Array<Tensor>(), args[2]);
+    }
+  });
+
+TVM_REGISTER_GLOBAL("te.ScheduleRFactor")
+.set_body_method(&Schedule::rfactor);
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/schedule/schedule_ops.cc b/src/te/schedule/schedule_ops.cc
index 0930f26372c4..a110bc458fe9 100644
--- a/src/te/schedule/schedule_ops.cc
+++ b/src/te/schedule/schedule_ops.cc
@@ -20,6 +20,7 @@
 /*!
  * \file schedule_ops.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/ir_pass.h>
 #include <tvm/tir/stmt_functor.h>
@@ -423,5 +424,13 @@ Stmt ScheduleOps(
   return post_proc(std::move(body));
 }
 
+TVM_REGISTER_GLOBAL("schedule.ScheduleOps")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  if (args.size() == 2)
+    *ret = ScheduleOps(args[0], args[1], false);
+  else
+    *ret = ScheduleOps(args[0], args[1], args[2]);
+});
+
 }  // namespace te
 }  // namespace tvm
diff --git a/src/te/tensor.cc b/src/te/tensor.cc
index f200514468cb..cb14f6a35270 100644
--- a/src/te/tensor.cc
+++ b/src/te/tensor.cc
@@ -20,6 +20,7 @@
 /*!
  * \file tensor.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/te/tensor.h>
 #include <tvm/te/operation.h>
 #include <tvm/te/tensor_intrin.h>
@@ -147,5 +148,33 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(TensorIntrinCallNode);
 
+TVM_REGISTER_GLOBAL("te.Tensor")
+.set_body_typed(TensorNode::make);
+
+TVM_REGISTER_GLOBAL("te.TensorIntrin")
+.set_body_typed(TensorIntrinNode::make);
+
+TVM_REGISTER_GLOBAL("te.TensorIntrinCall")
+.set_body_typed(TensorIntrinCallNode::make);
+
+TVM_REGISTER_GLOBAL("te.TensorEqual")
+.set_body_method(&Tensor::operator==);
+
+TVM_REGISTER_GLOBAL("te.TensorHash")
+.set_body_typed([](Tensor tensor) -> int64_t {
+    return static_cast<int64_t>(std::hash<Tensor>()(tensor));
+  });
+
+TVM_REGISTER_GLOBAL("te.OpGetOutput")
+.set_body_typed([](Operation op, int64_t output) {
+  return op.output(static_cast<size_t>(output));
+});
+
+TVM_REGISTER_GLOBAL("te.OpNumOutputs")
+.set_body_method<Operation>(&OperationNode::num_outputs);
+
+TVM_REGISTER_GLOBAL("te.OpInputTensors")
+.set_body_method<Operation>(&OperationNode::InputTensors);
+
 }  // namespace te
 }  // namespace tvm
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index d06c33f79dcc..22844745982f 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -20,6 +20,7 @@
 /*!
  * \file expr.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
 #include <tvm/tir/op.h>
@@ -45,6 +46,17 @@ SizeVar::SizeVar(std::string name_hint, DataType t)
 SizeVarNode::SizeVarNode(DataType t, std::string name_hint)
     : VarNode(t, std::move(name_hint)) {}
 
+
+TVM_REGISTER_GLOBAL("tir.Var")
+.set_body_typed([](std::string s, DataType t) {
+    return Var(s, t);
+  });
+
+TVM_REGISTER_GLOBAL("tir.SizeVar")
+.set_body_typed([](std::string s, DataType t) {
+    return SizeVar(s, t);
+  });
+
 IterVar IterVarNode::make(Range dom,
                           Var var,
                           IterVarType t,
@@ -57,6 +69,14 @@ IterVar IterVarNode::make(Range dom,
   return IterVar(n);
 }
 
+TVM_REGISTER_GLOBAL("tir.IterVar")
+.set_body_typed([](Range dom, Var var, int iter_type, std::string thread_tag) {
+  return IterVarNode::make(
+      dom, var,
+      static_cast<IterVarType>(iter_type),
+      thread_tag);
+});
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 .set_dispatch<IterVarNode>([](const ObjectRef& node, ReprPrinter* p) {
     auto* op = static_cast<const IterVarNode*>(node.get());
@@ -83,6 +103,9 @@ PrimExpr StringImmNode::make(std::string value) {
   return PrimExpr(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.StringImm")
+.set_body_typed(StringImmNode::make);
+
 PrimExpr CastNode::make(DataType t, PrimExpr value) {
   CHECK(value.defined());
   CHECK_EQ(t.lanes(), value.dtype().lanes());
@@ -311,6 +334,13 @@ Array<PrimExpr> CommReducerNode::operator()(Array<PrimExpr> a, Array<PrimExpr> b
     });
 }
 
+TVM_REGISTER_GLOBAL("tir.CommReducer")
+.set_body_typed(CommReducerNode::make);
+
+TVM_REGISTER_GLOBAL("tir.CommReducerCombine")
+.set_body_method<tir::CommReducer>(&tir::CommReducerNode::operator());
+
+
 PrimExpr ReduceNode::make(CommReducer combiner, Array<PrimExpr> source,
                   Array<IterVar> axis, PrimExpr condition, int value_index) {
   for (size_t i = 0; i < axis.size(); ++i) {
@@ -334,6 +364,11 @@ PrimExpr ReduceNode::make(CommReducer combiner, Array<PrimExpr> source,
   return PrimExpr(n);
 }
 
+
+TVM_REGISTER_GLOBAL("tir.Reduce")
+.set_body_typed(ReduceNode::make);
+
+
 PrimExpr AnyNode::make() {
   auto n = make_object<AnyNode>();
   return PrimExpr(n);
@@ -659,5 +694,104 @@ TVM_REGISTER_NODE_TYPE(CommReducerNode);
 TVM_REGISTER_NODE_TYPE(ReduceNode);
 TVM_REGISTER_NODE_TYPE(AnyNode);
 
+
+TVM_REGISTER_GLOBAL("tir.Add")
+.set_body_typed(AddNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Sub")
+.set_body_typed(SubNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Mul")
+.set_body_typed(MulNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Div")
+.set_body_typed(DivNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Mod")
+.set_body_typed(ModNode::make);
+
+TVM_REGISTER_GLOBAL("tir.FloorDiv")
+.set_body_typed(FloorDivNode::make);
+
+TVM_REGISTER_GLOBAL("tir.FloorMod")
+.set_body_typed(FloorModNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Min")
+.set_body_typed(MinNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Max")
+.set_body_typed(MaxNode::make);
+
+TVM_REGISTER_GLOBAL("tir.EQ")
+.set_body_typed(EQNode::make);
+
+TVM_REGISTER_GLOBAL("tir.NE")
+.set_body_typed(NENode::make);
+
+TVM_REGISTER_GLOBAL("tir.LT")
+.set_body_typed(LTNode::make);
+
+TVM_REGISTER_GLOBAL("tir.LE")
+.set_body_typed(LENode::make);
+
+TVM_REGISTER_GLOBAL("tir.GT")
+.set_body_typed(GTNode::make);
+
+TVM_REGISTER_GLOBAL("tir.GE")
+.set_body_typed(GENode::make);
+
+TVM_REGISTER_GLOBAL("tir.And")
+.set_body_typed(AndNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Or")
+.set_body_typed(OrNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Not")
+.set_body_typed(NotNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Select")
+.set_body_typed(SelectNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Ramp")
+.set_body_typed(RampNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Cast")
+.set_body_typed(CastNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Broadcast")
+.set_body_typed(BroadcastNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Shuffle")
+.set_body_typed(ShuffleNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Let")
+.set_body_typed(LetNode::make);
+
+TVM_REGISTER_GLOBAL("tir.Load")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    DataType t = args[0];
+    if (args.size() == 3) {
+      *ret = LoadNode::make(t, args[1], args[2], const_true(t.lanes()));
+    } else {
+      *ret = LoadNode::make(t, args[1], args[2], args[3]);
+    }
+  });
+
+
+
+TVM_REGISTER_GLOBAL("tir.Call")
+.set_body_typed([](
+  DataType type, std::string name,
+  Array<PrimExpr> args, int call_type,
+  FunctionRef func, int value_index
+) {
+  return CallNode::make(type,
+                    name,
+                    args,
+                    static_cast<CallNode::CallType>(call_type),
+                    func,
+                    value_index);
+});
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/ir/op.cc b/src/tir/ir/op.cc
index 58f8b6b76da8..452c3bbc68a2 100644
--- a/src/tir/ir/op.cc
+++ b/src/tir/ir/op.cc
@@ -662,4 +662,90 @@ TVM_REGISTER_GLOBAL("node.LargeUIntImm")
 TVM_REGISTER_GLOBAL("node.String")
 .set_body_typed(tir::StringImmNode::make);
 
+TVM_REGISTER_GLOBAL("tir.min_value")
+.set_body_typed(min_value);
+
+TVM_REGISTER_GLOBAL("tir.max_value")
+.set_body_typed(max_value);
+
+TVM_REGISTER_GLOBAL("tir.abs")
+.set_body_typed(tvm::abs);
+
+TVM_REGISTER_GLOBAL("tir.isnan")
+.set_body_typed(tvm::isnan);
+
+TVM_REGISTER_GLOBAL("tir.floor")
+.set_body_typed(tvm::floor);
+
+TVM_REGISTER_GLOBAL("tir.ceil")
+.set_body_typed(tvm::ceil);
+
+TVM_REGISTER_GLOBAL("tir.round")
+.set_body_typed(tvm::round);
+
+TVM_REGISTER_GLOBAL("tir.nearbyint")
+.set_body_typed(tvm::nearbyint);
+
+TVM_REGISTER_GLOBAL("tir.trunc")
+.set_body_typed(tvm::trunc);
+
+TVM_REGISTER_GLOBAL("tir._cast")
+.set_body_typed(tvm::cast);
+
+
+
+// operator overloading, smarter than make
+#define REGISTER_MAKE_BINARY_OP(Node, Func)                     \
+  TVM_REGISTER_GLOBAL("tir."#Node)                              \
+  .set_body_typed([](PrimExpr a, PrimExpr b) {                  \
+    return (Func(a, b));                                        \
+  })
+
+#define REGISTER_MAKE_BIT_OP(Node, Func)                                \
+  TVM_REGISTER_GLOBAL("tir."#Node)                                      \
+  .set_body([](TVMArgs args,  TVMRetValue *ret) {                       \
+    bool lhs_is_int = args[0].type_code() == kDLInt;                    \
+    bool rhs_is_int = args[1].type_code() == kDLInt;                    \
+    if (lhs_is_int) {                                                   \
+      *ret = (Func(args[0].operator int(), args[1].operator PrimExpr())); \
+    } else if (rhs_is_int) {                                            \
+      *ret = (Func(args[0].operator PrimExpr(), args[1].operator int())); \
+    } else {                                                            \
+      *ret = (Func(args[0].operator PrimExpr(), args[1].operator PrimExpr())); \
+    }                                                                   \
+  })
+
+
+REGISTER_MAKE_BINARY_OP(_OpAdd, operator+);
+REGISTER_MAKE_BINARY_OP(_OpSub, operator-);
+REGISTER_MAKE_BINARY_OP(_OpMul, operator*);
+REGISTER_MAKE_BINARY_OP(_OpDiv, div);
+REGISTER_MAKE_BINARY_OP(_OpMod, truncmod);
+REGISTER_MAKE_BINARY_OP(_OpIndexDiv, indexdiv);
+REGISTER_MAKE_BINARY_OP(_OpIndexMod, indexmod);
+REGISTER_MAKE_BINARY_OP(_OpFloorDiv, floordiv);
+REGISTER_MAKE_BINARY_OP(_OpFloorMod, floormod);
+REGISTER_MAKE_BINARY_OP(_OpTruncDiv, truncdiv);
+REGISTER_MAKE_BINARY_OP(_OpTruncMod, truncmod);
+REGISTER_MAKE_BINARY_OP(_OpPow, pow);
+REGISTER_MAKE_BINARY_OP(_OpMin, min);
+REGISTER_MAKE_BINARY_OP(_OpMax, max);
+REGISTER_MAKE_BINARY_OP(_OpEQ, operator==);
+REGISTER_MAKE_BINARY_OP(_OpNE, operator!=);
+REGISTER_MAKE_BINARY_OP(_OpLT, operator<); // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpLE, operator<=); // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpGT, operator>);  // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpGE, operator>=);
+REGISTER_MAKE_BINARY_OP(_OpAnd, operator&&);
+REGISTER_MAKE_BINARY_OP(_OpOr, operator||);
+REGISTER_MAKE_BIT_OP(bitwise_and, operator&);
+REGISTER_MAKE_BIT_OP(bitwise_or, operator|);
+REGISTER_MAKE_BIT_OP(bitwise_xor, operator^);
+REGISTER_MAKE_BIT_OP(left_shift, operator<<); // NOLINT(*)
+REGISTER_MAKE_BIT_OP(right_shift, operator>>);
+
+TVM_REGISTER_GLOBAL("tir._OpIfThenElse")
+.set_body_typed([] (PrimExpr cond, PrimExpr true_value, PrimExpr false_value) {
+  return if_then_else(cond, true_value, false_value);
+});
 }  // namespace tvm
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 0cd2aba319ee..a8fe9cd2bad3 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -20,7 +20,7 @@
 /*!
  * \file tvm/tir/stmt.cc
  */
-
+#include <tvm/runtime/registry.h>
 #include <tvm/tir/stmt.h>
 #include <tvm/tir/ir_pass.h>
 #include "../pass/ir_util.h"
@@ -40,6 +40,9 @@ Stmt LetStmtNode::make(Var var, PrimExpr value, Stmt body) {
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.LetStmt")
+.set_body_typed(LetStmtNode::make);
+
 Stmt AttrStmtNode::make(ObjectRef node,
                     std::string attr_key,
                     PrimExpr value,
@@ -52,6 +55,10 @@ Stmt AttrStmtNode::make(ObjectRef node,
   return Stmt(n);
 }
 
+TVM_REGISTER_GLOBAL("tir.AttrStmt")
+.set_body_typed(AttrStmtNode::make);
+
+
 Stmt AssertStmtNode::make(PrimExpr condition, PrimExpr message, Stmt body) {
   CHECK(condition.defined());
   CHECK(message.dtype() == DataType::Int(32) ||
@@ -66,6 +73,10 @@ Stmt AssertStmtNode::make(PrimExpr condition, PrimExpr message, Stmt body) {
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.AssertStmt")
+.set_body_typed(AssertStmtNode::make);
+
+
 Stmt ProducerConsumerNode::make(FunctionRef func, bool is_producer, Stmt body) {
   CHECK(body.defined());
 
@@ -76,6 +87,10 @@ Stmt ProducerConsumerNode::make(FunctionRef func, bool is_producer, Stmt body) {
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.ProducerConsumer")
+.set_body_typed(ProducerConsumerNode::make);
+
+
 Stmt ForNode::make(Var loop_var,
                PrimExpr min,
                PrimExpr extent,
@@ -99,6 +114,19 @@ Stmt ForNode::make(Var loop_var,
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.For")
+.set_body_typed([](
+  Var loop_var, PrimExpr min, PrimExpr extent,
+  int for_type, int device_api, Stmt body) {
+  return ForNode::make(loop_var,
+                   min,
+                   extent,
+                   static_cast<ForType>(for_type),
+                   static_cast<DeviceAPI>(device_api),
+                   body);
+});
+
+
 Stmt StoreNode::make(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate) {
   CHECK(value.defined());
   CHECK(index.defined());
@@ -114,6 +142,18 @@ Stmt StoreNode::make(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr pr
   return Stmt(node);
 }
 
+
+TVM_REGISTER_GLOBAL("tir.Store")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    PrimExpr value = args[1];
+    if (args.size() == 3) {
+      *ret = StoreNode::make(args[0], value, args[2], const_true(value.dtype().lanes()));
+    } else {
+      *ret = StoreNode::make(args[0], value, args[2], args[3]);
+    }
+  });
+
+
 Stmt ProvideNode::make(FunctionRef func, int value_index, PrimExpr value, Array<PrimExpr> args) {
   CHECK(value_index >=0 && value_index < func->num_outputs())
       << "value index output function return value bound";
@@ -131,6 +171,10 @@ Stmt ProvideNode::make(FunctionRef func, int value_index, PrimExpr value, Array<
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.Provide")
+.set_body_typed(ProvideNode::make);
+
+
 Stmt AllocateNode::make(Var buffer_var,
                     DataType dtype,
                     Array<PrimExpr> extents,
@@ -157,6 +201,15 @@ Stmt AllocateNode::make(Var buffer_var,
     return Stmt(node);
 }
 
+// overloaded, needs special handling
+// has default args
+TVM_REGISTER_GLOBAL("tir.Allocate")
+.set_body_typed([](
+    Var buffer_var, DataType type, Array<PrimExpr> extents, PrimExpr condition, Stmt body
+                   ){
+  return AllocateNode::make(buffer_var, type, extents, condition, body);
+});
+
 int32_t AllocateNode::constant_allocation_size(const Array<PrimExpr>& extents) {
   int64_t result = 1;
   for (size_t i = 0; i < extents.size(); ++i) {
@@ -178,12 +231,16 @@ Stmt FreeNode::make(Var buffer_var) {
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.Free")
+.set_body_typed(FreeNode::make);
+
+
 Stmt RealizeNode::make(FunctionRef func,
-                   int value_index,
-                   DataType dtype,
-                   Region bounds,
-                   PrimExpr condition,
-                   Stmt body) {
+                       int value_index,
+                       DataType dtype,
+                       Region bounds,
+                       PrimExpr condition,
+                       Stmt body) {
   for (size_t i = 0; i < bounds.size(); ++i) {
     CHECK(bounds[i]->min.defined());
     CHECK(bounds[i]->extent.defined());
@@ -204,6 +261,11 @@ Stmt RealizeNode::make(FunctionRef func,
   return Stmt(node);
 }
 
+
+TVM_REGISTER_GLOBAL("tir.Realize")
+.set_body_typed(RealizeNode::make);
+
+
 Stmt PrefetchNode::make(FunctionRef func, int value_index, DataType dtype, Region bounds) {
   for (size_t i = 0; i < bounds.size(); ++i) {
     CHECK(bounds[i]->min.defined());
@@ -220,12 +282,21 @@ Stmt PrefetchNode::make(FunctionRef func, int value_index, DataType dtype, Regio
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.Prefetch")
+.set_body_typed(PrefetchNode::make);
+
+
 SeqStmt::SeqStmt(Array<Stmt> seq) {
   auto node = make_object<SeqStmtNode>();
   node->seq = std::move(seq);
   data_ = std::move(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.SeqStmt")
+.set_body_typed([](Array<Stmt> seq) {
+  return SeqStmt(std::move(seq));
+});
+
 Stmt IfThenElseNode::make(PrimExpr condition, Stmt then_case, Stmt else_case) {
   CHECK(condition.defined());
   CHECK(then_case.defined());
@@ -238,6 +309,10 @@ Stmt IfThenElseNode::make(PrimExpr condition, Stmt then_case, Stmt else_case) {
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.IfThenElse")
+.set_body_typed(IfThenElseNode::make);
+
+
 Stmt EvaluateNode::make(PrimExpr value) {
   CHECK(value.defined());
 
@@ -246,6 +321,9 @@ Stmt EvaluateNode::make(PrimExpr value) {
   return Stmt(node);
 }
 
+TVM_REGISTER_GLOBAL("tir.Evaluate")
+.set_body_typed(EvaluateNode::make);
+
 // Printers
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
diff --git a/src/api/api_pass.cc b/src/tir/pass/ffi_api.cc
similarity index 99%
rename from src/api/api_pass.cc
rename to src/tir/pass/ffi_api.cc
index 75d5439b7f1b..233bfa51d614 100644
--- a/src/api/api_pass.cc
+++ b/src/tir/pass/ffi_api.cc
@@ -19,7 +19,7 @@
 
 /*!
  *  Exposure of pass functions.
- * \file api_pass.cc
+ * \file ffi_api.cc
  */
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
@@ -136,8 +136,8 @@ TVM_REGISTER_GLOBAL("ir_pass.LowerStorageAccess")
 
 // make from two arguments
 #define REGISTER_PASS(PassName)                                   \
-  TVM_REGISTER_GLOBAL("ir_pass."#PassName)                           \
-  .set_body_typed(PassName);                                     \
+  TVM_REGISTER_GLOBAL("ir_pass."#PassName)                        \
+  .set_body_typed(PassName);                                      \
 
 
 REGISTER_PASS(ConvertSSA);
diff --git a/tests/python/unittest/test_runtime_error.py b/tests/python/unittest/test_runtime_error.py
index d1a2d983ff25..ac019a0aab40 100644
--- a/tests/python/unittest/test_runtime_error.py
+++ b/tests/python/unittest/test_runtime_error.py
@@ -27,7 +27,7 @@ def test_op_translation():
     except tvm.error.OpNotImplemented as e:
         msg = str(e)
         assert isinstance(e, NotImplementedError)
-        assert msg.find("api_test.cc") != -1
+        assert msg.find("ffi_testing.cc") != -1
 
     fchk_eq = tvm.testing.test_check_eq_callback(
         "InternalError: myop")
@@ -36,14 +36,14 @@ def test_op_translation():
         assert False
     except tvm.error.InternalError as e:
         msg = str(e)
-        assert msg.find("api_test.cc") != -1
+        assert msg.find("ffi_testing.cc") != -1
 
     try:
         tvm.testing.ErrorTest(0, 1)
         assert False
     except ValueError as e:
         msg = str(e)
-        assert msg.find("api_test.cc") != -1
+        assert msg.find("ffi_testing.cc") != -1
 
 
 def test_deep_callback():

From fd6d7837ed05661e81358045adb902772a4f82c3 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 20 Feb 2020 09:24:37 -0800
Subject: [PATCH 23/73] [DOCS] Fix sphinx warnings (#4917)

* Fix Python docstrings

* More fixes

* Fix lint
---
 docs/api/python/contrib.rst           |  2 +-
 docs/api/python/relay/base.rst        |  6 ---
 docs/api/python/relay/expr.rst        |  6 ---
 docs/api/python/tensor.rst            | 44 ------------------
 python/tvm/autotvm/task/dispatcher.py |  6 +--
 python/tvm/autotvm/task/space.py      | 54 ++++++++++++----------
 python/tvm/contrib/cblas.py           | 29 ++++++------
 python/tvm/ir/base.py                 | 10 ++---
 python/tvm/ir/transform.py            |  3 +-
 python/tvm/relay/op/reduce.py         | 60 ++++++++++++-------------
 python/tvm/relay/op/transform.py      | 64 +++++++++++++--------------
 python/tvm/relay/op/vision/yolo.py    | 31 ++++++++-----
 python/tvm/relay/transform.py         | 19 ++++----
 python/tvm/te/tensor.py               |  2 +-
 python/tvm/tir/expr.py                |  2 +-
 topi/python/topi/sparse/csrmv.py      |  1 -
 16 files changed, 148 insertions(+), 191 deletions(-)
 delete mode 100644 docs/api/python/tensor.rst

diff --git a/docs/api/python/contrib.rst b/docs/api/python/contrib.rst
index 95465116cea6..b482d30515d4 100644
--- a/docs/api/python/contrib.rst
+++ b/docs/api/python/contrib.rst
@@ -21,7 +21,7 @@ tvm.contrib
 
 tvm.contrib.cblas
 ~~~~~~~~~~~~~~~~~
-.. automodule:: tvm.contrib.cc
+.. automodule:: tvm.contrib.cblas
     :members:
 
 
diff --git a/docs/api/python/relay/base.rst b/docs/api/python/relay/base.rst
index a3c52485ab97..dc9dac0f67bd 100644
--- a/docs/api/python/relay/base.rst
+++ b/docs/api/python/relay/base.rst
@@ -26,11 +26,5 @@ tvm.relay.base
 .. autoclass:: tvm.relay.base.RelayNode
     :members:
 
-.. autoclass:: tvm.relay.base.Span
-    :members:
-
-.. autoclass:: tvm.relay.base.SourceName
-    :members:
-
 .. autoclass:: tvm.relay.base.Id
     :members:
diff --git a/docs/api/python/relay/expr.rst b/docs/api/python/relay/expr.rst
index f17fc2471725..57a4a2511b72 100644
--- a/docs/api/python/relay/expr.rst
+++ b/docs/api/python/relay/expr.rst
@@ -35,12 +35,6 @@ tvm.relay.expr
 .. autoclass:: tvm.relay.expr.Tuple
     :members:
 
-.. autoclass:: tvm.relay.expr.Var
-    :members:
-
-.. autoclass:: tvm.relay.expr.GlobalVar
-    :members:
-
 .. autoclass:: tvm.relay.expr.Function
     :members:
 
diff --git a/docs/api/python/tensor.rst b/docs/api/python/tensor.rst
deleted file mode 100644
index 032de39bb976..000000000000
--- a/docs/api/python/tensor.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.tensor
-----------
-.. automodule:: tvm.tensor
-
-.. autoclass:: tvm.tensor.Tensor
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.tensor.Operation
-    :members:
-    :inherited-members:
-
-.. autoclass:: tvm.tensor.ComputeOp
-    :members:
-    :show-inheritance:
-
-.. autoclass:: tvm.tensor.PlaceholderOp
-    :members:
-    :show-inheritance:
-
-.. autoclass:: tvm.tensor.ScanOp
-    :members:
-    :show-inheritance:
-
-.. autoclass:: tvm.tensor.ExternOp
-    :members:
-    :show-inheritance:
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 28a9fbba2834..e7022fad2081 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -258,8 +258,7 @@ class ApplyHistoryBest(DispatchContext):
     records : str or iterator of (MeasureInput, MeasureResult)
         Collection of tuning records.
         If is str, then it should be the filename of a records log file.
-                   Each row of this file is an encoded record pair.
-        Otherwise, it is an iterator.
+        Each row of this file is an encoded record pair. Otherwise, it is an iterator.
     """
     def __init__(self, records):
         super(ApplyHistoryBest, self).__init__()
@@ -279,8 +278,7 @@ def load(self, records):
         records : str or iterator of (MeasureInput, MeasureResult)
             Collection of tuning records.
             If is str, then it should be the filename of a records log file.
-                       Each row of this file is an encoded record pair.
-            Otherwise, it is an iterator.
+            Each row of this file is an encoded record pair. Otherwise, it is an iterator.
         """
         # pylint: disable=import-outside-toplevel
         from pathlib import Path
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index d83a248c4ece..fbdd34e502ca 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -54,13 +54,13 @@ class TransformSpace(object):
     """Base class for transform space
     TransformSpace is the node in the computation graph of axes
 
-    Note
-    ----
-    We can regard our schedule code as a transformation graph of axes.
-    Starting from raw axes in the definition of tvm.compute, we can transform these axes
-    by some operators. The operator includes 'split', 'reorder' and 'annotate'.
-    Each operator has some tunable parameters (e.g. the split factor).
-    Then the tuning process is just to find good parameters of these op.
+    .. note::
+
+        We can regard our schedule code as a transformation graph of axes.
+        Starting from raw axes in the definition of tvm.compute, we can transform these axes
+        by some operators. The operator includes 'split', 'reorder' and 'annotate'.
+        Each operator has some tunable parameters (e.g. the split factor).
+        Then the tuning process is just to find good parameters of these op.
 
     So the all the combinations of the parameters of these op forms our search space.
 
@@ -109,7 +109,8 @@ class VirtualAxis(TransformSpace):
     var: int or tvm.schedule.IterVar
         If is int, return a virtual axis whose length is the provided argument.
         If is IterVar, return a virtual axis whose length is extracted from
-                       the IterVar's extent domain.
+        the IterVar's extent domain.
+
     name: str
     """
     name_ct = 0
@@ -253,9 +254,9 @@ class SplitEntity(object):
     Parameters
     ----------
     size: Array of int
-        the size of every axis after split
+        the size of every axis after split.
         e.g. an axis of extent 128, we split it into 3 axes, a possible
-             size is [4, 4, 8] (4x4x8 = 128)
+        size is [4, 4, 8] (4x4x8 = 128).
     """
     def __init__(self, size):
         self.size = size
@@ -626,7 +627,7 @@ def axis(var):
         var: int or tvm.schedule.IterVar
             If is int, return an axis whose length is the provided argument.
             If is IterVar, return an axis whose length is extracted from the
-                           IterVar's extent domain.
+            IterVar's extent domain.
         """
         return VirtualAxis(var)
 
@@ -647,18 +648,19 @@ def define_split(self, name, axis, policy='factors', **kwargs):
             If is 'power2', the tuner will try power-of-two factors less or equal to the length.
             If is 'verbose', the tuner will try all candidates in above two policies.
             If is 'candidate', try given candidates.
-        kwargs: dict
+        **kwargs:
             extra arguments for policy
-            max_factor: int
-                the maximum split factor.
-            filter: function(int) -> bool
-                see examples below for how to use filter.
-            num_outputs: int
-                the total number of axis after split.
-            no_tail: bool
-                should we only include divisible numbers as split factors.
-            candidate: list
-                (policy=candidate) manual candidate list.
+
+            ``max_factor``:
+                the maximum split factor (`int`).
+            ``filter``:
+                see examples below for how to use filter (`Callable[[int], bool]`).
+            ``num_outputs``:
+                the total number of axis after split (`int`).
+            ``no_tail``:
+                should we only include divisible numbers as split factors (`bool`).
+            `candidate``:
+                (policy=candidate) manual candidate list (`List`).
 
         Examples
         --------
@@ -668,6 +670,7 @@ def define_split(self, name, axis, policy='factors', **kwargs):
         >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4
         >>> cfg.define_split('tile_y', y, policy='factors', filter=lambda x: x.size[-1] <= 4)
         """
+
         axes = [axis]
         return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs)
 
@@ -749,8 +752,11 @@ def raise_error(self, msg):
 
     def valid(self):
         """Check whether the config meets all the constraints
-        Note: This check should be called after instantiation of task,
-              because the ConfigEntity/ConfigSpace collects errors during instantiation
+
+        .. note::
+
+            This check should be called after instantiation of task,
+            because the ConfigEntity/ConfigSpace collects errors during instantiation
 
         Returns
         -------
diff --git a/python/tvm/contrib/cblas.py b/python/tvm/contrib/cblas.py
index cdd4ce22c82d..2337f846be51 100644
--- a/python/tvm/contrib/cblas.py
+++ b/python/tvm/contrib/cblas.py
@@ -21,23 +21,22 @@
 
 def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
     """Create an extern op that compute matrix mult of A and rhs with CrhsLAS
-
     This function serves as an example on how to call external libraries.
 
     Parameters
     ----------
-    lhs : Tensor
+    lhs: Tensor
         The left matrix operand
-    rhs : Tensor
+    rhs: Tensor
         The right matrix operand
-    transa : bool
+    transa: bool
         Whether transpose lhs
-    transb : bool
+    transb: bool
         Whether transpose rhs
 
     Returns
     -------
-    C : Tensor
+    C: Tensor
         The result tensor.
     """
     n = lhs.shape[1] if transa else lhs.shape[0]
@@ -55,20 +54,22 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
 
 def batch_matmul(lhs, rhs, transa=False, transb=False, iterative=False, **kwargs):
     """Create an extern op that compute batched matrix mult of A and rhs with CBLAS
-     This function serves as an example on how to call external libraries.
-     Parameters
+    This function serves as an example on how to call external libraries.
+
+    Parameters
     ----------
-    lhs : Tensor
+    lhs: Tensor
         The left matrix operand
-    rhs : Tensor
+    rhs: Tensor
         The right matrix operand
-    transa : bool
+    transa: bool
         Whether transpose lhs
-    transb : bool
+    transb: bool
         Whether transpose rhs
-     Returns
+
+    Returns
     -------
-    C : Tensor
+    C: Tensor
         The result tensor.
     """
     b = lhs.shape[0]
diff --git a/python/tvm/ir/base.py b/python/tvm/ir/base.py
index 07ed8e8f8de1..661a64a08bba 100644
--- a/python/tvm/ir/base.py
+++ b/python/tvm/ir/base.py
@@ -39,11 +39,11 @@ def astext(self, show_meta_data=True, annotate=None):
             Optionally annotate function to provide additional
             information in the comment block.
 
-        Note
-        ----
-        The meta data section is necessary to fully parse the text format.
-        However, it can contain dumps that are big (e.g constant weights),
-        so it can be helpful to skip printing the meta data section.
+        .. note::
+
+            The meta data section is necessary to fully parse the text format.
+            However, it can contain dumps that are big (e.g constant weights),
+            so it can be helpful to skip printing the meta data section.
 
         Returns
         -------
diff --git a/python/tvm/ir/transform.py b/python/tvm/ir/transform.py
index 619250459b5c..4519fb630c2a 100644
--- a/python/tvm/ir/transform.py
+++ b/python/tvm/ir/transform.py
@@ -160,7 +160,8 @@ class Sequential(Pass):
     Some typical usage of the sequential pass are:
     1. Users provide a list of passes for optimization.
     2. Only an optimization level is provided so that the backend system has
-       to glob all passes at this level and below to perform the optimizations.
+    to glob all passes at this level and below to perform the optimizations.
+
     Note that users can also provide a series of passes that they don't want to
     apply when running a sequential pass. Pass dependency will be resolved in
     the backend as well.
diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
index baf896e6bc9a..d3226012e887 100644
--- a/python/tvm/relay/op/reduce.py
+++ b/python/tvm/relay/op/reduce.py
@@ -145,21 +145,21 @@ def all(data, axis=None, keepdims=False, exclude=False):
     --------
     .. code-block:: python
 
-    data = relay.Constant(tvm.nd.array([[[ True,  True,  True],
-                                         [ True,  True,  True],
-                                         [False,  True, False]],
-                                        [[ True, False, False],
-                                         [ True,  True, False],
-                                         [False,  True,  True]]]))
-
-    relay.all(data, axis=1)
-    # [[False,  True, False],
-    # [False, False, False]]
-
-    relay.all(data, axis=0)
-    # [[ True, False, False],
-    # [ True,  True, False],
-    # [False,  True, False]]
+        data = relay.Constant(tvm.nd.array([[[ True,  True,  True],
+                                           [ True,  True,  True],
+                                           [False,  True, False]],
+                                          [[ True, False, False],
+                                           [ True,  True, False],
+                                           [False,  True,  True]]]))
+
+        relay.all(data, axis=1)
+        # [[False,  True, False],
+        # [False, False, False]]
+
+        relay.all(data, axis=0)
+        # [[ True, False, False],
+        # [ True,  True, False],
+        # [False,  True, False]]
 
     """
     axis = [axis] if isinstance(axis, int) else axis
@@ -197,21 +197,21 @@ def any(data, axis=None, keepdims=False, exclude=False):
     --------
     .. code-block:: python
 
-    data = relay.Constant(tvm.nd.array([[[ True,  True,  True],
-                                         [ True,  True,  True],
-                                         [False,  True, False]],
-                                        [[ True, False, False],
-                                         [ True,  True, False],
-                                         [False,  True,  True]]]))
-
-    relay.any(data, axis=1)
-    # [[True, True, True],
-    # [True,  True, True]]
-
-    relay.any(data, axis=0)
-    # [[ True, True, True],
-    # [ True,  True, True],
-    # [False,  True, True]]
+        data = relay.Constant(tvm.nd.array([[[ True,  True,  True],
+                                            [ True,  True,  True],
+                                            [False,  True, False]],
+                                            [[ True, False, False],
+                                            [ True,  True, False],
+                                            [False,  True,  True]]]))
+
+        relay.any(data, axis=1)
+        # [[True, True, True],
+        # [True,  True, True]]
+
+        relay.any(data, axis=0)
+        # [[ True, True, True],
+        # [ True,  True, True],
+        # [False,  True, True]]
 
     """
     axis = [axis] if isinstance(axis, int) else axis
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 710d203eccc6..15c48df14827 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -147,56 +147,54 @@ def squeeze(data, axis=None):
     return _make.squeeze(data, axis)
 
 def reshape(data, newshape):
-    """Reshapes the input array.
-
-    Example::
+    """Reshape the input array.
 
     To give user more convenience in without doing manual shape inference,
     some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}.
     The significance of each is explained below:
 
-    - ``0``  copy this dimension from the input to the output shape.
+    ``0`` copy this dimension from the input to the output shape.
 
-    Example::
+        .. code-block:: python
 
-    - data.shape = (2,3,4), newshape = (4,0,2), result.shape = (4,3,2)
-    - data.shape = (2,3,4), newshape = (2,0,0), result.shape = (2,3,4)
+            data.shape = (2,3,4), newshape = (4,0,2), result.shape = (4,3,2)
+            data.shape = (2,3,4), newshape = (2,0,0), result.shape = (2,3,4)
 
-    - ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
-    keeping the size of the new array same as that of the input array.
+    ``-1`` infers the dimension of the output shape by using the remainder of
+    the input dimensions keeping the size of the new array same as that of the input array.
     At most one dimension of shape can be -1.
 
-    Example::
+        .. code-block:: python
 
-    - data.shape = (2,3,4), newshape = (6,1,-1), result.shape = (6,1,4)
-    - data.shape = (2,3,4), newshape = (3,-1,8), result.shape = (3,1,8)
-    - data.shape = (2,3,4), newshape = (-1,), result.shape = (24,)
+            data.shape = (2,3,4), newshape = (6,1,-1), result.shape = (6,1,4)
+            data.shape = (2,3,4), newshape = (3,-1,8), result.shape = (3,1,8)
+            data.shape = (2,3,4), newshape = (-1,), result.shape = (24,)
 
-    - ``-2`` copy all/remainder of the input dimensions to the output shape.
+    ``-2`` copy all/remainder of the input dimensions to the output shape.
 
-    Example::
+        .. code-block:: python
 
-    - data.shape = (2,3,4), newshape = (-2,), result.shape = (2,3,4)
-    - data.shape = (2,3,4), newshape = (2,-2), result.shape = (2,3,4)
-    - data.shape = (2,3,4), newshape = (-2,1,1), result.shape = (2,3,4,1,1)
+            data.shape = (2,3,4), newshape = (-2,), result.shape = (2,3,4)
+            data.shape = (2,3,4), newshape = (2,-2), result.shape = (2,3,4)
+            data.shape = (2,3,4), newshape = (-2,1,1), result.shape = (2,3,4,1,1)
 
-    - ``-3`` use the product of two consecutive dimensions of the input shape
+    ``-3`` use the product of two consecutive dimensions of the input shape
     as the output dimension.
 
-    Example::
+        .. code-block:: python
 
-    - data.shape = (2,3,4), newshape = (-3,4), result.shape = (6,4)
-    - data.shape = (2,3,4,5), newshape = (-3,-3), result.shape = (6,20)
-    - data.shape = (2,3,4), newshape = (0,-3), result.shape = (2,12)
-    - data.shape = (2,3,4), newshape = (-3,-2), result.shape = (6,4)
+            data.shape = (2,3,4), newshape = (-3,4), result.shape = (6,4)
+            data.shape = (2,3,4,5), newshape = (-3,-3), result.shape = (6,20)
+            data.shape = (2,3,4), newshape = (0,-3), result.shape = (2,12)
+            data.shape = (2,3,4), newshape = (-3,-2), result.shape = (6,4)
 
-    - ``-4`` split one dimension of the input into two dimensions passed subsequent
+    ``-4`` split one dimension of the input into two dimensions passed subsequent
     to -4 in shape (can contain -1).
 
-    Example::
+        .. code-block:: python
 
-    - data.shape = (2,3,4), newshape = (-4,1,2,-2), result.shape = (1,2,3,4)
-    - data.shape = (2,3,4), newshape = (2,-4,-1,3,-2), result.shape = (2,1,3,4)
+            data.shape = (2,3,4), newshape = (-4,1,2,-2), result.shape = (1,2,3,4)
+            data.shape = (2,3,4), newshape = (2,-4,-1,3,-2), result.shape = (2,1,3,4)
 
     Parameters
     ----------
@@ -715,14 +713,14 @@ def reverse_reshape(data, newshape):
     """Reshapes the input array where the special values are inferred from
     right to left.
 
-    Example::
-
     The special values have the same semantics as :py:class:`tvm.relay.reshape`.
     The difference is that special values are inferred from right to left. It
-    can be explained in the example below::
+    can be explained in the example below.
+
+    .. code-block:: python
 
-    - data.shape = (10,5,4), newshape = (-1,0), reshape results in (40,5)
-    - data.shape = (10,5,4), newshape = (-1,0), reverse_reshape results in (40,5)
+        data.shape = (10,5,4), newshape = (-1,0), reshape results in (40,5)
+        data.shape = (10,5,4), newshape = (-1,0), reverse_reshape results in (40,5)
 
     Parameters
     ----------
diff --git a/python/tvm/relay/op/vision/yolo.py b/python/tvm/relay/op/vision/yolo.py
index 7ecf64cf21a0..90dc3b8cef7c 100644
--- a/python/tvm/relay/op/vision/yolo.py
+++ b/python/tvm/relay/op/vision/yolo.py
@@ -21,18 +21,25 @@ def yolo_reorg(data, stride):
     """Yolo reorg operation used in darknet models.
     This layer shuffles the input tensor values based on the stride value.
     Along with the shuffling, it does the shape transform.
-    If '(n, c, h, w)' is the data shape and 's' is stride, output shape is '(n, c*s*s, h/s, w/s)'
-    Example: data(1, 4, 2, 2) = [[[[ 0  1] [ 2  3]]
-                                  [[ 4  5] [ 6  7]]
-                                  [[ 8  9] [10 11]]
-                                  [[12 13] [14 15]]]]
-             stride = 2
-             ret(1, 16, 1, 1) = [[[[ 0]]  [[ 2]]  [[ 8]]  [[10]]
-                                  [[ 1]]  [[ 3]]  [[ 9]]  [[11]]
-                                  [[ 4]]  [[ 6]]  [[12]]  [[14]]
-                                  [[ 5]]  [[ 7]]  [[13]]  [[15]]]]
-
-    Note: stride=1 has no significance for reorg operation.
+    If '(n, c, h, w)' is the data shape and 's' is stride, output shape is '(n, c*s*s, h/s, w/s)'.
+
+    Example:
+
+    .. code-block:: python
+
+        data(1, 4, 2, 2) = [[[[ 0  1] [ 2  3]]
+                            [[ 4  5] [ 6  7]]
+                            [[ 8  9] [10 11]]
+                            [[12 13] [14 15]]]]
+        stride = 2
+        ret(1, 16, 1, 1) = [[[[ 0]]  [[ 2]]  [[ 8]]  [[10]]
+                            [[ 1]]  [[ 3]]  [[ 9]]  [[11]]
+                            [[ 4]]  [[ 6]]  [[12]]  [[14]]
+                            [[ 5]]  [[ 7]]  [[13]]  [[15]]]]
+
+    .. note::
+
+        stride=1 has no significance for reorg operation.
 
     Parameters
     ----------
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
index 4c2bf873778a..08b41b28bd35 100644
--- a/python/tvm/relay/transform.py
+++ b/python/tvm/relay/transform.py
@@ -256,17 +256,20 @@ def CombineParallelConv2D(min_num_branches=3):
 def CombineParallelDense(min_num_branches=3):
     """Combine multiple dense operators into one. For example:
 
-                data
-          /              \
-     dense (2,2)         dense (2,2)
-         |                 |
-    elemwise/bcast (2,2)  elemwise/bcast (2,2)
+    .. code-block
+                    data
+            /              \
+        dense (2,2)         dense (2,2)
+            |                 |
+        elemwise/bcast (2,2)  elemwise/bcast (2,2)
 
     Would become:
 
-             data
-              |
-        batch_matmul+elemwise/bcast (2,2,2)
+    .. code-block
+
+                data
+                |
+            batch_matmul+elemwise/bcast (2,2,2)
 
     Parameters
     ----------
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index fcbb68f33f22..739268aba4a5 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -102,7 +102,7 @@ def axis(self):
 
     @property
     def op(self):
-        """The corressponding :any:`Operation`."""
+        """The corressponding :py:class:`Operation`."""
         return self.__getattr__("op")
 
     @property
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index aeda603e19aa..ca11ffc76ea5 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -812,7 +812,7 @@ class Select(PrimExprWithOp):
     Note
     ----
     Select may compute both true_value and false_value.
-    Use :any:`tvm.if_then_else` instead if you want to
+    Use :py:class:`tvm.if_then_else` instead if you want to
     get a conditional expression that only evaluates
     the correct branch.
 
diff --git a/topi/python/topi/sparse/csrmv.py b/topi/python/topi/sparse/csrmv.py
index fb9f10b151c0..8a21f0db6d96 100644
--- a/topi/python/topi/sparse/csrmv.py
+++ b/topi/python/topi/sparse/csrmv.py
@@ -87,7 +87,6 @@ def csrmv(a, x, y=None):
     where `x` and `y` are vectors, `A` is an m-by-k sparse matrix in the CSR format.
 
     Parameters
-
     ----------
     a : tvm.contrib.sparse.CSRNDArray
         2-D sparse matrix with shape [m, k]

From efd35e86992815d7f46a77dfced79180964472ac Mon Sep 17 00:00:00 2001
From: wpan11nv <60017475+wpan11nv@users.noreply.github.com>
Date: Thu, 20 Feb 2020 09:25:35 -0800
Subject: [PATCH 24/73] [Relay] Fix an assertion exposed by loop vectorizer
 (#4916)

- Allows uniform conditions for select expressions (the same as halide)
  exposed by the loop vectorizer.

Signed-off-by: Wei Pan <weip@nvidia.com>
---
 src/tir/ir/expr.cc                   | 3 ++-
 tests/python/relay/test_op_level4.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index 22844745982f..07cae5e2c746 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -158,7 +158,8 @@ PrimExpr SelectNode::make(PrimExpr condition, PrimExpr true_value, PrimExpr fals
   CHECK(true_value.defined()) << "ValueError: true_value is undefined";
   CHECK(false_value.defined()) << "ValueError: true_value is undefined";
   CHECK(condition.dtype().is_bool());
-  CHECK_EQ(condition.dtype().lanes(), true_value.dtype().lanes());
+  CHECK(condition.dtype().lanes() == true_value.dtype().lanes() ||
+        condition.dtype().lanes() == 1);
   CHECK(false_value.dtype() == true_value.dtype()) << "TypeError: mismatched types";
 
   ObjectPtr<SelectNode> node = make_object<SelectNode>();
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index c5cd70818795..44b51f2c2367 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -323,6 +323,7 @@ def verify(dshape, begin, end, strides, vshape, test_ref=True):
             op_res = intrp.evaluate(func)(x_data, v_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
+    verify((3, 4, 16), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
     verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3))
     verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))

From 98e7709ff9e69d48ed46e66d89145aeed074f509 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 20 Feb 2020 14:09:34 -0800
Subject: [PATCH 25/73] [DOCS] Fix Sphinx Warnings (RST indent, cross-ref, and
 image scale) (#4920)

* fix indents

* Fix image scale and cross-ref
---
 docs/api/python/index.rst                     |  1 +
 docs/api/python/relay/op.rst                  |  5 --
 docs/api/python/runtime.rst                   |  2 +-
 docs/contribute/pull_request.rst              |  7 ++-
 docs/deploy/index.rst                         |  3 +-
 docs/dev/inferbound.rst                       | 11 ----
 docs/dev/relay_bring_your_own_codegen.rst     |  4 +-
 docs/dev/relay_intro.rst                      |  3 --
 docs/dev/runtime.rst                          |  1 +
 docs/dev/virtual_machine.rst                  | 21 +++++++-
 docs/vta/dev/hardware.rst                     |  1 +
 docs/vta/hardware.rst                         | 19 -------
 tutorials/frontend/build_gcn.py               | 35 ++++++-------
 tutorials/frontend/from_tflite.py             | 51 +++++++++----------
 tutorials/frontend/using_external_lib.py      |  2 +-
 tutorials/language/tuple_inputs.py            |  2 +-
 tutorials/optimize/opt_conv_tensorcore.py     | 15 +++---
 tutorials/optimize/opt_gemm.py                |  1 -
 .../optimize/opt_matmul_auto_tensorcore.py    |  9 ++--
 tutorials/relay_quick_start.py                |  5 +-
 vta/tutorials/frontend/README.txt             |  2 +-
 .../frontend/deploy_vision_on_vta.py          | 21 ++++----
 22 files changed, 99 insertions(+), 122 deletions(-)
 delete mode 100644 docs/vta/hardware.rst

diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index f62a4b848650..796bba8c591c 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -21,6 +21,7 @@ Python API
 .. toctree::
    :maxdepth: 2
 
+   tvm
    runtime
    ndarray
    error
diff --git a/docs/api/python/relay/op.rst b/docs/api/python/relay/op.rst
index 4c62a06d0959..3d8460adcce7 100644
--- a/docs/api/python/relay/op.rst
+++ b/docs/api/python/relay/op.rst
@@ -53,8 +53,3 @@ tvm.relay.op
 
 .. automodule:: tvm.relay.op.nn
     :members:
-
-.. automodule:: tvm.relay.op.vision.multibox
-    :members:
-
-.. autofunction:: tvm.relay.vision.nms
diff --git a/docs/api/python/runtime.rst b/docs/api/python/runtime.rst
index 9e395712aa6d..30d1b98650a3 100644
--- a/docs/api/python/runtime.rst
+++ b/docs/api/python/runtime.rst
@@ -27,7 +27,7 @@ tvm.runtime
 
 .. autoclass:: tvm.runtime.PackedFunc
    :members:
-   :inheritated-members:
+   :inherited-members:
 
 .. autofunction:: tvm.register_func
 
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 7ad53758c54b..51626a16eb1a 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -29,12 +29,11 @@ This is a quick guide to submit a pull request, please also refer to the detaile
     git rebase upstream/master
 
 - Make sure code style check pass by typing the following command, and all the existing test-cases pass.
-  - ``docker/bash.sh tvmai/ci-lint ./tests/scripts/task_lint.sh``
-     (Note: You must install docker beforehand so you can run a docker image.)
+- ``docker/bash.sh tvmai/ci-lint ./tests/scripts/task_lint.sh``. (Note: You must install docker beforehand so you can run a docker image.)
 - Add test-cases to cover the new features or bugfix the patch introduces.
 - Document the code you wrote, see more at :ref:`doc_guide`
-- Send the pull request,  fix the problems reported by automatic checks.
-  Request code reviews from other contributors and improves your patch according to feedbacks.
+- Send the pull request and fix the problems reported by automatic checks.
+- Request code reviews from other contributors and improves your patch according to feedbacks.
 
   - To get your code reviewed quickly, we encourage you to help review others' code so they can do the favor in return.
   - Code review is a shepherding process that helps to improve contributor's code quality.
diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst
index 9a30b96ca66e..a43cce728f61 100644
--- a/docs/deploy/index.rst
+++ b/docs/deploy/index.rst
@@ -56,7 +56,6 @@ embedded devices is through TVM's RPC API.
 Here are the links to the related tutorials.
 
 - :ref:`tutorial-cross-compilation-and-rpc`
-- :ref:`tutorial-deploy-model-on-mali-gpu`
 - :ref:`tutorial-deploy-model-on-rasp`
 
 After you finished tuning and benchmarking, you might need to deploy the model on the
@@ -68,3 +67,5 @@ target device without relying on RPC. see the following resources on how to do s
    cpp_deploy
    android
    integrate
+   aocl_fpga
+   aws_fpga
diff --git a/docs/dev/inferbound.rst b/docs/dev/inferbound.rst
index d9fedf8296ef..2f4d428dba99 100644
--- a/docs/dev/inferbound.rst
+++ b/docs/dev/inferbound.rst
@@ -118,13 +118,11 @@ In the Operation class declaration above, we can see that each operation also ha
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/stage_graph.png
     :align: center
-    :scale: 70%
 
 InferBound makes one pass through the graph, visiting each stage exactly once. InferBound starts from the output stages (i.e., the solid blue nodes in the graph above), and moves upwards (in the opposite direction of the edges). This is achieved by performing a reverse topological sort on the nodes of the graph. Therefore, when InferBound visits a stage, each of its consumer stages has already been visited.
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/inferbound_traversal.png
     :align: center
-    :scale: 70%
 
 The InferBound pass is shown in the following pseudo-code:
 
@@ -162,7 +160,6 @@ Recall that all IterVars of the stage are related by IterVarRelations. The IterV
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/relations.png
     :align: center
-    :scale: 70%
 
 
 The above diagram shows the IterVar hyper-graph for one stage. The stage has one root_iter_var, ``i``. It has been split, and the resulting inner axis ``i.inner``, has been split again. The leaf_iter_vars of the stage are shown in green: ``i.outer``, ``i.inner.outer``, and ``i.inner.inner``.
@@ -208,7 +205,6 @@ As mentioned above, a consumer may only require a small number of elements from
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/inferbound_phases.png
     :align: center
-    :scale: 70%
 
 IntSets
 ~~~~~~~
@@ -323,14 +319,12 @@ A ComputeOp has only a single output Tensor, whose axes correspond to the axis v
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/gatherbound.png
     :align: center
-    :scale: 70%
 
 
 The union of IntSets is computed by converting each IntSet to an Interval, and then taking the minimum of all minimums, and the maximum of all of these interval's maximums.
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/union.png
     :align: center
-    :scale: 70%
 
 
 This clearly results in some unnecessary computation, i.e., tensor elements will be computed that are never used.
@@ -340,7 +334,6 @@ Unfortunately, even if we're lucky and the IntervalSet unions do not produce unn
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/gatherbound_problem.png
     :align: center
-    :scale: 70%
 
 .. _InferBoundCA:
 
@@ -696,7 +689,6 @@ When InferRootBound is working on stage B, it visits B's consumer stage C to fin
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/passupdomain_problem.png
     :align: center
-    :scale: 70%
 
 
 
@@ -756,17 +748,14 @@ If the split factor is 4, or 8, in the above example, the region of B needed in
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/passupdomain_div.png
     :align: center
-    :scale: 70%
 
 However, if the split factor is changed from 4 to 3 in the example above, it is easy to see that the region of B that C needs can no longer be described by an independent Range for each of its axes.
 
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/passupdomain_nodiv.png
     :align: center
-    :scale: 70%
 
 The best that can be done with rectangular regions is shown in the following diagram. The orange regions are the minimum rectangular regions covering the region of B that needs to be computed, at each iteration of the outer loop.
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/docs/inferbound/passupdomain_min.png
     :align: center
-    :scale: 70%
diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/relay_bring_your_own_codegen.rst
index 3e3369dba52f..b7d5fa9f5fd6 100644
--- a/docs/dev/relay_bring_your_own_codegen.rst
+++ b/docs/dev/relay_bring_your_own_codegen.rst
@@ -535,7 +535,7 @@ To simplify, we define a graph representation named "ExampleJSON" in this guide.
 
 Then the ExampleJON of this subgraph looks like:
 
-.. code-block:: json
+.. code-block:: none
 
   subgraph_0
     input 0 10 10
@@ -544,7 +544,7 @@ Then the ExampleJON of this subgraph looks like:
     input 3 10 10
     add 4 inputs: 0 1 shape: 10 10
     sub 5 inputs: 4 2 shape: 10 10
-    add 6 inputs: 5 3 shape: 10 10
+    mul 6 inputs: 5 3 shape: 10 10
 
 The ``input`` keyword declares an input tensor with its ID and shape; while the other statements describes computations in ``<op> <output ID> inputs: [input ID] shape: [shape]`` syntax.
 
diff --git a/docs/dev/relay_intro.rst b/docs/dev/relay_intro.rst
index 526822afe422..fac447987717 100644
--- a/docs/dev/relay_intro.rst
+++ b/docs/dev/relay_intro.rst
@@ -39,7 +39,6 @@ compile for heterogeneous execution environments (e.g., executing parts of the g
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow.png
     :align: center
-    :scale: 70%
 
 
 You can use Relay to build a computational (dataflow) graph. Specifically, the above code shows how to
@@ -130,7 +129,6 @@ The code example below shows one program with two forms side by side.
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow_vs_func.png
     :align: center
-    :scale: 70%
 
 
 The nested let binding is called A-normal form, and it is commonly used as IRs in functional programming languages.
@@ -155,7 +153,6 @@ which does not use let bindings.
 
 .. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/let_scope.png
     :align: center
-    :scale: 70%
 
 The problem comes when we try to decide where we should evaluate node ``%1``. In particular, while the text format seems
 to suggest that we should evaluate node ``%1`` outside the if scope, the AST(as shown in the picture) does not suggest so.
diff --git a/docs/dev/runtime.rst b/docs/dev/runtime.rst
index 353b3392d181..5ed5f86ed44b 100644
--- a/docs/dev/runtime.rst
+++ b/docs/dev/runtime.rst
@@ -258,6 +258,7 @@ It also allows us to get members of an object easily in front-end language.
 For example, in the following code, we accessed the op field of the TensorNode.
 
 .. code:: python
+
     import tvm
 
     x = tvm.placeholder((3,4), name="x")
diff --git a/docs/dev/virtual_machine.rst b/docs/dev/virtual_machine.rst
index 338fb46050e9..5bb5adee5459 100644
--- a/docs/dev/virtual_machine.rst
+++ b/docs/dev/virtual_machine.rst
@@ -91,6 +91,7 @@ Ret
 ^^^
 **Arguments**:
 ::
+
   RegName dst
   RegName result
 
@@ -100,6 +101,7 @@ InvokePacked
 ^^^^^^^^^^^^
 **Arguments**:
 ::
+
   Index packed_index
   Index arity
   Index output_size
@@ -114,6 +116,7 @@ AllocTensor
 ^^^^^^^^^^^
 **Arguments**:
 ::
+
   RegName dst
   RegName storage
   uint32_t ndim
@@ -127,6 +130,7 @@ AllocTensorReg
 ^^^^^^^^^^^^^^
 **Arguments**:
 ::
+
   RegName dst
   RegName storage
   RegName shape_register
@@ -139,6 +143,7 @@ AllocStorage
 ^^^^^^^^^^^^
 **Arguments**:
 ::
+
   RegName dst
   RegName size
   RegName alignment
@@ -151,6 +156,7 @@ AllocADT
 ^^^^^^^^
 **Arguments**:
 ::
+
   RegName dst
   Index tag
   Index num_fields
@@ -163,6 +169,7 @@ AllocClosure
 ^^^^^^^^^^^^
 **Arguments**:
 ::
+
   RegName dst
   Index clo_index
   Index num_freevar
@@ -176,6 +183,7 @@ GetField
 ^^^^^^^^
 **Arguments**:
 ::
+
   RegName dst
   RegName object
   Index field_index
@@ -186,6 +194,7 @@ If
 ^^
 **Arguments**:
 ::
+
   RegName test
   RegName target
   Index true_offset
@@ -199,6 +208,7 @@ GetTag
 ^^^^^^
 **Arguments**:
 ::
+
   RegName object
   RegName dst
 
@@ -212,6 +222,7 @@ Goto
 ^^^^
 **Arguments**:
 ::
+
   Index pc_offset
 
 Relative unconditional jump by ``pc_offset``.
@@ -220,6 +231,7 @@ Invoke
 ^^^^^^
 **Arguments**:
 ::
+
   Index func_index
 
 Invoke function at ``func_index``, consumes the number of arguments contained in the VMFunction's
@@ -229,6 +241,7 @@ InvokeClosure
 ^^^^^^^^^^^^^
 **Arguments**:
 ::
+
     RegName closure
     Index num_closure_args
     RegName* closure_args
@@ -239,6 +252,7 @@ LoadConst
 ^^^^^^^^^
 **Arguments**:
 ::
+
   RegName dst
   Index const_index
 
@@ -248,6 +262,7 @@ LoadConsti
 ^^^^^^^^^^
 **Arguments**:
 ::
+
   Index val
   RegName dst
 
@@ -277,7 +292,7 @@ previous call. Registers are allocated in a continuous space (virtual register f
 
 We keep track of a set of Relay functions we have called, a pointer into its bytecode, an offset into the byte code (known as the program counter).
 
-::
+.. code-block:: c
 
     struct VirtualMachine {
       ...
@@ -331,6 +346,7 @@ Optimizations marked with `TODO` are not implemented yet.
 
 Serialization
 ~~~~~~~~~~~~~
+
 Serializing and deserializing the executable generated by the Relay VM compiler is a must as
 we may want to save the model to the disk and perform inference later. Previously, Relay has produced
 a serialized form in a json file for the graph runtime. However, the same format is not directly
@@ -372,14 +388,17 @@ Unresolved Questions
 
 How do we handle dynamic shapes?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 TODO
 
 How can we modify the VM to support JIT compilation of certain code paths?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 In the code generation space there are still many tradeoffs to be analyzed and the VM is designed
 to be very flexible so we can modify it for future experiments.
 
 How do we support heterogenous execution?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 Heterogenous execution should work out of the box assuming we have annotated the appropriate device copies.
 In order to do this properly we need to run the device annotation and copying passes. 
diff --git a/docs/vta/dev/hardware.rst b/docs/vta/dev/hardware.rst
index 7ec073b7d905..784cd54d6d45 100644
--- a/docs/vta/dev/hardware.rst
+++ b/docs/vta/dev/hardware.rst
@@ -215,6 +215,7 @@ This would result in a ``load-gemm-activate-store`` task pipeline which closely
 Adding more stages has a cost however: it can add storage and extra logic overhead, which is why we opted for a default 3-stage pipeline.
 
 .. _vta-uarch:
+
 Microarchitectural Overview
 ---------------------------
 
diff --git a/docs/vta/hardware.rst b/docs/vta/hardware.rst
deleted file mode 100644
index cfd7be333081..000000000000
--- a/docs/vta/hardware.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-VTA Hardware Design Overview
-============================
diff --git a/tutorials/frontend/build_gcn.py b/tutorials/frontend/build_gcn.py
index 077b066f7f62..d385dc9e72ab 100644
--- a/tutorials/frontend/build_gcn.py
+++ b/tutorials/frontend/build_gcn.py
@@ -16,32 +16,29 @@
 # under the License.
 """
 Building a Graph Convolutional Network
-=====================
+======================================
 **Author**: `Yulun Yao <https://yulunyao.io/>`_, \
             `Chien-Yu Lin <https://homes.cs.washington.edu/~cyulin/>`_
 
 This article is an introductory tutorial to build a Graph Convolutional Network (GCN) with Relay.
-
 In this tutorial, we will run our GCN on Cora dataset to demonstrate.
-
 Cora dataset is a common benchmark for Graph Neural Networks (GNN) and frameworks that support GNN training and inference.
-
 We directly load the dataset from DGL library to do the apples to apples comparison against DGL.
 
 Please refer to DGL doc for DGL installation at
-https://docs.dgl.ai/install/index.html
+https://docs.dgl.ai/install/index.html.
 
-and refer to PyTorch guide for PyTorch installation at
-https://pytorch.org/get-started/locally/
+Please refer to PyTorch guide for PyTorch installation at
+https://pytorch.org/get-started/locally/.
 """
 
 
 ######################################################################
 # Define GCN in DGL with PyTorch backend
-# ------------------
+# --------------------------------------
 #
 # DGL example: https://github.com/dmlc/dgl/tree/master/examples/pytorch/gcn
-# This part reuses the code from the above example
+# This part reuses the code from the above example.
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -78,7 +75,7 @@ def forward(self, features):
 
 ######################################################################
 # Define the functions to load dataset and evaluate accuracy
-# ------------------
+# ----------------------------------------------------------
 # You may substitute this part with your own dataset, here we load data from DGL
 from dgl.data import load_data
 from collections import namedtuple
@@ -106,7 +103,7 @@ def evaluate(data, logits):
 
 ######################################################################
 # Load the data and set up model parameters
-# ------------------
+# -----------------------------------------
 """
 Parameters
 ----------
@@ -136,7 +133,7 @@ def evaluate(data, logits):
 
 ######################################################################
 # Set up the DGL-PyTorch model and get the golden results
-# ------------------
+# -------------------------------------------------------
 #
 # The weights are trained with https://github.com/dmlc/dgl/blob/master/examples/pytorch/gcn/train.py
 from tvm.contrib.download import download_testdata
@@ -162,7 +159,7 @@ def evaluate(data, logits):
 
 ######################################################################
 # Run the DGL model and test for accuracy
-# ------------------
+# ---------------------------------------
 torch_model.eval()
 with torch.no_grad():
     logits_torch = torch_model(features)
@@ -174,9 +171,8 @@ def evaluate(data, logits):
 
 ######################################################################
 # Define Graph Convolution Layer in Relay
-# ----------------------------
+# ---------------------------------------
 # To run GCN on TVM, we first need to implement Graph Convolution Layer.
-#
 # You may refer to https://github.com/dmlc/dgl/blob/master/python/dgl/nn/mxnet/conv.py for a GraphConv Layer implemented in DGL with MXNet Backend
 #
 # The layer is defined with below operations, note that we apply two transposes to keep adjacency matrix on right hand side of sparse_dense operator,
@@ -251,7 +247,7 @@ def GraphConv(layer_name,
 
 ######################################################################
 # Prepare the parameters needed in the GraphConv layers
-# ------------------
+# -----------------------------------------------------
 #
 import numpy as np
 import networkx as nx
@@ -282,7 +278,7 @@ def prepare_params(g, data):
 
 ######################################################################
 # Put layers together
-# ------------------
+# -------------------
 
 # Define input features, norms, adjacency matrix in Relay
 infeats = relay.var("infeats", shape=data.features.shape)
@@ -321,7 +317,8 @@ def prepare_params(g, data):
 
 ######################################################################
 # Compile and run with TVM
-# ------------------
+# ------------------------
+#
 # Export the weigths from PyTorch model to Python Dict
 model_params = {}
 for param_tensor in torch_model.state_dict():
@@ -345,7 +342,7 @@ def prepare_params(g, data):
 
 ######################################################################
 # Run the TVM model, test for accuracy and verify with DGL
-# ------------------
+# --------------------------------------------------------
 m.run()
 logits_tvm = m.get_output(0).asnumpy()
 print("Print the first five outputs from TVM execution\n", logits_tvm[:5])
diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py
index 85ece270a21b..e93a71ce4a78 100644
--- a/tutorials/frontend/from_tflite.py
+++ b/tutorials/frontend/from_tflite.py
@@ -16,13 +16,12 @@
 # under the License.
 """
 Compile TFLite Models
-===================
+=====================
 **Author**: `Zhao Wu <https://github.com/FrozenGene>`_
 
 This article is an introductory tutorial to deploy TFLite models with Relay.
 
 To get started, Flatbuffers and TFLite package needs to be installed as prerequisites.
-
 A quick solution is to install Flatbuffers via pip
 
 .. code-block:: bash
@@ -68,7 +67,7 @@
 """
 ######################################################################
 # Utils for downloading and extracting zip files
-# ---------------------------------------------
+# ----------------------------------------------
 import os
 
 def extract(path):
@@ -84,28 +83,28 @@ def extract(path):
 
 ######################################################################
 # Load pretrained TFLite model
-# ---------------------------------------------
-# we load mobilenet V1 TFLite model provided by Google
+# ----------------------------
+# Load mobilenet V1 TFLite model provided by Google
 from tvm.contrib.download import download_testdata
 
 model_url = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz"
 
-# we download model tar file and extract, finally get mobilenet_v1_1.0_224.tflite
+# Download model tar file and extract it to get mobilenet_v1_1.0_224.tflite
 model_path = download_testdata(model_url, "mobilenet_v1_1.0_224.tgz", module=['tf', 'official'])
 model_dir = os.path.dirname(model_path)
 extract(model_path)
 
-# now we have mobilenet_v1_1.0_224.tflite on disk and open it
+# Now we can open mobilenet_v1_1.0_224.tflite
 tflite_model_file = os.path.join(model_dir, "mobilenet_v1_1.0_224.tflite")
 tflite_model_buf = open(tflite_model_file, "rb").read()
 
-# get TFLite model from buffer
+# Get TFLite model from buffer
 import tflite.Model
 tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
 
 ######################################################################
 # Load a test image
-# ---------------------------------------------
+# -----------------
 # A single cat dominates the examples!
 from PIL import Image
 from matplotlib import pyplot as plt
@@ -118,10 +117,10 @@ def extract(path):
 plt.show()
 image_data = np.asarray(resized_image).astype("float32")
 
-# after expand_dims, we have format NHWC
+# Add a dimension to the image so that we have NHWC format layout
 image_data = np.expand_dims(image_data, axis=0)
 
-# preprocess image as described here:
+# Preprocess image as described here:
 # https://github.com/tensorflow/models/blob/edb6ed22a801665946c63d650ab9a0b23d98e1b1/research/slim/preprocessing/inception_preprocessing.py#L243
 image_data[:, :, :, 0] = 2.0 / 255.0 * image_data[:, :, :, 0] - 1
 image_data[:, :, :, 1] = 2.0 / 255.0 * image_data[:, :, :, 1] - 1
@@ -130,50 +129,50 @@ def extract(path):
 
 ######################################################################
 # Compile the model with relay
-# ---------------------------------------------
+# ----------------------------
 
 # TFLite input tensor name, shape and type
 input_tensor = "input"
 input_shape = (1, 224, 224, 3)
 input_dtype = "float32"
 
-# parse TFLite model and convert into Relay computation graph
+# Parse TFLite model and convert it to a Relay module
 from tvm import relay
 mod, params = relay.frontend.from_tflite(tflite_model,
                                          shape_dict={input_tensor: input_shape},
                                          dtype_dict={input_tensor: input_dtype})
 
-# target x86 CPU
+# Build the module against to x86 CPU
 target = "llvm"
 with relay.build_config(opt_level=3):
     graph, lib, params = relay.build(mod, target, params=params)
 
 ######################################################################
 # Execute on TVM
-# ---------------------------------------------
+# --------------
 import tvm
 from tvm.contrib import graph_runtime as runtime
 
-# create a runtime executor module
+# Create a runtime executor module
 module = runtime.create(graph, lib, tvm.cpu())
 
-# feed input data
+# Feed input data
 module.set_input(input_tensor, tvm.nd.array(image_data))
 
-# feed related params
+# Feed related params
 module.set_input(**params)
 
-# run
+# Run
 module.run()
 
-# get output
+# Get output
 tvm_output = module.get_output(0).asnumpy()
 
 ######################################################################
 # Display results
-# ---------------------------------------------
+# ---------------
 
-# load label file
+# Load label file
 label_file_url = ''.join(['https://raw.githubusercontent.com/',
                           'tensorflow/tensorflow/master/tensorflow/lite/java/demo/',
                           'app/src/main/assets/',
@@ -181,15 +180,15 @@ def extract(path):
 label_file = "labels_mobilenet_quant_v1_224.txt"
 label_path = download_testdata(label_file_url, label_file, module='data')
 
-# list of 1001 classes
+# List of 1001 classes
 with open(label_path) as f:
     labels = f.readlines()
 
-# convert result to 1D data
+# Convert result to 1D data
 predictions = np.squeeze(tvm_output)
 
-# get top 1 prediction
+# Get top 1 prediction
 prediction = np.argmax(predictions)
 
-# convert id to class name and show the result
+# Convert id to class name and show the result
 print("The image prediction result is: id " + str(prediction) + " name: " + labels[prediction])
diff --git a/tutorials/frontend/using_external_lib.py b/tutorials/frontend/using_external_lib.py
index 35b015bffcd3..71acedaf181b 100644
--- a/tutorials/frontend/using_external_lib.py
+++ b/tutorials/frontend/using_external_lib.py
@@ -16,7 +16,7 @@
 # under the License.
 """
 Using External Libraries in Relay
-================================
+=================================
 **Author**: `Masahiro Masuda <https://github.com/masahi>`_, `Truman Tian <https://github.com/SiNZeRo>`_
 
 This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with Relay.
diff --git a/tutorials/language/tuple_inputs.py b/tutorials/language/tuple_inputs.py
index 0c5c85ca585a..715e2ef36f7e 100644
--- a/tutorials/language/tuple_inputs.py
+++ b/tutorials/language/tuple_inputs.py
@@ -56,7 +56,7 @@
 # operators, and the inputs will collaborate together, e.g. :code:`argmax`.
 # In the reduction procedure, :code:`argmax` need to compare the value of
 # operands, also need to keep the index of operand. It can be expressed
-# with :any:`comm_reducer` as below:
+# with :py:func:`tvm.comm_reducer` as below:
 
 # x and y are the operands of reduction, both of them is a tuple of index
 # and value.
diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/tutorials/optimize/opt_conv_tensorcore.py
index 774b4c7258bb..ef840892d7d5 100644
--- a/tutorials/optimize/opt_conv_tensorcore.py
+++ b/tutorials/optimize/opt_conv_tensorcore.py
@@ -18,7 +18,7 @@
 .. _opt-conv-tensorcore:
 
 How to optimize convolution using TensorCores
-==================================
+=============================================
 **Author**: `Siyuan Feng <https://github.com/Hzfengsy>`_
 
 In this tutorial, we will demonstrate how to write a high performance convolution
@@ -29,7 +29,7 @@
 
 ################################################################
 # TensorCore Introduction
-# -------------------------
+# -----------------------
 # Each Tensor Core provides a 4x4x4 matrix processing array that operates
 # :code:`D = A * B + C`, where A, B, C and D are 4x4 matrices as Figure shows.
 # The matrix multiplication inputs A and B are FP16 matrices, while the accumulation
@@ -45,7 +45,7 @@
 
 ################################################################
 # Preparation and Algorithm
-# --------------------------
+# -------------------------
 # We use the fixed size for input tensors with 256 channels and 14 x 14 dimensions.
 # The batch size is 256. Convolution filters contain 512 filters of size 3 x 3.
 # We use stride size 1 and padding size 1 for the convolution. In the example, we use
@@ -126,8 +126,7 @@
 
 ###############################################################################
 # Memory Scope
-# ----------------
-#
+# ------------
 # In traditional GPU schedule, we have global, shared and local memory scope.
 # To support TensorCores, we add another three special memory scope: :code:`wmma.matrix_a`,
 # :code:`wmma.matrix_b` and :code:`wmma.accumulator`. On hardware, all fragments scope
@@ -142,6 +141,7 @@
 
 ###############################################################################
 # Define Tensor Intrinsic
+# -----------------------
 # In fact, TensorCore is a special hardware operation. So, we can just use tensorize
 # to replace a unit of computation with the TensorCore instruction. The first thing is
 # that we need to define tensor intrinsic.
@@ -246,7 +246,6 @@ def intrin_func(ins, outs):
 #   easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
 #   TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
 #   The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.
-#
 
 # Define tiling sizes
 block_row_warps = 4
@@ -312,10 +311,9 @@ def intrin_func(ins, outs):
 
 ###############################################################################
 # Lowering Computation to Intrinsics
-# --------------------------
+# ----------------------------------
 # The last phase is to lower the computation loops down to TensorCore hardware intrinsics
 # by mapping the 2D convolution to tensor intrinsics
-#
 
 s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))
 s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))
@@ -344,5 +342,6 @@ def intrin_func(ins, outs):
 
 ###############################################################################
 # Summary
+# -------
 # This tutorial demonstrates how TVM scheduling primitives can be used to
 # call TensorCores on specific GPUs.
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index a23589a4ab19..8ed152aee918 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -232,7 +232,6 @@
 #
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/array-packing.png
 #      :align: center
-#      :scale: 100%
 #
 
 
diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py
index 00dbe9de838f..eb6501ba4f3d 100644
--- a/tutorials/optimize/opt_matmul_auto_tensorcore.py
+++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py
@@ -18,7 +18,7 @@
 .. _opt-matmul-auto-tensorcore:
 
 How to optimize matmul with Auto TensorCore CodeGen
-==================================
+===================================================
 **Author**: `Minmin Sun <https://github.com/minminsun>`_, \
             `Lanbo Li <https://github.com/Orion34C>`_, \
             `Chenfan Jia <https://github.com/jcf94>`_, \
@@ -31,12 +31,11 @@
 Users can also write schedule with tensorization to generate TensorCore code.
 Both solutions use the same tensorcore intrinsics.
 Please refer to :ref:`opt-conv-tensorcore` tutorial for more details.
-
 """
 
 ################################################################
 # Preparation and Algorithm
-# --------------------------
+# -------------------------
 # 2 kinds of input data types are supported: float16 and int8.
 # For float16, the accumulator is float32.
 # For int8, the accumulator is int32.
@@ -215,7 +214,7 @@ def test_gemm(N, L, M, dtype, layout):
 
 ###############################################################################
 # AutoTune and Test
-# --------------------
+# -----------------
 # Finally we use a tuner to tune the schedule, generate code with best config
 # and run the kernel to compare with numpy to check whether the results are correct.
 
@@ -460,6 +459,6 @@ def tune_and_evaluate(M, N, L, dtype, layout):
 
 ###############################################################################
 # Summary
-# --------------------------
+# -------
 # This tutorial demonstrates how to use the AutoTensorCoreCodeGen of TVM
 # to generate tensorcore kernels.
diff --git a/tutorials/relay_quick_start.py b/tutorials/relay_quick_start.py
index 5461b08a4b35..6cded3325ad6 100644
--- a/tutorials/relay_quick_start.py
+++ b/tutorials/relay_quick_start.py
@@ -18,7 +18,7 @@
 .. _tutorial-relay-quick-start:
 
 Quick Start Tutorial for Compiling Deep Learning Models
-======================================================
+=======================================================
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_
 
 This example shows how to build a neural network with Relay python frontend and
@@ -33,7 +33,6 @@
 #
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tvm_support_list.png
 #      :align: center
-#      :scale: 100%
 #
 # In this tutorial, we'll choose cuda and llvm as target backends.
 # To begin with, let's import Relay and TVM.
@@ -47,7 +46,7 @@
 
 ######################################################################
 # Define Neural Network in Relay
-# -----------------------------
+# ------------------------------
 # First, let's define a neural network with relay python frontend.
 # For simplicity, we'll use pre-defined resnet-18 network in Relay.
 # Parameters are initialized with Xavier initializer.
diff --git a/vta/tutorials/frontend/README.txt b/vta/tutorials/frontend/README.txt
index 319506d21f8f..7adec27a9bc0 100644
--- a/vta/tutorials/frontend/README.txt
+++ b/vta/tutorials/frontend/README.txt
@@ -1,4 +1,4 @@
-.. _tutorial-frontend:
+.. _vta-tutorial-frontend:
 
 Compile Deep Learning Models
 ----------------------------
diff --git a/vta/tutorials/frontend/deploy_vision_on_vta.py b/vta/tutorials/frontend/deploy_vision_on_vta.py
index c410d24d07ae..df02b4842488 100644
--- a/vta/tutorials/frontend/deploy_vision_on_vta.py
+++ b/vta/tutorials/frontend/deploy_vision_on_vta.py
@@ -94,7 +94,7 @@
 
 ######################################################################
 # Obtain an execution remote
-# ---------------------------------
+# --------------------------
 # When target is 'pynq', reconfigure FPGA and runtime.
 # Otherwise, if target is 'sim', execute locally.
 
@@ -136,15 +136,16 @@
 # ---------------------------------
 # Grab vision model from Gluon model zoo and compile with Relay.
 # The compilation steps are:
-#    1) Front end translation from MxNet into Relay module.
-#    2) Apply 8-bit quantization: here we skip the first conv layer,
-#       and dense layer which will both be executed in fp32 on the CPU.
-#    3) Perform graph packing to alter the data layout for tensorization.
-#    4) Perform constant folding to reduce number of operators (e.g. eliminate
-#       batch norm multiply).
-#    5) Perform relay build to object file.
-#    6) Load the object file onto remote (FPGA device).
-#    7) Generate graph runtime, `m`.
+#
+# 1. Front end translation from MxNet into Relay module.
+# 2. Apply 8-bit quantization: here we skip the first conv layer,
+#    and dense layer which will both be executed in fp32 on the CPU.
+# 3. Perform graph packing to alter the data layout for tensorization.
+# 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply).
+# 5. Perform relay build to object file.
+# 6. Load the object file onto remote (FPGA device).
+# 7. Generate graph runtime, `m`.
+#
 
 # Load pre-configured AutoTVM schedules
 with autotvm.tophub.context(target):

From f23ac96905b3a434d2ee3b8bcc912a24b3e63eba Mon Sep 17 00:00:00 2001
From: Orion34C <vcitory34@gmail.com>
Date: Fri, 21 Feb 2020 10:43:45 +0800
Subject: [PATCH 26/73] [CODEGEN] Support cuda tensorcore subbyte int data type
 in auto tensorcore (#4546)

* support cuda tensorcore subbyte int data type in auto tensorcore

* add lisence

* pass cpplint

* fix code review comments

* merge the int4/int1 codegen tutorial into the existing auto tensorcore tutorial

* using master's new API

* disable tuning when cuda is not enabled

* address cr comment

* do not run the tuning

* fix test failure

* fix cpplint error

* fix bool type reduction bug

* 1. fix a index bug 2. fix returned bytes value of int1/int4/uint4

* fix typo
---
 include/tvm/runtime/data_type.h               |  7 +-
 include/tvm/tir/expr.h                        | 12 +++
 src/runtime/ndarray.cc                        |  7 +-
 src/target/source/codegen_c.cc                | 14 +++-
 src/target/source/codegen_cuda.cc             | 73 ++++++++++++++++++-
 src/tir/pass/arg_binder.cc                    | 11 ++-
 src/tir/pass/infer_fragment.cc                |  3 +-
 src/tir/pass/tensor_core.cc                   | 52 ++++++++++---
 .../optimize/opt_matmul_auto_tensorcore.py    | 73 ++++++++++++++++++-
 9 files changed, 228 insertions(+), 24 deletions(-)

diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h
index 7e0ef49154e4..e6f5e55e4d42 100644
--- a/include/tvm/runtime/data_type.h
+++ b/include/tvm/runtime/data_type.h
@@ -230,7 +230,12 @@ class DataType {
 inline int GetVectorBytes(DataType dtype) {
   int data_bits = dtype.bits() * dtype.lanes();
   // allow bool to exist
-  if (dtype == DataType::Bool()) return 1;
+  if (dtype == DataType::Bool() ||
+      dtype == DataType::Int(4) ||
+      dtype == DataType::UInt(4) ||
+      dtype == DataType::Int(1)) {
+    return 1;
+  }
   CHECK_EQ(data_bits % 8, 0U)
       << "Need to load/store by multiple of bytes";
   return data_bits / 8;
diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h
index 756907bfef47..7d497890dd13 100644
--- a/include/tvm/tir/expr.h
+++ b/include/tvm/tir/expr.h
@@ -1260,6 +1260,18 @@ constexpr const char* tvm_load_matrix_sync = "tvm_load_matrix_sync";
  *  }
  */
 constexpr const char* tvm_mma_sync = "tvm_mma_sync";
+/*!
+ * \brief tvm intrinsic for tensor core bmma_sync operators.
+ *
+ *  void tvm_bmma_sync(Var fragment_d, Expr index_d,
+ *                     Var fragment_a, Expr index_a,
+ *                     Var fragment_b, Expr index_b,
+ *                     Var fragment_c, Expr index_c) {
+ *    nvcuda::wmma::bmma_sync(fragment_d[index_d], fragment_a[index_a],
+ *                           fragment_b[index_b], fragment_c[index_c]);
+ *  }
+ */
+constexpr const char* tvm_bmma_sync = "tvm_bmma_sync";
 /*!
  * \brief tvm intrinsic for tensor core fill_fragment operators.
  *
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index f4160cc97d72..91002c951c04 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -44,7 +44,12 @@ inline void VerifyDataType(DLDataType dtype) {
   } else {
     // allow uint1 as a special flag for bool.
     if (dtype.bits == 1 && dtype.code == kDLUInt) return;
-    CHECK_EQ(dtype.bits % 8, 0);
+    // allow int1/uint4/int4
+    else if (dtype.bits == 1 && dtype.code == kDLInt) return;
+    else if (dtype.bits == 4 && dtype.code == kDLUInt) return;
+    else if (dtype.bits == 4 && dtype.code == kDLInt) return;
+    else
+      CHECK_EQ(dtype.bits % 8, 0);
   }
   CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 7f89307c04a3..91020555e5c8 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -170,8 +170,13 @@ std::string CodeGenC::GetBufferRef(
     } else {
       os << vid;
     }
-    os << '[';
+    os << "[(";
     PrintExpr(index, os);
+    os << ")";
+    if (t.bits() == 4 ||
+        (t.bits() == 1 && t.is_int())) {
+      os << " / " << (32 / t.bits());
+    }
     os << ']';
   } else {
     // Buffer declared as vector type.
@@ -205,8 +210,13 @@ std::string CodeGenC::GetBufferRef(
       PrintType(t.element_of(), os);
       os << "*)";
     }
-    os << vid << " + ";
+    os << vid << " + (";
     PrintExpr(index, os);
+    os << ")";
+    if (t.bits() == 4 ||
+        (t.bits() == 1 && t.is_int())) {
+      os << " / " << (32 / t.bits());
+    }
     os << "))[0]";
   }
   return os.str();
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 889d8b6a62d4..d5cab6eb3e37 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -144,6 +144,37 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
       }
     }
     switch (t.bits()) {
+      case 1: {
+        if (t.lanes() == 1) {
+          os << "int"; return;
+        } else if (t.lanes() == 8) {
+          os << "int8_t"; return;
+        } else if (t.lanes() == 16) {
+          os << "int16_t"; return;
+        } else if (t.lanes() == 32) {
+          os << "int"; return;
+        } else {
+          LOG(FATAL) << "Cannot convert type " << t << " to CUDA type!";
+        }
+      }
+      case 4: {
+        if (t.lanes() == 1) {
+          os << "int"; return;
+        } else if (t.lanes() == 4) {
+          os << "int16_t"; return;
+        } else if (t.lanes() == 8) {
+          // directly 8 4-bit int in integer.
+          os << "int"; return;
+        } else if (t.lanes() == 16) {
+          os << "int2"; return;
+        } else if (t.lanes() == 32) {
+          os << "int4"; return;
+        } else if (t.lanes() == 64) {
+          os << "int8"; return;
+        } else {
+          LOG(FATAL) << "Cannot convert type " << t << " to CUDA type!";
+        }
+      }
       case 8: {
         if (t.lanes() == 4) {
           // directly 4 8 bit int in integer.
@@ -182,7 +213,6 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
           os << "long"; break;
         }
       }
-      case 1: os << "int"; break;
       default: fail = true; break;
     }
     if (!fail && lanes == 1) {
@@ -371,6 +401,16 @@ void CodeGenCUDA::VisitExpr_(const CallNode *op, std::ostream& os) {
       this->PrintExpr(op->args[i * 2 + 1], os);
       os << "]" << ((i < 3) ? ", ": ")");
     }
+  } else if (op->is_intrinsic(intrinsic::tvm_bmma_sync)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 8U);
+    os << "nvcuda::wmma::bmma_sync(";
+    for (int i = 0; i < 4; ++i) {
+      this->PrintExpr(op->args[i * 2], os);
+      os << "[";
+      this->PrintExpr(op->args[i * 2 + 1], os);
+      os << "]" << ((i < 3) ? ", ": ")");
+    }
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
@@ -410,8 +450,12 @@ void CodeGenCUDA::VisitStmt_(const AllocateNode* op) {
       if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
         CHECK(op->dtype == DataType::Float(16) ||
               op->dtype == DataType::Int(8) ||
-              op->dtype == DataType::UInt(8))
-          << "Matrix_a and matrix_b only support half or char or unsigned char type for now";
+              op->dtype == DataType::UInt(8) ||
+              op->dtype == DataType::Int(4) ||
+              op->dtype == DataType::UInt(4) ||
+              op->dtype == DataType::Int(1))
+          << "Matrix_a and matrix_b only support half or char or unsigned char "
+          << "or uint4 or int4 or int1 type for now";
       } else {
         CHECK(op->dtype == DataType::Float(16) ||
               op->dtype == DataType::Float(32) ||
@@ -425,6 +469,11 @@ void CodeGenCUDA::VisitStmt_(const AllocateNode* op) {
       stream << ' ';
       PrintType(op->dtype, stream);
     }
+    if ((op->dtype == DataType::Int(4) ||
+         op->dtype == DataType::UInt(4) ||
+         op->dtype == DataType::Int(1)) && scope == "shared") {
+      constant_size = constant_size / (32 / op->dtype.bits());
+    }
     stream << ' '<< vid << '['
            << constant_size << "];\n";
   }
@@ -552,6 +601,24 @@ void CodeGenCUDA::PrintWmmaScope(const std::string &scope, DataType t,
   std::stringstream type;
   PrintType(t, type);
   std::string shape_str = fragment_shapes[variable];
+  if ((t.is_int() || t.is_uint()) && t.bits() < 8 && t.lanes() == 1) {
+    type.str(std::string());
+    if (t.is_int()) {
+      if (t.bits() == 4) {
+        type << "nvcuda::wmma::experimental::precision::s4";
+      } else if (t.bits() == 1) {
+        type << "nvcuda::wmma::experimental::precision::b1";
+      } else {
+        LOG(FATAL) << "Unhandled interger type for wmma fragment!";
+      }
+    } else if (t.is_uint()) {
+      if (t.bits() == 4) {
+        type << "nvcuda::wmma::experimental::precision::u4";
+      } else {
+        LOG(FATAL) << "Unhandled interger type for wmma fragment!";
+      }
+    }
+  }
   if (scope == "wmma.matrix_a") {
     need_mma_h_ = true;
     std::string layout_str = fragment_layouts[variable];
diff --git a/src/tir/pass/arg_binder.cc b/src/tir/pass/arg_binder.cc
index bd35c768e6ac..30542eaf6c1c 100644
--- a/src/tir/pass/arg_binder.cc
+++ b/src/tir/pass/arg_binder.cc
@@ -184,7 +184,11 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
                IntImm(DataType::UInt(8), dtype.bits()) &&
                TVMArrayGet(DataType::UInt(16), handle, intrinsic::kArrTypeLanes) ==
                IntImm(DataType::UInt(16), dtype.lanes()));
-  asserts_.emplace_back(AssertStmtNode::make(cond, type_err_msg.str(), nop));
+  if (!(dtype == DataType::Int(4) ||
+        dtype == DataType::UInt(4) ||
+        dtype == DataType::Int(1))) {
+    asserts_.emplace_back(AssertStmtNode::make(cond, type_err_msg.str(), nop));
+  }
   // data field
   if (Bind_(buffer->data, TVMArrayGet(DataType::Handle(), handle, intrinsic::kArrData),
             arg_name + ".data", true)) {
@@ -201,6 +205,11 @@ void ArgBinder::BindDLTensor(const Buffer& buffer,
   init_nest_.emplace_back(LetStmtNode::make(
       v_shape, TVMArrayGet(DataType::Handle(), handle, intrinsic::kArrShape), nop));
   for (size_t k = 0; k < buffer->shape.size(); ++k) {
+    if (dtype == DataType::Int(4) ||
+        dtype == DataType::UInt(4) ||
+        dtype == DataType::Int(1)) {
+      break;
+    }
     std::ostringstream field_name;
     field_name << v_shape->name_hint << '[' << k << ']';
     Bind_(buffer->shape[k],
diff --git a/src/tir/pass/infer_fragment.cc b/src/tir/pass/infer_fragment.cc
index 0cb1b9686cbd..608945a7a68a 100644
--- a/src/tir/pass/infer_fragment.cc
+++ b/src/tir/pass/infer_fragment.cc
@@ -138,7 +138,8 @@ class FragmentChecker : public StmtExprVisitor {
   void VisitExpr_(const CallNode* op) final {
     StmtExprVisitor::VisitExpr_(op);
     // Check shape when calling tvm_mma_sync
-    if (op->is_intrinsic(intrinsic::tvm_mma_sync)) {
+    if (op->is_intrinsic(intrinsic::tvm_mma_sync) ||
+        op->is_intrinsic(intrinsic::tvm_bmma_sync)) {
       CHECK_EQ(op->args.size(), 8U);
       const VarNode* buffer_var_d = op->args[0].as<VarNode>();
       const VarNode* buffer_var_a = op->args[2].as<VarNode>();
diff --git a/src/tir/pass/tensor_core.cc b/src/tir/pass/tensor_core.cc
index 6a5e01585ffb..88f749646d52 100644
--- a/src/tir/pass/tensor_core.cc
+++ b/src/tir/pass/tensor_core.cc
@@ -199,7 +199,11 @@ class MMAMatcher: public StmtVisitor {
     BufferInfo buffer_a;
     if (!check_local_buffer_(load_a, &buffer_a)
         || !(buffer_a.dtype == DataType::Float(16) ||
-             buffer_a.dtype == DataType::Int(8))) {
+             buffer_a.dtype == DataType::Int(8) ||
+             buffer_a.dtype == DataType::UInt(8) ||
+             buffer_a.dtype == DataType::Int(4) ||
+             buffer_a.dtype == DataType::UInt(4) ||
+             buffer_a.dtype == DataType::Int(1))) {
       return false;
     }
 
@@ -208,7 +212,11 @@ class MMAMatcher: public StmtVisitor {
     BufferInfo buffer_b;
     if (!check_local_buffer_(load_b, &buffer_b)
         || !(buffer_b.dtype == DataType::Float(16) ||
-             buffer_b.dtype == DataType::Int(8))) {
+             buffer_b.dtype == DataType::Int(8) ||
+             buffer_b.dtype == DataType::UInt(8) ||
+             buffer_b.dtype == DataType::Int(4) ||
+             buffer_a.dtype == DataType::UInt(4) ||
+             buffer_a.dtype == DataType::Int(1))) {
       return false;
     }
 
@@ -736,6 +744,17 @@ class BufferAnalyser : public StmtExprVisitor {
         warp_tile_.k == 16) {
       return true;
     }
+    if (warp_tile_.m == 8 &&
+        warp_tile_.n == 8 &&
+        warp_tile_.k == 32) {
+      return true;
+    }
+    if (warp_tile_.m == 8 &&
+        warp_tile_.n == 8 &&
+        warp_tile_.k == 128) {
+      return true;
+    }
+
     return false;
   }
 
@@ -869,18 +888,29 @@ class TensorCoreIRMutator : public StmtExprMutator {
       ObjectPtr<BufferNode> buffer_node_c = make_object<BufferNode>();
 
       auto mma_sync_call =
-        [&buffer_node_a, &buffer_node_b]
+        [&buffer_node_a, &buffer_node_b, &ca, &cb]
         (const Buffer &buffer) {
           Buffer buffer_a(buffer_node_a);
           Buffer buffer_b(buffer_node_b);
-          return EvaluateNode::make(
-                  CallNode::make(DataType::Handle(),
-                        intrinsic::tvm_mma_sync,
-                        {buffer->data, buffer->elem_offset,
-                        buffer_a->data, buffer_a->elem_offset,
-                        buffer_b->data, buffer_b->elem_offset,
-                        buffer->data, buffer->elem_offset},
-                        CallNode::Intrinsic));
+          if (ca->dtype == DataType::Int(1) && cb->dtype == DataType::Int(1)) {
+            return EvaluateNode::make(
+                    CallNode::make(DataType::Handle(),
+                          intrinsic::tvm_bmma_sync,
+                          {buffer->data, buffer->elem_offset,
+                          buffer_a->data, buffer_a->elem_offset,
+                          buffer_b->data, buffer_b->elem_offset,
+                          buffer->data, buffer->elem_offset},
+                          CallNode::Intrinsic));
+          } else {
+            return EvaluateNode::make(
+                    CallNode::make(DataType::Handle(),
+                          intrinsic::tvm_mma_sync,
+                          {buffer->data, buffer->elem_offset,
+                          buffer_a->data, buffer_a->elem_offset,
+                          buffer_b->data, buffer_b->elem_offset,
+                          buffer->data, buffer->elem_offset},
+                          CallNode::Intrinsic));
+          }
         };
 
       auto call_add_c =
diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py
index eb6501ba4f3d..f7cdae227b75 100644
--- a/tutorials/optimize/opt_matmul_auto_tensorcore.py
+++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py
@@ -56,6 +56,8 @@ def matmul_nn(A, B, L, dtype='float16', layout='NN'):
       out_type = 'float'
     elif dtype == 'int8':
       out_type = 'int'
+    elif dtype == 'int4' or dtype == 'int1':
+      out_type = 'int'
     if (layout == 'NN'):
       return tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k].astype(out_type) * B[k, j].astype(out_type), axis=k))
     if (layout == 'NT'):
@@ -123,6 +125,12 @@ def test_gemm(N, L, M, dtype, layout):
     if dtype == 'int8':
       factor = 32
       offset = 16
+    elif dtype == 'int4':
+      factor = 64
+      offset = 32
+    elif dtype == 'int1':
+      factor = 256
+      offset = 128
 
     # create cache stages
     AA = s.cache_read(A, "shared", [C])
@@ -139,9 +147,9 @@ def test_gemm(N, L, M, dtype, layout):
     cfg = autotvm.get_config()
 
     cfg.define_knob("bx", [2, 4, 8])
-    cfg.define_knob("by", [16, 32, 64])
-    cfg.define_knob("step_k", [8, 16, 32])
-    cfg.define_knob("v", [4, 8])
+    cfg.define_knob("by", [8, 16, 32, 64])
+    cfg.define_knob("step_k", [1, 2, 4, 8, 16, 32])
+    cfg.define_knob("v", [4, 8, 16, 32])
     by = cfg['by'].val
     bx = cfg['bx'].val
     step_k = cfg['step_k'].val
@@ -150,9 +158,17 @@ def test_gemm(N, L, M, dtype, layout):
     # thread tile
     TX = 8
     TY = 1
+    if dtype == 'int4' or dtype == 'int1':
+      TX = 2
     # warp tile
     warp_tile_m = 16 # it could also be 8 or 32 on CUDA version >= 10.0
-    warp_tile_k = 16 # it must be 16
+    warp_tile_k = 16 # it must be 16 for fp16/int8 data type
+    if dtype == 'int4':
+      warp_tile_m = 8
+      warp_tile_k = 32
+    elif dtype == 'int1':
+      warp_tile_m = 8
+      warp_tile_k = 128
     # block tile
     tile_x = bx * TX
     tile_y = by * TY
@@ -219,6 +235,10 @@ def test_gemm(N, L, M, dtype, layout):
 # and run the kernel to compare with numpy to check whether the results are correct.
 
 # check whether the gpu has tensorcore
+if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
+  print("skip because cuda is not enabled..")
+  sys.exit(0)
+
 ctx = tvm.gpu()
 if not nvcc.have_tensorcore(ctx.compute_version):
   print('the gpu has no tensorcore, skipping...')
@@ -234,6 +254,15 @@ def test_gemm(N, L, M, dtype, layout):
 if len(sys.argv) >= 6:
   layout = sys.argv[5]
 
+# check whether current gpu arch support support current dtype's wmma codegen
+cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4)
+major, minor= nvcc.parse_compute_version(cuda_compute_capability)
+if dtype == 'int8':
+  assert(major == 7 and minor >= 2)
+elif dtype == 'int4' or dtype == 'int1':
+  # int4/int1 only support layout TN
+  assert(major == 7 and minor == 5 and layout == 'TN')
+
 def tune_and_evaluate(M, N, L, dtype, layout):
   task = autotvm.task.create(test_gemm, args=(N, L, M, dtype, layout), target='cuda')
   print(task.config_space)
@@ -305,6 +334,42 @@ def tune_and_evaluate(M, N, L, dtype, layout):
       c_np = np.dot(a_np.astype(np.int32), b_np.astype(np.int32).T)
     elif (layout == "TT"):
       c_np = np.dot(a_np.astype(np.int32).T, b_np.astype(np.int32).T)
+  elif dtype == 'int4':
+    c_np_type = np.int32
+    a_np_int = np.random.randint(low=-8, high=7, size=shape_a).astype(np.int32)
+    b_np_int = np.random.randint(low=-8, high=7, size=shape_b).astype(np.int32)
+    # "TN"
+    c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
+    a_np = np.zeros(shape=(N, int(L/8)), dtype = np.int32)
+    b_np = np.zeros(shape=(M, int(L/8)), dtype = np.int32)
+    # a_np --> col_major
+    for i in range(N):
+      for j in range(int(L/8)):
+        for k in range(8):
+          a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4))
+
+    # b_np --> row_major
+    for i in range(M):
+      for j in range(int(L/8)):
+        for k in range(8):
+          b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 8 + k] & 0xf) << ((7 - k) * 4))
+  elif dtype == 'int1':
+    c_np_type = np.int32
+    a_np_int = np.random.randint(low=0, high=1, size=shape_a).astype(np.int32)
+    b_np_int = np.random.randint(low=0, high=1, size=shape_b).astype(np.int32)
+    # "TN"
+    c_np = np.dot(a_np_int.astype(np.int32), b_np_int.astype(np.int32).T)
+    a_np = np.zeros(shape=(N, int(L/32)), dtype = np.int32)
+    b_np = np.zeros(shape=(M, int(L/32)), dtype = np.int32)
+    for i in range(N):
+      for j in range(int(L/32)):
+        for k in range(32):
+          a_np[i, j] = a_np[i, j] | ((a_np_int[i, j * 32 + k] & 0xf) << (31 - k))
+
+    for i in range(M):
+      for j in range(int(L/32)):
+        for k in range(32):
+          b_np[i, j] = b_np[i, j] | ((b_np_int[i, j * 32 + k] & 0xf) << (31 - k))
 
   c_tvm = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np_type), ctx=ctx)
   a_tvm = tvm.nd.array(a_np, ctx=ctx)

From 0e189f01146c004f8a55840bb6c36d4449a25c34 Mon Sep 17 00:00:00 2001
From: Ina Dobreva <55383260+inadob@users.noreply.github.com>
Date: Fri, 21 Feb 2020 04:10:45 +0000
Subject: [PATCH 27/73] Fix tests for tflite unary elemwise operations (#4913)

* add TFLite version check for 'ceil' and 'cos'
* fix name check of test_op for positive inputs
* add error message for operator not found in the installed fbs schema
---
 python/tvm/relay/frontend/tflite.py          |  6 +++++-
 tests/python/frontend/tflite/test_forward.py | 15 ++++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index e92e4cef205d..dd3587125aec 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -166,7 +166,11 @@ def get_op_code_str(self, op):
 
         op_code_list_idx = op.OpcodeIndex()
         op_code_id = self.model.OperatorCodes(op_code_list_idx).BuiltinCode()
-        op_code_str = self.builtin_op_code[op_code_id]
+        try:
+            op_code_str = self.builtin_op_code[op_code_id]
+        except KeyError:
+            raise NotImplementedError('TFLite operator with code ' + str(op_code_id) + \
+                                      ' is not supported by this version of the fbs schema.')
         if op_code_id == BuiltinOperator.CUSTOM:
             # Custom operator
             custom_op_code_str = self.model.OperatorCodes(op_code_list_idx).CustomCode()
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index e88226c1b125..427d4bfe2810 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -669,7 +669,7 @@ def _test_unary_elemwise(math_op, data):
     with tf.Graph().as_default():
         in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name='in')
         out = math_op(in_data)
-        compare_tflite_with_tvm(data, ['in:0'], in_data, [out])
+        compare_tflite_with_tvm(data, ['in:0'], [in_data], [out])
 
 #######################################################################
 # Abs
@@ -745,23 +745,24 @@ def _test_neg(data):
 
 def _test_forward_unary_elemwise(test_op):
     # functions that need positive input
-    if test_op in {'_test_log', '_test_sqrt', '_test_rsqrt'}:
-        test_op(np.arange(6.0, dtype=np.float32).reshape((2, 1, 3)))
-        test_op(np.arange(6.0, dtype=np.int32).reshape((2, 1, 3)))
+    if test_op.__name__ in {'_test_log', '_test_sqrt', '_test_rsqrt'}:
+        test_op(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)))
     else:
-        np.array(np.random.uniform(-5, 5, (3, 1)), dtype=np.int32)
+        test_op(np.random.uniform(-10, 10, (3, 2)).astype(np.float32))
 
 def test_all_unary_elemwise():
     _test_forward_unary_elemwise(_test_abs)
-    _test_forward_unary_elemwise(_test_ceil)
     _test_forward_unary_elemwise(_test_floor)
     _test_forward_unary_elemwise(_test_exp)
     _test_forward_unary_elemwise(_test_log)
     _test_forward_unary_elemwise(_test_sin)
-    _test_forward_unary_elemwise(_test_cos)
     _test_forward_unary_elemwise(_test_sqrt)
     _test_forward_unary_elemwise(_test_rsqrt)
     _test_forward_unary_elemwise(_test_neg)
+    # ceil and cos come with TFLite 1.14.0.post1 fbs schema
+    if package_version.parse(tf.VERSION) >= package_version.parse('1.14.0'):
+        _test_forward_unary_elemwise(_test_ceil)
+        _test_forward_unary_elemwise(_test_cos)
 
 #######################################################################
 # Element-wise

From f47c38db8392378fc863ed4621f9ec632668ff61 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 20 Feb 2020 21:56:26 -0800
Subject: [PATCH 28/73] [COMMUNITY] @anijain2305 -> Committer (#4921)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 11e3cda7bf05..1f7ec33f1b0a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -45,6 +45,7 @@ We do encourage everyone to work anything they are interested in.
 - [Zhi Chen](https://github.com/zhiics): @zhiics - relay, quantization, pass manager
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
+- [Animesh Jain](https://github.com/anijain2305): @anijain2305 - quantization, relay
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PPMC): @ZihengJiang - relay, compiler
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
 - [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi

From 8290eabafb22a8a54249165ce09dd31ff1f31c7c Mon Sep 17 00:00:00 2001
From: Neo Chien <cchung100m@cs.ccu.edu.tw>
Date: Sat, 22 Feb 2020 05:03:41 +0800
Subject: [PATCH 29/73] [TEST][FLAKY]
 topi/tests/python/test_topi_sort.py::test_argsort (#4891)

* [TEST][FLAKY] topi/tests/python/test_topi_sort.py::test_argsort

* upadate test function of argsort like topk

* Shuffle index and get data from shuffled index

* Replace the random.uniform with np.arange
---
 topi/tests/python/test_topi_sort.py | 38 +++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/topi/tests/python/test_topi_sort.py b/topi/tests/python/test_topi_sort.py
index c084a7c431b6..0ad4e987d17d 100644
--- a/topi/tests/python/test_topi_sort.py
+++ b/topi/tests/python/test_topi_sort.py
@@ -21,11 +21,26 @@
 import topi
 import topi.testing
 
-def test_argsort():
+
+def verify_argsort(axis, is_ascend):
     dshape = (20, 100)
-    data = tvm.placeholder(dshape, name="data", dtype="float32")
-    np_data = np.random.rand(dshape[0], dshape[1]).astype(data.dtype)
-    np_result = np.argsort(-np_data)
+    data_dtype = "float32"
+    data = tvm.placeholder(dshape, name="data", dtype=data_dtype)
+
+    perm = np.arange(dshape[0] * dshape[1], dtype=data_dtype)
+    np.random.shuffle(perm)
+    np_data = perm.reshape(dshape)
+
+    if is_ascend:
+        np_indices = np.argsort(np_data, axis=axis)
+    else:
+        np_indices = np.argsort(-np_data, axis=axis)
+
+    if axis == 0:
+        np_indices = np_indices[:dshape[axis], :]
+    else:
+        np_indices = np_indices[:, :dshape[axis]]
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -33,18 +48,19 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            out = topi.argsort(data, axis=-1, is_ascend=False)
+            out = topi.argsort(data, axis=axis, is_ascend=is_ascend)
             s = topi.generic.schedule_argsort(out)
 
         tvm_data = tvm.nd.array(np_data, ctx)
-        tvm_out = tvm.nd.array(np.zeros(dshape, dtype="float32"), ctx)
+        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), ctx)
         f = tvm.build(s, [data, out], device)
         f(tvm_data, tvm_out)
-        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result.astype("float32"), rtol=1e0)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_indices.astype(data_dtype), rtol=1e0)
 
     for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
 
+
 def verify_topk(k, axis, ret_type, is_ascend, dtype):
     shape = (20, 100)
     data_dtype = "float32"
@@ -95,6 +111,14 @@ def check_device(device):
     for device in ['llvm', 'cuda', 'opencl']:
         check_device(device)
 
+
+def test_argsort():
+    np.random.seed(0)
+    for axis in [0, -1, 1]:
+        verify_argsort(axis, True)
+        verify_argsort(axis, False)
+
+
 def test_topk():
     np.random.seed(0)
     for k in [0, 1, 5]:

From c4c61cb766608fb2f0fd8c9facc480a43afed3f5 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 21 Feb 2020 14:31:04 -0800
Subject: [PATCH 30/73] [Fix] Fix get_valid_count flaky test for cuda (#4901)

* get_valid_count accuracy issue fixed for individual tests but not for all tests running together

* minor fix

* initialize valid_count and PrefixSum buffers

* test updated

* udpate relay test as well

* update document

* fix lint

* address comment

* fix lint

* correct atomicAdd identifier name
---
 tests/python/relay/test_op_level5.py  |   2 -
 topi/python/topi/cuda/nms.py          | 415 ++++++++++----------------
 topi/tests/python/test_topi_vision.py |   4 +-
 3 files changed, 166 insertions(+), 255 deletions(-)

diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 03e700b3df83..e622a8ae01ab 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -221,8 +221,6 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         func = relay.Function([x], z.astuple())
         func = run_infer_type(func)
         for target, ctx in ctx_list():
-            if target == 'cuda':
-                return
             intrp = relay.create_executor("debug", ctx=ctx, target=target)
             out = intrp.evaluate(func)(np_data)
             tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04)
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 38f87a9523c8..5485859de01f 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -21,29 +21,46 @@
 import tvm
 
 from tvm import api
-from tvm.generic import cast
-from tvm.intrin import if_then_else, log, power
+from tvm.intrin import if_then_else
 from topi.vision import non_max_suppression, get_valid_counts
 from .sort import argsort
 from .. import tag
 
 
-def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index):
-    """Low level IR to Prepare get valid count of bounding boxes
-    given a score threshold. Also moves valid boxes to the
+def cuda_atomic_add_rule(op):
+    if op.dtype == "float32":
+        return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
+    if op.dtype == "float64":
+        return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
+    if op.dtype == "int32":
+        return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
+    raise RuntimeError("only support int32, float32 and float64")
+
+
+tvm.target.intrin.register_intrin_rule(
+    "cuda", "atomic_add", cuda_atomic_add_rule, override=True)
+
+
+def atomic_add(x, y):
+    return tvm.call_pure_intrin(y.dtype, "atomic_add", x, y)
+
+
+def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, score_index):
+    """Low level IR to get valid count of bounding boxes
+    given a score threshold. Also prepares to move valid boxes to the
     top of input data.
 
     Parameters
     ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
+    data : Buffer
+        Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length].
+
+    valid_count : Buffer
+        1D buffer for valid number of boxes with shape [batch_size, ].
 
     flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
     score_threshold : float32
         Lower limit of score for valid bounding boxes.
 
@@ -60,18 +77,24 @@ def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
-    box_data_length = data.shape[2]
+    elem_length = data.shape[2]
 
     ib = tvm.ir_builder.create()
 
     data = ib.buffer_ptr(data)
+
+    valid_count = ib.buffer_ptr(valid_count)
     flag = ib.buffer_ptr(flag)
-    idx = ib.buffer_ptr(idx)
-    score_threshold = tvm.make.node("FloatImm", dtype="float32", value=score_threshold)
+    atomic_add_return = ib.allocate(
+        valid_count.dtype, (1,), name='atomic_add_return', scope='local')
+    one_count = tvm.const(1, dtype=valid_count.dtype)
+    score_threshold = tvm.make.node(
+        "FloatImm", dtype="float32", value=score_threshold)
     id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
     score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
 
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    max_threads = int(tvm.target.Target.current(
+        allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
@@ -79,163 +102,52 @@ def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
+    idxd = tvm.indexdiv
 
+    # initialize valid_count
+    with ib.if_scope(tid < batch_size):
+        valid_count[tid] = 0
+    # initialize flag
     with ib.if_scope(tid < batch_size * num_anchors):
-        with ib.if_scope(tvm.all(data[tid * box_data_length + score_index] > score_threshold, \
-            tvm.any(id_index < 0, data[tid * box_data_length + id_index] >= 0))):
+        flag[tid] = 0
+    with ib.if_scope(tid < batch_size * num_anchors):
+        i = idxd(tid, num_anchors)
+        with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold,
+                                 tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
             flag[tid] = 1
-            idx[tid] = 1
-        with ib.else_scope():
-            flag[tid] = 0
-            idx[tid] = 0
+            atomic_add_return[0] = atomic_add(tvm.call_pure_intrin("handle", "tvm_address_of",
+                                                                 valid_count[i]), one_count)
 
     return ib.get()
 
-def get_valid_counts_upsweep(data, idx_in, idx, partial):
-    """Low level IR of first step of scan: unsweep.
-
-    Parameters
-    ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
-
-    idx_in : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
-    partial : Buffer
-        2D Buffer of valid data indices with shape [batch_size, new_range].
-
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    ib = tvm.ir_builder.create()
-    data = ib.buffer_ptr(data)
-    idx_in = ib.buffer_ptr(idx_in)
-    idx = ib.buffer_ptr(idx)
-    partial = ib.buffer_ptr(partial)
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    elem_per_thread = num_anchors // max_threads + 1
-    nthread_tx = max_threads
-    nthread_bx = batch_size
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    new_range = num_anchors // elem_per_thread + 1
-    # Scan: Upsweep:
-    with ib.if_scope(tvm.all(bx < batch_size, tx < new_range)):
-        with ib.for_range(0, elem_per_thread) as i:
-            with ib.if_scope(bx * num_anchors + \
-                             tx * elem_per_thread + i < batch_size * num_anchors):
-                with ib.if_scope(i == 0):
-                    partial[bx * new_range + tx] = idx_in[bx * num_anchors + tx * elem_per_thread]
-                    idx[bx * num_anchors + tx * elem_per_thread] = \
-                    idx_in[bx * num_anchors + tx * elem_per_thread]
-                with ib.else_scope():
-                    partial[bx * new_range + tx] += \
-                    idx_in[bx * num_anchors + tx * elem_per_thread + i]
-                    idx[bx * num_anchors + tx * elem_per_thread + i] = \
-                    idx[bx * num_anchors + tx * elem_per_thread + i - 1] + \
-                    idx_in[bx * num_anchors + tx * elem_per_thread + i]
-            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                  tvm.convert(['shared']),
-                                  tvm.expr.Call.Intrinsic, None, 0))
-    return ib.get()
 
-def get_valid_counts_scan(data, partial_in, partial):
-    """Low level IR to do scan.
+def flag_scan(flag, prefix_sum):
+    """Low level IR to calculate correct positions for valid boxes.
 
     Parameters
     ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
-
-    idx_in : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
-
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+    flag : Buffer
+        2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
-    partial : Buffer
-        2D Buffer of valid data indices with shape [batch_size, new_range].
+    prefix_sum : Buffer
+        2D Buffer of prefix sum of flags indicating new locations of valid boxes
+        with same shape as flag.
 
     Returns
     -------
     stmt : Stmt
         The result IR statement.
     """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    ib = tvm.ir_builder.create()
-    partial_in = ib.buffer_ptr(partial_in)
-    partial = ib.buffer_ptr(partial)
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    elem_per_thread = num_anchors // max_threads + 1
-    nthread_tx = max_threads
-    nthread_bx = batch_size
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    var = tvm.make.node("FloatImm", dtype="float32", value=2)
-    new_range = num_anchors // elem_per_thread + 1
-    iteration = cast(log(cast(new_range, "float32")) / math.log(2), "int32")
-    # Scan: Kogge-Stone adder
-    with ib.if_scope(tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))):
-        with ib.for_range(0, iteration) as k:
-            with ib.if_scope(k == 0):
-                with ib.if_scope(tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))):
-                    partial[bx * new_range + tx] = \
-                    partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1]
-                with ib.else_scope():
-                    partial[bx * new_range] = partial_in[bx * new_range]
-            with ib.else_scope():
-                with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \
-                                         tx < tvm.min(new_range, num_anchors))):
-                    partial[bx * new_range + tx] += \
-                    partial[bx * new_range + tx - cast(power(var, k), "int32")]
-            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                  tvm.convert(['shared']),
-                                  tvm.expr.Call.Intrinsic, None, 0))
-    return ib.get()
-
-def get_valid_counts_downsweep(data, idx_in, partial, idx):
-    """Low level IR to do downsweep of scan.
-
-    Parameters
-    ----------
-    data: Buffer
-        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.
-
-    idx_in : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+    batch_size = flag.shape[0]
+    num_anchors = flag.shape[1]
 
-    partial : Buffer
-        2D Buffer of valid data indices with shape [batch_size, new_range].
+    ib = tvm.ir_builder.create()
 
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+    flag = ib.buffer_ptr(flag)
+    prefix_sum = ib.buffer_ptr(prefix_sum)
 
-    Returns
-    -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    ib = tvm.ir_builder.create()
-    idx_in = ib.buffer_ptr(idx_in)
-    idx = ib.buffer_ptr(idx)
-    partial = ib.buffer_ptr(partial)
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    elem_per_thread = num_anchors // max_threads + 1
+    max_threads = int(tvm.target.Target.current(
+        allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
@@ -243,23 +155,23 @@ def get_valid_counts_downsweep(data, idx_in, partial, idx):
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    new_range = num_anchors // elem_per_thread + 1
     idxd = tvm.indexdiv
     idxm = tvm.indexmod
-    # Scan: Downsweep:
-    with ib. if_scope(tid < batch_size * num_anchors):
-        i = idxd(tid, num_anchors) # number of batches
-        j = idxm(tid, num_anchors) # number of anchors
-        with ib.if_scope(j < elem_per_thread):
-            idx[tid] = idx_in[tid]
-        with ib.else_scope():
-            idx[tid] = idx_in[tid] + partial[i * new_range + idxd(j, elem_per_thread) - 1]
+
+    # initialize prefix_sum
+    with ib.if_scope(tid < batch_size * num_anchors):
+        prefix_sum[tid] = 0
+    with ib.if_scope(tid < batch_size * num_anchors):
+        i = idxd(tid, num_anchors)
+        j = idxm(tid, num_anchors)
+        with ib.for_range(0, j) as r:
+            prefix_sum[tid] += flag[i * num_anchors + r]
 
     return ib.get()
 
-def get_valid_counts_ir(data, flag, idx, valid_count, out):
-    """Low level IR to get valid count of bounding boxes
-    given a score threshold. Also moves valid boxes to the
+
+def out_rewrite(data, flag, prefix_sum, valid_count, out):
+    """Low level IR to move valid boxes to the
     top of input data.
 
     Parameters
@@ -270,11 +182,12 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out):
     flag : Buffer
         2D Buffer of flag indicating valid data with shape [batch_size, num_anchors].
 
-    idx : Buffer
-        2D Buffer of valid data indices with shape [batch_size, num_anchors].
+    prefix_sum : Buffer
+        2D Buffer of prefix sum of flags indicating new locations of valid boxes
+        with same shape as flag.
 
     valid_count : Buffer
-        1-D buffer for valid number of boxes.
+        1D buffer for valid number of boxes with shape [batch_size, ].
 
     out : Buffer
         Rearranged data buffer.
@@ -284,28 +197,28 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out):
     stmt : Stmt
         The result IR statement.
     """
-    batch_size = data.shape[0]
-    num_anchors = data.shape[1]
-    elem_length = data.shape[2]
-    size = batch_size * num_anchors * elem_length
+    batch_size = out.shape[0]
+    num_anchors = out.shape[1]
+    elem_length = out.shape[2]
 
     ib = tvm.ir_builder.create()
 
+    one = tvm.const(1, dtype=out.dtype)
     data = ib.buffer_ptr(data)
     flag = ib.buffer_ptr(flag)
-    idx = ib.buffer_ptr(idx)
     valid_count = ib.buffer_ptr(valid_count)
+    prefix_sum = ib.buffer_ptr(prefix_sum)
     out = ib.buffer_ptr(out)
 
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
+    max_threads = int(tvm.target.Target.current(
+        allow_none=False).max_num_threads)
     nthread_tx = max_threads
-    nthread_bx = batch_size * num_anchors * elem_length // max_threads + 1
+    nthread_bx = batch_size * num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-
     idxd = tvm.indexdiv
     idxm = tvm.indexmod
 
@@ -313,17 +226,15 @@ def get_valid_counts_ir(data, flag, idx, valid_count, out):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)
         base_idx = i * num_anchors * elem_length
-        with ib.if_scope(flag[tid] > 0):
+        with ib.if_scope(tvm.all(flag[tid] > 0, prefix_sum[tid] >= 0,
+                                 prefix_sum[tid] < num_anchors)):
+            with ib.for_range(0, elem_length) as k:
+                out[base_idx + prefix_sum[tid] * elem_length +
+                    k] = data[tid * elem_length + k]
+        with ib.if_scope(j >= valid_count[i]):
             with ib.for_range(0, elem_length) as k:
-                with ib.if_scope(base_idx + (idx[tid] - 1) * elem_length + k < size):
-                    out[base_idx + (idx[tid] - 1) * elem_length + k] =\
-                    data[base_idx + j * elem_length + k]
-        with ib.if_scope(j == 0):
-            valid_count[i] = idx[tid + num_anchors - 1]
-        with ib.if_scope(j >= idx[i * num_anchors + num_anchors - 1]):
-            with ib.for_range(0, elem_length) as l:
-                with ib.if_scope(tid * elem_length + l < size):
-                    out[tid * elem_length + l] = -1.0
+                out[tid * elem_length + k] = -one
+
     return ib.get()
 
 
@@ -356,56 +267,47 @@ def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1):
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
-    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    elem_per_thread = num_anchors // max_threads + 1
-    new_range = num_anchors // elem_per_thread + 1
+    data_buf = api.decl_buffer(
+        data.shape, data.dtype, "data_buf", data_alignment=8)
+    valid_count_buf = api.decl_buffer(
+        (batch_size,), "int32", "valid_count_buf", data_alignment=8)
     temp_flag_buf = api.decl_buffer(
         (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8)
-    temp_idx_buf = api.decl_buffer(
-        (batch_size, num_anchors,), "int32", "temp_idx", data_alignment=8)
     temp_partial_buf = api.decl_buffer(
-        (batch_size, new_range), "int32", "temp_partial", data_alignment=8)
-    data_buf = api.decl_buffer(
-        data.shape, data.dtype, "data_buf", data_alignment=8)
+        (batch_size, num_anchors), "int32", "temp_partial", data_alignment=8)
+    out_buf = api.decl_buffer(
+        data.shape, data.dtype, "out_buf", data_alignment=8)
 
-    temp_flag, temp_idx = \
-        tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data],
-                   lambda ins, outs: get_valid_counts_pre(
-                       ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
-                   dtype=["int32", "int32"],
-                   out_buffers=[temp_flag_buf, temp_idx_buf],
-                   name="get_valid_counts_phase_one")
-    temp_idx_new, temp_partial = \
-        tvm.extern([(batch_size, num_anchors,), (batch_size, new_range)], [data, temp_idx],
-                   lambda ins, outs: get_valid_counts_upsweep(
-                       ins[0], ins[1], outs[0], outs[1]),
-                   dtype=["int32", "int32"],
-                   out_buffers=[temp_idx_buf, temp_partial_buf],
-                   name="get_valid_counts_phase_two")
-    temp_partial_new = \
-        tvm.extern([(batch_size, new_range)], [data, temp_partial],
-                   lambda ins, outs: get_valid_counts_scan(
-                       ins[0], ins[1], outs[0]),
-                   dtype=["int32"],
-                   out_buffers=[temp_partial_buf],
-                   name="get_valid_counts_phase_three")
-    temp_idx_final = \
-        tvm.extern([(batch_size, num_anchors)], [data, temp_idx_new, temp_partial_new],
-                   lambda ins, outs: get_valid_counts_downsweep(
-                       ins[0], ins[1], ins[2], outs[0]),
-                   dtype=["int32"],
-                   out_buffers=[temp_idx_buf],
-                   name="get_valid_counts_phase_four")
-    valid_count, out_tensor = \
-	tvm.extern([(batch_size,), data.shape], [data, temp_flag, temp_idx_final],
-               lambda ins, outs: get_valid_counts_ir(
-                ins[0], ins[1], ins[2], outs[0], outs[1]),
-            dtype=["int32", data.dtype],
-            in_buffers=[data_buf, temp_flag_buf, temp_idx_buf],
-            name="get_valid_counts_phase_five",
+    valid_count, temp_flag = \
+        tvm.extern([(batch_size,), (batch_size, num_anchors)], [data],
+                   lambda ins, outs: get_valid_counts_ir(
+            ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
+            dtype=["int32", "int32"],
+            in_buffers=[data_buf],
+            out_buffers=[valid_count_buf, temp_flag_buf],
+            name="get_valid_counts",
             tag="get_valid_counts_gpu")
 
-    return [valid_count, out_tensor]
+    temp_partial = \
+        tvm.extern([(batch_size, num_anchors)], [temp_flag],
+                   lambda ins, outs: flag_scan(
+            ins[0], outs[0]),
+            dtype=["int32"],
+            in_buffers=[temp_flag_buf],
+            out_buffers=[temp_partial_buf],
+            name="flag_scan")
+
+    out = \
+        tvm.extern([data.shape], [data, temp_flag, temp_partial, valid_count],
+                   lambda ins, outs: out_rewrite(
+            ins[0], ins[1], ins[2], ins[3], outs[0]),
+            dtype=[data.dtype],
+            in_buffers=[data_buf, temp_flag_buf,
+                        temp_partial_buf, valid_count_buf],
+            out_buffers=[out_buf],
+            name="out_rewrite")
+
+    return [valid_count, out]
 
 
 def nms_ir(data, sorted_index, valid_count, out, box_indices,
@@ -479,7 +381,8 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     valid_count = ib.buffer_ptr(valid_count)
     out = ib.buffer_ptr(out)
     box_indices = ib.buffer_ptr(box_indices)
-    num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local")
+    num_valid_boxes = ib.allocate(
+        "int32", (1,), name="num_valid_boxes", scope="local")
 
     max_threads = int(
         tvm.target.Target.current(allow_none=False).max_num_threads)
@@ -491,26 +394,29 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     j = bx * max_threads + tx
 
-    iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold)
+    iou_threshold = tvm.make.node(
+        "FloatImm", dtype="float32", value=iou_threshold)
     top_k = tvm.make.node("IntImm", dtype="int32", value=top_k)
     coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start)
     id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
     score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
-    force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
+    force_suppress = tvm.make.node(
+        "IntImm", dtype="int32", value=1 if force_suppress else 0)
 
     with ib.for_range(0, batch_size, for_type="unroll") as i:
         base_idx = i * num_anchors * box_data_length
         with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)):
             # Reorder output
-            nkeep = if_then_else( \
-                    tvm.all(top_k > 0, top_k < valid_count[i]),
-                    top_k, valid_count[i])
+            nkeep = if_then_else(
+                tvm.all(top_k > 0, top_k < valid_count[i]),
+                top_k, valid_count[i])
             with ib.if_scope(j < nkeep):
                 with ib.for_range(0, box_data_length) as k:
                     out[(base_idx + j * box_data_length + k)] = \
-                    data[(base_idx + sorted_index[i * num_anchors + j] \
-                    * box_data_length + k)]
-                box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j]
+                        data[(base_idx + sorted_index[i * num_anchors + j]
+                              * box_data_length + k)]
+                box_indices[i * num_anchors +
+                            j] = sorted_index[i * num_anchors + j]
             with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])):
                 with ib.if_scope(j < valid_count[i] - nkeep):
                     with ib.for_range(0, box_data_length) as k:
@@ -519,16 +425,18 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             # Apply nms
             with ib.for_range(0, valid_count[i]) as k:
                 offset_k = k * box_data_length
-                with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0, \
-                    tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))):
+                with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0,
+                                         tvm.any(id_index < 0, out[base_idx +
+                                                                   offset_k + id_index] >= 0))):
                     with ib.if_scope(j < valid_count[i]):
                         offset_j = j * box_data_length
-                        with ib.if_scope(tvm.all(j > k, \
-                            out[base_idx + offset_j + score_index] > 0, \
-                                                 tvm.any(id_index < 0, \
-                                                    out[base_idx + offset_j + id_index] >= 0), \
-						 tvm.any(force_suppress > 0, id_index < 0, \
-                                                         out[base_idx + offset_k + id_index] == \
+                        with ib.if_scope(tvm.all(j > k,
+                                                 out[base_idx + offset_j +
+                                                     score_index] > 0,
+                                                 tvm.any(id_index < 0,
+                                                         out[base_idx + offset_j + id_index] >= 0),
+                                                 tvm.any(force_suppress > 0, id_index < 0,
+                                                         out[base_idx + offset_k + id_index] ==
                                                          out[base_idx + offset_j + id_index]))):
                             iou = calculate_overlap(out, base_idx + offset_j + coord_start,
                                                     base_idx + offset_k + coord_start)
@@ -541,12 +449,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             with ib.if_scope(j < valid_count[i]):
                 offset_j = j * box_data_length
                 with ib.for_range(0, box_data_length) as k:
-                    out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k]
+                    out[(base_idx + offset_j + k)
+                        ] = data[base_idx + offset_j + k]
                 box_indices[i * num_anchors + j] = j
         # Set invalid entry to be -1
         with ib.if_scope(j < num_anchors - valid_count[i]):
             with ib.for_range(0, box_data_length) as k:
-                out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0
+                out[base_idx + (j + valid_count[i]) *
+                    box_data_length + k] = -1.0
             box_indices[i * num_anchors + j + valid_count[i]] = -1
         # Only return max_output_size number of valid boxes
         num_valid_boxes[0] = 0
@@ -671,7 +581,7 @@ def invalid_to_bottom_ir(data, flag, idx, out):
             with ib.if_scope(flag[i * num_anchors + j] > 0):
                 with ib.for_range(0, elem_length) as k:
                     out[base_idx + (idx[i * num_anchors + j] - 1) * elem_length + k] \
-                    = data[base_idx + j * elem_length + k]
+                        = data[base_idx + j * elem_length + k]
     return ib.get()
 
 
@@ -756,8 +666,10 @@ def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
                                       "valid_count_buf", data_alignment=4)
     score_axis = score_index
     score_shape = (batch_size, num_anchors)
-    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
-    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
+    score_tensor = tvm.compute(
+        score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
+    sort_tensor = argsort(
+        score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
 
     sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype,
                                       "sort_tensor_buf", data_alignment=8)
@@ -795,7 +707,8 @@ def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
                                              ins[0], outs[0], outs[1]),
                                          dtype=["int32", "int32"],
                                          in_buffers=[out_buf],
-                                         out_buffers=[temp_flag_buf, temp_idx_buf],
+                                         out_buffers=[
+                                             temp_flag_buf, temp_idx_buf],
                                          name="invalid_to_bottom_phase_one")
 
         output = tvm.extern([data.shape], [out, temp_flag, temp_idx],
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index a081f0797dad..85e4180a0892 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -67,8 +67,8 @@ def check_device(device):
         tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
     for device in ['llvm', 'cuda', 'opencl']:
-        # Disable gpu test for now
-        if device != "llvm":
+        # Disable opencl test for now
+        if device != "llvm" and device != "cuda":
             continue
         check_device(device)
 

From 623dd2087839b76bf7950f0759d5d8746497f2b7 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 24 Feb 2020 13:12:03 -0800
Subject: [PATCH 31/73] [Relay][AutoTVM] Relay op strategy (#4644)

* relay op strategy

fix lint

bitpack strategy

bitserial_dense (#6)

* update strategy

* address comments

fix a few topi test

Dense strategy (#5)

* dense

* add biforst; remove comments

* address comment

Refactor x86 conv2d_NCHWc (#4)

* Refactor x86 conv2d

* Add x86 depthwise_conv2d_NCHWc

* Add back topi x86 conv2d_nchw

* Merge x86 conv2d_nchw and conv2d_NCHWc

* Minor fix for x86 conv2d

fix more strategy

Add x86 conv2d_NCHWc_int8 strategy (#8)

* Add x86 conv2d_NCHWc_int8 strategy

* Remove contrib_conv2d_nchwc_int8

* Fix generic conv2d_NCHWc for int8

* Fix topi arm_cpu conv2d_NCHWc_int8

update x86 conv2d

enable specify relay ops to be tuned for autotvm

add cuda conv2d strategy

add conv2d strategy for rocm

add conv2d strategy for hls

add conv2d strategy for arm cpu

add conv2d strategy for mali

add conv2d strategy for bifrost

add conv2d strategy for intel graphics

clean up and fix lint

remove template keys from autotvm

remove 2 in the func name

address comments

fix

* fix bugs

* lint

* address comments

* add name to op implement

* Modify topi tests (#9)

* Add pooling, reorg, softmax and vision

* Add lrn

* fix topi test

* fix more topi test

* lint

* address comments

* x

* fix more tests & bugs

* Modify more tests (#10)

* Modify tests for bitserial_conv2d, bitserial_dense, bitserial_conv2d_rasp and bnn

* Minor fix

* More minor fix

* fix more test

* try to update vta using strategy

* fix cpptest

* x

* fix rebase err

* Fix two tests (#11)

* change autotvm log format

* lint

* minor fix

* try fix vta test

* fix rebase err

* tweak

* tmp hack for vta pass

* fix tutorial

* fix

* fix more tutorials

* fix vta tutorial

* minor

* address comments

* fix

* address comments

* fix cpptest

* fix docs

* change data structure name and api

* address comments

* lint

* fix rebase err

* updates

* fix winograd test

* fix doc

* rebase

* upgrade tophub version number

* fix bug

* re-enable vta tsim test after tophub is upgraded

* fix vta test to use the correct args so the config can be found in tophub

Co-authored-by: Yao Wang <kevinthesunwy@gmail.com>
---
 include/tvm/relay/op_attr_types.h             |  31 +-
 include/tvm/relay/op_strategy.h               | 164 ++++
 include/tvm/te/schedule.h                     |  49 ++
 python/tvm/autotvm/__init__.py                |   4 +-
 python/tvm/autotvm/database.py                |   5 +-
 python/tvm/autotvm/feature.py                 |   5 +-
 .../autotvm/graph_tuner/base_graph_tuner.py   |  44 +-
 .../graph_tuner/utils/traverse_graph.py       |  41 +-
 python/tvm/autotvm/graph_tuner/utils/utils.py |  13 +-
 python/tvm/autotvm/record.py                  |  76 +-
 python/tvm/autotvm/task/__init__.py           |   7 +-
 python/tvm/autotvm/task/dispatcher.py         |  90 +-
 python/tvm/autotvm/task/relay_integration.py  |  65 +-
 python/tvm/autotvm/task/space.py              |  30 +-
 python/tvm/autotvm/task/task.py               | 331 +++++---
 python/tvm/autotvm/task/topi_integration.py   | 497 +++--------
 python/tvm/autotvm/tophub.py                  |  25 +-
 .../tvm/autotvm/tuner/xgboost_cost_model.py   |   3 +-
 python/tvm/relay/backend/compile_engine.py    | 203 ++++-
 python/tvm/relay/expr_functor.py              |  20 +-
 python/tvm/relay/frontend/tensorflow.py       |   4 +-
 python/tvm/relay/frontend/tflite.py           |   2 +-
 python/tvm/relay/memory_alloc.py              |   4 +-
 python/tvm/relay/op/__init__.py               |   5 +-
 python/tvm/relay/op/_algorithm.py             |  48 +-
 python/tvm/relay/op/_reduce.py                |  32 +-
 python/tvm/relay/op/_tensor.py                | 119 ++-
 python/tvm/relay/op/_transform.py             | 121 ++-
 python/tvm/relay/op/annotation/annotation.py  |   4 +-
 python/tvm/relay/op/contrib/_contrib.py       |  20 +-
 python/tvm/relay/op/image/_image.py           |  13 +-
 python/tvm/relay/op/nn/_nn.py                 | 784 +++---------------
 python/tvm/relay/op/nn/nn.py                  | 137 ---
 python/tvm/relay/op/op.py                     | 212 ++++-
 python/tvm/relay/op/strategy/__init__.py      |  31 +
 python/tvm/relay/op/strategy/arm_cpu.py       | 231 ++++++
 python/tvm/relay/op/strategy/bifrost.py       | 104 +++
 python/tvm/relay/op/strategy/cuda.py          | 398 +++++++++
 python/tvm/relay/op/strategy/generic.py       | 749 +++++++++++++++++
 python/tvm/relay/op/strategy/hls.py           | 158 ++++
 .../tvm/relay/op/strategy/intel_graphics.py   |  74 ++
 python/tvm/relay/op/strategy/mali.py          | 106 +++
 python/tvm/relay/op/strategy/opengl.py        |  73 ++
 python/tvm/relay/op/strategy/rocm.py          | 136 +++
 python/tvm/relay/op/strategy/x86.py           | 302 +++++++
 python/tvm/relay/op/vision/_rcnn.py           |  56 +-
 python/tvm/relay/op/vision/_vision.py         |  91 +-
 python/tvm/relay/op/vision/_yolo.py           |   6 +-
 python/tvm/relay/quantize/_annotate.py        |   7 +-
 python/tvm/relay/testing/mobilenet.py         |  30 +-
 python/tvm/target/generic_func.py             |   2 +
 python/tvm/te/__init__.py                     |   4 +-
 python/tvm/te/schedule.py                     |  35 +
 python/tvm/tir/expr.py                        |   8 +
 src/relay/backend/compile_engine.cc           |  78 +-
 src/relay/backend/compile_engine.h            |  26 +
 src/relay/ir/op_strategy.cc                   | 114 +++
 src/relay/op/annotation/annotation.cc         |  14 +-
 src/relay/op/debug.cc                         |   5 +-
 src/relay/op/memory/memory.cc                 |  10 +-
 src/relay/op/nn/convolution.cc                | 101 ---
 src/relay/op/nn/convolution.h                 |  14 +-
 src/relay/op/nn/nn.cc                         |  23 +-
 src/relay/op/nn/pad.cc                        |   5 +-
 src/relay/op/nn/pooling.cc                    |  30 +-
 src/relay/op/tensor/binary.cc                 |   5 +-
 src/relay/op/tensor/reduce.cc                 |  71 +-
 src/relay/op/tensor/transform.cc              | 135 ++-
 src/relay/op/tensor/unary.cc                  |  15 +-
 src/relay/op/vision/yolo.cc                   |   3 +-
 src/relay/pass/alter_op_layout.cc             |   5 +-
 src/te/schedule/schedule_lang.cc              |  75 +-
 tests/cpp/relay_build_module_test.cc          |  57 +-
 tests/python/frontend/mxnet/test_forward.py   |  15 +-
 tests/python/integration/test_tuning.py       |   4 +-
 tests/python/relay/test_any.py                |   3 +
 .../relay/test_autotvm_task_extraction.py     |  70 +-
 .../relay/test_backend_compile_engine.py      | 129 ++-
 tests/python/relay/test_op_level2.py          |  65 +-
 tests/python/relay/test_op_qnn_conv2d.py      |  25 +-
 .../python/relay/test_pass_alter_op_layout.py |  51 +-
 tests/python/relay/test_pass_auto_quantize.py |   5 +-
 .../python/relay/test_pass_fold_scale_axis.py |   9 +-
 tests/python/unittest/test_autotvm_common.py  |   6 +-
 .../unittest/test_autotvm_dispatch_context.py |  34 +-
 tests/python/unittest/test_autotvm_measure.py |   2 +-
 tests/python/unittest/test_codegen_blob.py    |   2 +-
 tests/python/unittest/test_codegen_cuda.py    |   2 +-
 .../python/unittest/test_graph_tuner_core.py  | 416 ++++------
 .../python/unittest/test_graph_tuner_utils.py |  33 +-
 .../unittest/test_lang_tensor_overload_op.py  |  10 +-
 topi/include/topi/cuda/normalization.h        |   5 +-
 topi/include/topi/rocm/normalization.h        |   7 +-
 topi/python/topi/__init__.py                  |   1 +
 topi/python/topi/argwhere.py                  |   2 -
 topi/python/topi/arm_cpu/__init__.py          |  17 +-
 topi/python/topi/arm_cpu/bitserial_conv2d.py  |  11 +-
 topi/python/topi/arm_cpu/bitserial_dense.py   |  10 +-
 topi/python/topi/arm_cpu/conv2d.py            | 456 ++--------
 topi/python/topi/arm_cpu/conv2d_alter_op.py   | 171 ++++
 topi/python/topi/arm_cpu/conv2d_int8.py       |  20 +-
 .../topi/arm_cpu/conv2d_spatial_pack.py       |   6 +-
 topi/python/topi/arm_cpu/conv2d_transpose.py  |  11 +-
 topi/python/topi/arm_cpu/depthwise_conv2d.py  |  71 +-
 topi/python/topi/arm_cpu/injective.py         |   4 -
 topi/python/topi/bifrost/conv2d.py            | 146 ++--
 topi/python/topi/bifrost/dense.py             |  37 +-
 topi/python/topi/bifrost/depthwise_conv2d.py  |   2 -
 topi/python/topi/cuda/__init__.py             |  28 +-
 topi/python/topi/cuda/batch_matmul.py         |  49 +-
 topi/python/topi/cuda/conv1d.py               |  81 +-
 topi/python/topi/cuda/conv1d_transpose_ncw.py |  11 +-
 topi/python/topi/cuda/conv2d.py               | 234 ++----
 topi/python/topi/cuda/conv2d_alter_op.py      | 136 +++
 topi/python/topi/cuda/conv2d_direct.py        |   2 +-
 topi/python/topi/cuda/conv2d_hwcn.py          |  12 +-
 topi/python/topi/cuda/conv2d_int8.py          |  18 +-
 .../python/topi/cuda/conv2d_transpose_nchw.py |  11 +-
 topi/python/topi/cuda/conv2d_winograd.py      | 178 +---
 topi/python/topi/cuda/conv3d.py               | 207 +++--
 topi/python/topi/cuda/conv3d_direct.py        |  11 +-
 topi/python/topi/cuda/deformable_conv2d.py    |  20 +-
 topi/python/topi/cuda/dense.py                | 163 ++--
 topi/python/topi/cuda/depthwise_conv2d.py     |  17 +-
 topi/python/topi/cuda/group_conv2d_nchw.py    | 358 ++++----
 topi/python/topi/cuda/injective.py            |   7 +-
 topi/python/topi/cuda/nms.py                  |  13 +-
 topi/python/topi/cuda/nn.py                   |   7 +-
 topi/python/topi/cuda/pooling.py              |   7 +-
 topi/python/topi/cuda/rcnn/__init__.py        |   2 +-
 topi/python/topi/cuda/rcnn/proposal.py        |   7 +-
 topi/python/topi/cuda/reduction.py            |   2 -
 topi/python/topi/cuda/softmax.py              |   3 +-
 topi/python/topi/cuda/sort.py                 |  14 +-
 topi/python/topi/cuda/ssd/multibox.py         |  18 +-
 topi/python/topi/cuda/vision.py               |  12 +-
 topi/python/topi/generic/conv2d.py            |  82 +-
 topi/python/topi/generic/extern.py            |   1 -
 topi/python/topi/generic/injective.py         |  21 +-
 topi/python/topi/generic/nn.py                |  75 +-
 topi/python/topi/generic/search.py            |   2 -
 topi/python/topi/generic/sort.py              |   3 -
 topi/python/topi/generic/vision.py            |   9 -
 topi/python/topi/hls/injective.py             |   3 -
 topi/python/topi/hls/nn.py                    |  14 -
 topi/python/topi/intel_graphics/__init__.py   |   2 +
 topi/python/topi/intel_graphics/conv2d.py     | 380 +++------
 .../topi/intel_graphics/conv2d_alter_op.py    | 102 +++
 .../topi/intel_graphics/depthwise_conv2d.py   |  21 +-
 topi/python/topi/mali/conv2d.py               | 152 ++--
 topi/python/topi/mali/dense.py                |  40 +-
 topi/python/topi/mali/depthwise_conv2d.py     |  15 +-
 topi/python/topi/nn/batch_matmul.py           |  22 +-
 topi/python/topi/nn/bitserial_conv2d.py       | 221 +----
 topi/python/topi/nn/bitserial_dense.py        |  79 +-
 topi/python/topi/nn/conv1d.py                 |  15 +-
 topi/python/topi/nn/conv1d_transpose.py       |   1 -
 topi/python/topi/nn/conv2d.py                 | 203 +----
 topi/python/topi/nn/conv2d_transpose.py       |   1 -
 topi/python/topi/nn/conv3d.py                 |  48 +-
 topi/python/topi/nn/deformable_conv2d.py      |   1 -
 topi/python/topi/nn/dense.py                  |  28 +-
 topi/python/topi/nn/depthwise_conv2d.py       |   3 -
 topi/python/topi/nn/local_response_norm.py    |   2 -
 topi/python/topi/nn/sparse.py                 |   8 +-
 topi/python/topi/nn/util.py                   |   2 +-
 topi/python/topi/opengl/conv2d_nchw.py        |   2 -
 topi/python/topi/opengl/dense.py              |   2 -
 topi/python/topi/opengl/injective.py          |   3 -
 topi/python/topi/opengl/pooling.py            |   3 -
 topi/python/topi/opengl/softmax.py            |   2 -
 topi/python/topi/rocm/conv2d.py               |  77 +-
 topi/python/topi/rocm/dense.py                | 101 ++-
 topi/python/topi/rocm/nn.py                   |   7 +-
 topi/python/topi/sort.py                      |   2 -
 topi/python/topi/testing/__init__.py          |   2 +
 topi/python/topi/testing/common.py            |  74 ++
 topi/python/topi/vision/nms.py                |   3 +-
 topi/python/topi/vision/rcnn/proposal.py      |   2 +-
 topi/python/topi/vision/rcnn/roi_align.py     |   1 -
 topi/python/topi/vision/rcnn/roi_pool.py      |   1 -
 topi/python/topi/vision/reorg.py              |   2 -
 topi/python/topi/vision/ssd/multibox.py       |   3 -
 topi/python/topi/x86/__init__.py              |  18 +-
 topi/python/topi/x86/batch_matmul.py          |  53 +-
 topi/python/topi/x86/binarize_pack.py         |   2 -
 topi/python/topi/x86/binary_dense.py          |   2 -
 topi/python/topi/x86/bitserial_conv2d.py      | 235 +++++-
 topi/python/topi/x86/bitserial_dense.py       |  80 +-
 topi/python/topi/x86/conv1d.py                |   4 +-
 topi/python/topi/x86/conv2d.py                | 443 +++-------
 topi/python/topi/x86/conv2d_alter_op.py       | 223 +++--
 topi/python/topi/x86/conv2d_avx_1x1.py        | 150 ++--
 topi/python/topi/x86/conv2d_avx_common.py     | 147 ++--
 topi/python/topi/x86/conv2d_int8.py           | 223 +++--
 topi/python/topi/x86/conv2d_transpose.py      |  49 +-
 topi/python/topi/x86/conv3d.py                |  27 +-
 topi/python/topi/x86/dense.py                 | 243 +++---
 topi/python/topi/x86/depthwise_conv2d.py      | 203 +++--
 topi/python/topi/x86/injective.py             |   4 -
 topi/python/topi/x86/nn.py                    |   2 -
 topi/python/topi/x86/pooling.py               |   3 -
 topi/python/topi/x86/reduction.py             |   5 +-
 topi/python/topi/x86/roi_align.py             |   4 +-
 topi/python/topi/x86/sparse.py                |   5 +-
 topi/src/topi.cc                              |   4 +-
 topi/tests/python/common.py                   |   5 +-
 topi/tests/python/test_fifo_buffer.py         |  24 +-
 topi/tests/python/test_topi_batch_matmul.py   |  11 +-
 .../python/test_topi_bitserial_conv2d.py      |  12 +-
 .../python/test_topi_bitserial_conv2d_rasp.py |   6 +-
 .../tests/python/test_topi_bitserial_dense.py |  52 +-
 topi/tests/python/test_topi_bnn.py            |   6 +-
 topi/tests/python/test_topi_broadcast.py      |  13 +-
 topi/tests/python/test_topi_clip.py           |   3 +-
 topi/tests/python/test_topi_conv1d.py         |  23 +-
 .../python/test_topi_conv1d_transpose_ncw.py  |  12 +-
 topi/tests/python/test_topi_conv2d_NCHWc.py   |  12 +-
 topi/tests/python/test_topi_conv2d_hwcn.py    |  15 +-
 topi/tests/python/test_topi_conv2d_int8.py    |   6 +-
 topi/tests/python/test_topi_conv2d_nchw.py    |  15 +-
 topi/tests/python/test_topi_conv2d_nhwc.py    |  13 +-
 .../python/test_topi_conv2d_nhwc_pack_int8.py |  25 +-
 .../python/test_topi_conv2d_transpose_nchw.py |  21 +-
 .../tests/python/test_topi_conv2d_winograd.py | 104 ++-
 topi/tests/python/test_topi_conv3d_ncdhw.py   |  12 +-
 topi/tests/python/test_topi_conv3d_ndhwc.py   |  17 +-
 .../python/test_topi_deformable_conv2d.py     |  12 +-
 topi/tests/python/test_topi_dense.py          |  40 +-
 topi/tests/python/test_topi_depth_to_space.py |   2 +-
 .../python/test_topi_depthwise_conv2d.py      | 176 ++--
 topi/tests/python/test_topi_group_conv2d.py   |  15 +-
 .../test_topi_group_conv2d_NCHWc_int8.py      |  12 +-
 topi/tests/python/test_topi_image.py          |   6 +-
 topi/tests/python/test_topi_lrn.py            |  16 +-
 topi/tests/python/test_topi_math.py           |  30 +-
 topi/tests/python/test_topi_pooling.py        |  37 +-
 topi/tests/python/test_topi_reduce.py         |   3 +-
 topi/tests/python/test_topi_relu.py           |   4 +-
 topi/tests/python/test_topi_reorg.py          |  11 +-
 topi/tests/python/test_topi_softmax.py        |  17 +-
 topi/tests/python/test_topi_sort.py           |  19 +-
 topi/tests/python/test_topi_space_to_depth.py |   2 +-
 topi/tests/python/test_topi_tensor.py         |   3 +-
 topi/tests/python/test_topi_transform.py      |  64 +-
 topi/tests/python/test_topi_upsampling.py     |   4 +-
 topi/tests/python/test_topi_vision.py         |  98 ++-
 tutorials/autotvm/tune_conv2d_cuda.py         |   4 +-
 tutorials/autotvm/tune_relay_arm.py           |  27 +-
 tutorials/autotvm/tune_relay_cuda.py          |  17 +-
 tutorials/autotvm/tune_relay_mobile_gpu.py    |  15 +-
 tutorials/autotvm/tune_relay_x86.py           |  21 +-
 tutorials/autotvm/tune_simple_template.py     |   6 +-
 tutorials/dev/relay_pass_infra.py             |   6 +-
 .../optimize/opt_matmul_auto_tensorcore.py    |   5 +-
 tutorials/topi/intro_topi.py                  |   8 +-
 vta/python/vta/ir_pass.py                     |   8 +-
 vta/python/vta/top/__init__.py                |   8 +-
 vta/python/vta/top/bitpack.py                 |   5 +-
 vta/python/vta/top/op.py                      | 217 ++---
 vta/python/vta/top/vta_conv2d.py              |  16 +-
 vta/python/vta/top/vta_conv2d_transpose.py    |  15 +-
 vta/python/vta/top/vta_dense.py               |  12 +-
 vta/python/vta/top/vta_group_conv2d.py        |   8 +-
 vta/scripts/tune_resnet.py                    |   2 +-
 .../integration/test_benchmark_topi_conv2d.py |  23 +-
 .../test_benchmark_topi_conv2d_transpose.py   |  20 +-
 .../integration/test_benchmark_topi_dense.py  |  10 +-
 .../test_benchmark_topi_group_conv2d.py       |  17 +-
 vta/tutorials/autotvm/tune_relay_vta.py       |  14 +-
 270 files changed, 8465 insertions(+), 7050 deletions(-)
 create mode 100644 include/tvm/relay/op_strategy.h
 create mode 100644 python/tvm/relay/op/strategy/__init__.py
 create mode 100644 python/tvm/relay/op/strategy/arm_cpu.py
 create mode 100644 python/tvm/relay/op/strategy/bifrost.py
 create mode 100644 python/tvm/relay/op/strategy/cuda.py
 create mode 100644 python/tvm/relay/op/strategy/generic.py
 create mode 100644 python/tvm/relay/op/strategy/hls.py
 create mode 100644 python/tvm/relay/op/strategy/intel_graphics.py
 create mode 100644 python/tvm/relay/op/strategy/mali.py
 create mode 100644 python/tvm/relay/op/strategy/opengl.py
 create mode 100644 python/tvm/relay/op/strategy/rocm.py
 create mode 100644 python/tvm/relay/op/strategy/x86.py
 create mode 100644 src/relay/ir/op_strategy.cc
 create mode 100644 topi/python/topi/arm_cpu/conv2d_alter_op.py
 create mode 100644 topi/python/topi/cuda/conv2d_alter_op.py
 create mode 100644 topi/python/topi/intel_graphics/conv2d_alter_op.py
 create mode 100644 topi/python/topi/testing/common.py

diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 88e948f5d72a..1a2263e3f187 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -29,6 +29,7 @@
 #include <tvm/relay/type.h>
 #include <tvm/relay/expr.h>
 #include <tvm/target/target.h>
+#include <tvm/target/generic_func.h>
 #include <tvm/tir/data_layout.h>
 #include <string>
 
@@ -105,9 +106,8 @@ using TShapeDataDependant = bool;
  */
 using FTVMCompute = runtime::TypedPackedFunc<
   Array<te::Tensor>(const Attrs& attrs,
-                     const Array<te::Tensor>& inputs,
-                     const Type& out_type,
-                     const Target& target)>;
+                    const Array<te::Tensor>& inputs,
+                    const Type& out_type)>;
 
 /*!
  * \brief Build the computation schedule for
@@ -120,8 +120,18 @@ using FTVMCompute = runtime::TypedPackedFunc<
  */
 using FTVMSchedule = runtime::TypedPackedFunc<
   te::Schedule(const Attrs& attrs,
-                const Array<te::Tensor>& outs,
-                const Target& target)>;
+               const Array<te::Tensor>& outs,
+               const Target& target)>;
+
+/*!
+ * \brief Generate the strategy of operators. This function is a generic
+ * function and can be re-defined for different targets.
+ *
+ * The function signature of generic function is:
+ *   OpStrategy(const Attrs& attrs, const Array<Tensor>& inputs,
+ *              const Type& out_type, const Target& target)
+ */
+using FTVMStrategy = GenericFunc;
 
 /*!
  * \brief Alternate the layout of operators or replace the
@@ -136,7 +146,8 @@ using FTVMSchedule = runtime::TypedPackedFunc<
 using FTVMAlterOpLayout = runtime::TypedPackedFunc<
   Expr(const Attrs& attrs,
        const Array<Expr>& args,
-       const Array<te::Tensor>& tinfos)>;
+       const Array<te::Tensor>& tinfos,
+       const Type& out_type)>;
 
 /*!
  * \brief Convert the layout of operators or replace the
@@ -191,9 +202,7 @@ using FForwardRewrite = runtime::TypedPackedFunc<
  * \brief Gradient for a specific op.
  *
  * \param orig_call the original Expr.
- *
  * \param output_grad the gradient of the Expr.
- *
  * \return the gradient for each parameters.
  */
 using FPrimalGradient = runtime::TypedPackedFunc<tvm::Array<Expr>(const Expr& orig_call,
@@ -207,13 +216,13 @@ enum AnyCodegenStrategy {
   kVariableDimensions
 };
 
-/* \brief A runtime representation of shape. */
+/*! \brief A runtime representation of shape. */
 using Shape = Array<IndexExpr>;
 
 using FShapeFunc = runtime::TypedPackedFunc<
   Array<te::Tensor>(const Attrs& attrs,
-                     const Array<te::Tensor>& inputs,
-                     const Array<IndexExpr>& out_ndims)>;
+                    const Array<te::Tensor>& inputs,
+                    const Array<IndexExpr>& out_ndims)>;
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/include/tvm/relay/op_strategy.h b/include/tvm/relay/op_strategy.h
new file mode 100644
index 000000000000..a4da95a36b07
--- /dev/null
+++ b/include/tvm/relay/op_strategy.h
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/op_strategy.h
+ * \brief The Relay operator Strategy and related data structure.
+ */
+
+#ifndef TVM_RELAY_OP_STRATEGY_H_
+#define TVM_RELAY_OP_STRATEGY_H_
+
+#include <tvm/te/tensor.h>
+#include <tvm/te/schedule.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/target/target.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Operator implementation that includes compute and schedule function.
+ */
+class OpImplementationNode : public Object {
+ public:
+  /*! \brief Compute function */
+  FTVMCompute fcompute;
+  /*! \brief Schedule function */
+  FTVMSchedule fschedule;
+  /*! \brief Name of the implementation */
+  std::string name;
+  /*! \brief Priority level */
+  int plevel;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("name", &name);
+    v->Visit("plevel", &plevel);
+  }
+
+  static constexpr const char* _type_key = "relay.OpImplementation";
+  TVM_DECLARE_FINAL_OBJECT_INFO(OpImplementationNode, Object);
+};
+
+/*!
+ * \brief Operator implementation class.
+ */
+class OpImplementation : public ObjectRef {
+ public:
+  /*!
+   * \brief Invoke the operator compute function.
+   * \param attrs The attribute of the primitive
+   * \param inputs The input tensors.
+   * \param out_type The output type information.
+   * \return The output compute description of the operator.
+   */
+  TVM_DLL Array<te::Tensor> Compute(const Attrs& attrs,
+                                    const Array<te::Tensor>& inputs,
+                                    const Type& out_type);
+  /*!
+   * \brief Build the computation schedule.
+   * \param attrs The attribute of the node.
+   * \param outs The output tensors.
+   * \param target The build target.
+   * \return The computation schedule.
+   */
+  TVM_DLL te::Schedule Schedule(const Attrs& attrs,
+                                const Array<te::Tensor>& outs,
+                                const Target& target);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(OpImplementation, ObjectRef, OpImplementationNode);
+};
+
+/*!
+ * \brief Specialized implementations for operators under certain conditions.
+ */
+class OpSpecializationNode : public Object {
+ public:
+  /*! \brief List of implementations. */
+  Array<OpImplementation> implementations;
+  /*! \brief Condition to enable the specialization.
+   *    Could be undefined to represent generic case. */
+  te::SpecializedCondition condition;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("condition", &condition);
+    v->Visit("implementations", &implementations);
+  }
+
+  static constexpr const char* _type_key = "relay.OpSpecialization";
+  TVM_DECLARE_FINAL_OBJECT_INFO(OpSpecializationNode, ExprNode);
+};
+
+/*!
+ * \brief Operator specialization class.
+ */
+class OpSpecialization : public ObjectRef {
+ public:
+  /*!
+   * \brief Add an implementation.
+   * \param fcompute Compute function
+   * \param fschedule Schedule function
+   * \param name Name of the implementation
+   * \param plevel Priority level of the implementation
+   */
+  TVM_DLL void AddImplementation(FTVMCompute fcompute, FTVMSchedule fschedule,
+                                 std::string name, int plevel);
+
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpSpecialization, ObjectRef, OpSpecializationNode);
+};
+
+/*!
+ * \brief Operator strategy to choose implementation.
+ */
+class OpStrategyNode : public Object {
+ public:
+  /*! \brief List of operator specializations. */
+  Array<OpSpecialization> specializations;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("specializations", &specializations);
+  }
+
+  static constexpr const char* _type_key = "relay.OpStrategy";
+  TVM_DECLARE_FINAL_OBJECT_INFO(OpStrategyNode, ExprNode);
+};
+
+/*!
+ * \brief Operator strategy class.
+ */
+class OpStrategy : public ObjectRef {
+ public:
+  /*!
+   * \brief Add an implementation.
+   * \param fcompute Compute function
+   * \param fschedule Schedule function
+   * \param name Name of the implementation
+   * \param plevel Priority level of the implementation
+   */
+  TVM_DLL void AddImplementation(FTVMCompute fcompute, FTVMSchedule fschedule,
+                                 std::string name, int plevel);
+
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(OpStrategy, ObjectRef, OpStrategyNode);
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_OP_STRATEGY_H_
diff --git a/include/tvm/te/schedule.h b/include/tvm/te/schedule.h
index e99b54a86565..a8a02365fbda 100644
--- a/include/tvm/te/schedule.h
+++ b/include/tvm/te/schedule.h
@@ -28,6 +28,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/te/tensor.h>
 #include <tvm/te/tensor_intrin.h>
+#include <tvm/support/with.h>
 
 #include <string>
 #include <unordered_map>
@@ -742,6 +743,53 @@ class SingletonNode : public IterVarRelationNode {
   TVM_DECLARE_FINAL_OBJECT_INFO(SingletonNode, IterVarRelationNode);
 };
 
+/*! \brief Container for specialization conditions. */
+class SpecializedConditionNode : public Object {
+ public:
+  /*!
+   * \brief List of conditions in conjunctive joint form (CNF).
+   *   Each condition should be a simple expression, e.g., n > 16, m % 8 == 0, etc.,
+   *   where n, m are tvm::Var that represents a dimension in the tensor shape.
+   */
+  Array<PrimExpr> clauses;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("clauses", &clauses);
+  }
+
+  static constexpr const char* _type_key = "SpecializedCondition";
+  TVM_DECLARE_FINAL_OBJECT_INFO(SpecializedConditionNode, Object);
+};
+
+/*!
+ * \brief Specialized condition to enable op specialization
+ */
+class SpecializedCondition : public ObjectRef {
+ public:
+  /*!
+   * \brief construct from conditions
+   * \param conditions The clauses in the specialized condition.
+   */
+  TVM_DLL SpecializedCondition(Array<PrimExpr> conditions);  // NOLINT(*)
+
+  /*!
+   * \brief Get the current specialized condition.
+   * \return the current specialized condition.
+   */
+  TVM_DLL static SpecializedCondition Current();
+
+  TVM_DEFINE_OBJECT_REF_METHODS(SpecializedCondition, ObjectRef, SpecializedConditionNode);
+  class Internal;
+
+ private:
+  // enable with syntax.
+  friend class Internal;
+  friend class With<SpecializedCondition>;
+  /*! \brief Push a new specialized condition onto the thread local stack. */
+  TVM_DLL void EnterWithScope();
+  /*! \brief Pop a specialized condition off the thread local context stack. */
+  TVM_DLL void ExitWithScope();
+};
 
 // implementations
 inline const StageNode* Stage::operator->() const {
@@ -765,6 +813,7 @@ inline const IterVarRelationNode* IterVarRelation::operator->() const {
 inline const IterVarAttrNode* IterVarAttr::operator->() const {
   return static_cast<const IterVarAttrNode*>(get());
 }
+
 }  // namespace te
 }  // namespace tvm
 #endif  // TVM_TE_SCHEDULE_H_
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index cf8362ad8368..eab4ddfeaf7d 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -41,8 +41,8 @@
 from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, \
     LocalBuilder, LocalRunner, RPCRunner
 from .tuner import callback
-from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
-    register_topi_compute, register_topi_schedule, \
+from .task import get_config, create, ConfigSpace, ConfigEntity, \
+    register_topi_compute, register_topi_schedule, register_customized_task, \
     DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best, \
     ApplyGraphBest as apply_graph_best
 from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/database.py b/python/tvm/autotvm/database.py
index 55d4180f03be..963f7e54ecaf 100644
--- a/python/tvm/autotvm/database.py
+++ b/python/tvm/autotvm/database.py
@@ -125,7 +125,7 @@ def load(self, inp, get_all=False):
         current = self.get(measure_str_key(inp))
         if current is not None:
             records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)]
-            results = [rec[1] for rec in records]
+            results = [rec[1] for rec in records if rec is not None]
             if get_all:
                 return results
             return max(results, key=lambda result: result.timestamp)
@@ -167,9 +167,12 @@ def filter(self, func):
             current = self.get(key)
             try:
                 records = [decode(x) for x in current.split(RedisDatabase.MAGIC_SPLIT)]
+                records = [rec for rec in records if rec is not None]
             except TypeError: # got a badly formatted/old format record
                 continue
 
+            if not records:
+                continue
             inps, results = zip(*records)
             inp = inps[0]
             if not func(inp, results):
diff --git a/python/tvm/autotvm/feature.py b/python/tvm/autotvm/feature.py
index b7d1c44117a7..4ff1139d85f1 100644
--- a/python/tvm/autotvm/feature.py
+++ b/python/tvm/autotvm/feature.py
@@ -153,7 +153,10 @@ def get_flatten_name(fea):
         from .record import decode
         # flatten line to feature
         line = fea
-        inp, _ = decode(line)
+        ret = decode(line)
+        if ret is None:
+            raise ValueError("Unsupported AutoTVM log format")
+        inp, _ = ret
         target = _target.create(inp.target)
         with target:
             s, args = inp.template.instantiate(inp.config)
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index b02c289cb10f..3e85e938fa82 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -25,7 +25,6 @@
 import tvm
 from tvm import autotvm, relay
 from tvm.autotvm.task import get_config
-from tvm.autotvm.task.topi_integration import deserialize_args, serialize_args
 from tvm.autotvm.record import encode, load_from_file
 from tvm.autotvm.measure import MeasureResult, MeasureInput
 
@@ -35,18 +34,16 @@
 from ._base import INVALID_LAYOUT_TIME
 
 
-# Setup topi_op_name -> layout function
-# NOTE: To add more ops, change the following dictionary.
-OP2LAYOUT = {
-    "topi_nn_conv2d": topi.nn.conv2d_infer_layout,
-    "topi_nn_depthwise_conv2d_nchw": topi.nn.depthwise_conv2d_infer_layout,
-}
+def get_infer_layout(task_name):
+    if task_name.startswith("conv2d"):
+        return topi.nn.conv2d_infer_layout
+    if task_name.startswith("depthwise_conv2d"):
+        return topi.nn.depthwise_conv2d_infer_layout
+    raise ValueError("Cannot find infer layout for task %s" % task_name)
 
-
-@autotvm.template
+@autotvm.register_customized_task("layout_transform")
 def layout_transform(*args):
     """Autotvm layout transform template."""
-    args = deserialize_args(args)
     cfg = get_config()
     cfg.add_flop(-1)
     data = args[0]
@@ -82,7 +79,7 @@ def __init__(self, graph, input_shapes, records, target_ops,
                        Each row of this file is an encoded record pair.
             Otherwise, it is an iterator.
 
-        target_ops : List of str
+        target_ops : List of relay.op.Op
             Target tuning operators.
 
         target : str or tvm.target
@@ -104,7 +101,7 @@ def __init__(self, graph, input_shapes, records, target_ops,
         self._layout_transform_perf_records = {}
         self._layout_transform_interlayer_cost = {}
         self._input_shapes = input_shapes
-        self._target_ops = [op.__name__ for op in target_ops]
+        self._target_ops = target_ops
 
         self._name = name
         self._max_sch_num = max_sch_num
@@ -179,7 +176,7 @@ def __init__(self, graph, input_shapes, records, target_ops,
                         dtype = first_tensor[-1]
                         new_shape = tuple([val.value for val in node_entry["types"][0].shape])
                         actual_workload = (input_workload[0],) + \
-                                          ((new_shape + (dtype,)),) + input_workload[2:]
+                                          (("TENSOR", new_shape, dtype),) + input_workload[2:]
                         node_entry["workloads"].append(actual_workload)
                         if "record_candidates" not in node_entry:
                             node_entry["record_candidates"] = input_node["record_candidates"]
@@ -212,7 +209,7 @@ def _fetch_cfg(self):
                 node_entry["record_candidates"] = cache_dict[workload]
                 continue
             record_candidates = []
-            infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+            infer_layout_func = get_infer_layout(node_entry["topi_op"][0])
             layout_tracking_dict = {}
             for record in cfg_dict[workload]:
                 in_measure, out_measure = record
@@ -264,7 +261,7 @@ def _iterate_layout_transform(self, callback):
 
                 if node_entry["op"] in self._target_ops:
                     o_idx = key
-                    o_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+                    o_infer_layout_func = get_infer_layout(node_entry["topi_op"][0])
                     o_wkl = node_entry["workloads"][0]
                     i_topi_op = in_node_entry["topi_op"][0]
                     i_wkl = in_node_entry["workloads"][0]
@@ -273,14 +270,14 @@ def _iterate_layout_transform(self, callback):
                         pivot += 1
                         i_topi_op = in_node_entry["topi_op"][pivot]
                         i_wkl = in_node_entry["workloads"][pivot]
-                    i_infer_layout_func = OP2LAYOUT[i_topi_op]
+                    i_infer_layout_func = get_infer_layout(i_topi_op)
                 else:
                     o_idx = target_input_idx
                     if i <= target_input_pos:
                         continue
-                    o_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][0]]
+                    o_infer_layout_func = get_infer_layout(node_entry["topi_op"][0])
                     o_wkl = node_entry["workloads"][target_input_pos]
-                    i_infer_layout_func = OP2LAYOUT[node_entry["topi_op"][i]]
+                    i_infer_layout_func = get_infer_layout(node_entry["topi_op"][i])
                     i_wkl = node_entry["workloads"][i]
 
                 if (i_idx, o_idx) in pair_tracker:
@@ -314,9 +311,8 @@ def _create_matrix_callback(self, from_node_idx, to_node_idx, from_sch_idx,
                                 to_sch_idx, args):
         """Create dictionary containing matrix format of layout transformation
         between nodes."""
-        sargs = serialize_args(args)
         in_layout, out_layout = args[1], args[2]
-        ltf_workload = ('layout_transform',) + autotvm.task.args_to_workload(sargs)
+        ltf_workload = autotvm.task.args_to_workload(args, 'layout_transform')
         idx_pair_key = (from_node_idx, to_node_idx)
 
         if in_layout == out_layout:
@@ -449,9 +445,8 @@ def _callback(_, inputs, results):
         measure_option = autotvm.measure_option(builder=builder, runner=runner)
         for args in args_list:
             data, in_layout, out_layout = args
-            args = serialize_args(args)
-            ltf_workload = ('layout_transform',) + autotvm.task.args_to_workload(args)
-            if ltf_workload in  self._layout_transform_perf_records:
+            ltf_workload = autotvm.task.args_to_workload(args, 'layout_transform')
+            if ltf_workload in self._layout_transform_perf_records:
                 continue
 
             if infer_layout:
@@ -478,9 +473,8 @@ def _callback(_, inputs, results):
                 continue
 
             records = []
-            task = autotvm.task.create(layout_transform, args=args, target=self._target,
+            task = autotvm.task.create("layout_transform", args=args, target=self._target,
                                        target_host=target_host)
-            task.workload = ltf_workload
             tuner = autotvm.tuner.GridSearchTuner(task)
             tuner.tune(n_trial=1, measure_option=measure_option,
                        callbacks=[_log_to_list(records)])
diff --git a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
index 7648322d3b18..17450ca3e7f3 100644
--- a/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
+++ b/python/tvm/autotvm/graph_tuner/utils/traverse_graph.py
@@ -18,8 +18,6 @@
 """API for graph traversing."""
 import threading
 
-import topi
-
 import tvm
 from tvm import relay, autotvm
 from tvm.relay import transform
@@ -30,13 +28,6 @@
 from .utils import has_multiple_inputs, is_boundary_node, is_skipped_node
 
 
-# Setup relay op base name -> topi compute functions
-# NOTE: To add more ops, change the following dictionary.
-OP2COMPUTE = {
-    "conv2d" : [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
-}
-
-
 def expr2graph(expr, target_ops, node_dict, node_list):
     """Convert relay expr to graph data structure
     and fetch workloads of target operators.
@@ -46,8 +37,8 @@ def expr2graph(expr, target_ops, node_dict, node_list):
     expr : tvm.relay.Expr.Function
         Input relay function expression.
 
-    target_ops: List of str
-        List of target relay base op name
+    target_ops: List of relay.op.Op
+        List of target relay ops
 
     node_dict : dictionary from tvm.relay.Expr to int
         Dictionary to record node index
@@ -58,14 +49,11 @@ def expr2graph(expr, target_ops, node_dict, node_list):
         {"op": str, "node": tvm.relay.expr, "inputs": [int], "types": [tvm.relay.Type],
          "name": str, "workloads": [tuple], "topi_op": [function]}
     """
+    # TODO(@kevinthesun, @icemelon9): Currently graph tuning pass relies on the fact
+    #   that # autotvm tasks == # ops. But this won't be true after having relay op
+    #   strategy. We need to find a solution to fix this.
     env = TaskExtractEnv.get(allow_duplicate=True)
-    topi_funcs = []
-    for op_name in target_ops:
-        if op_name not in OP2COMPUTE:
-            raise RuntimeError("Not supported relay op in graph tuner: %s"
-                               % op_name)
-        topi_funcs += OP2COMPUTE[op_name]
-    env.reset(topi_funcs)
+    env.reset(target_ops)
     # pylint: disable=not-context-manager
     with env:
         _expr2graph_impl(expr, target_ops, node_dict, node_list)
@@ -75,8 +63,7 @@ def expr2graph(expr, target_ops, node_dict, node_list):
                 task_name, args = env.task_collection[task_pos]
                 task = autotvm.task.create(task_name, args,
                                            target="llvm",
-                                           target_host=None,
-                                           template_key='direct')
+                                           target_host=None)
                 node_entry["workloads"] = [task.workload]
                 node_entry["topi_op"] = [task_name]
                 task_pos += 1
@@ -98,11 +85,11 @@ def _traverse_expr(node):
             return
         node_index = len(node_list)
         node_entry = {"node": node, "inputs": [], "types": [],
-                      "op": "null", "name": None}
+                      "op": None, "name": None}
 
         if isinstance(node, Call):
-            op_name = node.op.name.split(".")[-1]
-            node_entry["op"] = op_name
+            op = node.op
+            node_entry["op"] = node.op
             for arg in node.args:
                 in_node_idx = node_dict[arg]
                 if isinstance(arg, (Tuple, TupleGetItem)):
@@ -118,12 +105,12 @@ def _traverse_expr(node):
                     node_entry["types"].append(tupe_type)
             else:
                 raise RuntimeError("Unsupported output type %s in operator %s"
-                                   % (type(out_type), op_name))
+                                   % (type(out_type), op.name))
 
             # Utilize tracing target to fetch workload with topo-order.
             # Since we only need workload, dummy target can be used to
             # create task.
-            if op_name in target_ops:
+            if op in target_ops:
                 params = []
                 for i, input_idx in enumerate(node_entry["inputs"]):
                     input_node_entry = node_list[input_idx[0]]
@@ -133,7 +120,7 @@ def _traverse_expr(node):
                                            "operators with input node of type "
                                            "relay.expr.Var/Constant/Call. Now "
                                            "find a target op %s with input type %s"
-                                           % (op_name, str(type(input_node_entry["node"]))))
+                                           % (op, str(type(input_node_entry["node"]))))
                     free_var = relay.Var("var_%d" % i, input_type)
                     params.append(free_var)
                 call = relay.Call(node.op, params, node.attrs)
@@ -155,11 +142,9 @@ def _traverse_expr(node):
                 _expr2graph_impl(node, target_ops, node_dict, node_list)
             return
         elif isinstance(node, TupleGetItem):
-            node_entry["op"] = "TupleGetItem"
             in_node_idx = node_dict[node.tuple_value]
             node_entry["inputs"].append([in_node_idx, node.index, 0])
         elif isinstance(node, Tuple):
-            node_entry["op"] = "Tuple"
             for tuple_item in node:
                 in_node_idx = node_dict[tuple_item]
                 if isinstance(tuple_item, TupleGetItem):
diff --git a/python/tvm/autotvm/graph_tuner/utils/utils.py b/python/tvm/autotvm/graph_tuner/utils/utils.py
index 137ccbed2bbd..2486d0c0bda0 100644
--- a/python/tvm/autotvm/graph_tuner/utils/utils.py
+++ b/python/tvm/autotvm/graph_tuner/utils/utils.py
@@ -47,7 +47,7 @@ def has_multiple_inputs(node_list, node_idx, input_names):
         in_idx = in_idx[0]
         in_node = node_list[in_idx]
         # Exclude parameter nodes
-        if in_node["op"] != "null" or \
+        if in_node["op"] is not None or \
                 ("name" in in_node and in_node["name"] in input_names):
             num_inputs += 1
     return num_inputs > 1
@@ -72,9 +72,10 @@ def is_boundary_node(node_entry, input_names):
         whether node is a boundary node.
     """
     # Operators dependent on original layouts.
-    _LAYOUT_FIXED_OP = ["batch_flatten", "transpose", "reshape",
-                        "multibox_prior", "multibox_transform_loc", "where",
-                        "non_max_suppression", "strided_slice"]
+    _LAYOUT_FIXED_OP = [relay.op.get(name) for name in (
+        "nn.batch_flatten", "transpose", "reshape", "vision.multibox_prior",
+        "vision.multibox_transform_loc", "where", "vision.non_max_suppression",
+        "strided_slice")]
 
     out = node_entry["op"] in _LAYOUT_FIXED_OP or \
           ("name" in node_entry and node_entry["name"] in input_names)
@@ -95,9 +96,7 @@ def is_skipped_node(node_entry):
         whether node is skipped.
     """
     # Operators not counted in graph tuner.
-    _SKIPPED_OP = ["Tuple"]
-
-    return node_entry["op"] in _SKIPPED_OP
+    return isinstance(node_entry["node"], relay.Tuple)
 
 
 def bind_inputs(expr, input_shapes=None, input_dtypes="float32"):
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index fbf4a08f7b0c..416b2cd57eb6 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -28,14 +28,16 @@
 import os
 import itertools
 from collections import OrderedDict
+import numpy as np
 
 from .. import build, lower, target as _target
-
+from .. import __version__
 from . import task
 from .task import ConfigEntity, ApplyHistoryBest
 from .measure import MeasureInput, MeasureResult
 
-AUTOTVM_LOG_VERSION = 0.1
+AUTOTVM_LOG_VERSION = 0.2
+_old_version_warning = True
 logger = logging.getLogger('autotvm')
 
 try:  # convert unicode to str for python2
@@ -88,27 +90,30 @@ def encode(inp, result, protocol='json'):
 
     if protocol == 'json':
         json_dict = {
-            "i": (str(inp.target),
-                  inp.task.name, inp.task.args, inp.task.kwargs,
-                  inp.task.workload,
-                  inp.config.to_json_dict()),
+            "input": (str(inp.target),
+                      inp.task.name, inp.task.args, inp.task.kwargs),
+
+            "config": inp.config.to_json_dict(),
+
+            "result": (result.costs if result.error_no == 0 else (1e9,),
+                       result.error_no,
+                       result.all_cost,
+                       result.timestamp),
 
-            "r": (result.costs if result.error_no == 0 else (1e9,),
-                  result.error_no,
-                  result.all_cost,
-                  result.timestamp),
+            "version": AUTOTVM_LOG_VERSION,
 
-            "v": AUTOTVM_LOG_VERSION
+            "tvm_version": __version__
         }
         return json.dumps(json_dict)
     if protocol == 'pickle':
         row = (str(inp.target),
                str(base64.b64encode(pickle.dumps([inp.task.name,
                                                   inp.task.args,
-                                                  inp.task.kwargs,
-                                                  inp.task.workload])).decode()),
+                                                  inp.task.kwargs])).decode()),
                str(base64.b64encode(pickle.dumps(inp.config)).decode()),
-               str(base64.b64encode(pickle.dumps(tuple(result))).decode()))
+               str(base64.b64encode(pickle.dumps(tuple(result))).decode()),
+               str(AUTOTVM_LOG_VERSION),
+               str(__version__))
         return '\t'.join(row)
 
     raise RuntimeError("Invalid log protocol: " + protocol)
@@ -119,20 +124,29 @@ def decode(row, protocol='json'):
 
     Parameters
     ----------
-    row: str
+    row : str
         a row in the logger file
-    protocol: str
+
+    protocol : str
         log protocol, json or pickle
 
     Returns
     -------
-    input: autotvm.tuner.MeasureInput
-    result: autotvm.tuner.MeasureResult
+    ret : tuple(autotvm.tuner.MeasureInput, autotvm.tuner.MeasureResult), or None
+        The tuple of input and result, or None if input uses old version log format.
     """
     # pylint: disable=unused-variable
+    global _old_version_warning
+
     if protocol == 'json':
         row = json.loads(row)
-        tgt, task_name, task_args, task_kwargs, workload, config = row['i']
+        if 'v' in row and row['v'] == 0.1:
+            if _old_version_warning:
+                logger.warning("AutoTVM log version 0.1 is no longer supported.")
+                _old_version_warning = False
+            return None
+
+        tgt, task_name, task_args, task_kwargs = row["input"]
         tgt = _target.create(str(tgt))
 
         def clean_json_to_python(x):
@@ -148,22 +162,27 @@ def clean_json_to_python(x):
             return x
 
         tsk = task.Task(clean_json_to_python(task_name), clean_json_to_python(task_args))
-        tsk.workload = clean_json_to_python(workload)
-        config = ConfigEntity.from_json_dict(config)
+        config = ConfigEntity.from_json_dict(row["config"])
         inp = MeasureInput(tgt, tsk, config)
-        result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["r"]])
+        result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["result"]])
+        config.cost = np.mean(result.costs)
 
         return inp, result
     if protocol == 'pickle':
         items = row.split("\t")
+        if len(items) == 4:
+            if _old_version_warning:
+                logger.warning("AutoTVM log version 0.1 is no longer supported.")
+                _old_version_warning = False
+            return None
         tgt = _target.create(items[0])
         task_tuple = pickle.loads(base64.b64decode(items[1].encode()))
         config = pickle.loads(base64.b64decode(items[2].encode()))
-        result = pickle.loads(base64.b64decode(items[3].encode()))
+        result = MeasureResult(*pickle.loads(base64.b64decode(items[3].encode())))
+        config.cost = np.mean(result.costs)
 
         tsk = task.Task(task_tuple[0], task_tuple[1])
-        tsk.workload = task_tuple[3]
-        return MeasureInput(tgt, tsk, config), MeasureResult(*result)
+        return MeasureInput(tgt, tsk, config), result
 
     raise RuntimeError("Invalid log protocol: " + protocol)
 
@@ -183,7 +202,10 @@ def load_from_file(filename):
     """
     for row in open(filename):
         if row and not row.startswith('#'):
-            inp, res = decode(row)
+            ret = decode(row)
+            if ret is None:
+                continue
+            inp, res = ret
             # Avoid loading the record with an empty config. The TOPI schedule with no entities
             # will result in an empty entity map (e.g., depthwise_conv2d_nchw on x86).
             # Using an empty config will cause problems when applying alter op like NCHW to NCHWc.
@@ -208,7 +230,7 @@ def split_workload(in_file, clean=True):
 
     logger.info("start converting...")
     pool = multiprocessing.Pool()
-    lines = pool.map(decode, lines)
+    lines = [rec for rec in pool.map(decode, lines) if rec is not None]
     logger.info("map done %.2f", time.time() - tic)
 
     wkl_dict = OrderedDict()
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index f249f6bacb90..29313d4b5491 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -22,12 +22,13 @@
 of typical tasks of interest.
 """
 
-from .task import Task, create, register, template, get_config, args_to_workload
+from .task import Task, create, get_config, args_to_workload, \
+    register_customized_task
 from .space import ConfigSpace, ConfigEntity
 from .code_hash import attach_code_hash, attach_code_hash_to_arg
-from .dispatcher import dispatcher, DispatchContext, ApplyConfig, ApplyHistoryBest, \
+from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, \
     FallbackContext, clear_fallback_cache, ApplyGraphBest
 
 from .topi_integration import register_topi_compute, register_topi_schedule, \
-    TaskExtractEnv
+    TaskExtractEnv, get_workload
 from .relay_integration import extract_from_program, extract_from_multiple_program
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index e7022fad2081..97ee5383d760 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -33,9 +33,6 @@
 import logging
 
 import numpy as np
-from decorator import decorate
-
-from tvm import target as _target
 
 from .space import FallbackConfigEntity
 
@@ -152,79 +149,6 @@ def __exit__(self, ptype, value, trace):
         DispatchContext.current = self._old_ctx
 
 
-def dispatcher(fworkload):
-    """Wrap a workload dispatcher function.
-
-    Parameters
-    ----------
-    fworkload : function
-        The workload extraction function from arguments.
-
-    Returns
-    -------
-    fdispatcher : function
-        A wrapped dispatcher function, which will
-        dispatch based on DispatchContext and
-        the current workload.
-    """
-    dispatch_dict = {}
-    func_name = fworkload.__name__
-
-    def register(key, func=None, override=False):
-        """Register template function.
-
-        Parameters
-        ----------
-        key : str or List of str
-            The template key to identify the template
-            under this dispatcher.
-        func : function
-            The function to be registered.
-            The first argument of the function is always
-            cfg returned by DispatchContext,
-            the rest arguments are the same as the fworkload.
-        override : bool
-            Whether override existing registration.
-
-        Returns
-        -------
-        The register function if necessary.
-        """
-        if isinstance(key, str):
-            key = [key]
-
-        def _do_reg(myf):
-            for x in key:
-                if x in dispatch_dict and not override:
-                    raise ValueError(
-                        "Key %s is already registered for %s" % (x, func_name))
-                dispatch_dict[x] = myf
-            return myf
-
-        if func:
-            return _do_reg(func)
-        return _do_reg
-
-    def dispatch_func(func, *args, **kwargs):
-        """The wrapped dispatch function"""
-        tgt = _target.Target.current()
-        workload = func(*args, **kwargs)
-        cfg = DispatchContext.current.query(tgt, workload)
-        if cfg.is_fallback and not cfg.template_key:
-            # first try 'direct' template
-            if 'direct' in dispatch_dict:
-                return dispatch_dict['direct'](cfg, *args, **kwargs)
-            # otherwise pick a random template
-            for v in dispatch_dict.values():
-                return v(cfg, *args, **kwargs)
-        else:
-            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
-
-    fdecorate = decorate(fworkload, dispatch_func)
-    fdecorate.register = register
-    return fdecorate
-
-
 class ApplyConfig(DispatchContext):
     """Apply a deterministic config entity for all queries.
 
@@ -334,7 +258,8 @@ def _query_inside(self, target, workload):
         if key in self._best_user_defined:
             return self._best_user_defined[key]
         if key in self.best_by_model:
-            return self.best_by_model[key][0].config
+            inp, _ = self.best_by_model[key]
+            return inp.config
 
         # then try matching by target key
         for k in target.keys:
@@ -342,13 +267,16 @@ def _query_inside(self, target, workload):
             if key in self._best_user_defined:
                 return self._best_user_defined[key]
             if key in self.best_by_targetkey:
-                return self.best_by_targetkey[key][0].config
+                inp, _ = self.best_by_targetkey[key]
+                return inp.config
 
         return None
 
     def update(self, target, workload, cfg):
         model = target.model
         key = (model, workload)
+        # assume user provided config is the best
+        cfg.cost = 0
         self._best_user_defined[key] = cfg
 
         for k in target.keys:
@@ -481,8 +409,12 @@ def _query_inside(self, target, workload):
         """
         if self._counter < len(self._records):
             cfg = self._records[self._counter][0].config
+            wkl = self._records[self._counter][0].task.workload
+            if workload is not None:
+                assert wkl == workload
             self._counter += 1
-            self.update(target, workload, cfg)
+            self.update(target, wkl, cfg)
+            cfg.workload = wkl
             return cfg
         key = (str(target), workload)
         if key not in self._global_cfg_dict:
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index b39c8d446c7f..cd8d32fb2d68 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -21,10 +21,9 @@
 
 """
 import threading
-import warnings
 import logging
 
-
+import tvm
 from .task import create
 from .topi_integration import TaskExtractEnv
 
@@ -55,8 +54,7 @@ def _lower(mod,
     compiler.lower(mod, target=target)
 
 
-def extract_from_program(mod, params, ops, target, target_host=None,
-                         template_keys=None):
+def extract_from_program(mod, params, target, target_host=None, ops=None):
     """ Extract tuning tasks from a relay program.
 
     This function is the single program version of extract_from_multiple_program.
@@ -67,27 +65,22 @@ def extract_from_program(mod, params, ops, target, target_host=None,
         The module or function to tune
     params: dict of str to numpy array
         The associated parameters of the program
-    ops: List of relay op
-        List of relay ops to be tuned
     target: tvm.target.Target
         The compilation target
     target_host: tvm.target.Target
         The host compilation target
-    template_keys: dict of topi op to str
-        The tuning template keys map for schedules, default to None.
-        Example: {topi.nn.conv2d: 'direct'}
+    ops: List[relay.op.Op] or None
+        List of relay ops to be tuned. If not specified, all tunable ops will be extracted.
 
     Returns
     -------
     task: Array of autotvm.task.Task
         collected tasks
     """
-    return extract_from_multiple_program([mod], [params], ops, target, target_host,
-                                         template_keys)
+    return extract_from_multiple_program([mod], [params], target, target_host, ops)
 
 
-def extract_from_multiple_program(mods, params, ops, target, target_host=None,
-                                  template_keys=None):
+def extract_from_multiple_program(mods, params, target, target_host=None, ops=None):
     """ Extract tuning tasks from multiple relay programs.
 
     This function collects tuning tasks by building a list of programs
@@ -99,15 +92,12 @@ def extract_from_multiple_program(mods, params, ops, target, target_host=None,
         The list of modules or functions to tune
     params: List of dict of str to numpy array
         The associated parameters of the programs
-    ops: List of relay op
-        List of relay ops to be tuned
     target: tvm.target.Target
         The compilation target
     target_host: tvm.target.Target
         The host compilation target
-    template_keys: dict of topi op to str
-        The tuning template keys map for schedules, default to None.
-        Example: {topi.nn.conv2d: 'direct'}
+    ops: List[relay.op.Op] or None
+        List of relay ops to be tuned.  If not specified, all tunable ops will be extracted.
 
     Returns
     -------
@@ -115,36 +105,13 @@ def extract_from_multiple_program(mods, params, ops, target, target_host=None,
         collected tasks
     """
     # pylint: disable=import-outside-toplevel
-    import tvm.relay.op
     from tvm import relay
     import topi
 
     env = TaskExtractEnv.get()
 
-    # NOTE: To add more ops, you only need to change the following lists
-    # relay op -> topi compute
-    OP2TOPI = {
-        tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
-                                 topi.nn.group_conv2d_nchw,
-                                 topi.nn.conv2d_NCHWc,
-                                 topi.nn.conv2d_NCHWc_int8],
-        tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
-        tvm.relay.op.nn.dense: [topi.nn.dense],
-        tvm.relay.op.nn.batch_matmul: [topi.nn.batch_matmul],
-        tvm.relay.op.nn.deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
-        tvm.relay.op.nn.conv1d_transpose: [topi.nn.conv1d_transpose_ncw],
-        tvm.relay.op.nn.conv3d: [topi.nn.conv3d],
-    }
-
-    topi_funcs = []
-    for op_name in ops:
-        if op_name in OP2TOPI:
-            topi_funcs.extend(OP2TOPI[op_name])
-        else:
-            warnings.warn("Op %s is not tunable, ignored" % op_name)
-
     # run compiler to collect all TOPI calls during compilation
-    env.reset(topi_funcs)
+    env.reset(ops)
     with env:
         # disable logger temporarily
         old_state = logger.disabled
@@ -164,24 +131,12 @@ def extract_from_multiple_program(mods, params, ops, target, target_host=None,
 
         logger.disabled = old_state
 
-    # convert *topi op to template key* map to *task name to template key* map
-    task_name_to_keys = {}
-    if template_keys is not None:
-        for op in template_keys.keys():
-            if op in env.topi_to_task:
-                task_name_to_keys[env.topi_to_task[op]] = template_keys[op]
-            else:
-                logger.warning("Invalid template key, fallback to direct")
-                task_name_to_keys[env.topi_to_task[op]] = 'direct'
-
     # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
         try:
-            key = task_name_to_keys[task_name] if task_name in task_name_to_keys else 'direct'
             tsk = create(task_name, args,
-                         target=target, target_host=target_host,
-                         template_key=key)
+                         target=target, target_host=target_host)
             tasks.append(tsk)
         except topi.InvalidShapeError:
             logger.warning("Invalid shape during AutoTVM task creation")
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index fbdd34e502ca..47c227073677 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -613,9 +613,9 @@ def __init__(self):
         self._entity_map = OrderedDict()  # name -> entity
         self._constraints = []
         self.errors = []
-        self.template_key = None
         self.code_hash = None
         self.flop = 0
+        self.cost = None
         self.is_fallback = False
 
     @staticmethod
@@ -796,7 +796,7 @@ def get(self, index):
         for name, space in self.space_map.items():
             entities[name] = space[t % len(space)]
             t //= len(space)
-        ret = ConfigEntity(index, self.code_hash, self.template_key, entities, self._constraints)
+        ret = ConfigEntity(index, self.code_hash, entities, self._constraints)
         return ret
 
     def __iter__(self):
@@ -836,17 +836,14 @@ class ConfigEntity(ConfigSpace):
         index of this config in space
     code_hash: str
         hash of schedule code
-    template_key : str
-        The specific template key
     entity_map: dict
         map name to transform entity
     constraints : list
         List of constraints
     """
-    def __init__(self, index, code_hash, template_key, entity_map, constraints):
+    def __init__(self, index, code_hash, entity_map, constraints):
         super(ConfigEntity, self).__init__()
         self.index = index
-        self.template_key = template_key
         self._collect = False
         self._entity_map = entity_map
         self._space_map = None
@@ -896,9 +893,8 @@ def to_json_dict(self):
             a json serializable dictionary
         """
         ret = {}
-        ret['i'] = int(self.index)
-        ret['t'] = self.template_key
-        ret['c'] = self.code_hash
+        ret['index'] = int(self.index)
+        ret['code_hash'] = self.code_hash
         entity_map = []
         for k, v in self._entity_map.items():
             if isinstance(v, SplitEntity):
@@ -911,7 +907,7 @@ def to_json_dict(self):
                 entity_map.append((k, 'ot', v.val))
             else:
                 raise RuntimeError("Invalid entity instance: " + v)
-        ret['e'] = entity_map
+        ret['entity'] = entity_map
         return ret
 
     @staticmethod
@@ -930,13 +926,12 @@ def from_json_dict(json_dict):
             The corresponding config object
 
         """
-        index = json_dict["i"]
-        code_hash = json_dict["c"]
-        template_key = json_dict["t"]
+        index = json_dict["index"]
+        code_hash = json_dict["code_hash"]
         constraints = []
         entity_map = OrderedDict()
 
-        for item in json_dict["e"]:
+        for item in json_dict["entity"]:
             key, knob_type, knob_args = item
             if knob_type == 'sp':
                 entity = SplitEntity(knob_args)
@@ -950,11 +945,10 @@ def from_json_dict(json_dict):
                 raise RuntimeError("Invalid config knob type: " + knob_type)
             entity_map[str(key)] = entity
 
-        return ConfigEntity(index, code_hash, template_key, entity_map, constraints)
+        return ConfigEntity(index, code_hash, entity_map, constraints)
 
     def __repr__(self):
-        return "%s,%s,%s,%d" % (str(self._entity_map)[12:-1], self.template_key,
-                                self.code_hash, self.index)
+        return "%s,%s,%d" % (str(self._entity_map)[12:-1], self.code_hash, self.index)
 
 
 class FallbackConfigEntity(ConfigSpace):
@@ -1068,4 +1062,4 @@ def __setitem__(self, name, entity):
         self._entity_map[name] = entity
 
     def __repr__(self):
-        return "%s,%s,%s" % (str(self._entity_map)[12:-1], self.template_key, self.code_hash)
+        return "%s,%s" % (str(self._entity_map)[12:-1], self.code_hash)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 9ff8b24fcb5d..ca1ae0eefefd 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=unused-variable
+# pylint: disable=unused-variable,not-callable
 """Definition of task function.
 
 Task can be constructed from tuple of func, args, and kwargs.
@@ -24,10 +24,10 @@
 
 import numpy as np
 
-from ... import tensor, expr, container, target as _target
+from ... import tensor, expr, container, placeholder, target as _target
 
-from ..util import get_const_int, get_const_tuple, get_func_name
-from .dispatcher import DispatchContext, ApplyConfig, dispatcher
+from ..util import get_const_int, get_const_tuple
+from .dispatcher import DispatchContext, ApplyConfig
 from .space import ConfigSpace
 
 def _raise_error(*args, **kwargs):  # pylint: disable=unused-argument
@@ -35,6 +35,70 @@ def _raise_error(*args, **kwargs):  # pylint: disable=unused-argument
                        "of this task is registered in another python file "
                        "which is not imported in this run")
 
+
+def serialize_args(args):
+    """serialize arguments of a topi function to a hashable tuple.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    def _encode(x):
+        if isinstance(x, tensor.Tensor):
+            return ('TENSOR', get_const_tuple(x.shape), x.dtype)
+        if isinstance(x, (tuple, list, container.Array)):
+            return tuple([_encode(a) for a in x])
+        if isinstance(x, (str, int, float, np.int, np.float, expr.Var)):
+            return x
+        if isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)):
+            return x.value
+        if x is None:
+            return None
+        raise RuntimeError('Do not support type "%s" in argument. Consider to use'
+                           'primitive types or tvm.tir.Var only' % type(x))
+    ret = []
+    for t in args:
+        ret.append(_encode(t))
+    return tuple(ret)
+
+
+def deserialize_args(args):
+    """The inverse function of :code:`serialize_args`.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tuple) and t[0] == 'TENSOR':
+            ret.append(placeholder(shape=t[1], dtype=t[2]))
+        else:
+            ret.append(t)
+    return ret
+
+
+def args_to_workload(args, task_name=None):
+    """Convert argument list to hashable workload tuple.
+    This function will convert list to tuple, tvm node to python value and
+    flatten tvm.tensor.Tensor to a tuple
+
+    Parameters
+    ----------
+    task_name : str
+        The AutoTVM task name
+
+    args : list of args
+        The arguments to the function
+
+    Returns
+    -------
+    ret: hashable
+        The hashable value
+    """
+    return (task_name,) + serialize_args(args) if task_name is not None else serialize_args(args)
+
+
 class Task(object):
     """A Tunable Task
 
@@ -55,11 +119,14 @@ def __init__(self, name, args):
         self.func = TASK_TABLE.get(name, _raise_error)
 
         # auxiliary info, available after `init_space` is called
-        self.workload = None
         self.flop = None
         self.target = None
         self.target_host = None
 
+    @property
+    def workload(self):
+        return (self.name,) + serialize_args(self.args)
+
     def instantiate(self, config):
         """Instantiate this task function (template) with a config.
         Returns corresponding schedule.
@@ -94,7 +161,6 @@ def __getstate__(self):
             "args": self.args,
             "kwargs": self.kwargs,
             "config_space": self.config_space,
-            "workload": self.workload,
             "flop": self.flop,
             "target": self.target,
             "target_host": self.target_host
@@ -106,7 +172,6 @@ def __setstate__(self, state):
         self.kwargs = state["kwargs"]
         self.config_space = state["config_space"]
         self.func = TASK_TABLE.get(state["name"], _raise_error)
-        self.workload = state["workload"]
         self.flop = state["flop"]
         self.target = state["target"]
         self.target_host = state["target_host"]
@@ -116,135 +181,119 @@ def __repr__(self):
             self.name, self.args, self.kwargs, self.workload
         )
 
-TASK_TABLE = {
-}
+TASK_TABLE = {}
+
+class TopiTemplate(object):
+    """Topi template that holds the topi compute and schedule function"""
+    def __init__(self):
+        self.compute = None
+        self.schedule = None
+        self.customized_func = None
+
+    def __call__(self, *args, **kwargs):
+        args = deserialize_args(args)
+        if self.customized_func is None:
+            return self._default_func(*args, **kwargs)
+        assert callable(self.customized_func)
+        return self.customized_func(*args, **kwargs)
+
+    def _default_func(self, *args, **kwargs):
+        assert callable(self.compute) and callable(self.schedule)
+        out = self.compute(*args, **kwargs)
+        arg_bufs = [out] + self.get_inputs(out)
+        s = self.schedule([out])
+        return s, arg_bufs
+
+    def get_inputs(self, out):
+        inputs = []
+        queue = [out]
+        while queue:
+            t = queue.pop(0)
+            if isinstance(t.op, tensor.PlaceholderOp):
+                inputs.append(t)
+            else:
+                queue.extend(t.op.input_tensors)
+        return inputs
 
-def register(name, func=None, override=False):
-    """Register a task function.
+def register_task_compute(name, func=None):
+    """Register compute function to autotvm task
 
     Parameters
     ----------
-    name : str
-        The name to identify the task.
-    func : callable
-        The function to be registered.
-    override : bool
-        Whether override existing registration.
+    name: str
+        The task name
+
+    func: None or callable
+        If it is None, return a decorator.
+        If is callable, decorate this function.
 
     Returns
     -------
-    func: callable
-        The registered function
+    decorator: callable
+        A decorator
     """
-    def _do_reg(myf):
-        if name in TASK_TABLE and not override:
-            raise ValueError(
-                "Key %s is already registered" % name)
-        TASK_TABLE[name] = myf
-        return myf
+    def _do_reg(f):
+        if name not in TASK_TABLE:
+            TASK_TABLE[name] = TopiTemplate()
+        tmpl = TASK_TABLE[name]
+        if tmpl.compute is not None:
+            raise ValueError("Compute is already registered in autoTVM task %s" % name)
+        tmpl.compute = f
+        return f
     if func:
         return _do_reg(func)
     return _do_reg
 
-def create(func_name, args, target, target_host=None, template_key=None):
-    """Create a tuning task and initialize its search space
+def register_task_schedule(name, func=None):
+    """Register schedule function to autotvm task
 
     Parameters
     ----------
-    func_name : str or callable
-        The task function
-    args : List
-        Positional arguments
-    target : Target
-        The compilation target
-    target_host: Target, optional
-        The compilation target for host side
+    name: str
+        The task name
+
+    func: None or callable
+        If it is None, return a decorator.
+        If is callable, decorate this function.
 
     Returns
     -------
-    tsk: Task
-        a task object
+    decorator: callable
+        A decorator
     """
-    if callable(func_name):
-        # register this function if it is not registered before
-        func = func_name
-        func_name = func.func_name if hasattr(func, 'func_name') else func.__name__
-        if func_name in TASK_TABLE:
-            assert func == TASK_TABLE[func_name], "Find name conflict in task registration. " \
-                                                  "Consider to choose another name for this task"
-        else:
-            register(func_name, func=func)
-
-    func = TASK_TABLE[func_name]
-    ret = Task(func_name, args)
-
-    if isinstance(target, str):
-        target = _target.create(target)
-
-    # init config space
-    ret.config_space = ConfigSpace()
-    ret.config_space.template_key = template_key or ""
-
-    ctx = ApplyConfig(ret.config_space)
-    with ctx:
-        with target:
-            sch, _ = func(*args)
-            ret.config_space.code_hash = getattr(sch, 'code_hash', None)
+    def _do_reg(f):
+        if name not in TASK_TABLE:
+            TASK_TABLE[name] = TopiTemplate()
+        tmpl = TASK_TABLE[name]
+        if tmpl.schedule is not None:
+            raise ValueError("Schedule is already registered in autoTVM task %s" % name)
+        tmpl.schedule = f
+        return f
+    if func:
+        return _do_reg(func)
+    return _do_reg
 
-    ret.workload = ctx.workload
-    ret.flop = ret.config_space.flop or compute_flop(sch)
-    ret.target = target
-    ret.target_host = target_host
+def register_customized_task(name, func=None):
+    """Register a customized function to AutoTVM task.
 
-    return ret
-
-def args_to_workload(x, topi_compute_func=None):
-    """Convert argument list to hashable workload tuple.
-    This function will convert list to tuple, tvm node to python value and
-    flatten tvm.tensor.Tensor to a tuple
+    In most cases, you can just use register_topi_compute and register_topi_schedule
+    with the same task name to define an AutoTVM task. However, you can also
+    create a customized AutoTVM task that defines a tunable template or performs
+    extra layout transform before invoking compute/schedule function.
 
     Parameters
     ----------
-    x: primitive hashable types or tensor.Tensor
-        The original value
-    topi_compute_func: topi compute function
-        The function name will be added as first element of the workload tuple
-
-    Returns
-    -------
-    ret: hashable
-        The hashable value
-    """
-    if isinstance(x, tensor.Tensor):
-        workload = get_const_tuple(x.shape) + (x.dtype, )
-    elif isinstance(x, (tuple, list, container.Array)):
-        workload = tuple([args_to_workload(a) for a in x])
-    elif isinstance(x, (str, int, float, np.int, np.float, expr.Var)):
-        workload = x
-    elif isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)):
-        workload = x.value
-    elif x is None:
-        workload = 0
-    else:
-        raise RuntimeError('Do not support type "%s" in argument. Consider to use'
-                           'primitive types or tvm.tir.Var only' % type(x))
-    return (get_func_name(topi_compute_func), ) + workload  if topi_compute_func else workload
-
-def template(func):
-    """
-    Decorate a function as a tunable schedule template
+    name: str
+        The task name
 
-    Parameters
-    ----------
-    func: callable
-        A callable template function.
-        Its argument should be hashable values.
-        Its return value should be a Tuple(Schedule, Array of Tensor)
+    func: None or callable
+        If it is None, return a decorator.
+        If is callable, decorate this function.
 
     Returns
     -------
-    func: callable
-        The decorated function
+    decorator: callable
+        A decorator
 
     Examples
     --------
@@ -252,7 +301,7 @@ def template(func):
 
     .. code-block:: python
 
-        @autotvm.template
+        @autotvm.register_customized_task("matmul")
         def matmul(N, L, M, dtype):
             A = tvm.placeholder((N, L), name='A', dtype=dtype)
             B = tvm.placeholder((L, M), name='B', dtype=dtype)
@@ -279,24 +328,57 @@ def matmul(N, L, M, dtype):
 
             return s, [A, B, C]
     """
-    # pylint: disable=unused-variable
+    def _do_reg(f):
+        if name not in TASK_TABLE:
+            TASK_TABLE[name] = TopiTemplate()
+        tmpl = TASK_TABLE[name]
+        if tmpl.customized_func is not None:
+            raise ValueError("Customized func is already registered in autoTVM task %s" % name)
+        tmpl.customized_func = f
+        return f
+    if func:
+        return _do_reg(func)
+    return _do_reg
+
+def create(task_name, args, target, target_host=None):
+    """Create a tuning task and initialize its search space
+
+    Parameters
+    ----------
+    task_name : str
+        The AutoTVM task name
+    args : List
+        Positional arguments
+    target : Target
+        The compilation target
+    target_host: Target, optional
+        The compilation target for host side
+
+    Returns
+    -------
+    tsk: Task
+        a task object
+    """
+    args = serialize_args(args)
+    ret = Task(task_name, args)
 
-    fname = get_func_name(func)
+    if isinstance(target, str):
+        target = _target.create(target)
 
-    @register(fname)
-    @dispatcher
-    def config_dispatcher(*args, **kwargs):
-        assert not kwargs, "Do not support kwargs in template function call"
-        return (fname, ) + args_to_workload(args)
+    # init config space
+    ret.config_space = ConfigSpace()
+
+    ctx = ApplyConfig(ret.config_space)
+    with ctx:
+        with target:
+            sch, _ = ret.func(*args)
+            ret.config_space.code_hash = getattr(sch, 'code_hash', None)
 
-    @config_dispatcher.register("")
-    def template_call(cfg, *args, **kwargs):
-        assert not kwargs, "Do not support kwargs in template function call"
-        with ApplyConfig(cfg):
-            return func(*args, **kwargs)
+    ret.flop = ret.config_space.flop or compute_flop(sch)
+    ret.target = target
+    ret.target_host = target_host
 
-    config_dispatcher.func_name = fname
-    return config_dispatcher
+    return ret
 
 def get_config():
     """Get current config object
@@ -306,7 +388,8 @@ def get_config():
     cfg: ConfigSpace or ConfigEntity
         The current config
     """
-    return DispatchContext.current.query(None, None)
+    tgt = _target.Target.current(allow_none=True)
+    return DispatchContext.current.query(tgt, None)
 
 class FlopCalculationError(RuntimeError):
     """Error happens when estimating FLOP for a compute op"""
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 3d3a1d3d3a4e..45385fbe8f7e 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -27,47 +27,11 @@
 See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
 """
 import tvm.te._ffi_api
+from tvm import target as _target
 
-from ... import tensor, placeholder
-
-from .task import args_to_workload, dispatcher, register
-from ..util import get_const_tuple
-
-# A table that records all registered dispatcher for all targets
-_REGISTERED_DISPATCHER = {
-}
-
-
-def serialize_args(args):
-    """serialize arguments of a topi function to a hashable tuple.
-
-    Parameters
-    ----------
-    args: list of hashable or Tensor
-    """
-    ret = []
-    for t in args:
-        if isinstance(t, tensor.Tensor):
-            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
-        else:
-            ret.append(t)
-    return tuple(ret)
-
-
-def deserialize_args(args):
-    """The inverse function of :code:`serialize_args`.
-
-    Parameters
-    ----------
-    args: list of hashable or Tensor
-    """
-    ret = []
-    for t in args:
-        if isinstance(t, tuple) and t[0] == 'TENSOR':
-            ret.append(placeholder(shape=t[1], dtype=t[2]))
-        else:
-            ret.append(t)
-    return ret
+from ... import tensor
+from .task import args_to_workload, DispatchContext, \
+    register_task_compute, register_task_schedule, serialize_args
 
 
 # Task extractor for relay program
@@ -77,250 +41,46 @@ class TaskExtractEnv:
     registered = None
 
     def __init__(self, allow_duplicate=False):
-        # pylint: disable=import-outside-toplevel
-        import topi
-
-        # topi compute -> autotvm task name
-        self.topi_to_task = {
-            topi.nn.conv2d: "topi_nn_conv2d",
-            topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
-            topi.nn.group_conv2d_nchw: "topi_nn_group_conv2d_nchw",
-            topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
-            topi.nn.conv2d_NCHWc: "topi_x86_conv2d_NCHWc",
-            topi.nn.conv2d_NCHWc_int8: "topi_x86_conv2d_NCHWc_int8",
-            topi.nn.dense: "topi_nn_dense",
-            topi.nn.batch_matmul: "topi_nn_batch_matmul",
-            topi.nn.bitserial_conv2d_nchw: "topi_nn_bitserial_conv2d_nchw",
-            topi.nn.bitserial_conv2d_nhwc: "topi_nn_bitserial_conv2d_nhwc",
-            topi.nn.bitserial_dense: "topi_nn_bitserial_dense",
-            topi.nn.deformable_conv2d_nchw: "topi_nn_deformable_conv2d_nchw",
-            topi.nn.conv1d_transpose_ncw: "topi_nn_conv1d_transpose_ncw",
-            topi.nn.conv3d: "topi_nn_conv3d",
-        }
-
-        self.topi_to_schedule = {
-            topi.nn.conv2d: [topi.generic.schedule_conv2d_nchw,
-                             topi.generic.schedule_conv2d_nhwc],
-            topi.nn.depthwise_conv2d_nchw: [topi.generic.schedule_depthwise_conv2d_nchw,
-                                            topi.generic.schedule_depthwise_conv2d_nhwc],
-            topi.nn.group_conv2d_nchw: [topi.generic.schedule_group_conv2d_nchw],
-            topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw],
-            topi.nn.conv2d_NCHWc: [topi.generic.schedule_conv2d_NCHWc],
-            topi.nn.conv2d_NCHWc_int8: [topi.generic.schedule_conv2d_NCHWc_int8],
-            topi.nn.dense: [topi.generic.schedule_dense],
-            topi.nn.batch_matmul: [topi.generic.schedule_batch_matmul],
-            topi.nn.bitserial_conv2d_nchw: [topi.generic.schedule_bitserial_conv2d_nchw],
-            topi.nn.bitserial_conv2d_nhwc: [topi.generic.schedule_bitserial_conv2d_nhwc],
-            topi.nn.bitserial_dense: [topi.generic.schedule_bitserial_dense],
-            topi.nn.deformable_conv2d_nchw: [topi.generic.schedule_deformable_conv2d_nchw],
-            topi.nn.conv1d_transpose_ncw: [topi.generic.schedule_conv1d_transpose_ncw],
-            topi.nn.conv3d: [topi.generic.schedule_conv3d_ndhwc],
-        }
-
-        # function reflection for tracing
-        self.func_to_reflection = {
-            topi.nn.conv2d:                 lambda x: setattr(topi.nn, 'conv2d', x),
-            topi.nn.conv2d_NCHWc:           lambda x: setattr(topi.nn, 'conv2d_NCHWc', x),
-            topi.nn.conv2d_NCHWc_int8:      lambda x: setattr(topi.nn, 'conv2d_NCHWc_int8', x),
-            topi.nn.depthwise_conv2d_nchw:  lambda x: setattr(topi.nn, 'depthwise_conv2d_nchw', x),
-            topi.nn.group_conv2d_nchw:      lambda x: setattr(topi.nn, 'group_conv2d_nchw', x),
-            topi.nn.conv2d_transpose_nchw:  lambda x: setattr(topi.nn, 'conv2d_transpose_nchw', x),
-            topi.nn.dense:                  lambda x: setattr(topi.nn, 'dense', x),
-            topi.nn.batch_matmul:           lambda x: setattr(topi.nn, 'batch_matmul', x),
-            topi.nn.bitserial_conv2d_nchw:  lambda x: setattr(topi.nn, 'bitserial_conv2d_nchw', x),
-            topi.nn.bitserial_conv2d_nhwc:  lambda x: setattr(topi.nn, 'bitserial_conv2d_nhwc', x),
-            topi.nn.bitserial_dense:        lambda x: setattr(topi.nn, 'bitserial_dense', x),
-            topi.nn.deformable_conv2d_nchw: lambda x: setattr(topi.nn, 'deformable_conv2d_nchw', x),
-            topi.nn.conv1d_transpose_ncw:   lambda x: setattr(topi.nn, 'conv1d_transpose_ncw', x),
-            topi.nn.conv3d:                 lambda x: setattr(topi.nn, 'conv3d', x),
-        }
-
         self.allow_duplicate = allow_duplicate
-        self._register_topi_task()
         self.task_collection = []
-        self.wanted_topi_funcs = list(self.topi_to_task.keys())
+        self.wanted_relay_ops = None
         self.modified_funcs = []
+        self.tracing = False
 
     def __enter__(self):
         self.task_collection = []
-        self.modified_funcs = []
-
-        for topi_compute in self.wanted_topi_funcs:
-            def _local_scope(compute_func):
-                """start a scope to hold the local function in for loop"""
-
-                def _tracing_wrapper(*args, **kwargs):
-                    assert not kwargs, "Do not support extracting tuning tasks when " \
-                                       "kwargs is used in TOPI function call. " \
-                                       "Please modify it to use only positional args."
-                    key = (self.topi_to_task[compute_func], serialize_args(args))
-                    if self.allow_duplicate or key not in self.task_collection:
-                        self.task_collection.append(key)
-
-                    return compute_func(*args, **kwargs)
-
-                self.func_to_reflection[compute_func](_tracing_wrapper)
-                self.modified_funcs.append(compute_func)
-
-            _local_scope(topi_compute)
+        self.tracing = True
 
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        # revert modification
-        for func in self.modified_funcs:
-            self.func_to_reflection[func](func)
-
-    def _register_topi_task(self):
-        """register tuning wrapper for topi function"""
-        # pylint: disable=import-outside-toplevel
-        import topi
-
-        # Avoid double registration for certain targets
-        if TaskExtractEnv.registered:
-            return
-        TaskExtractEnv.registered = True
-
-        # Tuning wrapper for topi functions
-        @register("topi_nn_conv2d")
-        def _topi_nn_conv2d(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            layout = args[-2]
-            C = topi.nn.conv2d(*args, **kwargs)
-            if layout == 'NCHW':
-                s = topi.generic.schedule_conv2d_nchw([C])
-            elif layout == 'HWCN':
-                s = topi.generic.schedule_conv2d_hwcn([C])
-            elif layout == 'NHWC':
-                s = topi.generic.schedule_conv2d_nhwc([C])
-            else:
-                raise ValueError("Unsupported layout {}".format(layout))
-            return s, [A, W, C]
-
-        @register("topi_nn_depthwise_conv2d_nchw")
-        def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs)
-            s = topi.generic.schedule_depthwise_conv2d_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_group_conv2d_nchw")
-        def _topi_nn_group_conv2d_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.group_conv2d_nchw(*args, **kwargs)
-            s = topi.generic.schedule_group_conv2d_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_conv2d_transpose_nchw")
-        def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
-            s = topi.generic.schedule_conv2d_transpose_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_conv1d_transpose_ncw")
-        def _topi_nn_conv1d_transpose_ncw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.conv1d_transpose_ncw(*args, **kwargs)
-            s = topi.generic.schedule_conv1d_transpose_ncw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_conv3d")
-        def _topi_nn_conv3d(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.conv3d(*args, **kwargs)
-            s = topi.generic.schedule_conv3d_ndhwc([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_dense")
-        def _topi_nn_dense(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            if len(args) > 2:
-                data, weight, bias = args[:3]
-            else:
-                data, weight = args
-                bias = None
-            C = topi.nn.dense(*args, **kwargs)
-            s = topi.generic.schedule_dense([C])
-            if bias is not None:
-                return s, [data, weight, bias, C]
-            return s, [data, weight, C]
-
-        @register("topi_nn_batch_matmul")
-        def _topi_nn_batch_matmul(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, B = args
-            C = topi.nn.batch_matmul(A, B)
-            s = topi.generic.schedule_batch_matmul([C])
-            return s, [A, B, C]
-
-        @register("topi_nn_bitserial_conv2d_nhwc")
-        def _topi_bitserial_conv2d_nhwc(*args, **kwargs):
-            args = deserialize_args(args)
-            C = topi.nn.bitserial_conv2d_nhwc(*args, **kwargs)
-            s = topi.generic.nn.schedule_bitserial_conv2d_nhwc([C])
-            A, W = args[:2]
-            return s, [A, W, C]
-
-        @register("topi_nn_bitserial_conv2d_nchw")
-        def _topi_bitserial_conv2d_nchw(*args, **kwargs):
-            args = deserialize_args(args)
-            C = topi.nn.bitserial_conv2d_nchw(*args, **kwargs)
-            s = topi.generic.nn.schedule_bitserial_conv2d_nchw([C])
-            A, W = args[:2]
-            return s, [A, W, C]
-
-        @register("topi_nn_bitserial_dense")
-        def _topi_nn_bitserial_dense(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.bitserial_dense(*args, **kwargs)
-            s = topi.generic.schedule_bitserial_dense([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_deformable_conv2d_nchw")
-        def _topi_nn_deformable_conv2d_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, Offset, W = args[:3]
-            C = topi.nn.deformable_conv2d_nchw(*args, **kwargs)
-            s = topi.generic.schedule_deformable_conv2d_nchw([C])
-            return s, [A, Offset, W, C]
-
-        @register("topi_nn_conv2d_NCHWc")
-        def _topi_nn_conv2d_NCHWc(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.conv2d_NCHWc(*args, **kwargs)
-            s = topi.generic.schedule_conv2d_NCHWc([C])
-            return s, [A, W, C]
+        self.tracing = False
 
-    def reset(self, wanted_topi_funcs):
+    def reset(self, wanted_relay_ops=None):
         """Reset task collections
 
         Parameters
         ----------
-        wanted_topi_funcs: List of function
-            The topi function to be extracted
+        wanted_relay_ops: List of relay.op.Op
+            The relay ops to be extracted
         """
         self.task_collection = []
-        self.wanted_topi_funcs = wanted_topi_funcs
+        self.wanted_relay_ops = wanted_relay_ops
+
+    def add_task(self, task_name, args):
+        """Add AutoTVM task
+
+        Parameters
+        ----------
+        task_name: str
+            AutoTVM task name.
+
+        args: tuple
+            Arguments to the TOPI function.
+        """
+        key = (task_name, serialize_args(args))
+        if self.allow_duplicate or key not in self.task_collection:
+            self.task_collection.append(key)
 
     def get_tasks(self):
         """Get collected tasks
@@ -355,26 +115,19 @@ def get(allow_duplicate=False):
         return TaskExtractEnv.current
 
 
-def register_topi_compute(topi_compute, target_keys, template_keys, func=None, override=False):
+def register_topi_compute(task_name, func=None):
     """Register a tunable template for a topi compute function.
 
-    After the registration, this topi compute will become a configuration dispatcher. It uses
-    all its argument as workload and dispatches configurations according to the input workload.
-
-    It also stores this "workload" to its final ComputeOp, which can be used to reconstruct
+    The registration will wrap this topi compute to take `cfg` as the first argument,
+    followed by the original argument list. It uses all its argument as workload and
+    stores this "workload" to its final ComputeOp, which can be used to reconstruct
     "workload" in the following topi_schedule call.
 
     Parameters
     ----------
-    topi_compute: GenericFunc
-        The topi compute function that will be overloaded
-    target_keys: str or list of str
-        The compilation target. The same as the argument of GenericFunc.register.
-    template_keys: str or list of str
-        The template key.
-        We might have several strategies for a single operator (e.g. direct, im2col, winograd).
-        The template key is used to identity the algorithm strategy.
-        Every operator must have a "direct" template, which is used by default.
+    task_name: str
+        The AutoTVM task name
+
     func: None or callable
         If it is None, return a decorator.
         If is callable, decorate this function.
@@ -388,81 +141,63 @@ def register_topi_compute(topi_compute, target_keys, template_keys, func=None, o
     --------
     See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
     """
-    def _decorator(f):
-        targets = [target_keys] if isinstance(target_keys, str) else target_keys
-        for target_key in targets:
-            if target_key not in _REGISTERED_DISPATCHER:
-                _REGISTERED_DISPATCHER[target_key] = {}
-            if topi_compute not in _REGISTERED_DISPATCHER[target_key]:
-                @topi_compute.register(target_key)
-                @dispatcher
-                def config_dispatcher(*args, **kwargs):
-                    """override topi call as a config dispatcher"""
-                    assert not kwargs, "Do not support kwargs in template function call"
-                    return args_to_workload(args, topi_compute)
-                _REGISTERED_DISPATCHER[target_key][topi_compute] = config_dispatcher
-
-            config_dispatcher = _REGISTERED_DISPATCHER[target_key][topi_compute]
-
-            @config_dispatcher.register(template_keys, override=override)
-            def template_call(cfg, *args, **kwargs):
-                """call the topi func and attach workload to compute node"""
-                assert not kwargs, "Do not support kwargs in template function call"
-
-                if f == topi_compute.fdefault:
-                    node = f(*args, **kwargs)
-                else:
-                    node = f(cfg, *args, **kwargs)
-
-                # attach workload to return op
-                op = node.op
-                attrs = {}
-                for k, v in node.op.attrs.items():
-                    attrs[k] = v
-                attrs['workload'] = args_to_workload(args, topi_compute)
-                if isinstance(op, tensor.ComputeOp):
-                    op = tvm.te._ffi_api.ComputeOp(
-                        op.name, op.tag, attrs, op.axis, op.body)
-                elif isinstance(op, tensor.ExternOp):
-                    op = tvm.te._ffi_api.ExternOp(
-                        op.name, op.tag, attrs,
-                        op.inputs, op.input_placeholders,
-                        op.output_placeholders, op.body)
-                else:
-                    raise RuntimeError("Unsupported op type: " + str(type(op)))
-
-                if isinstance(node, tensor.Tensor):
-                    return op.output(0)
-                return [op.output(i) for i in range(len(node))]
-
-        return f
+    def _decorate(topi_compute):
+        @register_task_compute(task_name)
+        def wrapper(*args, **kwargs):
+            """wrapper function for topi compute"""
+            assert not kwargs, "Do not support kwargs in template function call"
+            task_env = TaskExtractEnv.current
+            if task_env is not None and task_env.tracing:
+                task_env.add_task(task_name, args)
+            workload = args_to_workload(args, task_name)
+            tgt = _target.Target.current()
+            cfg = DispatchContext.current.query(tgt, workload)
+            node = topi_compute(cfg, *args)
+
+            # attach workload to return op
+            op = node.op
+            attrs = {}
+            for k, v in node.op.attrs.items():
+                attrs[k] = v
+            attrs['workload'] = workload
+            if isinstance(op, tensor.ComputeOp):
+                op = tvm.te._ffi_api.ComputeOp(
+                    op.name, op.tag, attrs, op.axis, op.body)
+            elif isinstance(op, tensor.ExternOp):
+                op = tvm.te._ffi_api.ExternOp(
+                    op.name, op.tag, attrs,
+                    op.inputs, op.input_placeholders,
+                    op.output_placeholders, op.body)
+            else:
+                raise RuntimeError("Unsupported op type: " + str(type(op)))
 
-    if func:
-        _decorator(func)
+            if isinstance(node, tensor.Tensor):
+                return op.output(0)
+            return [op.output(i) for i in range(len(node))]
 
-    return _decorator
+        return wrapper
+
+    if func:
+        return _decorate(func)
+    return _decorate
 
 
-def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None, override=False):
+def register_topi_schedule(task_name, func=None):
     """Register a tunable template for a topi schedule function.
 
-    After the registration. This topi schedule will become a configuration dispatcher. It dispatches
-    configurations according to the input workload.
+    The registration will wrap this topi schedule to take `cfg` as the first argument,
+    followed by the original argument list.
 
     Note that this function will try to find "workload" from all the ComputeOp in the input.
     You can attach "workload" to your compute op by using :any:`register_topi_compute`.
 
+    The task name has to be the same as that of the corresponding topi compute function.
+
     Parameters
     ----------
-    topi_schedule: GenericFunc
-        The topi schedule function that will be overloaded
-    target_keys: str or list of str
-        The compilation target
-    template_keys: str or list of str
-        The template key.
-        We might have several strategies for a single operator (e.g. direct, im2col, winograd).
-        The template key is used to identity the algorithm strategy.
-        Every operator must have a "direct" template, which is used by default.
+    task_name: str
+        The AutoTVM task name
+
     func: None or callable
         If it is None, return a decorator.
         If is callable, decorate this function.
@@ -476,49 +211,33 @@ def register_topi_schedule(topi_schedule, target_keys, template_keys, func=None,
     --------
     See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
     """
-    def _decorator(f):
-        targets = [target_keys] if isinstance(target_keys, str) else target_keys
-        for target_key in targets:
-            if target_key not in _REGISTERED_DISPATCHER:
-                _REGISTERED_DISPATCHER[target_key] = {}
-            if topi_schedule not in _REGISTERED_DISPATCHER[target_key]:
-                @topi_schedule.register(target_key)
-                @dispatcher
-                def config_dispatcher(outs, *args, **kwargs):
-                    """override topi call as a workload dispatcher"""
-                    def traverse(tensors):
-                        """traverse all ops to find attached workload"""
-                        for t in tensors:
-                            op = t.op
-                            if 'workload' in op.attrs:
-                                return op.attrs['workload']
-                            wkl = traverse(op.input_tensors)
-                            if wkl:
-                                return wkl
-                        return None
-
-                    outs = [outs] if isinstance(outs, tensor.Tensor) else outs
-                    workload = traverse(outs)
-
-                    if workload is None:
-                        raise RuntimeError("Cannot find workload in attribute of this schedule")
-
-                    return args_to_workload(workload)
-
-                _REGISTERED_DISPATCHER[target_key][topi_schedule] = config_dispatcher
-
-            config_dispatcher = _REGISTERED_DISPATCHER[target_key][topi_schedule]
-
-            @config_dispatcher.register(template_keys, override=override)
-            def template_call(cfg, outs, *args, **kwargs):
-                """call the schedule func"""
-                if f == topi_schedule.fdefault:
-                    return f(outs, *args, **kwargs)
-                return f(cfg, outs, *args, **kwargs)
-
-        return f
-
+    def _decorate(topi_schedule):
+        @register_task_schedule(task_name)
+        def wrapper(outs, *args, **kwargs):
+            """wrapper function for topi schedule"""
+            workload = get_workload(outs)
+            if workload is None:
+                raise RuntimeError("Cannot find workload in attribute of this schedule")
+            tgt = _target.Target.current()
+            cfg = DispatchContext.current.query(tgt, workload)
+            return topi_schedule(cfg, outs, *args, **kwargs)
+        return wrapper
     if func:
-        _decorator(func)
-
-    return _decorator
+        return _decorate(func)
+    return _decorate
+
+
+def get_workload(outs):
+    """Retrieve the workload from outputs"""
+    def traverse(tensors):
+        """traverse all ops to find attached workload"""
+        for t in tensors:
+            op = t.op
+            if 'workload' in op.attrs:
+                return args_to_workload(op.attrs['workload'])
+            wkl = traverse(op.input_tensors)
+            if wkl:
+                return wkl
+        return None
+    outs = [outs] if isinstance(outs, tensor.Tensor) else outs
+    return traverse(outs)
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index e1a7d86695f2..f13ba5289ce5 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -46,16 +46,16 @@
 
 # the version of each package
 PACKAGE_VERSION = {
-    'arm_cpu':          "v0.04",
-    'llvm':             "v0.03",
+    'arm_cpu':          "v0.06",
+    'llvm':             "v0.04",
 
-    'cuda':             "v0.06",
-    'rocm':             "v0.03",
-    'opencl':           "v0.03",
-    'mali':             "v0.05",
-    'intel_graphics':   "v0.01",
+    'cuda':             "v0.08",
+    'rocm':             "v0.04",
+    'opencl':           "v0.04",
+    'mali':             "v0.06",
+    'intel_graphics':   "v0.02",
 
-    'vta':              "v0.06",
+    'vta':              "v0.08",
 }
 
 logger = logging.getLogger('autotvm')
@@ -189,7 +189,7 @@ def download_package(tophub_location, package_name):
 # global cache for load_reference_log
 REFERENCE_LOG_CACHE = {}
 
-def load_reference_log(backend, model, workload_name, template_key):
+def load_reference_log(backend, model, workload_name):
     """ Load reference log from TopHub to support fallback in template.
     Template will use these reference logs to choose fallback config.
 
@@ -201,8 +201,6 @@ def load_reference_log(backend, model, workload_name, template_key):
         The name of the device model
     workload_name: str
         The name of the workload. (The first item in the workload tuple)
-    template_key: str
-        The template key
     """
 
     backend = _alias(backend)
@@ -211,7 +209,7 @@ def load_reference_log(backend, model, workload_name, template_key):
     filename = os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)
 
     global REFERENCE_LOG_CACHE
-    key = (backend, model, workload_name, template_key)
+    key = (backend, model, workload_name)
 
     if key not in REFERENCE_LOG_CACHE:
         tmp = []
@@ -233,8 +231,7 @@ def load_reference_log(backend, model, workload_name, template_key):
                 model = max(counts.items(), key=lambda k: k[1])[0]
 
             for inp, res in load_from_file(filename):
-                if (model == inp.target.model and inp.task.workload[0] == workload_name and
-                        inp.config.template_key == template_key):
+                if model == inp.target.model and inp.task.workload[0] == workload_name:
                     tmp.append((inp, res))
         REFERENCE_LOG_CACHE[key] = tmp
 
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 882b0ad19dd5..305244808a33 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -219,8 +219,7 @@ def fit_log(self, records, plan_size):
         # filter data, only pick the data with a same task
         data = []
         for inp, res in records:
-            if inp.task.name == self.task.name and \
-                            inp.config.template_key == self.task.config_space.template_key:
+            if inp.task.name == self.task.name:
                 data.append((inp, res))
 
         logger.debug("XGB load %d entries from history log file", len(data))
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 4eedd23faa1c..6466dff6c5df 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -14,18 +14,30 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=len-as-condition,no-else-return,invalid-name
 """Backend code generation engine."""
 from __future__ import absolute_import
 
+import logging
+import numpy as np
+import tvm
 from ..base import register_relay_node, Object
 from ... import target as _target
+from ... import autotvm
 from .. import expr as _expr
+from .. import op as _op
+from .. import ty as _ty
 from . import _backend
 
+logger = logging.getLogger('compile_engine')
+
+
 @register_relay_node
-class CachedFunc(Object):
-    """Low-level tensor function to back a relay primitive function.
-    """
+class LoweredOutput(Object):
+    """Lowered output"""
+    def __init__(self, outputs, implement):
+        self.__init_handle_by_constructor__(
+            _backend._make_LoweredOutput, outputs, implement)
 
 
 @register_relay_node
@@ -63,6 +75,191 @@ def _get_cache_key(source_func, target):
     return source_func
 
 
+def get_shape(shape):
+    """Convert the shape to correct dtype and vars."""
+    ret = []
+    for dim in shape:
+        if isinstance(dim, tvm.expr.IntImm):
+            val = int(dim)
+            assert val <= np.iinfo(np.int32).max
+            ret.append(tvm.expr.IntImm("int32", val))
+        elif isinstance(dim, tvm.expr.Any):
+            ret.append(tvm.var("any_dim", "int32"))
+        else:
+            ret.append(dim)
+    return ret
+
+
+def get_valid_implementations(op, attrs, inputs, out_type, target):
+    """Get all valid implementations from the op strategy.
+
+    Note that this function doesn't support op with symbolic input shapes.
+
+    Parameters
+    ----------
+    op : relay.op.Op
+        Relay operator.
+
+    attrs : object
+        The op attribute.
+
+    inputs : List[tvm.Tensor]
+        Input tensors to the op.
+
+    out_type : relay.Type
+        The output type.
+
+    target : tvm.target.Target
+        The target to compile the op.
+
+    Returns
+    -------
+    ret : List[relay.op.OpImplementation]
+        The list of all valid op implementations.
+    """
+    fstrategy = op.get_attr("FTVMStrategy")
+    assert fstrategy is not None, "%s doesn't have FTVMStrategy registered" % op.name
+    with target:
+        strategy = fstrategy(attrs, inputs, out_type, target)
+    analyzer = tvm.arith.Analyzer()
+    ret = []
+    for spec in strategy.specializations:
+        if spec.condition:
+            # check if all the clauses in the specialized condition are true
+            flag = True
+            for clause in spec.condition.clauses:
+                clause = analyzer.canonical_simplify(clause)
+                if isinstance(clause, tvm.expr.IntImm) and clause.value:
+                    continue
+                flag = False
+                break
+            if flag:
+                for impl in spec.implementations:
+                    ret.append(impl)
+        else:
+            for impl in spec.implementations:
+                ret.append(impl)
+    return ret
+
+
+def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True):
+    """Select the best implementation from the op strategy.
+
+    If use_autotvm is True, it'll first try to find the best implementation
+    based on AutoTVM profile results. If no AutoTVM profile result is found,
+    it'll choose the implementation with highest plevel.
+
+    If use_autotvm is False, it'll directly choose the implementation with
+    highest plevel.
+
+    Note that this function doesn't support op with symbolic input shapes.
+
+    Parameters
+    ----------
+    op : relay.op.Op
+        Relay operator.
+
+    attrs : object
+        The op attribute.
+
+    inputs : List[tvm.Tensor]
+        Input tensors to the op.
+
+    out_type : relay.Type
+        The output type.
+
+    target : tvm.target.Target
+        The target to compile the op.
+
+    use_autotvm : bool
+        Whether query AutoTVM to pick the best.
+
+    Returns
+    -------
+    ret : tuple(relay.op.OpImplementation, List[tvm.Tensor])
+        The best op implementation and the corresponding output tensors.
+    """
+    all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
+
+    best_plevel_impl = None
+    for impl in all_impls:
+        if best_plevel_impl is None or impl.plevel > best_plevel_impl.plevel:
+            best_plevel_impl = impl
+    if not use_autotvm:
+        outs = best_plevel_impl.compute(attrs, inputs, out_type)
+        return best_plevel_impl, outs
+
+    outputs = {}
+    best_autotvm_impl = None
+    best_cfg = None
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    for impl in all_impls:
+        outs = impl.compute(attrs, inputs, out_type)
+        outputs[impl] = outs
+        workload = autotvm.task.get_workload(outs)
+        if workload is None:
+            continue
+        cfg = dispatch_ctx.query(target, workload)
+        if cfg.is_fallback:
+            # It's a fallback config
+            continue
+        if best_cfg is None or best_cfg.cost > cfg.cost:
+            best_autotvm_impl = impl
+            best_cfg = cfg
+    if best_autotvm_impl:
+        return best_autotvm_impl, outputs[best_autotvm_impl]
+    return best_plevel_impl, outputs[best_plevel_impl]
+
+
+@tvm._ffi.register_func("relay.backend.lower_call")
+def lower_call(call, inputs, target):
+    """Lower the call expression to op implementation and tensor outputs."""
+    assert isinstance(call.op, _op.Op)
+    op = call.op
+
+    # Prepare the call_node->checked_type(). For the call node inputs, we ensure that
+    # the shape is Int32. Following code ensures the same for the output as well.
+    # TODO(@icemelon9): Support recursive tuple
+    ret_type = call.checked_type
+    if isinstance(ret_type, _ty.TensorType):
+        ret_type = _ty.TensorType(get_shape(ret_type.shape), ret_type.dtype)
+    elif isinstance(ret_type, _ty.TupleType):
+        new_fields = []
+        for field in ret_type.fields:
+            if isinstance(field, _ty.TensorType):
+                new_fields.append(_ty.TensorType(get_shape(field.shape), field.dtype))
+            else:
+                new_fields.append(field)
+        ret_type = _ty.TupleType(new_fields)
+
+    is_dyn = _ty.type_has_any(call.checked_type)
+    for arg in call.args:
+        is_dyn = is_dyn or _ty.type_has_any(arg.checked_type)
+
+    # check if in the AutoTVM tracing mode, and disable if op is not in wanted list
+    env = autotvm.task.TaskExtractEnv.current
+    reenable_tracing = False
+    if env is not None and env.tracing:
+        if env.wanted_relay_ops is not None and op not in env.wanted_relay_ops:
+            env.tracing = False
+            reenable_tracing = True
+
+    if not is_dyn:
+        best_impl, outputs = select_implementation(
+            op, call.attrs, inputs, ret_type, target)
+        logger.info("Use implementation %s for op %s", best_impl.name, op.name)
+    else:
+        # TODO(@icemelon9): Allow tvm to generate multiple kernels for dynamic shapes.
+        #   Currently, we just use the implementation with highest plevel
+        best_impl, outputs = select_implementation(
+            op, call.attrs, inputs, ret_type, target, use_autotvm=False)
+
+    # re-enable AutoTVM tracing
+    if reenable_tracing:
+        env.tracing = True
+    return LoweredOutput(outputs, best_impl)
+
+
 @register_relay_node
 class CompileEngine(Object):
     """CompileEngine to get lowered code.
diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py
index f492c743173c..8d6923920979 100644
--- a/python/tvm/relay/expr_functor.py
+++ b/python/tvm/relay/expr_functor.py
@@ -131,22 +131,22 @@ class ExprVisitor(ExprFunctor):
 
     The default behavior recursively traverses the AST.
     """
-    def visit_tuple(self, t):
-        for x in t.fields:
+    def visit_tuple(self, tup):
+        for x in tup.fields:
             self.visit(x)
 
-    def visit_call(self, c):
-        self.visit(c.op)
-        for a in c.args:
+    def visit_call(self, call):
+        self.visit(call.op)
+        for a in call.args:
             self.visit(a)
 
-    def visit_var(self, v):
+    def visit_var(self, var):
         pass
 
-    def visit_let(self, l):
-        self.visit(l.var)
-        self.visit(l.value)
-        self.visit(l.body)
+    def visit_let(self, let):
+        self.visit(let.var)
+        self.visit(let.value)
+        self.visit(let.body)
 
     def visit_function(self, f):
         self.visit(f.body)
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 587b07673fbe..5532e3a5c1a4 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -311,6 +311,7 @@ def _impl(inputs, attr, params):
             flip_layout = True
 
         if attr['data_format'] == 'NHWC':
+            in_channels = input_shape[3]
             kernel_h, kernel_w, _, depth_mult = weights_shape
             attr['kernel_shape'] = (weights_shape[0], weights_shape[1])
             if opname == 'conv':
@@ -324,6 +325,7 @@ def _impl(inputs, attr, params):
                 attr['dilations'] = (attr['dilations'][1], attr['dilations'][2])
             attr['strides'] = (attr['strides'][1], attr['strides'][2])
         elif attr['data_format'] == 'NCHW':
+            in_channels = input_shape[1]
             _, depth_mult, kernel_h, kernel_w = weights_shape
             attr['kernel_shape'] = (weights_shape[2], weights_shape[3])
             if opname == 'conv':
@@ -344,7 +346,7 @@ def _impl(inputs, attr, params):
             raise tvm.error.OpAttributeInvalid(msg.format(attr['data_format']))
 
         if opname == 'depthwise':
-            attr['groups'] = attr['channels']
+            attr['groups'] = in_channels
 
         # Fix padding
         attr['padding'] = attr['padding'].decode("utf-8")
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index dd3587125aec..352bc6302ee0 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1156,7 +1156,7 @@ def convert_conv(self, op, conv_type):
 
         if is_depthwise_conv:
             params['channels'] = int(in_channels)
-            params['groups'] = int(in_channels)
+            params['groups'] = int(input_c)
             params['kernel_layout'] = 'HWOI'
         else:
             params['channels'] = int(output_channels)
diff --git a/python/tvm/relay/memory_alloc.py b/python/tvm/relay/memory_alloc.py
index d61c6f1d6fba..f8e981121031 100644
--- a/python/tvm/relay/memory_alloc.py
+++ b/python/tvm/relay/memory_alloc.py
@@ -28,8 +28,8 @@
 
 
 def is_primitive(call):
-    return hasattr(call.op, 'attrs') and hasattr(call.op.attrs, 'Primitive') and \
-        int(call.op.attrs.Primitive) == 1
+    return hasattr(call, 'op') and hasattr(call.op, 'attrs') and \
+           hasattr(call.op.attrs, 'Primitive') and int(call.op.attrs.Primitive) == 1
 
 # TODO(@jroesch): port to c++ and unify with existing code
 class LinearizeRetType:
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index bcd58ba5b1b1..7427c63a14c1 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -17,9 +17,10 @@
 #pylint: disable=wildcard-import, redefined-builtin
 """Relay core operators."""
 # operator defs
-from .op import get, register, register_schedule, register_compute, register_gradient, \
+from .op import get, register, register_compute, register_gradient, \
     register_pattern, register_alter_op_layout, register_legalize, \
-    schedule_injective, Op, OpPattern, debug
+    Op, OpPattern, OpStrategy, debug
+from . import strategy
 
 # Operators
 from .reduce import *
diff --git a/python/tvm/relay/op/_algorithm.py b/python/tvm/relay/op/_algorithm.py
index 09746be13e30..e1e6fd3a1139 100644
--- a/python/tvm/relay/op/_algorithm.py
+++ b/python/tvm/relay/op/_algorithm.py
@@ -18,48 +18,14 @@
 # pylint: disable=invalid-name,unused-argument
 from __future__ import absolute_import
 
-import topi
-from topi.util import get_const_int
-from ..op import OpPattern, register_compute, register_schedule, register_pattern
-
-
-@register_schedule("argsort")
-def schedule_argsort(_, outs, target):
-    """Schedule definition of argsort"""
-    with target:
-        return topi.generic.schedule_argsort(outs)
-
-
-@register_compute("argsort")
-def compute_argsort(attrs, inputs, _, target):
-    """Compute definition of argsort"""
-    axis = get_const_int(attrs.axis)
-    is_ascend = bool(get_const_int(attrs.is_ascend))
-    dtype = attrs.dtype
-    return [topi.argsort(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype)]
-
+from . import strategy
+from .op import OpPattern, register_pattern
+from .op import register_strategy
 
+# argsort
+register_strategy("argsort", strategy.argsort_strategy)
 register_pattern("argsort", OpPattern.OPAQUE)
 
-
-@register_schedule("topk")
-def schedule_topk(_, outs, target):
-    """Schedule definition of argsort"""
-    with target:
-        return topi.generic.schedule_topk(outs)
-
-
-@register_compute("topk")
-def compute_topk(attrs, inputs, _, target):
-    """Compute definition of argsort"""
-    k = get_const_int(attrs.k)
-    axis = get_const_int(attrs.axis)
-    ret_type = attrs.ret_type
-    is_ascend = bool(get_const_int(attrs.is_ascend))
-    dtype = attrs.dtype
-    out = topi.topk(inputs[0], k, axis, ret_type, is_ascend, dtype)
-    out = out if isinstance(out, list) else [out]
-    return out
-
-
+# topk
+register_strategy("topk", strategy.topk_strategy)
 register_pattern("topk", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index 43f71c0aa679..9d52ed3af777 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -17,33 +17,21 @@
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
 
-import topi
-
 from topi.util import get_const_int, get_const_tuple
 from . import op as _reg
 from ...api import convert
 from ...hybrid import script
 
-
-def _schedule_reduce(_, outs, target):
-    """Generic schedule for reduce"""
-    with target:
-        return topi.generic.schedule_reduce(outs)
-
-
-_reg.register_schedule("argmax", _schedule_reduce)
-_reg.register_schedule("argmin", _schedule_reduce)
-_reg.register_schedule("sum", _schedule_reduce)
-_reg.register_schedule("all", _schedule_reduce)
-_reg.register_schedule("any", _schedule_reduce)
-_reg.register_schedule("max", _schedule_reduce)
-_reg.register_schedule("min", _schedule_reduce)
-_reg.register_schedule("prod", _schedule_reduce)
-_reg.register_schedule("mean", _schedule_reduce)
-_reg.register_schedule("variance", _schedule_reduce)
-_reg.register_schedule("nn.cross_entropy", _schedule_reduce)
-_reg.register_schedule("nn.cross_entropy_with_logits", _schedule_reduce)
-
+_reg.register_reduce_schedule("argmax")
+_reg.register_reduce_schedule("argmin")
+_reg.register_reduce_schedule("sum")
+_reg.register_reduce_schedule("all")
+_reg.register_reduce_schedule("any")
+_reg.register_reduce_schedule("max")
+_reg.register_reduce_schedule("min")
+_reg.register_reduce_schedule("prod")
+_reg.register_reduce_schedule("mean")
+_reg.register_reduce_schedule("variance")
 
 def _create_axis_record(attrs, inputs):
     axes = attrs.axis if attrs.axis is None else list(get_const_tuple(attrs.axis))
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index c1d02bd56d1b..7c8ccb7dd827 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -19,101 +19,99 @@
 from __future__ import absolute_import
 import topi
 from topi.util import get_const_tuple
-from .op import register_compute, register_schedule, register_pattern, register_shape_func
-from .op import schedule_injective, OpPattern
+from .op import register_compute, register_shape_func
+from .op import register_broadcast_schedule, register_injective_schedule
+from .op import register_pattern, OpPattern
 from ...hybrid import script
 from ...api import convert
 
-schedule_broadcast = schedule_injective
-schedule_elemwise = schedule_injective
-
-register_schedule("log", schedule_broadcast)
-register_schedule("cos", schedule_broadcast)
-register_schedule("sin", schedule_broadcast)
-register_schedule("atan", schedule_broadcast)
-register_schedule("exp", schedule_broadcast)
-register_schedule("erf", schedule_broadcast)
-register_schedule("sqrt", schedule_broadcast)
-register_schedule("rsqrt", schedule_broadcast)
-register_schedule("sigmoid", schedule_broadcast)
-register_schedule("floor", schedule_broadcast)
-register_schedule("ceil", schedule_broadcast)
-register_schedule("trunc", schedule_broadcast)
-register_schedule("round", schedule_broadcast)
-register_schedule("sign", schedule_broadcast)
-register_schedule("abs", schedule_broadcast)
-register_schedule("tanh", schedule_broadcast)
-register_schedule("logical_not", schedule_broadcast)
-register_schedule("bitwise_not", schedule_broadcast)
-register_schedule("negative", schedule_broadcast)
-register_schedule("copy", schedule_broadcast)
-
-register_schedule("add", schedule_broadcast)
-register_schedule("subtract", schedule_broadcast)
-register_schedule("multiply", schedule_broadcast)
-register_schedule("divide", schedule_broadcast)
-register_schedule("floor_divide", schedule_broadcast)
-register_schedule("power", schedule_injective)
-register_schedule("mod", schedule_broadcast)
-register_schedule("floor_mod", schedule_broadcast)
-register_schedule("logical_and", schedule_broadcast)
-register_schedule("logical_or", schedule_broadcast)
-register_schedule("bitwise_and", schedule_broadcast)
-register_schedule("bitwise_or", schedule_broadcast)
-register_schedule("bitwise_xor", schedule_broadcast)
-register_schedule("equal", schedule_broadcast)
-register_schedule("not_equal", schedule_broadcast)
-register_schedule("less", schedule_broadcast)
-register_schedule("less_equal", schedule_broadcast)
-register_schedule("greater", schedule_broadcast)
-register_schedule("greater_equal", schedule_broadcast)
-register_schedule("maximum", schedule_injective)
-register_schedule("minimum", schedule_injective)
-register_schedule("right_shift", schedule_injective)
-register_schedule("left_shift", schedule_injective)
-register_schedule("shape_of", schedule_injective)
+
+register_broadcast_schedule("log")
+register_broadcast_schedule("cos")
+register_broadcast_schedule("sin")
+register_broadcast_schedule("atan")
+register_broadcast_schedule("exp")
+register_broadcast_schedule("erf")
+register_broadcast_schedule("sqrt")
+register_broadcast_schedule("rsqrt")
+register_broadcast_schedule("sigmoid")
+register_broadcast_schedule("floor")
+register_broadcast_schedule("ceil")
+register_broadcast_schedule("trunc")
+register_broadcast_schedule("round")
+register_broadcast_schedule("sign")
+register_broadcast_schedule("abs")
+register_broadcast_schedule("tanh")
+register_broadcast_schedule("add")
+register_broadcast_schedule("subtract")
+register_broadcast_schedule("multiply")
+register_broadcast_schedule("divide")
+register_broadcast_schedule("floor_divide")
+register_broadcast_schedule("power")
+register_broadcast_schedule("copy")
+register_broadcast_schedule("logical_not")
+register_broadcast_schedule("logical_and")
+register_broadcast_schedule("logical_or")
+register_broadcast_schedule("bitwise_not")
+register_broadcast_schedule("bitwise_and")
+register_broadcast_schedule("bitwise_or")
+register_broadcast_schedule("bitwise_xor")
+register_broadcast_schedule("negative")
+register_broadcast_schedule("mod")
+register_broadcast_schedule("floor_mod")
+register_broadcast_schedule("equal")
+register_broadcast_schedule("not_equal")
+register_broadcast_schedule("less")
+register_broadcast_schedule("less_equal")
+register_broadcast_schedule("greater")
+register_broadcast_schedule("greater_equal")
+register_injective_schedule("maximum")
+register_injective_schedule("minimum")
+register_injective_schedule("right_shift")
+register_injective_schedule("left_shift")
+register_injective_schedule("shape_of")
 
 # zeros
 @register_compute("zeros")
-def zeros_compute(attrs, inputs, output_type, target):
+def zeros_compute(attrs, inputs, output_type):
     assert not inputs
     return [topi.full(output_type.shape, output_type.dtype, 0.0)]
 
-register_schedule("zeros", schedule_broadcast)
+register_broadcast_schedule("zeros")
 register_pattern("zeros", OpPattern.ELEMWISE)
 
 # zeros_like
 @register_compute("zeros_like")
-def zeros_like_compute(attrs, inputs, output_type, target):
+def zeros_like_compute(attrs, inputs, output_type):
     assert len(inputs) == 1
     return [topi.full_like(inputs[0], 0.0)]
 
-register_schedule("zeros_like", schedule_broadcast)
+register_broadcast_schedule("zeros_like")
 
 # ones
 @register_compute("ones")
-def ones_compute(attrs, inputs, output_type, target):
+def ones_compute(attrs, inputs, output_type):
     assert not inputs
     return [topi.full(output_type.shape, output_type.dtype, 1.0)]
 
-register_schedule("ones", schedule_broadcast)
+register_broadcast_schedule("ones")
 register_pattern("ones", OpPattern.ELEMWISE)
 
 # ones_like
 @register_compute("ones_like")
-def ones_like(attrs, inputs, output_type, target):
+def ones_like_compute(attrs, inputs, output_type):
     assert len(inputs) == 1
     return [topi.full_like(inputs[0], 1.0)]
 
-register_schedule("ones_like", schedule_broadcast)
+register_broadcast_schedule("ones_like")
 
 # clip
 @register_compute("clip")
-def clip_compute(attrs, inputs, output_type, target):
+def clip_compute(attrs, inputs, output_type):
     assert len(inputs) == 1
     return [topi.clip(inputs[0], attrs.a_min, attrs.a_max)]
 
-register_schedule("clip", schedule_elemwise)
+register_injective_schedule("clip")
 
 @script
 def _cast_shape_function(x):
@@ -198,6 +196,7 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("floor_mod", False, broadcast_shape_func)
 register_shape_func("logical_and", False, broadcast_shape_func)
 register_shape_func("logical_or", False, broadcast_shape_func)
+register_shape_func("bitwise_not", False, broadcast_shape_func)
 register_shape_func("bitwise_and", False, broadcast_shape_func)
 register_shape_func("bitwise_or", False, broadcast_shape_func)
 register_shape_func("bitwise_xor", False, broadcast_shape_func)
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index e6053b887d38..42c94349da8c 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -21,52 +21,74 @@
 import topi
 from topi.util import get_const_int, get_const_tuple
 from . import op as _reg
-from ._reduce import _schedule_reduce
+from . import strategy
 from .op import OpPattern
 from ...hybrid import script
 from ...api import convert
 
-schedule_injective = _reg.schedule_injective
-schedule_broadcast = _reg.schedule_injective
-schedule_concatenate = _reg.schedule_concatenate
-
-
-_reg.register_schedule("collapse_sum_like", _schedule_reduce)
-_reg.register_schedule("broadcast_to", schedule_broadcast)
-_reg.register_schedule("broadcast_to_like", schedule_broadcast)
-_reg.register_schedule("expand_dims", schedule_broadcast)
-_reg.register_schedule("squeeze", schedule_injective)
-_reg.register_schedule("reshape", schedule_injective)
-_reg.register_schedule("reshape_like", schedule_injective)
-_reg.register_schedule("full", schedule_injective)
-_reg.register_schedule("full_like", schedule_injective)
-_reg.register_schedule("arange", schedule_injective)
-_reg.register_schedule("reverse", schedule_injective)
-_reg.register_schedule("repeat", schedule_broadcast)
-_reg.register_schedule("tile", schedule_broadcast)
-_reg.register_schedule("cast", schedule_injective)
-_reg.register_schedule("cast_like", schedule_injective)
-_reg.register_schedule("reinterpret", schedule_injective)
-_reg.register_schedule("strided_slice", schedule_injective)
-_reg.register_schedule("strided_set", schedule_injective)
-_reg.register_schedule("slice_like", schedule_injective)
-_reg.register_schedule("split", schedule_injective)
-_reg.register_schedule("take", schedule_injective)
-_reg.register_schedule("transpose", schedule_injective)
-_reg.register_schedule("where", schedule_broadcast)
-_reg.register_schedule("stack", schedule_injective)
-_reg.register_schedule("concatenate", schedule_concatenate)
-_reg.register_schedule("_contrib_reverse_reshape", schedule_injective)
-_reg.register_schedule("gather_nd", schedule_injective)
-_reg.register_schedule("sequence_mask", schedule_injective)
-_reg.register_schedule("one_hot", schedule_injective)
+_reg.register_broadcast_schedule("broadcast_to")
+_reg.register_broadcast_schedule("broadcast_to_like")
+_reg.register_broadcast_schedule("expand_dims")
+_reg.register_broadcast_schedule("repeat")
+_reg.register_broadcast_schedule("tile")
+_reg.register_broadcast_schedule("where")
+_reg.register_injective_schedule("squeeze")
+_reg.register_injective_schedule("reshape")
+_reg.register_injective_schedule("reshape_like")
+_reg.register_injective_schedule("full")
+_reg.register_injective_schedule("full_like")
+_reg.register_injective_schedule("arange")
+_reg.register_injective_schedule("reverse")
+_reg.register_injective_schedule("cast")
+_reg.register_injective_schedule("cast_like")
+_reg.register_injective_schedule("reinterpret")
+_reg.register_injective_schedule("strided_slice")
+_reg.register_injective_schedule("slice_like")
+_reg.register_injective_schedule("split")
+_reg.register_injective_schedule("take")
+_reg.register_injective_schedule("transpose")
+_reg.register_injective_schedule("stack")
+_reg.register_injective_schedule("_contrib_reverse_reshape")
+_reg.register_injective_schedule("gather_nd")
+_reg.register_injective_schedule("sequence_mask")
+_reg.register_injective_schedule("one_hot")
+_reg.register_reduce_schedule("collapse_sum_like")
+
+# concatenate
+_reg.register_schedule("concatenate", strategy.schedule_concatenate)
+
+# strided_set
+@_reg.register_compute("strided_set")
+def compute_strided_set(attrs, inputs, output_type):
+    """Compute definition of strided_set"""
+    return [topi.strided_set(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4])]
 
+_reg.register_injective_schedule("strided_set")
 
 # layout_transform
-_reg.register_schedule("layout_transform", schedule_injective)
+_reg.register_injective_schedule("layout_transform")
 _reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
 
-# shape func
+# argwhere
+@_reg.register_compute("argwhere")
+def compute_argwhere(attrs, inputs, output_type):
+    """Compute definition of argwhere"""
+    output_shape = []
+    for s in output_type.shape:
+        if hasattr(s, "value"):
+            output_shape.append(s)
+        else:
+            # see Any, replace it with a var
+            output_shape.append(tvm.var("any_dim", "int32"))
+    new_output_type = tvm.relay.ty.TensorType(output_shape, "int32")
+    return [topi.argwhere(new_output_type, inputs[0])]
+
+_reg.register_schedule("argwhere", strategy.schedule_argwhere)
+
+#####################
+#  Shape functions  #
+#####################
+
 @script
 def _arange_shape_func(start, stop, step):
     out = output_tensor((1,), "int64")
@@ -284,31 +306,6 @@ def argwhere_shape_func(attrs, inputs, out_ndims):
         return [_argwhere_shape_func_5d(inputs[0])]
     return ValueError("Does not support rank higher than 5 in argwhere")
 
-@_reg.register_schedule("argwhere")
-def schedule_argwhere(_, outs, target):
-    """Schedule definition of argwhere"""
-    with target:
-        return topi.generic.schedule_argwhere(outs)
-
-
-@_reg.register_compute("argwhere")
-def compute_argwhere(attrs, inputs, output_type, _):
-    """Compute definition of argwhere"""
-    output_shape = []
-    for s in output_type.shape:
-        if hasattr(s, "value"):
-            output_shape.append(s)
-        else:
-            # see Any, replace it with a var
-            output_shape.append(tvm.var("any_dim", "int32"))
-    new_output_type = tvm.relay.ty.TensorType(output_shape, "int32")
-    return [topi.argwhere(new_output_type, inputs[0])]
-
-@_reg.register_compute("strided_set")
-def compute_strided_set(attrs, inputs, output_type, _):
-    """Compute definition of strided_set"""
-    return [topi.strided_set(inputs[0], inputs[1], inputs[2], inputs[3], inputs[4])]
-
 @script
 def _layout_transform_shape_func(data_shape,
                                  out_layout_len,
diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
index 586c30085601..7bd5262dc30a 100644
--- a/python/tvm/relay/op/annotation/annotation.py
+++ b/python/tvm/relay/op/annotation/annotation.py
@@ -19,7 +19,7 @@
 from tvm.runtime import TVMContext as _TVMContext
 
 from . import _make
-from ..op import register_schedule, schedule_injective
+from .. import op as reg
 
 
 def on_device(data, device):
@@ -79,7 +79,7 @@ def checkpoint(data):
     """
     return _make.checkpoint(data)
 
-register_schedule("annotation.checkpoint", schedule_injective)
+reg.register_injective_schedule("annotation.checkpoint")
 
 
 def compiler_begin(data, compiler):
diff --git a/python/tvm/relay/op/contrib/_contrib.py b/python/tvm/relay/op/contrib/_contrib.py
index 4b5588024411..3927cef69706 100644
--- a/python/tvm/relay/op/contrib/_contrib.py
+++ b/python/tvm/relay/op/contrib/_contrib.py
@@ -18,29 +18,19 @@
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
 
-import topi
 from .. import op as reg
-from ..op import schedule_injective, OpPattern
+from .. import strategy
+from ..op import OpPattern
 
 
 # adaptive_max_pool2d
-@reg.register_schedule("contrib.adaptive_max_pool2d")
-def schedule_adaptive_max_pool2d(_, outs, target):
-    """Schedule definition of adaptive_max_pool2d"""
-    with target:
-        return topi.generic.schedule_adaptive_pool(outs)
-
+reg.register_schedule("contrib.adaptive_max_pool2d", strategy.schedule_adaptive_pool)
 reg.register_pattern("contrib.adaptive_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # adaptive_avg_pool2d
-@reg.register_schedule("contrib.adaptive_avg_pool2d")
-def schedule_adaptive_avg_pool2d(_, outs, target):
-    """Schedule definition of adaptive_avg_pool2d"""
-    with target:
-        return topi.generic.schedule_adaptive_pool(outs)
-
+reg.register_schedule("contrib.adaptive_avg_pool2d", strategy.schedule_adaptive_pool)
 reg.register_pattern("contrib.adaptive_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 # relay.contrib.ndarray_size
-reg.register_schedule("contrib.ndarray_size", schedule_injective)
+reg.register_injective_schedule("contrib.ndarray_size")
diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py
index 89fde6dc1738..b98b2bc14c28 100644
--- a/python/tvm/relay/op/image/_image.py
+++ b/python/tvm/relay/op/image/_image.py
@@ -20,13 +20,10 @@
 
 import topi
 from .. import op as reg
-from ..op import schedule_injective
 
 # resize
-reg.register_schedule("image.resize", schedule_injective)
-
 @reg.register_compute("image.resize")
-def compute_resize(attrs, inputs, out_type, target):
+def compute_resize(attrs, inputs, out_type):
     size = attrs.size
     layout = attrs.layout
     method = attrs.method
@@ -34,12 +31,12 @@ def compute_resize(attrs, inputs, out_type, target):
     out_dtype = attrs.out_dtype
     return [topi.image.resize(inputs[0], size, layout, method, coord_trans, out_dtype)]
 
+reg.register_injective_schedule("image.resize")
 
-# crop and resize
-reg.register_schedule("image.crop_and_resize", schedule_injective)
 
+# crop and resize
 @reg.register_compute("image.crop_and_resize")
-def compute_crop_and_resize(attrs, inputs, out_type, target):
+def compute_crop_and_resize(attrs, inputs, out_type):
     crop_size = attrs.crop_size
     layout = attrs.layout
     method = attrs.method
@@ -48,3 +45,5 @@ def compute_crop_and_resize(attrs, inputs, out_type, target):
     return [topi.image.crop_and_resize(inputs[0], inputs[1], inputs[2],
                                        crop_size, layout, method,
                                        extrapolation_value, out_dtype)]
+
+reg.register_injective_schedule("image.crop_and_resize")
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 3fdafd5b8628..97a5fa6ec00b 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -21,253 +21,79 @@
 import topi
 from topi.util import get_const_tuple
 from .. import op as reg
-from ..op import OpPattern, schedule_injective
+from .. import strategy
+from ..op import OpPattern
 from .._tensor import elemwise_shape_func
 from ....api import convert
 from ....hybrid import script
 
 # relu
-reg.register_schedule("nn.relu", schedule_injective)
+reg.register_broadcast_schedule("nn.relu")
 reg.register_pattern("nn.relu", OpPattern.ELEMWISE)
 
-# softmax
-@reg.register_schedule("nn.softmax")
-def schedule_softmax(_, outputs, target):
-    """Schedule definition of softmax"""
-    with target:
-        return topi.generic.schedule_softmax(outputs)
-
 
+# softmax
+reg.register_schedule("nn.softmax", strategy.schedule_softmax)
 reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
 
-schedule_broadcast = schedule_injective
-
-
-@reg.register_schedule("nn.log_softmax")
-def schedule_log_softmax(_, outputs, target):
-    """Schedule definition of log_softmax"""
-    with target:
-        return topi.generic.schedule_softmax(outputs)
-
 
+# log_softmax
+reg.register_schedule("nn.log_softmax", strategy.schedule_softmax)
 reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
 
 
 # dense
-@reg.register_compute("nn.dense")
-def compute_dense(attrs, inputs, out_type, target):
-    """Compute definition of dense"""
-    out_dtype = attrs.out_dtype
-    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-    return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
-
-
-@reg.register_schedule("nn.dense")
-def schedule_dense(attrs, outputs, target):
-    """Schedule definition of dense"""
-    with target:
-        return topi.generic.schedule_dense(outputs)
-
-
+reg.register_strategy("nn.dense", strategy.dense_strategy)
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+# fifo_buffer
 @reg.register_compute('nn.fifo_buffer')
-def compute_fifo_buffer(attrs, inputs, out_type, target):
+def compute_fifo_buffer(attrs, inputs, out_type):
     return [topi.nn.fifo_buffer(inputs[0], inputs[1], axis=attrs.get_int('axis'))]
 
-
-@reg.register_schedule('nn.fifo_buffer')
-def schedule_fifo_buffer(attrs, outputs, target):
-    with target:
-        return topi.generic.schedule_injective(outputs)
-
-
+reg.register_injective_schedule("nn.fifo_buffer")
 reg.register_pattern("nn.fifo_buffer", OpPattern.OPAQUE)
 
 
 # batch_matmul
-@reg.register_compute("nn.batch_matmul")
-def compute_batch_matmul(attrs, inputs, out_type, target):
-    """Compute definition of batch_matmul"""
-    with target:
-        return [topi.nn.batch_matmul(inputs[0], inputs[1])]
-
-
-@reg.register_schedule("nn.batch_matmul")
-def schedule_batch_matmul(attrs, outputs, target):
-    """Schedule definition of batch_matmul"""
-    with target:
-        return topi.generic.schedule_batch_matmul(outputs)
-
-
+reg.register_strategy("nn.batch_matmul", strategy.batch_matmul_strategy)
 reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
+
 # sparse_dense
 @reg.register_compute("nn.sparse_dense")
-def compute_sparse_dense(attrs, inputs, out_type, target):
+def compute_sparse_dense(attrs, inputs, out_type):
     """Compute definition of sparse_dense"""
     return [topi.nn.sparse_dense(inputs[0], inputs[1], inputs[2], inputs[3])]
 
-@reg.register_schedule("nn.sparse_dense")
-def schedule_sparse_dense(attrs, outputs, target):
-    """Schedule definition of batch_matmul"""
-    with target:
-        return topi.generic.schedule_sparse_dense(outputs)
-
+reg.register_schedule("nn.sparse_dense", strategy.schedule_sparse_dense)
 reg.register_pattern("nn.sparse_dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
+
 # sparse_transpose
 @reg.register_compute("nn.sparse_transpose")
-def compute_sparse_transpose(attrs, inputs, out_type, target):
+def compute_sparse_transpose(attrs, inputs, out_type):
     """Compute definition of sparse_transpose"""
     return topi.nn.sparse_transpose(inputs[0], inputs[1], inputs[2])
 
-@reg.register_schedule("nn.sparse_transpose")
-def schedule_sparse_transpose(attrs, outputs, target):
-    """Schedule definition of batch_matmul"""
-    with target:
-        return topi.generic.schedule_sparse_transpose(outputs)
-
+reg.register_schedule("nn.sparse_transpose", strategy.schedule_sparse_transpose)
 reg.register_pattern("nn.sparse_transpose", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
-# Conv1D
-@reg.register_compute("nn.conv1d")
-def compute_conv1d(attrs, inputs, out_type, target):
-    """Compute definition of conv1d"""
-    strides = get_const_tuple(attrs.strides)
-    padding = get_const_tuple(attrs.padding)
-    dilation = get_const_tuple(attrs.dilation)
-    layout = attrs.data_layout
-    out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
-                 else out_dtype)
-
-    assert layout in ["NCW", "NWC"]
-    if dilation[0] < 1:
-        raise ValueError("dilation should be a positive value")
-
-    return [topi.nn.conv1d(inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype)]
-
-
-@reg.register_schedule("nn.conv1d")
-def schedule_conv1d(attrs, outs, target):
-    """Schedule definition of conv1d"""
-    layout = attrs.data_layout
-
-    with target:
-        if layout == "NCW":
-            return topi.generic.schedule_conv1d_ncw(outs)
-        elif layout == "NCW":
-            return topi.generic.schedule_conv1d_nwc(outs)
-    raise ValueError("No compatible schedule")
-
-
+# conv1d
+reg.register_strategy("nn.conv1d", strategy.conv1d_strategy)
 reg.register_pattern("nn.conv1d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # conv2d
-def _find_conv2d_op(op):
-    """Find the op with conv2d in its tag by traversing."""
-    if 'conv2d' in op.tag:
-        return op
-    for tensor in op.input_tensors:
-        op_ = _find_conv2d_op(tensor.op)
-        if op_ is not None:
-            return op_
-    return None
-
-@reg.register_compute("nn.conv2d")
-def compute_conv2d(attrs, inputs, out_type, target):
-    """Compute definition of conv2d"""
-    padding = get_const_tuple(attrs.padding)
-    strides = get_const_tuple(attrs.strides)
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-    out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
-                 else out_dtype)
-
-    assert layout in ["NCHW", "NHWC", "NCHW4c", "HWCN"]
-    (dilation_h, dilation_w) = dilation
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    def _get_out_depth():
-        weight_shape = get_const_tuple(inputs[1].shape)
-        # NHWC layout
-        if kernel_layout.startswith("HW"):
-            return weight_shape[2] * weight_shape[3]
-        # NCHW layout.
-        # in ARM CPU contrib_spatial_pack schedule, we will prepack weight layout
-        if len(weight_shape) == 4:
-            return weight_shape[0] * weight_shape[1]
-        else:
-            assert len(weight_shape) == 5
-            C, M, _, _, VC = weight_shape
-            return C * VC * M
-
-    if groups == 1:
-        out = topi.nn.conv2d(
-            inputs[0], inputs[1], strides, padding,
-            dilation, layout, out_dtype)
-    elif layout == "NCHW" and _get_out_depth() == groups:
-        out = topi.nn.depthwise_conv2d_nchw(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
-    elif layout == "NHWC" and kernel_layout == "HWOI" and _get_out_depth() == groups:
-        out = topi.nn.depthwise_conv2d_nhwc(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype)
-    elif layout in ['NCHW', 'NCHW4c']:
-        out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
-                                        out_dtype)
-    else:
-        raise ValueError("not support arbitrary group number for now")
-    return [out]
-
-
-@reg.register_schedule("nn.conv2d")
-def schedule_conv2d(attrs, outs, target):
-    """Schedule definition of conv2d"""
-    groups = attrs.groups
-    layout = attrs.data_layout
-    kernel_layout = attrs.kernel_layout
-
-    with target:
-        if groups == 1 and layout == "NCHW":
-            return topi.generic.schedule_conv2d_nchw(outs)
-        elif groups == 1 and layout == "NCHW4c":
-            return topi.generic.schedule_conv2d_nchw(outs)
-        elif groups == 1 and layout == "NHWC":
-            return topi.generic.schedule_conv2d_nhwc(outs)
-        elif groups == 1 and layout == "HWCN":
-            return topi.generic.schedule_conv2d_hwcn(outs)
-        elif groups != 1:
-            # collect in_channels to distinguish depthwise and group conv2d
-            op = _find_conv2d_op(outs[0].op)
-            assert op is not None
-
-            is_depthwise = 'depthwise' in op.tag
-            if is_depthwise:
-                if layout == "NCHW":
-                    # TODO(leyuan, merrymercy, Huyuwei): fold depthwise topi into conv2d.
-                    return topi.generic.schedule_depthwise_conv2d_nchw(outs)
-                if layout == "NHWC" and kernel_layout == "HWOI":
-                    return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
-            else:
-                if layout in ["NCHW", "NCHW4c"]:
-                    return topi.generic.schedule_group_conv2d_nchw(outs)
-    raise ValueError("No compatible schedule")
-
+reg.register_strategy("nn.conv2d", strategy.conv2d_strategy)
+reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 @reg.register_alter_op_layout("nn.conv2d")
-def alter_op_layout_conv2d(attrs, inputs, tinfos):
+def alter_op_layout_conv2d(attrs, inputs, tinfos, out_type):
     """Alternate the layout of conv2d"""
-    # pylint: disable=import-outside-toplevel
-    from ... import op
-    return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, op)
+    return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type)
 
 @reg.register_legalize("nn.conv2d")
 def legalize_conv2d(attrs, inputs, types):
@@ -289,7 +115,6 @@ def legalize_conv2d(attrs, inputs, types):
     """
     return topi.nn.conv2d_legalize(attrs, inputs, types)
 
-
 @reg.register_convert_op_layout("nn.conv2d")
 def convert_conv2d(attrs, inputs, tinfos, desired_layout):
     """Convert Layout pass registration for conv2d op.
@@ -330,82 +155,10 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layout):
             return relay.nn.conv2d(data, weight, **new_attrs)
     return None
 
-reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
-
 
 # conv2d_transpose
-@reg.register_compute("nn.conv2d_transpose")
-def compute_conv2d_transpose(attrs, inputs, out_dtype, target):
-    """Compute definition of conv2d_transpose"""
-    padding = get_const_tuple(attrs.padding)
-    strides = get_const_tuple(attrs.strides)
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
-                 else out_dtype)
-    assert layout == "NCHW", "only support nchw for now"
-    assert dilation == (1, 1), "not support dilate now"
-    assert groups == 1, "only support groups == 1 for now"
-    out = topi.nn.conv2d_transpose_nchw(
-        inputs[0], inputs[1], strides, padding, out_dtype)
-    output_padding = get_const_tuple(attrs.output_padding)
-    out = topi.nn.pad(out,
-                      [0, 0, 0, 0], [0, 0, output_padding[0], output_padding[1]])
-    return [out]
-
-
-@reg.register_compute("nn.conv3d")
-def compute_conv3d(attrs, inputs, out_type, target):
-    """Compute definition of conv3d"""
-    padding = get_const_tuple(attrs.padding)
-    strides = get_const_tuple(attrs.strides)
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
-                 else out_dtype)
-
-    assert layout in ["NCDHW", "NDHWC"]
-    (dilation_d, dilation_h, dilation_w) = dilation
-    if dilation_d < 1 or dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        out = topi.nn.conv3d(
-            inputs[0], inputs[1], strides, padding,
-            dilation, layout, out_dtype)
-    else:
-        raise ValueError("not support arbitrary group number for now")
-    return [out]
-
-
-@reg.register_schedule("nn.conv3d")
-def schedule_conv3d(attrs, outs, target):
-    """Schedule definition of conv3d"""
-    groups = attrs.groups
-    layout = attrs.data_layout
-
-    with target:
-        if groups == 1 and layout == "NCDHW":
-            return topi.generic.schedule_conv3d_ncdhw(outs)
-        elif groups == 1 and layout == "NDHWC":
-            return topi.generic.schedule_conv3d_ndhwc(outs)
-
-    raise ValueError("No compatible schedule")
-
-
-reg.register_pattern("nn.conv3d", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-
-@reg.register_schedule("nn.conv2d_transpose")
-def schedule_conv2d_transpose(attrs, outs, target):
-    """Schedule definition of conv2d_transpose"""
-    with target:
-        return topi.generic.schedule_conv2d_transpose_nchw(outs)
-
+reg.register_strategy("nn.conv2d_transpose", strategy.conv2d_transpose_strategy)
+reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 @reg.register_legalize("nn.conv2d_transpose")
 def legalize_conv2d_transpose(attrs, inputs, types):
@@ -427,202 +180,102 @@ def legalize_conv2d_transpose(attrs, inputs, types):
     """
     return topi.nn.conv2d_transpose_legalize(attrs, inputs, types)
 
-reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-# conv1d_transpose
-@reg.register_compute("nn.conv1d_transpose")
-def compute_conv1d_transpose(attrs, inputs, out_dtype, target):
-    """Compute definition of conv1d_transpose"""
-    padding = get_const_tuple(attrs.padding)
-    strides = get_const_tuple(attrs.strides)
-    dilation = get_const_tuple(attrs.dilation)
-    groups = attrs.groups
-    layout = attrs.data_layout
-    out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
-                 else out_dtype)
-    assert layout == "NCW", "conv1d_transpose ncw only supported"
-    assert dilation == (1,), "conv1d_transpose dilation is not supported"
-    assert groups == 1, "conv1d_transpose groups == 1 only supported"
-    out = topi.nn.conv1d_transpose_ncw(
-        inputs[0], inputs[1], strides, padding, out_dtype)
-    output_padding = get_const_tuple(attrs.output_padding)
-    out = topi.nn.pad(out,
-                      [0, 0, 0], [0, 0, output_padding[0]])
-    return [out]
 
+# conv3d
+reg.register_strategy("nn.conv3d", strategy.conv3d_strategy)
+reg.register_pattern("nn.conv3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
-@reg.register_schedule("nn.conv1d_transpose")
-def schedule_conv1d_transpose(attrs, outs, target):
-    """Schedule definition of conv1d_transpose"""
-    with target:
-        return topi.generic.schedule_conv1d_transpose_ncw(outs)
 
+# conv1d_transpose
+reg.register_strategy("nn.conv1d_transpose", strategy.conv1d_transpose_strategy)
 reg.register_pattern("nn.conv1d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
 
+
 # bias_add
-reg.register_schedule("nn.bias_add", schedule_injective)
+reg.register_injective_schedule("nn.bias_add")
 reg.register_pattern("nn.bias_add", OpPattern.BROADCAST)
 
 
 # max_pool1d
-@reg.register_schedule("nn.max_pool1d")
-def schedule_max_pool1d(attrs, outs, target):
-    """Schedule definition of max_pool1d"""
-    layout = attrs.layout
-    with target:
-        return topi.generic.schedule_pool(outs, layout)
-
-
+reg.register_schedule("nn.max_pool1d", strategy.schedule_pool)
 reg.register_pattern("nn.max_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # max_pool2d
-@reg.register_schedule("nn.max_pool2d")
-def schedule_max_pool2d(attrs, outs, target):
-    """Schedule definition of max_pool2d"""
-    layout = attrs.layout
-    with target:
-        return topi.generic.schedule_pool(outs, layout)
-
-
+reg.register_schedule("nn.max_pool2d", strategy.schedule_pool)
 reg.register_pattern("nn.max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # max_pool3d
-@reg.register_schedule("nn.max_pool3d")
-def schedule_max_pool3d(attrs, outs, target):
-    """Schedule definition of max_pool3d"""
-    layout = attrs.layout
-    with target:
-        return topi.generic.schedule_pool(outs, layout)
-
-
+reg.register_schedule("nn.max_pool3d", strategy.schedule_pool)
 reg.register_pattern("nn.max_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool1d
-@reg.register_schedule("nn.avg_pool1d")
-def schedule_avg_pool1d(attrs, outs, target):
-    """Schedule definition of avg_pool1d"""
-    layout = attrs.layout
-    with target:
-        return topi.generic.schedule_pool(outs, layout)
-
-
+reg.register_schedule("nn.avg_pool1d", strategy.schedule_pool)
 reg.register_pattern("nn.avg_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool2d
-@reg.register_schedule("nn.avg_pool2d")
-def schedule_avg_pool2d(attrs, outs, target):
-    """Schedule definition of avg_pool2d"""
-    layout = attrs.layout
-    with target:
-        return topi.generic.schedule_pool(outs, layout)
-
+reg.register_schedule("nn.avg_pool2d", strategy.schedule_pool)
 reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool3d
-@reg.register_schedule("nn.avg_pool3d")
-def schedule_avg_pool3d(attrs, outs, target):
-    """Schedule definition of avg_pool3d"""
-    layout = attrs.layout
-    with target:
-        return topi.generic.schedule_pool(outs, layout)
-
-
+reg.register_schedule("nn.avg_pool3d", strategy.schedule_pool)
 reg.register_pattern("nn.avg_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # max_pool2d_grad
-@reg.register_schedule("nn.max_pool2d_grad")
-def schedule_max_pool2d_grad(attrs, outs, target):
-    """Schedule definition of max_pool2d_grad"""
-    with target:
-        return topi.generic.schedule_pool_grad(outs)
-
-
+reg.register_schedule("nn.max_pool2d_grad", strategy.schedule_pool_grad)
 reg.register_pattern("nn.max_pool2d_grad", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool2d_grad
-@reg.register_schedule("nn.avg_pool2d_grad")
-def schedule_avg_pool2d_grad(attrs, outs, target):
-    """Schedule definition of avg_pool2d_grad"""
-    with target:
-        return topi.generic.schedule_pool_grad(outs)
-
-
+reg.register_schedule("nn.avg_pool2d_grad", strategy.schedule_pool_grad)
 reg.register_pattern("nn.avg_pool2d_grad", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # global_max_pool2d
-@reg.register_schedule("nn.global_max_pool2d")
-def schedule_global_max_pool2d(_, outs, target):
-    """Schedule definition of global_max_pool2d"""
-    with target:
-        return topi.generic.schedule_adaptive_pool(outs)
-
-
+reg.register_schedule("nn.global_max_pool2d", strategy.schedule_adaptive_pool)
 reg.register_pattern("nn.global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # global_avg_pool2d
-@reg.register_schedule("nn.global_avg_pool2d")
-def schedule_global_avg_pool2d(_, outs, target):
-    """Schedule definition of global_avg_pool2d"""
-    with target:
-        return topi.generic.schedule_adaptive_pool(outs)
-
-
+reg.register_schedule("nn.global_avg_pool2d", strategy.schedule_adaptive_pool)
 reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # leaky_relu
-reg.register_schedule("nn.leaky_relu", schedule_broadcast)
+reg.register_broadcast_schedule("nn.leaky_relu")
 reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE)
 
+
 # prelu
-reg.register_schedule("nn.prelu", schedule_broadcast)
+reg.register_broadcast_schedule("nn.prelu")
 reg.register_pattern("nn.prelu", OpPattern.BROADCAST)
 
+
 # flatten
-reg.register_schedule("nn.batch_flatten", schedule_broadcast)
+reg.register_broadcast_schedule("nn.batch_flatten")
 reg.register_pattern("nn.batch_flatten", OpPattern.INJECTIVE)
 
 
 # lrn
 @reg.register_compute("nn.lrn")
-def compute_lrn(attrs, inputs, out_dtype, target):
+def compute_lrn(attrs, inputs, out_dtype):
     """Compute definition of lrn"""
     assert len(inputs) == 1
     return [topi.nn.lrn(inputs[0], attrs.size, attrs.axis,
                         attrs.alpha, attrs.beta, attrs.bias)]
 
-
-@reg.register_schedule("nn.lrn")
-def schedule_lrn(attrs, outs, target):
-    """Schedule definition of lrn"""
-    with target:
-        return topi.generic.schedule_lrn(outs)
-
-
+reg.register_schedule("nn.lrn", strategy.schedule_lrn)
 reg.register_pattern("nn.lrn", OpPattern.OPAQUE)
 
 
 # upsampling
-reg.register_schedule("nn.upsampling", reg.schedule_injective)
-
-
-def schedule_upsampling(_, outs, target):
-    """Schedule definition of upsampling"""
-    with target:
-        return topi.generic.schedule_injective(outs)
-
 @reg.register_compute("nn.upsampling")
-def compute_upsampling(attrs, inputs, out_dtype, target):
+def compute_upsampling(attrs, inputs, out_dtype):
     scale_h = attrs.scale_h
     scale_w = attrs.scale_w
     layout = attrs.layout
@@ -630,16 +283,12 @@ def compute_upsampling(attrs, inputs, out_dtype, target):
     align_corners = attrs.align_corners
     return [topi.nn.upsampling(inputs[0], scale_h, scale_w, layout, method, align_corners)]
 
-# upsampling3d
-reg.register_schedule("nn.upsampling3d", reg.schedule_injective)
+reg.register_injective_schedule("nn.upsampling")
 
-def schedule_upsampling3d(_, outs, target):
-    """Schedule definition of upsampling3d"""
-    with target:
-        return topi.generic.schedule_injective(outs)
 
+# upsampling3d
 @reg.register_compute("nn.upsampling3d")
-def compute_upsampling3d(attrs, inputs, out_dtype, target):
+def compute_upsampling3d(attrs, inputs, out_dtype):
     scale_d = attrs.scale_d
     scale_h = attrs.scale_h
     scale_w = attrs.scale_w
@@ -649,297 +298,93 @@ def compute_upsampling3d(attrs, inputs, out_dtype, target):
     return [topi.nn.upsampling3d(inputs[0], scale_d, scale_h, scale_w, layout, method,\
         coordinate_transformation_mode)]
 
+reg.register_injective_schedule("nn.upsampling3d")
+
+
 # pad
-reg.register_schedule("nn.pad", schedule_broadcast)
+reg.register_broadcast_schedule("nn.pad")
 
-# mirror_pad
-reg.register_schedule("nn.mirror_pad", schedule_broadcast)
 
+# mirror_pad
 @reg.register_compute("nn.mirror_pad")
-def compute_mirror_pad(attrs, inputs, out_dtype, target):
+def compute_mirror_pad(attrs, inputs, out_dtype):
     pad_before, pad_after = list(zip(*attrs.pad_width))
     mode = attrs.mode
     out = topi.nn.mirror_pad(inputs[0], pad_before=pad_before, pad_after=pad_after, mode=mode)
     return [out]
 
-# winograd related operators
-@reg.register_compute("nn.contrib_conv2d_winograd_without_weight_transform")
-def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, out_dtype, target):
-    """Compute definition of conv2d_winograd_without_weight_transform"""
-    # pylint: disable=assignment-from-no-return
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    data_layout = attrs.get_str("data_layout")
-    out_dtype = attrs.get_str("out_dtype")
-    tile_size = attrs.get_int("tile_size")
-    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-    assert dilation == (1, 1), "Do not support dilate now"
-    assert groups == 1, "Do not supoort arbitrary group number"
-
-    out = topi.nn.conv2d_winograd_without_weight_transform(
-        inputs[0], inputs[1], strides, padding, dilation, data_layout,
-        out_dtype, tile_size)
-
-    return [out]
-
-
-@reg.register_schedule("nn.contrib_conv2d_winograd_without_weight_transform")
-def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, target):
-    """Schedule definition of conv2d_winograd_without_weight_transform"""
-    with target:
-        return topi.generic.schedule_conv2d_winograd_without_weight_transform(outs)
+reg.register_broadcast_schedule("nn.mirror_pad")
 
 
+# conv2d_winograd related operators
+reg.register_strategy("nn.contrib_conv2d_winograd_without_weight_transform",
+                      strategy.conv2d_winograd_without_weight_transfrom_strategy)
 reg.register_pattern("nn.contrib_conv2d_winograd_without_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_compute("nn.contrib_conv2d_winograd_weight_transform")
-def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype, target):
+def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype):
     """Compute definition of contrib_conv2d_winograd_weight_transform"""
     out = topi.nn.conv2d_winograd_weight_transform(
         inputs[0], attrs.get_int('tile_size'))
     return [out]
 
-
-@reg.register_schedule("nn.contrib_conv2d_winograd_weight_transform")
-def schedule_contrib_conv2d_winograd_weight_transform(attrs, outs, target):
-    """Schedule definition of contrib_conv2d_winograd_weight_transform"""
-    with target:
-        return topi.generic.schedule_conv2d_winograd_weight_transform(outs)
-
-
+reg.register_schedule("nn.contrib_conv2d_winograd_weight_transform",
+                      strategy.schedule_conv2d_winograd_weight_transform)
 reg.register_pattern("nn.contrib_conv2d_winograd_weight_transform",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
-
-# winograd nnpack related operators
-@reg.register_compute("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
-def compute_contrib_conv2d_winograd_nnpack_without_weight_transform(
-        attrs, inputs, out_dtype, target):
-    """Compute definition of conv2d_winograd_nnpack_without_weight_transform"""
-    # pylint: disable=assignment-from-no-return
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int("groups")
-    data_layout = attrs.get_str("data_layout")
-    out_dtype = attrs.get_str("out_dtype")
-    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-    assert dilation == (1, 1), "Do not support dilate now"
-    assert groups == 1, "Do not supoort arbitrary group number"
-
-    # No bias
-    out = topi.nn.conv2d_winograd_nnpack_without_weight_transform(
-        inputs[0], inputs[1], None, strides, padding, dilation, data_layout,
-        out_dtype)
-
-    return [out]
-
-
-@reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
-def schedule_contrib_conv2d_winograd_nnpack_without_weight_transform(attrs, outs, target):
-    """Schedule definition of conv2d_winograd_nnpack_without_weight_transform"""
-    with target:
-        return topi.generic.schedule_conv2d_winograd_nnpack_without_weight_transform(outs)
-
-
-reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_without_weight_transform",
-                     OpPattern.OPAQUE)
-
-
 @reg.register_compute("nn.contrib_conv2d_winograd_nnpack_weight_transform")
-def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_dtype, target):
+def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_dtype):
     """Compute definition of contrib_conv2d_winograd_nnpack_weight_transform"""
     convolution_algorithm = attrs.get_int('convolution_algorithm')
     out = topi.nn.conv2d_winograd_nnpack_weight_transform(
         inputs[0], convolution_algorithm, out_dtype)
     return [out]
 
-
-@reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_weight_transform")
-def schedule_contrib_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
-    """Schedule definition of contrib_conv2d_winograd_nnpack_weight_transform"""
-    with target:
-        return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
-
-
+reg.register_schedule("nn.contrib_conv2d_winograd_nnpack_weight_transform",
+                      strategy.schedule_conv2d_winograd_nnpack_weight_transform)
 reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_weight_transform",
                      OpPattern.OPAQUE)
 
 
-@reg.register_compute("nn.contrib_conv2d_NCHWc")
-def compute_contrib_conv2d_NCHWc(attrs, inputs, out_dtype, target):
-    """Compute definition of conv2d NCHWc"""
-    # pylint: disable=assignment-from-no-return
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs.get_str("data_layout")
-    out_layout = attrs.get_str("out_layout")
-    out_dtype = attrs.get_str("out_dtype")
-    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-
-    out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation,
-                               data_layout, out_layout, out_dtype)
-    return [out]
-
-
-@reg.register_schedule("nn.contrib_conv2d_NCHWc")
-def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
-    """Schedule definition of contrib_conv2d_NCHWc"""
-    with target:
-        return topi.generic.schedule_conv2d_NCHWc(outs)
-
-
+# conv2d_NCHWc
+reg.register_strategy("nn.contrib_conv2d_NCHWc", strategy.conv2d_NCHWc_strategy)
 reg.register_pattern("nn.contrib_conv2d_NCHWc",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
-
-@reg.register_compute("nn.contrib_conv2d_NCHWc_int8")
-def compute_contrib_conv2d_NCHWc_int8(attrs, inputs, out_dtype, target):
-    """Compute definition of conv2d NCHWc"""
-    # pylint: disable=assignment-from-no-return
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs.get_str("data_layout")
-    out_layout = attrs.get_str("out_layout")
-    out_dtype = attrs.get_str("out_dtype")
-    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-
-    out = topi.nn.conv2d_NCHWc_int8(inputs[0], inputs[1], strides, padding, dilation,
-                                    data_layout, out_layout, out_dtype)
-    return [out]
-
-
-@reg.register_schedule("nn.contrib_conv2d_NCHWc_int8")
-def schedule_contrib_conv2d_NCHWc_int8(attrs, outs, target):
-    """Schedule definition of contrib_conv2d_NCHWc_int8"""
-    with target:
-        return topi.generic.schedule_conv2d_NCHWc_int8(outs)
-
-
-reg.register_pattern("nn.contrib_conv2d_NCHWc_int8",
-                     OpPattern.OUT_ELEMWISE_FUSABLE)
-
-
-@reg.register_compute("nn.contrib_depthwise_conv2d_NCHWc")
-def compute_contrib_depthwise_conv2d_NCHWc(attrs, inputs, out_dtype, target):
-    """Compute definition of depthwise conv2d NCHWc"""
-    # pylint: disable=assignment-from-no-return
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    data_layout = attrs.get_str("data_layout")
-    out_layout = attrs.get_str("out_layout")
-    out_dtype = attrs.get_str("out_dtype")
-    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-
-    out = topi.nn.depthwise_conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation,
-                                         data_layout, out_layout, out_dtype)
-    return [out]
-
-
-@reg.register_schedule("nn.contrib_depthwise_conv2d_NCHWc")
-def schedule_contrib_depthwise_conv2d_NCHWc(attrs, outs, target):
-    """Schedule definition of contrib_conv2d_NCHWc"""
-    with target:
-        return topi.generic.schedule_depthwise_conv2d_NCHWc(outs)
-
-
+# depthwise_conv2d_NCHWc
+reg.register_strategy("nn.contrib_depthwise_conv2d_NCHWc",
+                      strategy.depthwise_conv2d_NCHWc_strategy)
 reg.register_pattern("nn.contrib_depthwise_conv2d_NCHWc",
                      OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
-@reg.register_compute("nn.deformable_conv2d")
-def compute_deformable_conv2d(attrs, inputs, out_dtype, target):
-    """Compute definition of deformable_conv2d"""
-    padding = get_const_tuple(attrs.padding)
-    strides = get_const_tuple(attrs.strides)
-    dilation = get_const_tuple(attrs.dilation)
-    deformable_groups = attrs.deformable_groups
-    groups = attrs.groups
-    out_dtype = attrs.out_dtype
-    out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
-    with target:
-        out = topi.nn.deformable_conv2d_nchw(inputs[0], inputs[1], inputs[2], strides, padding,
-                                             dilation, deformable_groups, groups, out_dtype)
-    return [out]
-
-
-@reg.register_schedule("nn.deformable_conv2d")
-def schedule_deformable_conv2d(attrs, outs, target):
-    """Schedule definition of deformable_conv2d"""
-    with target:
-        return topi.generic.schedule_deformable_conv2d_nchw(outs)
-
-
+# deformable_conv2d
+reg.register_strategy("nn.deformable_conv2d", strategy.deformable_conv2d_strategy)
 reg.register_pattern("nn.deformable_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+# bitpack
 @reg.register_compute("nn.bitpack")
-def compute_bitpack(attrs, inputs, out_dtype, target):
+def compute_bitpack(attrs, inputs, out_dtype):
     """Compute definition for bitpack"""
     bits = attrs.bits
     pack_axis = attrs.pack_axis
     bit_axis = attrs.bit_axis
     pack_type = attrs.pack_type
     name = attrs.name
-    with target:
-        out = topi.nn.bitpack(inputs[0], bits, pack_axis, bit_axis, pack_type,
-                              name)
+    out = topi.nn.bitpack(inputs[0], bits, pack_axis, bit_axis, pack_type, name)
     return [out]
 
-@reg.register_schedule("nn.bitpack")
-def schedule_bitpack(attrs, outs, target):
-    with target:
-        return topi.generic.schedule_bitpack(outs)
-
+reg.register_schedule("nn.bitpack", strategy.schedule_bitpack)
 reg.register_pattern("nn.bitpack", OpPattern.INJECTIVE)
 
 
-@reg.register_compute("nn.bitserial_conv2d")
-def compute_bitserial_conv2d(attrs, inputs, out_dtype, target):
-    """Compute definition for bitserial conv2d."""
-    padding = get_const_tuple(attrs.padding)
-    strides = get_const_tuple(attrs.strides)
-    activation_bits = attrs.activation_bits
-    weight_bits = attrs.weight_bits
-    layout = attrs.data_layout
-    pack_dtype = attrs.pack_dtype
-    out_dtype = attrs.out_dtype
-    unipolar = attrs.unipolar
-    if layout == 'NCHW':
-        with target:
-            out = topi.nn.bitserial_conv2d_nchw(
-                inputs[0], inputs[1], strides, padding, activation_bits,
-                weight_bits, pack_dtype, out_dtype, unipolar)
-    elif layout == 'NHWC':
-        with target:
-            out = topi.nn.bitserial_conv2d_nhwc(
-                inputs[0], inputs[1], strides, padding, activation_bits,
-                weight_bits, pack_dtype, out_dtype, unipolar)
-    else:
-        raise ValueError("Data layout not supported.")
-
-    return [out]
-
-
-@reg.register_schedule("nn.bitserial_conv2d")
-def schedule_bitserial_conv2d(attrs, outs, target):
-    """Schedule definition for bitserial conv2d."""
-    layout = attrs.data_layout
-    if layout == 'NCHW':
-        with target:
-            return topi.generic.schedule_bitserial_conv2d_nchw(outs)
-    elif layout == 'NHWC':
-        with target:
-            return topi.generic.schedule_bitserial_conv2d_nhwc(outs)
-    else:
-        raise ValueError("Data layout not supported.")
+# bitserial_conv2d
+reg.register_strategy("nn.bitserial_conv2d", strategy.bitserial_conv2d_strategy)
+reg.register_pattern("nn.bitserial_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 @reg.register_legalize("nn.bitserial_conv2d")
 def legalize_bitserial_conv2d(attrs, inputs, types):
@@ -962,79 +407,58 @@ def legalize_bitserial_conv2d(attrs, inputs, types):
     return topi.nn.bitserial_conv2d_legalize(attrs, inputs, types)
 
 
-reg.register_pattern("nn.bitserial_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-
 # bitserial_dense
-@reg.register_compute("nn.bitserial_dense")
-def compute_bitserial_dense(attrs, inputs, out_type, target):
-    """Compute definition of bitserial_dense"""
-    data_bits = attrs.data_bits
-    weight_bits = attrs.weight_bits
-    pack_dtype = attrs.pack_dtype
-    out_dtype = attrs.out_dtype
-    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-    unipolar = attrs.unipolar
-    return [
-        topi.nn.bitserial_dense(
-            inputs[0],
-            inputs[1],
-            data_bits,
-            weight_bits,
-            pack_dtype,
-            out_dtype,
-            unipolar)
-    ]
-
-
-@reg.register_schedule("nn.bitserial_dense")
-def schedule_bitserial_dense(attrs, outputs, target):
-    """Schedule definition of bitserial_dense"""
-    with target:
-        return topi.generic.schedule_bitserial_dense(outputs)
-
-
+reg.register_strategy("nn.bitserial_dense", strategy.bitserial_dense_strategy)
 reg.register_pattern("nn.bitserial_dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
-reg.register_pattern("nn.cross_entropy", OpPattern.OPAQUE)
-
+# cross_entropy
 @reg.register_compute("nn.cross_entropy")
-def compute_cross_entropy(attrs, inputs, out_dtype, target):
+def compute_cross_entropy(attrs, inputs, out_dtype):
     x, y = inputs
     return [-topi.sum(topi.log(x) * y) / x.shape[0]]
 
+reg.register_reduce_schedule("nn.cross_entropy")
+reg.register_pattern("nn.cross_entropy", OpPattern.OPAQUE)
 
-reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE)
 
+# cross_entropy_with_logits
 @reg.register_compute("nn.cross_entropy_with_logits")
-def compute_cross_entropy_with_logits(attrs, inputs, out_dtype, target):
+def compute_cross_entropy_with_logits(attrs, inputs, out_dtype):
     x, y = inputs
     return [-topi.sum(x * y) / x.shape[0]]
 
+reg.register_reduce_schedule("nn.cross_entropy_with_logits")
+reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE)
 
+
+# depth_to_space
 @reg.register_compute("nn.depth_to_space")
-def compute_depth_to_space(attrs, inputs, out_dtype, target):
+def compute_depth_to_space(attrs, inputs, out_dtype):
     block_size = attrs.block_size
     layout = attrs.layout
     mode = attrs.mode
     return [topi.nn.depth_to_space(inputs[0], block_size, layout=layout, mode=mode)]
 
-reg.register_schedule("nn.depth_to_space", schedule_injective)
+reg.register_injective_schedule("nn.depth_to_space")
 reg.register_pattern("nn.depth_to_space", OpPattern.INJECTIVE)
 
 
+# space_to_depth
 @reg.register_compute("nn.space_to_depth")
-def compute_space_to_depth(attrs, inputs, out_dtype, target):
+def compute_space_to_depth(attrs, inputs, out_dtype):
     block_size = attrs.block_size
     layout = attrs.layout
     return [topi.nn.space_to_depth(inputs[0], block_size, layout=layout)]
 
-reg.register_schedule("nn.space_to_depth", schedule_injective)
+reg.register_injective_schedule("nn.space_to_depth")
 reg.register_pattern("nn.space_to_depth", OpPattern.INJECTIVE)
 
 
-# shape func
+#####################
+#  Shape functions  #
+#####################
+
 @script
 def _conv2d_NCHWc_shape_func(dshape, kshape, strides, padding, dilation, oc_bn):
     out = output_tensor((dshape.shape[0],), "int64")
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 9ee43438f83d..9ecb5af8b551 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -204,7 +204,6 @@ def conv2d(data,
     # TODO enforce 4-way padding in topi/nn/conv2d after #4644 merged
     # convert 2-way padding to 4-way padding
     padding = get_pad_tuple2d(padding)
-
     return _make.conv2d(data, weight, strides, padding, dilation,
                         groups, channels, kernel_size, data_layout,
                         kernel_layout, out_layout, out_dtype)
@@ -298,7 +297,6 @@ def conv3d(data,
         dilation = (dilation, dilation, dilation)
     if isinstance(padding, int):
         padding = (padding, padding, padding)
-
     return _make.conv3d(data, weight, strides, padding, dilation,
                         groups, channels, kernel_size, data_layout,
                         kernel_layout, out_layout, out_dtype)
@@ -1772,74 +1770,6 @@ def contrib_conv2d_winograd_without_weight_transform(data,
         kernel_layout, out_layout, out_dtype)
 
 
-def contrib_conv2d_winograd_nnpack_without_weight_transform(data,
-                                                            weight,
-                                                            strides=(1, 1),
-                                                            padding=(0, 0),
-                                                            dilation=(1, 1),
-                                                            groups=1,
-                                                            channels=None,
-                                                            kernel_size=None,
-                                                            data_layout="NCHW",
-                                                            kernel_layout="OIHW",
-                                                            out_layout="",
-                                                            out_dtype=""):
-    r"""2D convolution with the NNPACK implementation of winograd algorithm.
-
-    The basic parameters are the same as the ones in vanilla conv2d.
-    It assumes the weight is pre-transformed by nn.contrib_conv2d_winograd_nnpack_weight_transform
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    weight : tvm.relay.Expr
-        The weight expressions.
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.contrib_conv2d_winograd_nnpack_without_weight_transform(
-        data, weight, strides, padding, dilation,
-        groups, channels, kernel_size, data_layout,
-        kernel_layout, out_layout, out_dtype)
-
-
 def contrib_conv2d_nchwc(data,
                          kernel,
                          strides=(1, 1),
@@ -1974,73 +1904,6 @@ def contrib_depthwise_conv2d_nchwc(data,
                                                 groups, channels, kernel_size, data_layout,
                                                 kernel_layout, out_layout, out_dtype)
 
-def contrib_conv2d_nchwc_int8(data,
-                              kernel,
-                              strides=(1, 1),
-                              padding=(0, 0),
-                              dilation=(1, 1),
-                              groups=1,
-                              channels=None,
-                              kernel_size=None,
-                              data_layout="NCHW8c",
-                              kernel_layout="OIHW",
-                              out_layout="",
-                              out_dtype=""):
-    r"""Variant of 2D convolution. It deals with only int8 inputs.
-
-    This operator takes the weight as the convolution kernel
-    and convolves it with data to produce an output, following a specialized
-    NCHWc data layout.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    kernel : tvm.relay.Expr
-        The kernel expressions.
-
-    strides : tuple of int, optional
-        The strides of convolution.
-
-    padding : tuple of int, optional
-        The padding of convolution on both sides of inputs before convolution.
-
-    dilation : tuple of int, optional
-        Specifies the dilation rate to be used for dilated convolution.
-
-    groups : int, optional
-        Number of groups for grouped convolution.
-
-    channels : int, optional
-        Number of output channels of this convolution.
-
-    kernel_size : tuple of int, optional
-        The spatial of the convolution kernel.
-
-    data_layout : str, optional
-        Layout of the input.
-
-    kernel_layout : str, optional
-        Layout of the weight.
-
-    out_layout : str, optional
-        Layout of the output, by default, out_layout is the same as data_layout
-
-    out_dtype : str, optional
-        Specifies the output data type for mixed precision conv2d.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    # convert 2-way padding to 4-way padding
-    padding = get_pad_tuple2d(padding)
-    return _make.contrib_conv2d_NCHWc_int8(data, kernel, strides, padding, dilation,
-                                           groups, channels, kernel_size, data_layout,
-                                           kernel_layout, out_layout, out_dtype)
-
 
 def contrib_conv2d_winograd_weight_transform(weight,
                                              tile_size):
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index c6d301213e98..4fd88f4383df 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -14,15 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-#pylint: disable=unused-argument
+#pylint: disable=unused-argument,invalid-name
 """The base node types for the Relay language."""
-import topi
 import tvm._ffi
 from tvm.driver import lower, build
 
 from ..base import register_relay_node
 from ..expr import RelayExpr
 from ...api import register_func
+from ...target import get_native_generic_func, GenericFunc
+from ...runtime import Object
 from . import _make
 
 @register_relay_node
@@ -143,39 +144,208 @@ class OpPattern(object):
     OPAQUE = 8
 
 
-def register_schedule(op_name, schedule=None, level=10):
-    """Register schedule function for an op
+@tvm._ffi.register_object("relay.OpImplementation")
+class OpImplementation(Object):
+    """Operator implementation"""
+    def compute(self, attrs, inputs, out_type):
+        """Call compute function.
+
+        Parameters
+        ----------
+        attrs : Attrs
+            Op attributes.
+
+        inputs : list[tvm.tensor.Tensor]
+            The input tensors.
+
+        out_type : relay.Type
+            The output type.
+
+        Returns
+        -------
+        outs : list[tvm.tensor.Tensor]
+            The output tensors.
+        """
+        return _OpImplementationCompute(self, attrs, inputs, out_type)
+
+    def schedule(self, attrs, outs, target):
+        """Call schedule function.
+
+        Parameters
+        ----------
+        attrs : Attrs
+            Op attributes.
+
+        outs : list[tvm.tensor.Tensor]
+            The output tensors.
+
+        target : tvm.target.Target
+            The target to schedule the op.
+
+        Returns
+        -------
+        schedule : tvm.Schedule
+            The schedule.
+        """
+        return _OpImplementationSchedule(self, attrs, outs, target)
+
+
+@tvm._ffi.register_object("relay.OpSpecialization")
+class OpSpecialization(Object):
+    """Operator specialization"""
+
+
+@tvm._ffi.register_object("relay.OpStrategy")
+class OpStrategy(Object):
+    """Operator strategy"""
+    def __init__(self):
+        self.__init_handle_by_constructor__(_make.OpStrategy)
+
+    def add_implementation(self, compute, schedule, name="default", plevel=10):
+        """Add an implementation to the strategy
+
+        Parameters
+        ----------
+        compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type)
+                           -> List[Tensor]
+            The compute function.
+
+        schedule : function (attrs: Attrs, outs: List[Tensor], target:Target) -> Schedule
+            The schedule function.
+
+        name : str
+            The name of implementation.
+
+        plevel : int
+            The priority level of implementation.
+        """
+        _OpStrategyAddImplementation(self, compute, schedule, name, plevel)
+
+
+def _wrap_default_fstrategy(compute, schedule, name):
+    def _fstrategy(attrs, inputs, out_type, target):
+        strategy = OpStrategy()
+        strategy.add_implementation(compute, schedule, name=name)
+        return strategy
+    return _fstrategy
+
+
+def _create_fstrategy_from_schedule(op_name, schedule):
+    assert hasattr(schedule, "dispatch_dict")
+    compute = get(op_name).get_attr("FTVMCompute")
+    assert compute is not None, "FTVMCompute is not registered for op %s" % op_name
+    fstrategy = get_native_generic_func("{}_strategy".format(op_name))
+    name_pfx = schedule.__name__
+    name_pfx = name_pfx[name_pfx.index('_')+1:]
+    fstrategy.set_default(
+        _wrap_default_fstrategy(compute, schedule.fdefault, "%s.generic" % name_pfx))
+    for key, sch in schedule.dispatch_dict.items():
+        fstrategy.register(
+            _wrap_default_fstrategy(compute, sch, "%s.%s" % (name_pfx, key)), [key])
+    return fstrategy
+
+
+def register_compute(op_name, compute=None, level=10):
+    """Register compute function for an op.
 
     Parameters
     ----------
     op_name : str
         The name of the op.
 
-    schedule : function (attrs: Attrs, outs: List[Tensor], target: Target) -> sch: Schedule
-        The schedule function.
+    compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type)
+                       -> List[Tensor]
+        The compute function.
 
     level : int
         The priority level
     """
-    return register(op_name, "FTVMSchedule", schedule, level)
+    return register(op_name, "FTVMCompute", compute, level)
 
 
-def register_compute(op_name, compute=None, level=10):
-    """Register compute function for an op.
+def register_strategy(op_name, fstrategy=None, level=10):
+    """Register strategy function for an op.
 
     Parameters
     ----------
     op_name : str
         The name of the op.
 
-    compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type, target:Target)
-                       -> List[Tensor]
-        The compute function.
+    fstrategy : function (attrs: Attrs, inputs: List[Tensor], out_type: Type,
+                          target:Target) -> OpStrategy
+        The strategy function. Need to be native GenericFunc.
 
     level : int
         The priority level
     """
-    return register(op_name, "FTVMCompute", compute, level)
+    if not isinstance(fstrategy, GenericFunc):
+        assert hasattr(fstrategy, "generic_func_node")
+        fstrategy = fstrategy.generic_func_node
+    return register(op_name, "FTVMStrategy", fstrategy, level)
+
+
+def register_schedule(op_name, schedule, level=10):
+    """Register schedule function for an op.
+
+    This is used when compute function is the same for all targets and only
+    schedule is different. It requires FTVMCompute is already registered to
+    the op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    schedule : function (attrs: Attrs, outs: List[Tensor], target:Target) -> Schedule
+        The schedule function. Need to be target.generic_func.
+
+    level : int
+        The priority level
+    """
+    fstrategy = _create_fstrategy_from_schedule(op_name, schedule)
+    return register_strategy(op_name, fstrategy, level)
+
+
+def register_injective_schedule(op_name, level=10):
+    """Register injective schedule function for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    level : int
+        The priority level
+    """
+    return register_schedule(op_name, _schedule_injective, level)
+
+
+def register_broadcast_schedule(op_name, level=10):
+    """Register broadcast schedule function for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    level : int
+        The priority level
+    """
+    return register_schedule(op_name, _schedule_injective, level)
+
+
+def register_reduce_schedule(op_name, level=10):
+    """Register reduce schedule function for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    level : int
+        The priority level
+    """
+    return register_schedule(op_name, _schedule_reduce, level)
 
 
 def register_alter_op_layout(op_name, alter_layout=None, level=10):
@@ -245,6 +415,7 @@ def register_pattern(op_name, pattern, level=10):
     """
     return register(op_name, "TOpPattern", pattern, level)
 
+
 def register_gradient(op_name, fgradient=None, level=10):
     """Register operator pattern for an op.
 
@@ -261,6 +432,7 @@ def register_gradient(op_name, fgradient=None, level=10):
     """
     return register(op_name, "FPrimalGradient", fgradient, level)
 
+
 def register_shape_func(op_name, data_dependant, shape_func=None, level=10):
     """Register operator shape function for an op.
 
@@ -290,18 +462,8 @@ def _lower(name, schedule, inputs, outputs):
 def _build(lowered_funcs):
     return build(lowered_funcs, target="llvm")
 
-
-def schedule_injective(attrs, outputs, target):
-    """Generic schedule for binary broadcast."""
-    with target:
-        return topi.generic.schedule_injective(outputs)
-
-
-def schedule_concatenate(attrs, outputs, target):
-    """Generic schedule for concatinate."""
-    with target:
-        return topi.generic.schedule_concatenate(outputs)
-
+_schedule_injective = None
+_schedule_reduce = None
 
 __DEBUG_COUNTER__ = 0
 
diff --git a/python/tvm/relay/op/strategy/__init__.py b/python/tvm/relay/op/strategy/__init__.py
new file mode 100644
index 000000000000..59adf8262664
--- /dev/null
+++ b/python/tvm/relay/op/strategy/__init__.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import
+"""Relay op strategies."""
+from __future__ import absolute_import as _abs
+
+from .generic import *
+from . import x86
+from . import arm_cpu
+from . import cuda
+from . import hls
+from . import mali
+from . import bifrost
+from . import opengl
+from . import rocm
+from . import intel_graphics
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
new file mode 100644
index 000000000000..0945f517970f
--- /dev/null
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -0,0 +1,231 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of ARM CPU operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import re
+import logging
+
+import topi
+from .generic import *
+from .. import op as _op
+
+logger = logging.getLogger('strategy')
+
+@schedule_injective.register("arm_cpu")
+def schedule_injective_arm_cpu(_, outs, target):
+    """schedule injective ops for arm cpu"""
+    with target:
+        return topi.arm_cpu.schedule_injective(outs)
+
+@schedule_concatenate.register("arm_cpu")
+def schedule_concatenate_arm_cpu(_, outs, target):
+    """schedule concatenate for arm cpu"""
+    with target:
+        return topi.arm_cpu.schedule_concatenate(outs)
+
+@conv2d_strategy.register("arm_cpu")
+def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """conv2d arm cpu strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    stride_h, stride_w = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            if kernel_layout == "OIHW":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
+                    name="conv2d_nchw_spatial_pack.arm_cpu")
+                # check if winograd algorithm is applicable
+                _, _, kh, kw = get_const_tuple(kernel.shape)
+                pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw))
+                if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \
+                    dilation_h == 1 and dilation_w == 1:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd),
+                        name="conv2d_nchw_winograd.arm_cpu",
+                        plevel=15)
+                    if "nnpack" in target.libs and pt == 1 and pb == 1 and pl == 1 and pr == 1:
+                        strategy.add_implementation(
+                            wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd_nnpack),
+                            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack),
+                            name="conv2d_nchw_winograd_nnpack.arm_cpu",
+                            plevel=13)
+            elif re.match(r"OIHW\d*o", kernel_layout):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
+                    name="conv2d_nchw_spatial_pack.arm_cpu")
+            else:
+                raise RuntimeError("Unsupported weight layout {} for conv2d NCHW".
+                                   format(kernel_layout))
+        elif layout == "HWCN":
+            assert kernel_layout == "HWIO"
+            logger.warning("conv2d_hwcn is not optimized for arm cpu.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_hwcn),
+                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
+                name="conv2d_hwcn.generic")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWIO"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
+                wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
+                name="conv2d_nhwc_spatial_pack.arm_cpu")
+        else:
+            raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout))
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout)
+            if kernel_layout == "OIHW":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
+                    name="depthwise_conv2d_nchw.arm_cpu")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack),
+                wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack),
+                name="depthwise_conv2d_nchw_spatial_pack.arm_cpu",
+                plevel=15)
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nhwc.generic")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {} for arm cpu".
+                               format(layout))
+    else: # group_conv2d
+        if layout == 'NCHW':
+            assert kernel_layout == "OIHW"
+            logger.warning("group_conv2d with layout NCHW is not optimized for arm cpu.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.generic")
+        else:
+            raise RuntimeError("Unsupported group_conv2d layout {} for arm cpu".
+                               format(layout))
+    return strategy
+
+def wrap_compute_conv2d_winograd_nnpack(topi_compute):
+    """wrap topi compute for conv2d_winograd NNPack"""
+    def _compute_conv2d_nnpack(attrs, inputs, out_type):
+        padding = attrs.get_int_tuple("padding")
+        strides = attrs.get_int_tuple("strides")
+        dilation = attrs.get_int_tuple("dilation")
+        out_dtype = attrs.get_str("out_dtype")
+        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
+        return [topi_compute(inputs[0], inputs[1], None, strides, padding,
+                             dilation, out_dtype)]
+    return _compute_conv2d_nnpack
+
+@conv2d_winograd_without_weight_transfrom_strategy.register("arm_cpu")
+def conv2d_winograd_without_weight_transfrom_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom arm cpu strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    strides = attrs.get_int_tuple("strides")
+    kernel = inputs[1]
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert strides == (1, 1), "Do not support strides now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout == "NCHW":
+        if len(kernel.shape) == 5:
+            pad_kh, pad_kw, _, _, _ = get_const_tuple(inputs[1].shape)
+            tile_size = attrs.get_int("tile_size")
+            kh = pad_kh - tile_size + 1
+            kw = pad_kw - tile_size + 1
+            assert kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd),
+                wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd),
+                name="conv2d_nchw_winograd.arm_cpu")
+        elif len(kernel.shape) == 4:
+            # kernel must be packed by winograd nnpack
+            assert "nnpack" in target.libs
+            strategy.add_implementation(
+                wrap_compute_conv2d_winograd_nnpack(
+                    topi.arm_cpu.conv2d_nchw_winograd_nnpack_without_weight_transform),
+                wrap_topi_schedule(
+                    topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack_without_weight_transform),
+                name="conv2d_nchw_winograd_nnpack_withou_weight_transform.arm_cpu",
+                plevel=5)
+        else:
+            raise RuntimeError("Unsupported kernel shape: {}".format(kernel.shape))
+    else:
+        raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}".
+                           format(layout))
+    return strategy
+
+@conv2d_transpose_strategy.register("arm_cpu")
+def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """conv2d_transpose arm cpu strategy"""
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    assert groups == 1, "only support groups == 1 for now"
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw),
+        name="conv2d_tranpose_nchw.arm_cpu")
+    return strategy
+
+@bitserial_conv2d_strategy.register("arm_cpu")
+def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """bitserial_conv2d x86 strategy"""
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw),
+            wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw),
+            name="bitserial_conv2d_nchw.arm_cpu")
+    elif layout == "NHWC":
+        strategy.add_implementation(
+            wrap_compute_bitserial_conv2d(topi.arm_cpu.bitserial_conv2d_nhwc),
+            wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_conv2d_nhwc),
+            name="bitserial_conv2d_nhwc.arm_cpu")
+    else:
+        raise ValueError("Data layout {} not supported.".format(layout))
+    return strategy
+
+@bitserial_dense_strategy.register("arm_cpu")
+def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
+    """bitserial_dense arm cpu strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_bitserial_dense(topi.arm_cpu.bitserial_dense),
+        wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense),
+        name="bitserial_dense.arm_cpu")
+    return strategy
diff --git a/python/tvm/relay/op/strategy/bifrost.py b/python/tvm/relay/op/strategy/bifrost.py
new file mode 100644
index 000000000000..e8f62980a621
--- /dev/null
+++ b/python/tvm/relay/op/strategy/bifrost.py
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of bifrost operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import re
+import topi
+from .generic import *
+from .. import op as _op
+
+
+@conv2d_strategy.register("bifrost")
+def conv2d_strategy_bifrost(attrs, inputs, out_type, target):
+    """conv2d mali(bifrost) strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    stride_h, stride_w = attrs.get_int_tuple("strides")
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            if kernel_layout == "OIHW":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack),
+                    wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack),
+                    name="conv2d_nchw_spatial_pack.bifrost")
+
+                _, _, kh, kw = get_const_tuple(kernel.shape)
+                if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \
+                   dilation_h == 1 and dilation_w == 1:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd),
+                        wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd),
+                        name="conv2d_nchw_winograd.bifrost",
+                        plevel=15)
+            elif re.match(r"OIHW\d*o", kernel_layout):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.bifrost.conv2d_nchw_spatial_pack),
+                    wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_spatial_pack),
+                    name="conv2d_nchw_spatial_pack.bifrost")
+        else:
+            raise RuntimeError("Unsupported conv2d layout {} for Mali(Bifrost)".
+                               format(layout))
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
+                wrap_topi_schedule(topi.bifrost.schedule_depthwise_conv2d_nchw),
+                name="depthwise_conv2d_nchw.bifrost")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {} for Mali(Bifrost)".
+                               format(layout))
+    else: # group_conv2d
+        raise RuntimeError("group_conv2d is not supported for Mali(Bifrost)")
+    return strategy
+
+@conv2d_winograd_without_weight_transfrom_strategy.register("bifrost")
+def conv2d_winograd_without_weight_transfrom_strategy_bifrost(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom mali(bifrost) strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    strides = attrs.get_int_tuple("strides")
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert strides == (1, 1), "Do not support strides now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.bifrost.conv2d_nchw_winograd),
+            wrap_topi_schedule(topi.bifrost.schedule_conv2d_nchw_winograd),
+            name="conv2d_nchw_winograd.bifrost")
+    else:
+        raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}".
+                           format(layout))
+    return strategy
+
+@dense_strategy.register("bifrost")
+def dense_strategy_bifrost(attrs, inputs, out_type, target):
+    """dense mali(bifrost) strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_dense(topi.bifrost.dense),
+                                wrap_topi_schedule(topi.bifrost.schedule_dense),
+                                name="dense.bifrost")
+    return strategy
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
new file mode 100644
index 000000000000..b2f559f12131
--- /dev/null
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -0,0 +1,398 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of CUDA/GPU operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import topi
+from tvm.te import SpecializedCondition
+from .generic import *
+from .. import op as _op
+
+@schedule_injective.register(["cuda", "gpu"])
+def schedule_injective_cuda(attrs, outs, target):
+    """schedule injective ops for cuda"""
+    with target:
+        return topi.cuda.schedule_injective(outs)
+
+@schedule_reduce.register(["cuda", "gpu"])
+def schedule_reduce_cuda(attrs, outs, target):
+    """schedule reduction ops for cuda"""
+    with target:
+        return topi.cuda.schedule_reduce(outs)
+
+@schedule_concatenate.register(["cuda", "gpu"])
+def schedule_concatenate_cuda(attrs, outs, target):
+    """schedule concatenate for cuda"""
+    with target:
+        return topi.cuda.schedule_injective(outs)
+
+@schedule_pool.register(["cuda", "gpu"])
+def schedule_pool_cuda(attrs, outs, target):
+    """schedule pooling ops for cuda"""
+    with target:
+        return topi.cuda.schedule_pool(outs, attrs.layout)
+
+@schedule_pool_grad.register(["cuda", "gpu"])
+def schedule_pool_grad_cuda(attrs, outs, target):
+    """schedule pooling gradient ops for cuda"""
+    with target:
+        return topi.cuda.schedule_pool_grad(outs)
+
+@schedule_adaptive_pool.register(["cuda", "gpu"])
+def schedule_adaptive_pool_cuda(attrs, outs, target):
+    """schedule adaptive pooling ops for cuda"""
+    with target:
+        return topi.cuda.schedule_adaptive_pool(outs)
+
+@schedule_softmax.register(["cuda", "gpu"])
+def schedule_softmax_cuda(attrs, outs, target):
+    """schedule softmax for cuda"""
+    with target:
+        return topi.cuda.schedule_softmax(outs)
+
+@schedule_lrn.register(["cuda", "gpu"])
+def schedule_lrn_cuda(attrs, outs, target):
+    """schedule LRN for cuda"""
+    with target:
+        return topi.cuda.schedule_lrn(outs)
+
+@conv2d_strategy.register(["cuda", "gpu"])
+def conv2d_strategy_cuda(attrs, inputs, out_type, target):
+    """conv2d cuda strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    stride_h, stride_w = attrs.get_int_tuple("strides")
+    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    padding = attrs.get_int_tuple("padding")
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8.
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.conv2d_nchw),
+                wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
+                name="conv2d_nchw.cuda")
+            _, _, kh, kw = get_const_tuple(kernel.shape)
+            if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \
+                dilation_h == 1 and dilation_w == 1:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
+                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd),
+                    name="conv2d_nchw_winograd.cuda",
+                    plevel=15)
+        elif layout == "HWCN":
+            assert kernel_layout == "HWIO"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
+                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
+                name="conv2d_hwcn.cuda")
+        # TODO(@alexgl-github): Re-enable this after fix the conv2d_nhwc for cuda
+        # elif layout == "NHWC":
+        #     assert kernel_layout == "HWIO"
+        #     strategy.add_implementation(
+        #         wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
+        #         wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
+        #         name="conv2d_nhwc.cuda")
+        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
+            assert kernel_layout == "OIHW4o4i"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
+                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
+                name="conv2d_NCHWc_int8.cuda")
+        else:
+            raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout))
+        # add cudnn implementation
+        if target.target_name == "cuda" and "cudnn" in target.libs:
+            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
+                    padding[1] == padding[3]:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn, True),
+                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
+                    name="conv2d_cudnn.cuda",
+                    plevel=5)
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
+                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
+                name="dpethwise_nchw.cuda")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nhwc.cuda")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
+    else: # group_conv2d
+        if layout == 'NCHW':
+            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.cuda")
+        elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]:
+            assert kernel_layout == "OIHW4o4i"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
+                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
+                name="group_conv2d_NCHWc_int8.cuda")
+        else:
+            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
+    return strategy
+
+@conv2d_winograd_without_weight_transfrom_strategy.register(["cuda", "gpu"])
+def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom cuda strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform),
+            wrap_topi_schedule(
+                topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform),
+            name="conv2d_nchw_winograd_without_weight_transform.cuda")
+    else:
+        raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}".
+                           format(layout))
+    return strategy
+
+@deformable_conv2d_strategy.register(["cuda", "gpu"])
+def deformable_conv2d_strategy_cuda(attrs, inputs, out_type, target):
+    """deformable_conv2d cuda strategy"""
+    layout = attrs.data_layout
+    assert layout == "NCHW"
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_deformable_conv2d(topi.cuda.deformable_conv2d_nchw),
+        wrap_topi_schedule(topi.cuda.schedule_deformable_conv2d_nchw),
+        name="deformable_conv2d_nchw.cuda")
+    return strategy
+
+@conv2d_transpose_strategy.register(["cuda", "gpu"])
+def conv2d_transpose_strategy_cuda(attrs, inputs, out_type, target):
+    """conv2d_transpose cuda strategy"""
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    assert groups == 1, "only support groups == 1 for now"
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.cuda.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.cuda.schedule_conv2d_transpose_nchw),
+        name="conv2d_transpose_nchw.cuda")
+    return strategy
+
+@conv3d_strategy.register(["cuda", "gpu"])
+def conv3d_strategy_cuda(attrs, inputs, out_type, target):
+    """conv3d cuda strategy"""
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} yet".format(layout)
+    if layout == "NCDHW":
+        strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ncdhw),
+                                    wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw),
+                                    name="conv3d_ncdhw.cuda",
+                                    plevel=10)
+    else: # layout == "NDHWC":
+        strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
+                                    wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc),
+                                    name="conv3d_ndhwc.cuda",
+                                    plevel=10)
+    if target.target_name == "cuda" and "cudnn" in target.libs:
+        strategy.add_implementation(wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True),
+                                    wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn),
+                                    name="conv3d_cudnn.cuda",
+                                    plevel=15)
+    return strategy
+
+@conv1d_strategy.register(["cuda", "gpu"])
+def conv1d_strategy_cuda(attrs, inputs, out_type, target):
+    """conv1d cuda strategy"""
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    if dilation[0] < 1:
+        raise ValueError("dilation should be a positive value")
+    strategy = _op.OpStrategy()
+    if layout == "NCW":
+        strategy.add_implementation(wrap_compute_conv1d(topi.cuda.conv1d_ncw),
+                                    wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw),
+                                    name="conv1d_ncw.cuda")
+    elif layout == "NWC":
+        strategy.add_implementation(wrap_compute_conv1d(topi.cuda.conv1d_nwc),
+                                    wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc),
+                                    name="conv1d_nwc.cuda")
+    else:
+        raise ValueError("Unsupported conv1d layout {}".format(layout))
+    return strategy
+
+@conv1d_transpose_strategy.register(["cuda", "gpu"])
+def conv1d_transpose_strategy_cuda(attrs, inputs, out_type, target):
+    """conv1d_transpose cuda strategy"""
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    assert layout == "NCW", "conv1d_transpose ncw only supported"
+    assert dilation == (1,), "conv1d_transpose dilation is not supported"
+    assert groups == 1, "conv1d_transpose groups == 1 only supported"
+    strategy.add_implementation(wrap_compute_conv1d_transpose(topi.cuda.conv1d_transpose_ncw),
+                                wrap_topi_schedule(topi.cuda.schedule_conv1d_transpose_ncw),
+                                name="conv1d_transpose_ncw.cuda")
+    return strategy
+
+@dense_strategy.register(["cuda", "gpu"])
+def dense_strategy_cuda(attrs, inputs, out_type, target):
+    """dense cuda strategy"""
+    strategy = _op.OpStrategy()
+    if out_type.dtype == "int8":
+        strategy.add_implementation(
+            wrap_compute_dense(topi.cuda.dense_int8),
+            wrap_topi_schedule(topi.cuda.schedule_dense_int8),
+            name="dense_int8.cuda")
+    else:
+        strategy.add_implementation(
+            wrap_compute_dense(topi.cuda.dense_small_batch),
+            wrap_topi_schedule(topi.cuda.schedule_dense_small_batch),
+            name="dense_small_batch.cuda")
+        b = inputs[0].shape[0]
+        with SpecializedCondition(b >= 32):
+            strategy.add_implementation(
+                wrap_compute_dense(topi.cuda.dense_large_batch),
+                wrap_topi_schedule(topi.cuda.schedule_dense_large_batch),
+                name="dense_large_batch.cuda",
+                plevel=15)
+    if target.target_name == "cuda" and "cublas" in target.libs:
+        strategy.add_implementation(
+            wrap_compute_dense(topi.cuda.dense_cublas),
+            wrap_topi_schedule(topi.cuda.schedule_dense_cublas),
+            name="dense_cublas.cuda",
+            plevel=20)
+    return strategy
+
+@batch_matmul_strategy.register(["cuda", "gpu"])
+def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
+    """batch_matmul cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_batch_matmul(topi.nn.batch_matmul),
+        wrap_topi_schedule(topi.cuda.schedule_batch_matmul),
+        name="batch_matmul.cuda",
+        plevel=10)
+    if target.target_name == "cuda" and "cublas" in target.libs:
+        strategy.add_implementation(
+            wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas),
+            wrap_topi_schedule(topi.generic.schedule_extern),
+            name="batch_matmul_cublas.cuda",
+            plevel=15)
+    return strategy
+
+@argsort_strategy.register(["cuda", "gpu"])
+def argsort_strategy_cuda(attrs, inputs, out_type, target):
+    """argsort cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_argsort(topi.cuda.argsort),
+        wrap_topi_schedule(topi.cuda.schedule_argsort),
+        name="argsort.cuda")
+    return strategy
+
+@topk_strategy.register(["cuda", "gpu"])
+def topk_strategy_cuda(attrs, inputs, out_type, target):
+    """topk cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_topk(topi.cuda.topk),
+                                wrap_topi_schedule(topi.cuda.schedule_topk),
+                                name="topk.cuda")
+    return strategy
+
+@multibox_prior_strategy.register(["cuda", "gpu"])
+def multibox_prior_strategy_cuda(attrs, inputs, out_type, target):
+    """multibox_prior cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_multibox_prior(topi.cuda.multibox_prior),
+        wrap_topi_schedule(topi.cuda.schedule_multibox_prior),
+        name="multibox_prior.cuda")
+    return strategy
+
+@multibox_transform_loc_strategy.register(["cuda", "gpu"])
+def multibox_transform_loc_strategy_cuda(attrs, inputs, out_type, target):
+    """multibox_transform_loc cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_multibox_transform_loc(topi.cuda.multibox_transform_loc),
+        wrap_topi_schedule(topi.cuda.schedule_multibox_transform_loc),
+        name="multibox_transform_loc.cuda")
+    return strategy
+
+@get_valid_counts_strategy.register(["cuda", "gpu"])
+def get_valid_counts_strategy_cuda(attrs, inputs, out_type, target):
+    """get_valid_counts cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_get_valid_counts(topi.cuda.get_valid_counts),
+        wrap_topi_schedule(topi.cuda.schedule_get_valid_counts),
+        name="get_valid_counts.cuda")
+    return strategy
+
+@nms_strategy.register(["cuda", "gpu"])
+def nms_strategy_cuda(attrs, inputs, out_type, target):
+    """nms cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_nms(topi.cuda.non_max_suppression),
+        wrap_topi_schedule(topi.cuda.schedule_nms),
+        name="nms.cuda")
+    return strategy
+
+@roi_align_strategy.register(["cuda", "gpu"])
+def roi_align_strategy_cuda(attrs, inputs, out_type, target):
+    """roi_align cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw),
+                                wrap_topi_schedule(topi.cuda.schedule_roi_align),
+                                name="roi_align_nchw.cuda")
+    return strategy
+
+@schedule_roi_pool.register(["cuda", "gpu"])
+def schedule_roi_pool_cuda(attrs, outs, target):
+    """schedule roi_pool for cuda"""
+    with target:
+        return topi.cuda.schedule_roi_pool(outs)
+
+@proposal_strategy.register(["cuda", "gpu"])
+def proposal_strategy_cuda(attrs, inputs, out_type, target):
+    """proposal cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_proposal(topi.cuda.proposal),
+                                wrap_topi_schedule(topi.cuda.schedule_proposal),
+                                name="proposal.cuda")
+    return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
new file mode 100644
index 000000000000..312ce95b2510
--- /dev/null
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -0,0 +1,749 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of generic operator strategy."""
+# pylint: disable=invalid-name,unused-argument
+import logging
+
+import re
+import topi
+from topi.util import get_const_int, get_const_float, get_const_tuple, get_float_tuple
+from .. import op as _op
+from ....target import generic_func, override_native_generic_func
+
+logger = logging.getLogger('strategy')
+
+def wrap_topi_schedule(topi_schedule):
+    """Wrap TOPI schedule which doesn't use attrs"""
+    def wrapper(attrs, outs, target):
+        with target:
+            return topi_schedule(outs)
+    return wrapper
+
+def get_conv2d_in_channels(data_shape, data_layout):
+    """Get conv2d input channels"""
+    data_shape = get_const_tuple(data_shape)
+    if len(data_shape) == 4:
+        idx = data_layout.find("C")
+        assert idx >= 0, "Invalid conv2d data layout {}".format(data_layout)
+        return data_shape[idx]
+    if re.match(r"NCHW\d*c", data_layout):
+        # NCHW[8]c
+        return data_shape[1] * data_shape[4]
+    raise ValueError("Unknown conv2d data layout {}".format(data_layout))
+
+def get_conv2d_out_channels(kernel_shape, kernel_layout):
+    """Get conv2d output channels"""
+    kernel_shape = get_const_tuple(kernel_shape)
+    if len(kernel_shape) == 4:
+        idx = kernel_layout.find("O")
+        assert idx >= 0, "Invalid conv2d kernel layout {}".format(kernel_layout)
+        return kernel_shape[idx]
+    if re.match(r"OIHW\d*i\d*o", kernel_layout):
+        return kernel_shape[0] * kernel_shape[5]
+    if re.match(r"OIHW\d*o", kernel_layout):
+        return kernel_shape[0] * kernel_shape[4]
+    raise ValueError("Unknown conv2d kernel layout {}".format(kernel_layout))
+
+def is_depthwise_conv2d(data_shape, data_layout, kernel_shape, kernel_layout, groups):
+    ic = get_conv2d_in_channels(data_shape, data_layout)
+    oc = get_conv2d_out_channels(kernel_shape, kernel_layout)
+    return ic == oc == groups
+
+@generic_func
+def schedule_injective(attrs, outs, target):
+    """Schedule injective ops"""
+    with target:
+        return topi.generic.schedule_injective(outs)
+
+@generic_func
+def schedule_reduce(attrs, outs, target):
+    """Schedule reduction ops"""
+    with target:
+        return topi.generic.schedule_reduce(outs)
+
+_op._schedule_injective = schedule_injective
+_op._schedule_reduce = schedule_reduce
+
+# concatenate
+@generic_func
+def schedule_concatenate(attrs, outs, target):
+    """Schedule concatenate op"""
+    with target:
+        return topi.generic.schedule_injective(outs)
+
+# pool
+@generic_func
+def schedule_pool(attrs, outs, target):
+    """Schedule pooling ops"""
+    with target:
+        return topi.generic.schedule_pool(outs, attrs.layout)
+
+# pool_grad
+@generic_func
+def schedule_pool_grad(attrs, outs, target):
+    """Schedule pooling gradient ops"""
+    with target:
+        return topi.generic.schedule_pool_grad(outs)
+
+# adaptive pool
+@generic_func
+def schedule_adaptive_pool(attrs, outs, target):
+    """Schedule adaptive pooling ops"""
+    with target:
+        return topi.generic.schedule_adaptive_pool(outs)
+
+# softmax
+@generic_func
+def schedule_softmax(attrs, outs, target):
+    """Schedule softmax"""
+    with target:
+        return topi.generic.schedule_softmax(outs)
+
+# lrn
+@generic_func
+def schedule_lrn(attrs, outs, target):
+    """Schedule LRN op"""
+    with target:
+        return topi.generic.schedule_lrn(outs)
+
+# bitpack
+@generic_func
+def schedule_bitpack(attrs, outs, target):
+    """Schedule bitpack"""
+    with target:
+        return topi.generic.schedule_bitpack(outs)
+
+# conv2d
+def wrap_compute_conv2d(topi_compute, need_data_layout=False, need_out_layout=False,
+                        has_groups=False):
+    """Wrap conv2d topi compute"""
+    def _compute_conv2d(attrs, inputs, out_type):
+        padding = get_const_tuple(attrs.padding)
+        strides = get_const_tuple(attrs.strides)
+        dilation = get_const_tuple(attrs.dilation)
+        data_layout = attrs.get_str("data_layout")
+        out_layout = attrs.get_str("out_layout")
+        out_dtype = attrs.out_dtype
+        out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
+                     else out_dtype)
+        args = [inputs[0], inputs[1], strides, padding, dilation]
+        if has_groups:
+            args.append(attrs.groups)
+        if need_data_layout:
+            args.append(data_layout)
+        if need_out_layout:
+            args.append(out_layout)
+        args.append(out_dtype)
+        return [topi_compute(*args)]
+    return _compute_conv2d
+
+@override_native_generic_func("conv2d_strategy")
+def conv2d_strategy(attrs, inputs, out_type, target):
+    """conv2d generic strategy"""
+    logger.warning("conv2d is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    (dilation_h, dilation_w) = dilation
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_nchw),
+                wrap_topi_schedule(topi.generic.schedule_conv2d_nchw),
+                name="conv2d_nchw.generic")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWIO"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
+                wrap_topi_schedule(topi.generic.schedule_conv2d_nhwc),
+                name="conv2d_nhwc.generic")
+        elif layout == "HWCN":
+            assert kernel_layout == "HWIO"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_hwcn),
+                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
+                name="conv2d_hwcn.generic")
+        else:
+            raise RuntimeError("Unsupported conv2d layout {}".format(layout))
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
+                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw),
+                name="depthwise_conv2d_nchw.generic")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nhwc.generic")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
+    else: # group_conv2d
+        if layout == 'NCHW':
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.generic")
+        else:
+            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
+    return strategy
+
+# conv2d_NCHWc
+@override_native_generic_func("conv2d_NCHWc_strategy")
+def conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
+    """conv2d_NCHWc generic strategy"""
+    logger.warning("conv2d_NCHWc is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    if inputs[0].dtype == "int8" or inputs[0].dtype == "uint8":
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.nn.conv2d_NCHWc_int8, True, True),
+            wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc_int8),
+            name="conv2d_NCHWc_int8.generic")
+    else:
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True),
+            wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc),
+            name="conv2d_NCHWc.generic")
+    return strategy
+
+# depthwise_conv2d_NCHWc
+@override_native_generic_func("depthwise_conv2d_NCHWc_strategy")
+def depthwise_conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
+    """depthwise_conv2d generic strategy"""
+    logger.warning("depthwise_conv2d_NCHWc is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d(topi.nn.depthwise_conv2d_NCHWc, True, True),
+        wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_NCHWc),
+        name="depthwise_conv2d_NCHWc.generic")
+    return strategy
+
+# conv2d_winograd_without_weight_transform
+@override_native_generic_func("conv2d_winograd_without_weight_transform_strategy")
+def conv2d_winograd_without_weight_transfrom_strategy(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom generic strategy"""
+    raise ValueError("No generic implemenation for conv2d_winograd_without_weight_transform")
+
+# conv2d_winograd_weight_transform
+@generic_func
+def schedule_conv2d_winograd_weight_transform(attrs, outs, target):
+    """Schedule conv2d_winograd_weight_transform"""
+    with target:
+        return topi.generic.schedule_conv2d_winograd_weight_transform(outs)
+
+# conv2d_winograd_nnpack_weight_transform
+@generic_func
+def schedule_conv2d_winograd_nnpack_weight_transform(attrs, outs, target):
+    """Schedule conv2d_winograd_nnpack_weight_transform"""
+    with target:
+        return topi.generic.schedule_conv2d_winograd_nnpack_weight_transform(outs)
+
+# deformable_conv2d
+def wrap_compute_deformable_conv2d(topi_compute):
+    """wrap deformable_conv2d topi compute"""
+    def _compute_deformable_conv2d(attrs, inputs, out_dtype):
+        assert attrs.data_layout == "NCHW"
+        padding = get_const_tuple(attrs.padding)
+        strides = get_const_tuple(attrs.strides)
+        dilation = get_const_tuple(attrs.dilation)
+        deformable_groups = attrs.deformable_groups
+        groups = attrs.groups
+        out_dtype = attrs.out_dtype
+        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
+        out = topi_compute(inputs[0], inputs[1], inputs[2], strides, padding,
+                           dilation, deformable_groups, groups, out_dtype)
+        return [out]
+    return _compute_deformable_conv2d
+
+@override_native_generic_func("deformable_conv2d_strategy")
+def deformable_conv2d_strategy(attrs, inputs, out_type, target):
+    """deformable_conv2d generic strategy"""
+    logger.warning("deformable_conv2d is not optimized for this platform.")
+    layout = attrs.data_layout
+    assert layout == "NCHW"
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_deformable_conv2d(topi.nn.deformable_conv2d_nchw),
+        wrap_topi_schedule(topi.generic.schedule_deformable_conv2d_nchw),
+        name="deformable_conv2d.generic")
+    return strategy
+
+# conv2d_transpose
+def wrap_compute_conv2d_transpose(topi_compute):
+    """wrap conv2d_transpose topi compute"""
+    def compute_conv2d_transpose(attrs, inputs, out_dtype):
+        """Compute definition of conv2d_transpose"""
+        padding = get_const_tuple(attrs.padding)
+        strides = get_const_tuple(attrs.strides)
+        out_dtype = attrs.out_dtype
+        out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
+                     else out_dtype)
+        out = topi_compute(
+            inputs[0], inputs[1], strides, padding, out_dtype)
+        output_padding = get_const_tuple(attrs.output_padding)
+        out = topi.nn.pad(out, [0, 0, 0, 0],
+                          [0, 0, output_padding[0], output_padding[1]])
+        return [out]
+    return compute_conv2d_transpose
+
+@override_native_generic_func("conv2d_transpose_strategy")
+def conv2d_transpose_strategy(attrs, inputs, out_type, target):
+    """conv2d_transpose generic strategy"""
+    logger.warning("conv2d_transpose is not optimized for this platform.")
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    assert groups == 1, "only support groups == 1 for now"
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw),
+        name="conv2d_transpose_nchw.generic")
+    return strategy
+
+# conv3d
+def wrap_compute_conv3d(topi_compute, need_layout=False):
+    """wrap conv3d topi compute"""
+    def _compute_conv3d(attrs, inputs, out_type):
+        padding = get_const_tuple(attrs.padding)
+        strides = get_const_tuple(attrs.strides)
+        dilation = get_const_tuple(attrs.dilation)
+        groups = attrs.groups
+        layout = attrs.data_layout
+        out_dtype = attrs.out_dtype
+        out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
+                     else out_dtype)
+
+        (dilation_d, dilation_h, dilation_w) = dilation
+        if dilation_d < 1 or dilation_h < 1 or dilation_w < 1:
+            raise ValueError("Dilation should be positive value")
+        if groups != 1:
+            raise ValueError("Not support arbitrary group number for conv3d")
+        if need_layout:
+            out = topi_compute(inputs[0], inputs[1], strides, padding, dilation,
+                               layout, out_dtype)
+        else:
+            out = topi_compute(inputs[0], inputs[1], strides, padding, dilation,
+                               out_dtype)
+        return [out]
+    return _compute_conv3d
+
+@override_native_generic_func("conv3d_strategy")
+def conv3d_strategy(attrs, inputs, out_type, target):
+    """conv3d generic strategy"""
+    logger.warning("conv3d is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    if layout == "NCDHW":
+        strategy.add_implementation(
+            wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
+            wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw),
+            name="conv3d_ncdhw.generic")
+    elif layout == "NDHWC":
+        strategy.add_implementation(
+            wrap_compute_conv3d(topi.nn.conv3d_ndhwc),
+            wrap_topi_schedule(topi.generic.schedule_conv3d_ndhwc),
+            name="conv3d_ndhwc.generic")
+    else:
+        raise ValueError("Not support this layout {} yet".format(layout))
+    return strategy
+
+# conv1d
+def wrap_compute_conv1d(topi_compute):
+    """wrap conv1d topi compute"""
+    def _compute_conv1d(attrs, inputs, out_type):
+        """Compute definition of conv1d"""
+        strides = get_const_tuple(attrs.strides)
+        padding = get_const_tuple(attrs.padding)
+        dilation = get_const_tuple(attrs.dilation)
+        out_dtype = attrs.out_dtype
+        out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
+                     else out_dtype)
+        return [topi_compute(inputs[0], inputs[1], strides, padding, dilation,
+                             out_dtype)]
+    return _compute_conv1d
+
+@override_native_generic_func("conv1d_strategy")
+def conv1d_strategy(attrs, inputs, out_type, target):
+    """conv1d generic strategy"""
+    logger.warning("conv1d is not optimized for this platform.")
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    if dilation[0] < 1:
+        raise ValueError("dilation should be a positive value")
+    strategy = _op.OpStrategy()
+    if layout == "NCW":
+        strategy.add_implementation(
+            wrap_compute_conv1d(topi.nn.conv1d_ncw),
+            wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
+            name="conv1d_ncw.generic")
+    elif layout == "NWC":
+        strategy.add_implementation(
+            wrap_compute_conv1d(topi.nn.conv1d_nwc),
+            wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
+            name="conv1d_nwc.generic")
+    else:
+        raise ValueError("Unsupported conv1d layout {}".format(layout))
+    return strategy
+
+# conv1d_transpose
+def wrap_compute_conv1d_transpose(topi_compute):
+    """wrap conv1d_transpose topi compute"""
+    def _compute_conv1d_tranpsoe(attrs, inputs, out_type):
+        padding = get_const_tuple(attrs.padding)
+        strides = get_const_tuple(attrs.strides)
+        out_dtype = attrs.out_dtype
+        out_dtype = (inputs[0].dtype if out_dtype in ("same", "") else out_dtype)
+        out = topi_compute(inputs[0], inputs[1], strides, padding, out_dtype)
+        output_padding = get_const_tuple(attrs.output_padding)
+        out = topi.nn.pad(out, [0, 0, 0], [0, 0, output_padding[0]])
+        return [out]
+    return _compute_conv1d_tranpsoe
+
+@override_native_generic_func("conv1d_transpose_strategy")
+def conv1d_transpose_strategy(attrs, inputs, out_type, target):
+    """conv1d_transpose generic strategy"""
+    logger.warning("conv1d_transpose is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    assert layout == "NCW", "conv1d_transpose ncw only supported"
+    assert dilation == (1,), "conv1d_transpose dilation is not supported"
+    assert groups == 1, "conv1d_transpose groups == 1 only supported"
+    strategy.add_implementation(wrap_compute_conv1d_transpose(topi.nn.conv1d_transpose_ncw),
+                                wrap_topi_schedule(topi.generic.schedule_conv1d_transpose_ncw),
+                                name="conv1d_transpose_ncw.generic")
+    return strategy
+
+# dense
+def wrap_compute_dense(topi_compute):
+    """wrap dense topi compute"""
+    def _compute_dense(attrs, inputs, out_type):
+        """Compute definition of dense"""
+        out_dtype = attrs.out_dtype
+        out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+        return [topi_compute(inputs[0], inputs[1], None, out_dtype)]
+    return _compute_dense
+
+@override_native_generic_func("dense_strategy")
+def dense_strategy(attrs, inputs, out_type, target):
+    """dense generic strategy"""
+    logger.warning("dense is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_dense(topi.nn.dense),
+                                wrap_topi_schedule(topi.generic.schedule_dense),
+                                name="dense.generic")
+    return strategy
+
+# batch_matmul
+def wrap_compute_batch_matmul(topi_compute):
+    """wrap batch_matmul topi compute"""
+    def _compute_batch_matmul(attrs, inputs, out_type):
+        return [topi_compute(inputs[0], inputs[1])]
+    return _compute_batch_matmul
+
+@override_native_generic_func("batch_matmul_strategy")
+def batch_matmul_strategy(attrs, inputs, out_type, target):
+    """batch_matmul generic strategy"""
+    logger.warning("batch_matmul is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_batch_matmul(topi.nn.batch_matmul),
+                                wrap_topi_schedule(topi.generic.schedule_batch_matmul),
+                                name="batch_matmul.generic")
+    return strategy
+
+# sparse_dense
+@generic_func
+def schedule_sparse_dense(attrs, outs, target):
+    """schedule sparse_dense"""
+    with target:
+        return topi.generic.schedule_sparse_dense(outs)
+
+# sparse_transpose
+@generic_func
+def schedule_sparse_transpose(attrs, outs, target):
+    """schedule sparse_transpose"""
+    with target:
+        return topi.generic.schedule_sparse_transpose(outs)
+
+# argsort
+def wrap_compute_argsort(topi_compute):
+    """Wrap argsort topi compute"""
+    def _compute_argsort(attrs, inputs, _):
+        axis = get_const_int(attrs.axis)
+        is_ascend = bool(get_const_int(attrs.is_ascend))
+        dtype = attrs.dtype
+        return [topi_compute(inputs[0], axis=axis, is_ascend=is_ascend, dtype=dtype)]
+    return _compute_argsort
+
+@override_native_generic_func("argsort_strategy")
+def argsort_strategy(attrs, inputs, out_type, target):
+    """argsort generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_argsort(topi.argsort),
+                                wrap_topi_schedule(topi.generic.schedule_argsort),
+                                name="argsort.generic")
+    return strategy
+
+# topk
+def wrap_compute_topk(topi_compute):
+    """Wrap topk compute"""
+    def _compute_topk(attrs, inputs, out_type):
+        k = get_const_int(attrs.k)
+        axis = get_const_int(attrs.axis)
+        ret_type = attrs.ret_type
+        is_ascend = bool(get_const_int(attrs.is_ascend))
+        dtype = attrs.dtype
+        out = topi_compute(inputs[0], k, axis, ret_type, is_ascend, dtype)
+        out = out if isinstance(out, list) else [out]
+        return out
+    return _compute_topk
+
+@override_native_generic_func("topk_strategy")
+def topk_strategy(attrs, inputs, out_type, target):
+    """topk generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_topk(topi.topk),
+                                wrap_topi_schedule(topi.generic.schedule_topk),
+                                name="topk.generic")
+    return strategy
+
+# multibox_prior
+def wrap_compute_multibox_prior(topi_compute):
+    """Wrap multibox_prior compute"""
+    def _compute_multibox_prior(attrs, inputs, _):
+        """Compute definition of multibox_prior"""
+        sizes = get_float_tuple(attrs.sizes)
+        ratios = get_float_tuple(attrs.ratios)
+        steps = get_float_tuple(attrs.steps)
+        offsets = get_float_tuple(attrs.offsets)
+        clip = bool(get_const_int(attrs.clip))
+        return [topi_compute(inputs[0], sizes, ratios, steps, offsets, clip)]
+    return _compute_multibox_prior
+
+@override_native_generic_func("multibox_prior_strategy")
+def multibox_prior_strategy(attrs, inputs, out_type, target):
+    """multibox_prior generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_multibox_prior(topi.vision.ssd.multibox_prior),
+                                wrap_topi_schedule(topi.generic.schedule_multibox_prior),
+                                name="multibox_prior.generic")
+    return strategy
+
+# multibox_transform_loc
+def wrap_compute_multibox_transform_loc(topi_compute):
+    """Wrap multibox_transform_loc compute"""
+    def _compute_multibox_transform_loc(attrs, inputs, _):
+        """Compute definition of multibox_detection"""
+        clip = bool(get_const_int(attrs.clip))
+        threshold = get_const_float(attrs.threshold)
+        variances = get_float_tuple(attrs.variances)
+        return topi_compute(
+            inputs[0], inputs[1], inputs[2], clip, threshold, variances)
+    return _compute_multibox_transform_loc
+
+@override_native_generic_func("multibox_transform_loc_strategy")
+def multibox_transform_loc_strategy(attrs, inputs, out_type, target):
+    """schedule multibox_transform_loc"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_multibox_transform_loc(topi.vision.ssd.multibox_transform_loc),
+        wrap_topi_schedule(topi.generic.schedule_multibox_transform_loc),
+        name="multibox_transform_loc.generic")
+    return strategy
+
+# get_valid_counts
+def wrap_compute_get_valid_counts(topi_compute):
+    """wrap get_valid_counts topi compute"""
+    def _compute_get_valid_counts(attrs, inputs, out_type):
+        score_threshold = get_const_float(attrs.score_threshold)
+        id_index = get_const_int(attrs.id_index)
+        score_index = get_const_int(attrs.score_index)
+        return topi_compute(inputs[0], score_threshold, id_index, score_index)
+    return _compute_get_valid_counts
+
+@override_native_generic_func("get_valid_counts_strategy")
+def get_valid_counts_strategy(attrs, inputs, out_type, target):
+    """get_valid_counts generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_get_valid_counts(topi.vision.get_valid_counts),
+                                wrap_topi_schedule(topi.generic.schedule_get_valid_counts),
+                                name="get_valid_counts.generic")
+    return strategy
+
+# non-maximum suppression
+def wrap_compute_nms(topi_compute):
+    """wrap nms topi compute"""
+    def _compute_nms(attrs, inputs, out_type):
+        return_indices = bool(get_const_int(attrs.return_indices))
+        max_output_size = get_const_int(attrs.max_output_size)
+        iou_threshold = get_const_float(attrs.iou_threshold)
+        force_suppress = bool(get_const_int(attrs.force_suppress))
+        top_k = get_const_int(attrs.top_k)
+        coord_start = get_const_int(attrs.coord_start)
+        score_index = get_const_int(attrs.score_index)
+        id_index = get_const_int(attrs.id_index)
+        invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
+        return [topi_compute(inputs[0], inputs[1], max_output_size, iou_threshold,
+                             force_suppress, top_k, coord_start, score_index,
+                             id_index, return_indices, invalid_to_bottom)]
+    return _compute_nms
+
+@override_native_generic_func("non_max_suppression_strategy")
+def nms_strategy(attrs, inputs, out_type, target):
+    """nms generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_nms(topi.vision.non_max_suppression),
+                                wrap_topi_schedule(topi.generic.schedule_nms),
+                                name="nms.generic")
+    return strategy
+
+# roi_align
+def wrap_compute_roi_align(topi_compute):
+    """wrap roi_align topi compute"""
+    def _compute_roi_align(attrs, inputs, out_type):
+        assert attrs.layout == "NCHW"
+        pooled_size = get_const_tuple(attrs.pooled_size)
+        return [topi_compute(inputs[0], inputs[1],
+                             pooled_size=pooled_size,
+                             spatial_scale=attrs.spatial_scale,
+                             sample_ratio=attrs.sample_ratio)]
+    return _compute_roi_align
+
+@override_native_generic_func("roi_align_strategy")
+def roi_align_strategy(attrs, inputs, out_type, target):
+    """roi_align generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw),
+                                wrap_topi_schedule(topi.generic.schedule_roi_align),
+                                name="roi_align.generic")
+    return strategy
+
+# roi_pool
+@generic_func
+def schedule_roi_pool(attrs, outs, target):
+    """schedule roi_pool"""
+    with target:
+        return topi.generic.schedule_roi_pool(outs)
+
+# proposal
+def wrap_compute_proposal(topi_compute):
+    """wrap proposal topi compute"""
+    def _compute_proposal(attrs, inputs, out_type):
+        scales = get_float_tuple(attrs.scales)
+        ratios = get_float_tuple(attrs.ratios)
+        feature_stride = attrs.feature_stride
+        threshold = attrs.threshold
+        rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
+        rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
+        rpn_min_size = attrs.rpn_min_size
+        iou_loss = bool(get_const_int(attrs.iou_loss))
+        return [topi_compute(inputs[0], inputs[1], inputs[2], scales, ratios,
+                             feature_stride, threshold, rpn_pre_nms_top_n,
+                             rpn_post_nms_top_n, rpn_min_size, iou_loss)]
+    return _compute_proposal
+
+@override_native_generic_func("proposal_strategy")
+def proposal_strategy(attrs, inputs, out_type, target):
+    """proposal generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_proposal(topi.vision.rcnn.proposal),
+                                wrap_topi_schedule(topi.generic.schedule_proposal),
+                                name="proposal.generic")
+    return strategy
+
+# argwhere
+@generic_func
+def schedule_argwhere(attrs, outs, target):
+    """schedule argwhere"""
+    with target:
+        return topi.generic.schedule_argwhere(outs)
+
+# bitserial_conv2d
+def wrap_compute_bitserial_conv2d(topi_compute):
+    """wrap bitserial_conv2d topi compute"""
+    def compute_bitserial_conv2d(attrs, inputs, out_dtype):
+        """Compute definition for bitserial conv2d."""
+        padding = get_const_tuple(attrs.padding)
+        strides = get_const_tuple(attrs.strides)
+        activation_bits = attrs.activation_bits
+        weight_bits = attrs.weight_bits
+        pack_dtype = attrs.pack_dtype
+        out_dtype = attrs.out_dtype
+        unipolar = attrs.unipolar
+        return [topi_compute(inputs[0], inputs[1], strides, padding, activation_bits,
+                             weight_bits, pack_dtype, out_dtype, unipolar)]
+    return compute_bitserial_conv2d
+
+@override_native_generic_func("bitserial_conv2d_strategy")
+def bitserial_conv2d_strategy(attrs, inputs, out_type, target):
+    """bitserial_conv2d generic strategy"""
+    logger.warning("bitserial_conv2d is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw),
+            wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nchw),
+            name="bitserial_conv2d_nchw.generic")
+    elif layout == "NHWC":
+        strategy.add_implementation(
+            wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc),
+            wrap_topi_schedule(topi.generic.schedule_bitserial_conv2d_nhwc),
+            name="bitserial_conv2d_nhwc.generic")
+    else:
+        raise ValueError("Data layout {} not supported.".format(layout))
+    return strategy
+
+# bitserial_dense
+def wrap_compute_bitserial_dense(topi_compute):
+    """wrap bitserial_dense topi compute"""
+    def compute_bitserial_dense(attrs, inputs, out_type):
+        """Compute definition of bitserial dense"""
+        data_bits = attrs.data_bits
+        weight_bits = attrs.weight_bits
+        pack_dtype = attrs.pack_dtype
+        out_dtype = attrs.out_dtype
+        out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
+        unipolar = attrs.unipolar
+        return [topi_compute(inputs[0], inputs[1], data_bits, weight_bits,
+                             pack_dtype, out_dtype, unipolar)]
+    return compute_bitserial_dense
+
+@override_native_generic_func("bitserial_dense_strategy")
+def bitserial_dense_strategy(attrs, inputs, out_type, target):
+    """bitserial_dense generic strategy"""
+    logger.warning("bitserial_dense is not optimized for this platform.")
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_bitserial_dense(topi.nn.bitserial_dense),
+        wrap_topi_schedule(topi.generic.schedule_bitserial_dense),
+        name="bitserial_dense.generic")
+    return strategy
diff --git a/python/tvm/relay/op/strategy/hls.py b/python/tvm/relay/op/strategy/hls.py
new file mode 100644
index 000000000000..514902b86833
--- /dev/null
+++ b/python/tvm/relay/op/strategy/hls.py
@@ -0,0 +1,158 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of HLS operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import topi
+from .generic import *
+from .. import op as _op
+
+@schedule_injective.register("hls")
+def schedule_injective_hls(attrs, outs, target):
+    """schedule injective ops for hls"""
+    with target:
+        return topi.hls.schedule_injective(outs)
+
+@schedule_reduce.register("hls")
+def schedule_reduce_hls(attrs, outs, target):
+    """schedule reduction ops for hls"""
+    with target:
+        return topi.hls.schedule_reduce(outs)
+
+@schedule_concatenate.register("hls")
+def schedule_concatenate_hls(attrs, outs, target):
+    """schedule concatenate for hls"""
+    with target:
+        return topi.hls.schedule_injective(outs)
+
+@schedule_pool.register("hls")
+def schedule_pool_hls(attrs, outs, target):
+    """schedule pooling ops for hls"""
+    with target:
+        return topi.hls.schedule_pool(outs, attrs.layout)
+
+@schedule_adaptive_pool.register("hls")
+def schedule_adaptive_pool_hls(attrs, outs, target):
+    """schedule adaptive pooling ops for hls"""
+    with target:
+        return topi.hls.schedule_adaptive_pool(outs)
+
+@schedule_softmax.register("hls")
+def schedule_softmax_hls(attrs, outs, target):
+    """schedule softmax for hls"""
+    with target:
+        return topi.hls.schedule_softmax(outs)
+
+@override_native_generic_func("conv2d_strategy")
+def conv2d_strategy_hls(attrs, inputs, out_type, target):
+    """conv2d hls strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    (dilation_h, dilation_w) = dilation
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_nchw),
+                wrap_topi_schedule(topi.hls.schedule_conv2d_nchw),
+                name="conv2d_nchw.hls")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWIO"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
+                wrap_topi_schedule(topi.hls.schedule_conv2d_nhwc),
+                name="conv2d_nhwc.hls")
+        else:
+            raise RuntimeError("Unsupported conv2d layout {}".format(layout))
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
+                wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nchw),
+                name="depthwise_conv2d_nchw.hls")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_topi_schedule(topi.hls.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_nhwc.hls")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
+    else: # group_conv2d
+        raise RuntimeError("group_conv2d is not supported for hls")
+    return strategy
+
+@override_native_generic_func("conv2d_NCHWc_strategy")
+def conv2d_NCHWc_strategy_hls(attrs, inputs, out_type, target):
+    """conv2d_NCHWc hls strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True),
+        wrap_topi_schedule(topi.hls.schedule_conv2d_NCHWc),
+        name="conv2d_NCHWc.hls")
+    return strategy
+
+@conv2d_transpose_strategy.register("hls")
+def conv2d_transpose_strategy_hls(attrs, inputs, out_type, target):
+    """conv2d_transpose hls strategy"""
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    assert groups == 1, "only support groups == 1 for now"
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.hls.schedule_conv2d_transpose_nchw),
+        name="conv2d_transpose_nchw.hls")
+    return strategy
+
+@dense_strategy.register("hls")
+def dense_strategy_hls(attrs, inputs, out_type, target):
+    """dense hls strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_dense(topi.nn.dense),
+                                wrap_topi_schedule(topi.hls.schedule_dense),
+                                name="dense.hls")
+    return strategy
+
+@bitserial_conv2d_strategy.register("hls")
+def bitserial_conv2d_strategy_hls(attrs, inputs, out_type, target):
+    """bitserial_conv2d hls strategy"""
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nchw),
+            wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nchw),
+            name="bitserial_conv2d_nchw.hls")
+    elif layout == "NHWC":
+        strategy.add_implementation(
+            wrap_compute_bitserial_conv2d(topi.nn.bitserial_conv2d_nhwc),
+            wrap_topi_schedule(topi.hls.schedule_bitserial_conv2d_nhwc),
+            name="bitserial_conv2d_nhwc.hls")
+    else:
+        raise ValueError("Data layout {} not supported.".format(layout))
+    return strategy
diff --git a/python/tvm/relay/op/strategy/intel_graphics.py b/python/tvm/relay/op/strategy/intel_graphics.py
new file mode 100644
index 000000000000..0ea8d85e3530
--- /dev/null
+++ b/python/tvm/relay/op/strategy/intel_graphics.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of x86 operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import topi
+from .generic import *
+from .. import op as _op
+
+
+@conv2d_strategy.register("intel_graphics")
+def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target):
+    """conv2d intel graphics strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.intel_graphics.conv2d_nchw),
+                wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_nchw),
+                name="conv2d_nchw.intel_graphics")
+            # conv2d_NCHWc won't work without alter op layout pass
+            # TODO(@Laurawly): fix this
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True),
+                wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc),
+                name="conv2d_NCHWc.intel_graphics",
+                plevel=5)
+        else:
+            raise RuntimeError("Unsupported conv2d layout {} for intel graphics".
+                               format(layout))
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.intel_graphics.depthwise_conv2d_nchw),
+                wrap_topi_schedule(topi.intel_graphics.schedule_depthwise_conv2d_nchw),
+                name="depthwise_conv2d_nchw.intel_graphics")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
+    else: # group_conv2d
+        raise RuntimeError("group_conv2d is not supported for intel graphics")
+    return strategy
+
+@conv2d_NCHWc_strategy.register("intel_graphics")
+def conv2d_NCHWc_strategy_intel_graphics(attrs, inputs, out_type, target):
+    """conv2d_NCHWc intel_graphics strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True),
+        wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc),
+        name="conv2d_NCHWc.intel_graphics")
+    return strategy
diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
new file mode 100644
index 000000000000..8f1fa291d236
--- /dev/null
+++ b/python/tvm/relay/op/strategy/mali.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of mali operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import re
+import topi
+from .generic import *
+from .. import op as _op
+
+@conv2d_strategy.register("mali")
+def conv2d_strategy_mali(attrs, inputs, out_type, target):
+    """conv2d mali strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    stride_h, stride_w = attrs.get_int_tuple("strides")
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            if kernel_layout == "OIHW":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack),
+                    wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack),
+                    name="conv2d_nchw_spatial_pack.mali")
+                # check if winograd algorithm is applicable
+                _, _, kh, kw = get_const_tuple(kernel.shape)
+                if kh == 3 and kw == 3 and stride_h == 1 and stride_w == 1 and \
+                   dilation_h == 1 and dilation_w == 1:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd),
+                        wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd),
+                        name="conv2d_nchw_winograd.mali",
+                        plevel=15)
+            elif re.match(r"OIHW\d*o", kernel_layout):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.mali.conv2d_nchw_spatial_pack),
+                    wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_spatial_pack),
+                    name="conv2d_nchw_spatial_pack.mali")
+            else:
+                raise RuntimeError("Unsupported weight layout {} for conv2d NCHW".
+                                   format(kernel_layout))
+        else:
+            raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout))
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.mali.depthwise_conv2d_nchw),
+                wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nchw),
+                name="depthwise_conv2d_nchw.mali")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout))
+    else: # group_conv2d
+        raise RuntimeError("group_conv2d is not supported for mali")
+    return strategy
+
+@conv2d_winograd_without_weight_transfrom_strategy.register("mali")
+def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom mali strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    strides = attrs.get_int_tuple("strides")
+    kernel = inputs[1]
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert strides == (1, 1), "Do not support strides now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout == "NCHW":
+        assert len(kernel.shape) == 5, "Kernel must be packed into 5-dim"
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.mali.conv2d_nchw_winograd),
+            wrap_topi_schedule(topi.mali.schedule_conv2d_nchw_winograd),
+            name="conv2d_nchw_winograd.mali")
+    else:
+        raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}".
+                           format(layout))
+    return strategy
+
+@dense_strategy.register("mali")
+def dense_strategy_mali(attrs, inputs, out_type, target):
+    """dense mali strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_dense(topi.mali.dense),
+                                wrap_topi_schedule(topi.mali.schedule_dense),
+                                name="dense.mali")
+    return strategy
diff --git a/python/tvm/relay/op/strategy/opengl.py b/python/tvm/relay/op/strategy/opengl.py
new file mode 100644
index 000000000000..45e290c50e0f
--- /dev/null
+++ b/python/tvm/relay/op/strategy/opengl.py
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of OpenGL operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import topi
+from .generic import *
+from .. import op as _op
+
+@schedule_injective.register("opengl")
+def schedule_injective_opengl(attrs, outs, target):
+    """schedule injective ops for opengl"""
+    with target:
+        return topi.opengl.schedule_injective(outs)
+
+@schedule_concatenate.register("opengl")
+def schedule_concatenate_opengl(attrs, outs, target):
+    """schedule concatenate for opengl"""
+    with target:
+        return topi.opengl.schedule_injective(outs)
+
+@schedule_pool.register("opengl")
+def schedule_pool_opengl(attrs, outs, target):
+    """schedule pooling ops for opengl"""
+    with target:
+        return topi.opengl.schedule_pool(outs, attrs.layout)
+
+@schedule_adaptive_pool.register("opengl")
+def schedule_adaptive_pool_opengl(attrs, outs, target):
+    """schedule adative pooling ops for opengl"""
+    with target:
+        return topi.opengl.schedule_adaptive_pool(outs)
+
+@schedule_softmax.register("opengl")
+def schedule_softmax_opengl(attrs, outs, target):
+    """schedule softmax for opengl"""
+    with target:
+        return topi.opengl.schedule_softmax(outs)
+
+@conv2d_strategy.register("opengl")
+def conv2d_strategy_opengl(attrs, inputs, out_type, target):
+    """conv2d opengl strategy"""
+    strategy = _op.OpStrategy()
+    groups = attrs.groups
+    layout = attrs.data_layout
+    assert groups == 1, "Don't support group conv2d on OpenGL"
+    assert layout == "NCHW", "Only support conv2d layout NCHW for OpenGL"
+    strategy.add_implementation(wrap_compute_conv2d(topi.nn.conv2d),
+                                wrap_topi_schedule(topi.opengl.schedule_conv2d_nchw),
+                                name="conv2d_nchw.opengl")
+    return strategy
+
+@dense_strategy.register("opengl")
+def dense_strategy_opengl(attrs, inputs, out_type, target):
+    """dense opengl strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_dense(topi.nn.dense),
+                                wrap_topi_schedule(topi.opengl.schedule_dense),
+                                name="dense.opengl")
+    return strategy
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
new file mode 100644
index 000000000000..e11a688c1398
--- /dev/null
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of ROCm operator strategy."""
+# pylint: disable=invalid-name,unused-argument,unused-wildcard-import,wildcard-import
+import topi
+from .generic import *
+from .. import op as _op
+
+@schedule_lrn.register("rocm")
+def schedule_lrn_rocm(attrs, outs, target):
+    """schedule LRN for rocm"""
+    with target:
+        return topi.rocm.schedule_lrn(outs)
+
+@conv2d_strategy.register("rocm")
+def conv2d_strategy_rocm(attrs, inputs, out_type, target):
+    """conv2d rocm strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    groups = attrs.groups
+    layout = attrs.data_layout
+    stride_h, stride_w = attrs.get_int_tuple("strides")
+    kernel_layout = attrs.kernel_layout
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8.
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.conv2d_nchw),
+                wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
+                name="conv2d_nchw.cuda")
+            _, _, kh, kw = get_const_tuple(kernel.shape)
+            if kh <= 7 and kw <= 7 and kh == kw and stride_h == 1 and stride_w == 1:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
+                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd),
+                    name="conv2d_nchw_winograd.cuda",
+                    plevel=15)
+        elif layout == "HWCN":
+            assert kernel_layout == "HWIO"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
+                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
+                name="conv2d_hwcn.cuda")
+        # TODO(@alexgl-github): Re-enable this after fix the conv2d_nhwc for cuda
+        # elif layout == "NHWC":
+        #     assert kernel_layout == "HWIO"
+        #     strategy.add_implementation(
+        #         wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
+        #         wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
+        #         name="conv2d_nhwc.cuda")
+        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
+            assert kernel_layout == "OIHW4o4i"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
+                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
+                name="conv2d_NCHWc_int8.cuda")
+        else:
+            raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout))
+        # add miopen implementation
+        if "miopen" in target.libs:
+            if layout == "NCHW":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True),
+                    wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
+                    name="conv2d_nchw_miopen.rocm",
+                    plevel=15)
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
+                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
+                name="depthwise_conv2d_nchw.cuda")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nhwc.cuda")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
+    else: # group_conv2d
+        if layout == 'NCHW':
+            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.cuda")
+        elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]:
+            assert kernel_layout == "OIHW4o4i"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
+                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
+                name="group_conv2d_NCHWc_int8.cuda")
+        else:
+            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
+    return strategy
+
+@dense_strategy.register("rocm")
+def dense_strategy_rocm(attrs, inputs, out_type, target):
+    """Dense strategy for ROCM"""
+    strategy = _op.OpStrategy()
+    assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense"
+
+    strategy.add_implementation(
+        wrap_compute_dense(topi.rocm.dense),
+        wrap_topi_schedule(topi.rocm.schedule_dense),
+        name="dense.rocm")
+    if target.target_name == "rocm" and "rocblas" in target.libs:
+        assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported."
+        strategy.add_implementation(
+            wrap_compute_dense(topi.rocm.dense_rocblas),
+            wrap_topi_schedule(topi.rocm.dense_rocblas),
+            name="dense_rocblas.rocm",
+            plevel=5)
+    return strategy
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
new file mode 100644
index 000000000000..2fadb7f08dcd
--- /dev/null
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -0,0 +1,302 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of x86 operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import logging
+
+import topi
+from tvm.te import SpecializedCondition
+from .generic import *
+from .. import op as _op
+
+logger = logging.getLogger('strategy')
+
+@schedule_injective.register("cpu")
+def schedule_injective_cpu(attrs, outs, target):
+    """schedule injective ops for x86"""
+    with target:
+        return topi.x86.schedule_injective(outs)
+
+@schedule_reduce.register("cpu")
+def schedule_reduce_cpu(attrs, outs, target):
+    """schedule reduction ops for x86"""
+    with target:
+        return topi.x86.schedule_reduce(outs)
+
+@schedule_concatenate.register("cpu")
+def schedule_concatenate_cpu(attrs, outs, target):
+    """schedule concatenate op for x86"""
+    with target:
+        return topi.x86.schedule_concatenate(outs)
+
+@schedule_pool.register("cpu")
+def schedule_pool_cpu(attrs, outs, target):
+    """schedule pooling ops for x86"""
+    with target:
+        return topi.x86.schedule_pool(outs, attrs.layout)
+
+@schedule_adaptive_pool.register("cpu")
+def schedule_adaptive_pool_cpu(attrs, outs, target):
+    """schedule adaptive pooling ops for x86"""
+    with target:
+        return topi.x86.schedule_adaptive_pool(outs)
+
+@schedule_softmax.register("cpu")
+def schedule_softmax_cpu(attrs, outs, target):
+    """schedule softmax for x86"""
+    with target:
+        return topi.x86.schedule_softmax(outs)
+
+@conv2d_strategy.register("cpu")
+def conv2d_strategy_cpu(attrs, inputs, out_type, target):
+    """conv2d x86 strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation_h, dilation_w = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype):
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.x86.conv2d_nchw_int8),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8),
+                    name="conv2d_nchw_int8.x86")
+            else:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.x86.conv2d_nchw),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
+                    name="conv2d_nchw.x86")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWIO"
+            logger.warning("For x86 target, NCHW layout is recommended for conv2d.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
+                wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
+                name="conv2d_nhwc.x86")
+        elif layout == "HWCN":
+            assert kernel_layout == "HWIO"
+            logger.warning("conv2d HWCN layout is not optimized for x86.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_hwcn),
+                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
+                name="conv2d_hwcn.generic")
+        else:
+            raise RuntimeError("Unsupported conv2d layout {} for x86".format(layout))
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            channel_multiplier = get_const_tuple(inputs[1].shape)[1]
+            if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
+                    wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw),
+                    name="depthwise_conv2d_nchw.x86")
+            else:
+                logger.warning("For x86 target, depthwise_conv2d with channel "
+                               "multiplier greater than 1 is not optimized")
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
+                    wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nchw),
+                    name="depthwise_conv2d_nchw.generic")
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            logger.warning("depthwise_conv2d NHWC layout is not optimized for x86.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nhwc.generic")
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
+    else: # group_conv2d
+        if layout == 'NCHW':
+            assert kernel_layout == "OIHW"
+            logger.warning("group_conv2d is not optimized for x86.")
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.generic")
+        else:
+            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
+    return strategy
+
+@conv2d_NCHWc_strategy.register("cpu")
+def conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target):
+    """conv2d_NCHWc x86 strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype):
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.x86.conv2d_NCHWc_int8, True, True),
+            wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc_int8),
+            name="conv2d_NCHWc_int8.x86")
+    else:
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
+            wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
+            name="conv2d_NCHWc.x86")
+    return strategy
+
+@depthwise_conv2d_NCHWc_strategy.register("cpu")
+def depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target):
+    """depthwise_conv2d x86 strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True),
+        wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
+        name="depthwise_conv2d_NCHWc.x86")
+    return strategy
+
+@conv2d_transpose_strategy.register("cpu")
+def conv2d_transpose_strategy_cpu(attrs, inputs, out_type, target):
+    """conv2d_transpose x86 strategy"""
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    assert groups == 1, "only support groups == 1 for now"
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_conv2d_transpose(topi.x86.conv2d_transpose_nchw),
+        wrap_topi_schedule(topi.x86.schedule_conv2d_transpose_nchw),
+        name="conv2d_transpose_nchw.x86")
+    return strategy
+
+@conv3d_strategy.register("cpu")
+def conv3d_strategy_cpu(attrs, inputs, out_type, target):
+    """conv3d generic strategy"""
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    if layout == "NCDHW":
+        logger.warning("conv3d with layout NCDHW is not optimized for x86.")
+        strategy.add_implementation(wrap_compute_conv3d(topi.nn.conv3d_ncdhw),
+                                    wrap_topi_schedule(topi.generic.schedule_conv3d_ncdhw),
+                                    name="conv3d_ncdhw.generic")
+    elif layout == "NDHWC":
+        strategy.add_implementation(wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
+                                    wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
+                                    name="conv3d_ndhwc.x86")
+    else:
+        raise ValueError("Not support this layout {} yet".format(layout))
+    return strategy
+
+@conv1d_strategy.register("cpu")
+def conv1d_strategy_cpu(attrs, inputs, out_type, target):
+    """conv1d x86 strategy"""
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    if dilation[0] < 1:
+        raise ValueError("dilation should be a positive value")
+    strategy = _op.OpStrategy()
+    if layout == "NCW":
+        strategy.add_implementation(wrap_compute_conv1d(topi.nn.conv1d_ncw),
+                                    wrap_topi_schedule(topi.x86.schedule_conv1d_ncw),
+                                    name="conv1d_ncw.x86")
+    elif layout == "NWC":
+        strategy.add_implementation(wrap_compute_conv1d(topi.nn.conv1d_nwc),
+                                    wrap_topi_schedule(topi.x86.schedule_conv1d_nwc),
+                                    name="conv1d_nwc.x86")
+    else:
+        raise ValueError("Unsupported conv1d layout {}".format(layout))
+    return strategy
+
+@dense_strategy.register("cpu")
+def dense_strategy_cpu(attrs, inputs, out_type, target):
+    """dense x86 strategy"""
+    strategy = _op.OpStrategy()
+    m, _ = inputs[0].shape
+    strategy.add_implementation(wrap_compute_dense(topi.x86.dense_nopack),
+                                wrap_topi_schedule(topi.x86.schedule_dense_nopack),
+                                name="dense_nopack.x86",
+                                plevel=10)
+    if "cblas" in target.libs:
+        strategy.add_implementation(wrap_compute_dense(topi.x86.dense_cblas),
+                                    wrap_topi_schedule(topi.x86.schedule_dense_cblas),
+                                    name="dense_cblas.x86",
+                                    plevel=5)
+    with SpecializedCondition(m >= 16):
+        # this implementation may not be well-optimized, so use plevel=8 for now.
+        strategy.add_implementation(wrap_compute_dense(topi.x86.dense_pack),
+                                    wrap_topi_schedule(topi.x86.schedule_dense_pack),
+                                    name="dense_pack.x86",
+                                    plevel=8)
+    return strategy
+
+@batch_matmul_strategy.register("cpu")
+def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
+    """batch_matmul x86 strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_batch_matmul(topi.x86.batch_matmul),
+                                wrap_topi_schedule(topi.x86.schedule_batch_matmul),
+                                name="batch_matmul.x86",
+                                plevel=10)
+    if "cblas" in target.libs:
+        strategy.add_implementation(wrap_compute_batch_matmul(topi.x86.batch_matmul_cblas),
+                                    wrap_topi_schedule(topi.x86.schedule_batch_matmul_cblas),
+                                    name="batch_matmul_cblas.x86",
+                                    plevel=5)
+    return strategy
+
+@schedule_sparse_dense.register("cpu")
+def schedule_sparse_dense_cpu(attrs, outs, target):
+    """schedule sparse_dense for x86"""
+    with target:
+        return topi.x86.schedule_sparse_dense(outs)
+
+@roi_align_strategy.register("cpu")
+def roi_align_strategy_cpu(attrs, inputs, out_type, target):
+    """roi_align x86 strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(wrap_compute_roi_align(topi.x86.roi_align_nchw),
+                                wrap_topi_schedule(topi.generic.schedule_roi_align),
+                                name="roi_align.x86")
+    return strategy
+
+@bitserial_conv2d_strategy.register("cpu")
+def bitserial_conv2d_strategy_cpu(attrs, inputs, out_type, target):
+    """bitserial_conv2d x86 strategy"""
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    if layout == "NCHW":
+        strategy.add_implementation(
+            wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw),
+            wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw),
+            name="bitserial_conv2d_nchw.x86")
+    elif layout == "NHWC":
+        strategy.add_implementation(
+            wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nhwc),
+            wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nhwc),
+            name="bitserial_conv2d_nhwc.x86")
+    else:
+        raise ValueError("Data layout {} not supported.".format(layout))
+    return strategy
+
+@bitserial_dense_strategy.register("cpu")
+def bitserial_dense_strategy_cpu(attrs, inputs, out_type, target):
+    """bitserial_dense x86 strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_bitserial_dense(topi.x86.bitserial_dense),
+        wrap_topi_schedule(topi.x86.schedule_bitserial_dense),
+        name="bitserial_dense.x86")
+    return strategy
diff --git a/python/tvm/relay/op/vision/_rcnn.py b/python/tvm/relay/op/vision/_rcnn.py
index f35283961b27..16468e5eabc7 100644
--- a/python/tvm/relay/op/vision/_rcnn.py
+++ b/python/tvm/relay/op/vision/_rcnn.py
@@ -17,65 +17,27 @@
 # pylint: disable=invalid-name, unused-argument
 """Faster R-CNN and Mask R-CNN operations."""
 import topi
-from topi.util import get_const_tuple, get_float_tuple, get_const_int
+from topi.util import get_const_tuple
 from .. import op as reg
+from .. import strategy
 from ..op import OpPattern
 
-
-@reg.register_compute("vision.roi_align")
-def compute_roi_align(attrs, inputs, _, target):
-    """Compute definition of roi_align"""
-    assert attrs.layout == "NCHW"
-    return [topi.vision.rcnn.roi_align_nchw(
-        inputs[0], inputs[1], pooled_size=get_const_tuple(attrs.pooled_size),
-        spatial_scale=attrs.spatial_scale, sample_ratio=attrs.sample_ratio)]
-
-@reg.register_schedule("vision.roi_align")
-def schedule_roi_align(_, outs, target):
-    """Schedule definition of roi_align"""
-    with target:
-        return topi.generic.vision.schedule_roi_align(outs)
-
+# roi_align
+reg.register_strategy("vision.roi_align", strategy.roi_align_strategy)
 reg.register_pattern("vision.roi_align", OpPattern.OUT_ELEMWISE_FUSABLE)
 
+# roi_pool
 @reg.register_compute("vision.roi_pool")
-def compute_roi_pool(attrs, inputs, _, target):
+def compute_roi_pool(attrs, inputs, _):
     """Compute definition of roi_pool"""
     assert attrs.layout == "NCHW"
     return [topi.vision.rcnn.roi_pool_nchw(
         inputs[0], inputs[1], pooled_size=get_const_tuple(attrs.pooled_size),
         spatial_scale=attrs.spatial_scale)]
 
-@reg.register_schedule("vision.roi_pool")
-def schedule_roi_pool(_, outs, target):
-    """Schedule definition of roi_pool"""
-    with target:
-        return topi.generic.vision.schedule_roi_pool(outs)
-
+reg.register_schedule("vision.roi_pool", strategy.schedule_roi_pool)
 reg.register_pattern("vision.roi_pool", OpPattern.OUT_ELEMWISE_FUSABLE)
 
-@reg.register_compute("vision.proposal")
-def compute_proposal(attrs, inputs, _, target):
-    """Compute definition of proposal"""
-    scales = get_float_tuple(attrs.scales)
-    ratios = get_float_tuple(attrs.ratios)
-    feature_stride = attrs.feature_stride
-    threshold = attrs.threshold
-    rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
-    rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
-    rpn_min_size = attrs.rpn_min_size
-    iou_loss = bool(get_const_int(attrs.iou_loss))
-    with target:
-        return [
-            topi.vision.rcnn.proposal(inputs[0], inputs[1], inputs[2], scales, ratios,
-                                      feature_stride, threshold, rpn_pre_nms_top_n,
-                                      rpn_post_nms_top_n, rpn_min_size, iou_loss)
-        ]
-
-@reg.register_schedule("vision.proposal")
-def schedule_proposal(_, outs, target):
-    """Schedule definition of proposal"""
-    with target:
-        return topi.generic.schedule_proposal(outs)
-
+# proposal
+reg.register_strategy("vision.proposal", strategy.proposal_strategy)
 reg.register_pattern("vision.proposal", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py
index 7de118071aa4..6e2008ad74c0 100644
--- a/python/tvm/relay/op/vision/_vision.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -18,104 +18,25 @@
 """Definition of vision ops"""
 from __future__ import absolute_import
 
-import topi
-from topi.util import get_const_int, get_const_float, get_float_tuple
 from .. import op as reg
+from .. import strategy
 from ..op import OpPattern
 
-
-@reg.register_schedule("vision.multibox_prior")
-def schedule_multibox_prior(_, outs, target):
-    """Schedule definition of multibox_prior"""
-    with target:
-        return topi.generic.schedule_multibox_prior(outs)
-
-
-@reg.register_compute("vision.multibox_prior")
-def compute_multibox_prior(attrs, inputs, _, target):
-    """Compute definition of multibox_prior"""
-    sizes = get_float_tuple(attrs.sizes)
-    ratios = get_float_tuple(attrs.ratios)
-    steps = get_float_tuple(attrs.steps)
-    offsets = get_float_tuple(attrs.offsets)
-    clip = bool(get_const_int(attrs.clip))
-    return [
-        topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios, steps,
-                                       offsets, clip)
-    ]
-
-
+# multibox_prior
+reg.register_strategy("vision.multibox_prior", strategy.multibox_prior_strategy)
 reg.register_pattern("vision.multibox_prior", OpPattern.OPAQUE)
 
 
 # multibox_transform_loc
-@reg.register_schedule("vision.multibox_transform_loc")
-def schedule_multibox_transform_loc(_, outs, target):
-    """Schedule definition of multibox_detection"""
-    with target:
-        return topi.generic.schedule_multibox_transform_loc(outs)
-
-
-@reg.register_compute("vision.multibox_transform_loc")
-def compute_multibox_transform_loc(attrs, inputs, _, target):
-    """Compute definition of multibox_detection"""
-    clip = bool(get_const_int(attrs.clip))
-    threshold = get_const_float(attrs.threshold)
-    variances = get_float_tuple(attrs.variances)
-    return topi.vision.ssd.multibox_transform_loc(
-        inputs[0], inputs[1], inputs[2], clip, threshold, variances)
-
-
+reg.register_strategy("vision.multibox_transform_loc", strategy.multibox_transform_loc_strategy)
 reg.register_pattern("vision.multibox_transform_loc", OpPattern.OPAQUE)
-reg.register_pattern("vision.multibox_detection", OpPattern.OPAQUE)
 
 
 # Get counts of valid boxes
-@reg.register_schedule("vision.get_valid_counts")
-def schedule_get_valid_counts(_, outs, target):
-    """Schedule definition of get_valid_counts"""
-    with target:
-        return topi.generic.schedule_get_valid_counts(outs)
-
-
-@reg.register_compute("vision.get_valid_counts")
-def compute_get_valid_counts(attrs, inputs, _, target):
-    """Compute definition of get_valid_counts"""
-    score_threshold = get_const_float(attrs.score_threshold)
-    id_index = get_const_int(attrs.id_index)
-    score_index = get_const_int(attrs.score_index)
-    return topi.vision.get_valid_counts(inputs[0], score_threshold,
-                                        id_index, score_index)
-
+reg.register_strategy("vision.get_valid_counts", strategy.get_valid_counts_strategy)
 reg.register_pattern("vision.get_valid_counts", OpPattern.OPAQUE)
 
 
 # non-maximum suppression
-@reg.register_schedule("vision.non_max_suppression")
-def schedule_nms(_, outs, target):
-    """Schedule definition of nms"""
-    with target:
-        return topi.generic.schedule_nms(outs)
-
-
-@reg.register_compute("vision.non_max_suppression")
-def compute_nms(attrs, inputs, _, target):
-    """Compute definition of nms"""
-    return_indices = bool(get_const_int(attrs.return_indices))
-    max_output_size = get_const_int(attrs.max_output_size)
-    iou_threshold = get_const_float(attrs.iou_threshold)
-    force_suppress = bool(get_const_int(attrs.force_suppress))
-    top_k = get_const_int(attrs.top_k)
-    coord_start = get_const_int(attrs.coord_start)
-    score_index = get_const_int(attrs.score_index)
-    id_index = get_const_int(attrs.id_index)
-    invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
-    return [
-        topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
-                                        iou_threshold, force_suppress, top_k,
-                                        coord_start, score_index, id_index,
-                                        return_indices, invalid_to_bottom)
-    ]
-
-
+reg.register_strategy("vision.non_max_suppression", strategy.nms_strategy)
 reg.register_pattern("vision.non_max_suppression", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/_yolo.py b/python/tvm/relay/op/vision/_yolo.py
index 32fc62d5c23a..c58a7a367549 100644
--- a/python/tvm/relay/op/vision/_yolo.py
+++ b/python/tvm/relay/op/vision/_yolo.py
@@ -17,9 +17,9 @@
 #pylint: disable=invalid-name, unused-argument
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
-from ..op import  register_schedule, register_pattern
-from ..op import schedule_injective, OpPattern
+from ..op import register_pattern, OpPattern
+from ..op import register_injective_schedule
 
 # reorg
 register_pattern("vision.yolo_reorg", OpPattern.INJECTIVE)
-register_schedule("vision.yolo_reorg", schedule_injective)
+register_injective_schedule("vision.yolo_reorg")
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index ba100d8d03e4..b77516de6839 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -31,7 +31,7 @@
 
 
 @_reg.register_compute("relay.op.annotation.simulated_quantize")
-def simulated_quantize_compute(attrs, inputs, out_type, target):
+def simulated_quantize_compute(attrs, inputs, out_type):
     """Compiler for simulated_quantize."""
     assert len(inputs) == 4
     assert attrs.sign
@@ -52,11 +52,10 @@ def simulated_quantize_compute(attrs, inputs, out_type, target):
     return [rdata]
 
 
-_reg.register_schedule("relay.op.annotation.simulated_quantize",
-                       _reg.schedule_injective)
+_reg.register_injective_schedule("relay.op.annotation.simulated_quantize")
 _reg.register_pattern("relay.op.annotation.simulated_quantize",
                       _reg.OpPattern.ELEMWISE)
-_reg.register_schedule("annotation.cast_hint", _reg.schedule_injective)
+_reg.register_injective_schedule("annotation.cast_hint")
 
 
 @register_relay_node
diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py
index 9aaefdfdb02d..d5a4d5f1e08f 100644
--- a/python/tvm/relay/testing/mobilenet.py
+++ b/python/tvm/relay/testing/mobilenet.py
@@ -44,15 +44,18 @@ def conv_block(data, name, channels, kernel_size=(3, 3), strides=(1, 1),
 
 def separable_conv_block(data, name, depthwise_channels, pointwise_channels,
                          kernel_size=(3, 3), downsample=False, padding=(1, 1),
-                         epsilon=1e-5, layout='NCHW'):
+                         epsilon=1e-5, layout='NCHW', dtype="float32"):
     """Helper function to get a separable conv block"""
     if downsample:
         strides = (2, 2)
     else:
         strides = (1, 1)
     # depthwise convolution + bn + relu
+    wshape = (depthwise_channels, 1) + kernel_size
+    weight = relay.var(name + "_weight", shape=wshape, dtype=dtype)
     conv1 = layers.conv2d(
         data=data,
+        weight=weight,
         channels=depthwise_channels,
         groups=depthwise_channels,
         kernel_size=kernel_size,
@@ -85,38 +88,41 @@ def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224),
     body = conv_block(data, 'conv_block_1', int(32*alpha), strides=(2, 2),
                       layout=layout)
     body = separable_conv_block(body, 'separable_conv_block_1',
-                                int(32*alpha), int(64*alpha), layout=layout)
+                                int(32*alpha), int(64*alpha), layout=layout,
+                                dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_2',
                                 int(64*alpha), int(128*alpha), downsample=True,
-                                layout=layout)
+                                layout=layout, dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_3',
-                                int(128*alpha), int(128*alpha), layout=layout)
+                                int(128*alpha), int(128*alpha), layout=layout,
+                                dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_4',
                                 int(128*alpha), int(256*alpha), downsample=True,
-                                layout=layout)
+                                layout=layout, dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_5',
-                                int(256*alpha), int(256*alpha), layout=layout)
+                                int(256*alpha), int(256*alpha), layout=layout,
+                                dtype=dtype)
     body = separable_conv_block(body, 'separable_conv_block_6',
                                 int(256*alpha), int(512*alpha), downsample=True,
-                                layout=layout)
+                                layout=layout, dtype=dtype)
     if is_shallow:
         body = separable_conv_block(body, 'separable_conv_block_7',
                                     int(512*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout)
+                                    downsample=True, layout=layout, dtype=dtype)
         body = separable_conv_block(body, 'separable_conv_block_8',
                                     int(1024*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout)
+                                    downsample=True, layout=layout, dtype=dtype)
     else:
         for i in range(7, 12):
             body = separable_conv_block(body, 'separable_conv_block_%d' % i,
                                         int(512*alpha), int(512*alpha),
-                                        layout=layout)
+                                        layout=layout, dtype=dtype)
         body = separable_conv_block(body, 'separable_conv_block_12',
                                     int(512*alpha), int(1024*alpha),
-                                    downsample=True, layout=layout)
+                                    downsample=True, layout=layout, dtype=dtype)
         body = separable_conv_block(body, 'separable_conv_block_13',
                                     int(1024*alpha), int(1024*alpha),
-                                    layout=layout)
+                                    layout=layout, dtype=dtype)
     pool = relay.nn.global_avg_pool2d(data=body, layout=layout)
     flatten = relay.nn.batch_flatten(data=pool)
     weight = relay.var('fc_weight')
diff --git a/python/tvm/target/generic_func.py b/python/tvm/target/generic_func.py
index 13f280a5ab1a..1936ff1511be 100644
--- a/python/tvm/target/generic_func.py
+++ b/python/tvm/target/generic_func.py
@@ -184,6 +184,7 @@ def dispatch_func(func, *args, **kwargs):
         fresult = decorate(fdefault, dispatch_func)
         fresult.fdefault = fdefault
         fresult.register = register
+        fresult.generic_func_node = generic_func_node
         return fresult
     return fdecorate
 
@@ -268,4 +269,5 @@ def dispatch_func(func, *args, **kwargs):
     fdecorate = decorate(fdefault, dispatch_func)
     fdecorate.register = register
     fdecorate.fdefault = fdefault
+    fdecorate.dispatch_dict = dispatch_dict
     return fdecorate
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 1580da369c33..5970315e854b 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -23,8 +23,8 @@
 from tvm.tir import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod
 from tvm.tir import comm_reducer, min, max, sum
 
-from .schedule import Schedule, create_schedule
-from .tensor import Tensor
+from .schedule import Schedule, create_schedule, SpecializedCondition
+from .tensor import TensorSlice, Tensor
 from .tensor_intrin import decl_tensor_intrin
 from .tag import tag_scope
 from .operation import placeholder, compute, scan, extern, var, size_var
diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py
index d160f78d7c89..f8bbe09725f2 100644
--- a/python/tvm/te/schedule.py
+++ b/python/tvm/te/schedule.py
@@ -517,4 +517,39 @@ def opengl(self):
         _ffi_api.StageOpenGL(self)
 
 
+@tvm._ffi.register_object
+class SpecializedCondition(Object):
+    """Specialized condition to enable op specialization."""
+    def __init__(self, conditions):
+        """Create a specialized condition.
+
+        .. note::
+            Conditions are represented in conjunctive joint form (CNF).
+            Each condition should be a simple expression, e.g., n > 16,
+            m % 8 == 0, etc., where n, m are tvm.Var that represents a
+            dimension in the tensor shape.
+
+        Parameters
+        ----------
+        conditions : List of tvm.Expr
+            List of conditions in conjunctive joint form (CNF).
+        """
+        if not isinstance(conditions, (list, _container.Array)):
+            conditions = [conditions]
+        self.__init_handle_by_constructor__(
+            _ffi_api.CreateSpecializedCondition, conditions)
+
+    @staticmethod
+    def current():
+        """Returns the current specialized condition"""
+        return _ffi_api.GetCurrentSpecialization()
+
+    def __enter__(self):
+        _ffi_api.EnterSpecializationScope(self)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        _ffi_api.ExitSpecializationScope(self)
+
+
 tvm._ffi._init_api("schedule", __name__)
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index ca11ffc76ea5..acf5f51941dc 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -964,3 +964,11 @@ class Let(PrimExprWithOp):
     def __init__(self, var, value, body):
         self.__init_handle_by_constructor__(
             _ffi_api.Let, var, value, body)
+
+
+@tvm._ffi.register_object
+class Any(PrimExpr):
+    """Any node.
+    """
+    def __init__(self):
+        self.__init_handle_by_constructor__(_ffi_api.Any)
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index e5629e8f3505..d0a7da9f1ba9 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -47,11 +47,19 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_NODE_TYPE(LoweredOutputNode);
 TVM_REGISTER_NODE_TYPE(CachedFuncNode);
 TVM_REGISTER_NODE_TYPE(CCacheKeyNode);
 TVM_REGISTER_NODE_TYPE(CCacheValueNode);
 TVM_REGISTER_OBJECT_TYPE(CompileEngineNode);
 
+LoweredOutput::LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation impl) {
+  auto n = make_object<LoweredOutputNode>();
+  n->outputs = std::move(outputs);
+  n->implementation = std::move(impl);
+  data_ = std::move(n);
+}
+
 CCacheKey CCacheKeyNode::make(Function source_func, Target target) {
   auto n = make_object<CCacheKeyNode>();
   n->source_func = std::move(source_func);
@@ -108,9 +116,7 @@ class ScheduleGetter :
   explicit ScheduleGetter(Target target)
       : target_(target), device_copy_op_(Op::Get("device_copy")) {}
 
-  std::pair<te::Schedule, CachedFunc> Create(const Function& prim_func) {
-    static auto fschedule =
-        Op::GetAttr<FTVMSchedule>("FTVMSchedule");
+  CachedFunc Create(const Function& prim_func) {
     auto cache_node = make_object<CachedFuncNode>();
     cache_node->target = target_;
     for (Var param : prim_func->params) {
@@ -147,7 +153,6 @@ class ScheduleGetter :
     }
     cache_node->func_name = candidate_name;
 
-    CachedFunc cfunc(cache_node);
     CHECK(master_op_.defined());
     // Fusion over tupled results may leave identity relationships
     // between inputs and outputs, and those should not be scheduled.
@@ -161,15 +166,16 @@ class ScheduleGetter :
     te::Schedule schedule;
     // No need to register schedule for device copy op.
     if (master_attrs_.as<DeviceCopyAttrs>() == nullptr) {
-      schedule =
-          fschedule[master_op_](master_attrs_, tensor_outs, target_);
+      CHECK(master_implementation_.defined());
+      schedule = master_implementation_.Schedule(master_attrs_, tensor_outs, target_);
       for (const auto& scalar : scalars_) {
         if (schedule->Contain(scalar)) {
           schedule[scalar].compute_inline();
         }
       }
     }
-    return std::make_pair(schedule, cfunc);
+    cache_node->schedule = std::move(schedule);
+    return CachedFunc(cache_node);
   }
 
   Array<te::Tensor> VisitExpr(const Expr& expr) {
@@ -208,16 +214,16 @@ class ScheduleGetter :
           LOG(FATAL) << "not handled";
           return tvm::PrimExpr();
         }
-      }, "compile_engine_const", topi::kBroadcast);
+    }, "compile_engine_const", topi::kBroadcast);
     scalars_.push_back(value->op);
     return {value};
   }
 
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
-    static auto fcompute =
-        Op::GetAttr<FTVMCompute>("FTVMCompute");
     static auto fpattern =
         Op::GetAttr<TOpPattern>("TOpPattern");
+    static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
+    CHECK(flower_call) << "relay.backend.lower_call is not registered.";
 
     Array<te::Tensor> inputs;
     int count_tuple = 0;
@@ -231,51 +237,37 @@ class ScheduleGetter :
     }
     if (count_tuple) {
       CHECK_EQ(call_node->args.size(), 1U)
-          << "Only allow function with a single tuple input";
-    }
-
-    // Prepare the call_node->checked_type(). For the call node inputs, we ensure that the shape is
-    // Int32. Following code ensures the same for the output as well.
-    // TODO(@icemelon): Support recursive tuple
-    Type call_node_type = call_node->checked_type();
-    if (const auto* tt = call_node->checked_type().as<TensorTypeNode>()) {
-      call_node_type = TensorType(GetShape(tt->shape), tt->dtype);
-    } else if (const auto* tuple_t = call_node->checked_type().as<TupleTypeNode>()) {
-      std::vector<Type> new_fields;
-      for (auto field : tuple_t->fields) {
-        if (const auto* tt = field.as<TensorTypeNode>()) {
-          new_fields.push_back(TensorType(GetShape(tt->shape), tt->dtype));
-        } else {
-          new_fields.push_back(field);
-        }
-      }
-      call_node_type = TupleType(new_fields);
+        << "Only allow function with a single tuple input";
     }
 
     CHECK(call_node->op.as<OpNode>())
-        << "Primitive function only allows call into primitive ops";
+      << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
+
     Array<te::Tensor> outputs;
+    OpImplementation impl;
     // Skip fcompute for device copy operators as it is not registered.
     if (op == device_copy_op_) {
       const auto* copy_input = inputs[0].operator->();
       outputs.push_back(te::TensorNode::make(copy_input->shape, copy_input->dtype,
                                          te::Operation(), 0));
     } else {
-      outputs = fcompute[op](call_node->attrs, inputs,
-                             call_node_type, target_);
+      LoweredOutput lowered_out = (*flower_call)(GetRef<Call>(call_node), inputs, target_);
+      outputs = lowered_out->outputs;
+      impl = lowered_out->implementation;
     }
 
     int op_pattern = fpattern[op];
     if (op_pattern >= kCommReduce) {
       CHECK(!master_op_.defined() || master_op_pattern_ < kCommReduce)
-          << "Two complicated op in a primitive function "
-          << " master=" << master_op_ << " current=" << op;
+        << "Two complicated op in a primitive function "
+        << " master=" << master_op_ << " current=" << op;
     }
     if (op_pattern >= master_op_pattern_) {
       master_op_ = op;
       master_attrs_ = call_node->attrs;
       master_op_pattern_ = op_pattern;
+      master_implementation_ = impl;
     }
     if (outputs.size() != 1) {
       const auto* tuple_type =
@@ -332,6 +324,7 @@ class ScheduleGetter :
   Op master_op_;
   Attrs master_attrs_;
   int master_op_pattern_{0};
+  OpImplementation master_implementation_;
   std::ostringstream readable_name_stream_;
   std::unordered_map<Expr, Array<te::Tensor>, ObjectHash, ObjectEqual> memo_;
   Array<te::Operation> scalars_;
@@ -677,8 +670,7 @@ class CompileEngineImpl : public CompileEngineNode {
    * \return Pair of schedule and cache.
    *  The funcs field in cache is not yet populated.
    */
-  std::pair<te::Schedule, CachedFunc> CreateSchedule(
-      const Function& source_func, const Target& target) {
+  CachedFunc CreateSchedule(const Function& source_func, const Target& target) {
     return ScheduleGetter(target).Create(source_func);
   }
 
@@ -713,9 +705,9 @@ class CompileEngineImpl : public CompileEngineNode {
     With<Target> target_scope(key->target);
 
     CHECK(!value->cached_func.defined());
-    auto spair = CreateSchedule(key->source_func, key->target);
+    auto cfunc = CreateSchedule(key->source_func, key->target);
     auto cache_node = make_object<CachedFuncNode>(
-        *(spair.second.operator->()));
+        *(cfunc.operator->()));
 
     // Skip lowering for device copy node.
     const Expr body = (key->source_func)->body;
@@ -735,11 +727,12 @@ class CompileEngineImpl : public CompileEngineNode {
     // lower the function
     if (const auto* f = runtime::Registry::Get("relay.backend.lower")) {
       cache_node->funcs = (*f)(
-          spair.first, all_args, cache_node->func_name, key->source_func);
+          cfunc->schedule, all_args, cache_node->func_name, key->source_func);
     } else {
       tvm::BuildConfig bcfg = BuildConfig::Create();
       std::unordered_map<te::Tensor, tir::Buffer> binds;
-      cache_node->funcs = tvm::lower(spair.first, all_args, cache_node->func_name, binds, bcfg);
+      cache_node->funcs = tvm::lower(cfunc->schedule, all_args, cache_node->func_name,
+                                     binds, bcfg);
     }
     value->cached_func = CachedFunc(cache_node);
     return value;
@@ -820,6 +813,11 @@ const CompileEngine& CompileEngine::Global() {
   return *inst;
 }
 
+TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput")
+.set_body_typed([](tvm::Array<te::Tensor> outputs, OpImplementation impl) {
+  return LoweredOutput(outputs, impl);
+});
+
 TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey")
 .set_body_typed(CCacheKeyNode::make);
 
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 15ec2d6bd0f1..2dbacf645482 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -30,6 +30,7 @@
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/transform.h>
+#include <tvm/relay/op_strategy.h>
 #include <string>
 #include <functional>
 
@@ -44,6 +45,28 @@ enum ShapeFuncParamState {
   kNeedBoth = 3,
 };
 
+struct LoweredOutputNode : public Object {
+  /*! \brief The outputs to the function */
+  tvm::Array<te::Tensor> outputs;
+  /*! \brief The implementation used to compute the output */
+  OpImplementation implementation;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("outputs", &outputs);
+    v->Visit("implementation", &implementation);
+  }
+
+  static constexpr const char* _type_key = "relay.LoweredOutput";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LoweredOutputNode, Object);
+};
+
+class LoweredOutput : public ObjectRef {
+ public:
+  TVM_DLL LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation impl);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(LoweredOutput, ObjectRef, LoweredOutputNode);
+};
+
 /*! \brief Node container to represent a cached function. */
 struct CachedFuncNode : public Object {
   /* \brief compiled target */
@@ -54,6 +77,8 @@ struct CachedFuncNode : public Object {
   tvm::Array<te::Tensor> inputs;
   /* \brief The outputs to the function */
   tvm::Array<te::Tensor> outputs;
+  /*! \brief The schedule to the function */
+  te::Schedule schedule;
   /*! \brief The lowered functions to support the function. */
   tvm::Array<tir::LoweredFunc> funcs;
   /*! \brief Parameter usage states in the shape function. */
@@ -64,6 +89,7 @@ struct CachedFuncNode : public Object {
     v->Visit("func_name", &func_name);
     v->Visit("inputs", &inputs);
     v->Visit("outputs", &outputs);
+    v->Visit("schedule", &schedule);
     v->Visit("funcs", &funcs);
     v->Visit("shape_func_param_states", &shape_func_param_states);
   }
diff --git a/src/relay/ir/op_strategy.cc b/src/relay/ir/op_strategy.cc
new file mode 100644
index 000000000000..5ce609104395
--- /dev/null
+++ b/src/relay/ir/op_strategy.cc
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/tvm/relay/ir/op_strategy.cc
+ * \brief The Relay operator Strategy and related data structure.
+ */
+
+#include <tvm/relay/op_strategy.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(OpImplementationNode);
+TVM_REGISTER_NODE_TYPE(OpSpecializationNode);
+TVM_REGISTER_NODE_TYPE(OpStrategyNode);
+
+Array<te::Tensor> OpImplementation::Compute(const Attrs& attrs,
+                                            const Array<te::Tensor>& inputs,
+                                            const Type& out_type) {
+  return (*this)->fcompute(attrs, inputs, out_type);
+}
+
+te::Schedule OpImplementation::Schedule(const Attrs& attrs,
+                                        const Array<te::Tensor> &outs,
+                                        const Target& target) {
+  return (*this)->fschedule(attrs, outs, target);
+}
+
+void OpSpecialization::AddImplementation(tvm::relay::FTVMCompute fcompute,
+                                         tvm::relay::FTVMSchedule fschedule,
+                                         std::string name,
+                                         int plevel) {
+  auto n = make_object<OpImplementationNode>();
+  n->fcompute = fcompute;
+  n->fschedule = fschedule;
+  n->name = std::move(name);
+  n->plevel = plevel;
+  (*this)->implementations.push_back(OpImplementation(n));
+}
+
+void OpStrategy::AddImplementation(FTVMCompute fcompute,
+                                   FTVMSchedule fschedule,
+                                   std::string name,
+                                   int plevel) {
+  auto curr_cond = te::SpecializedCondition::Current();
+  auto self = this->operator->();
+  Array<OpSpecialization> specializations = self->specializations;
+  OpSpecialization op_spec;
+  for (OpSpecialization op_spec : specializations) {
+    if (op_spec->condition == curr_cond) {
+      op_spec.AddImplementation(fcompute, fschedule, std::move(name), plevel);
+      return;
+    }
+  }
+  ObjectPtr<OpSpecializationNode> n = make_object<OpSpecializationNode>();
+  n->condition = curr_cond;
+  op_spec = OpSpecialization(n);
+  op_spec.AddImplementation(fcompute, fschedule, std::move(name), plevel);
+  self->specializations.push_back(op_spec);
+}
+
+TVM_REGISTER_GLOBAL("relay.op._OpImplementationCompute")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    OpImplementation imp = args[0];
+    Attrs attrs = args[1];
+    Array<te::Tensor> inputs = args[2];
+    Type out_type = args[3];
+    *rv = imp.Compute(attrs, inputs, out_type);
+});
+
+TVM_REGISTER_GLOBAL("relay.op._OpImplementationSchedule")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    OpImplementation imp = args[0];
+    Attrs attrs = args[1];
+    Array<te::Tensor> outs = args[2];
+    Target target = args[3];
+    *rv = imp.Schedule(attrs, outs, target);
+});
+
+TVM_REGISTER_GLOBAL("relay.op._make.OpStrategy")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    ObjectPtr<OpStrategyNode> n = make_object<OpStrategyNode>();
+    *rv = OpStrategy(n);
+});
+
+TVM_REGISTER_GLOBAL("relay.op._OpStrategyAddImplementation")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    OpStrategy strategy = args[0];
+    FTVMCompute compute = args[1];
+    FTVMSchedule schedule = args[2];
+    std::string name = args[3];
+    int plevel = args[4];
+    strategy.AddImplementation(compute, schedule, name, plevel);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 6106b07f543b..36f592355a2c 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -79,7 +79,7 @@ TVM_ADD_FILELINE)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute",
                        [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                          const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                          const Type& out_dtype) -> Array<te::Tensor> {
                          return {topi::identity(inputs[0])};
                        });
 
@@ -105,7 +105,7 @@ TVM_ADD_FILELINE)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute",
                        [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                          const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                          const Type& out_dtype) -> Array<te::Tensor> {
                          return {topi::identity(inputs[0])};
                        });
 
@@ -123,7 +123,7 @@ Mark the start of bitpacking.
                                ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute",
                        [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                          const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                          const Type& out_dtype) -> Array<te::Tensor> {
                          return {topi::identity(inputs[0])};
                        });
 
@@ -140,7 +140,7 @@ Mark the end of bitpacking.
                                ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute",
                        [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                          const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                          const Type& out_dtype) -> Array<te::Tensor> {
                          return {topi::identity(inputs[0])};
                        });
 
@@ -163,7 +163,7 @@ Mark a checkpoint for checkpointing memory optimization.
                                ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute",
                        [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                          const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                          const Type& out_dtype) -> Array<te::Tensor> {
                          Array<te::Tensor> outputs;
                          for (size_t i = 0; i < inputs.size(); ++i) {
                            outputs.push_back(topi::identity(inputs[i]));
@@ -184,7 +184,7 @@ Beginning of a region that is handled by a given compiler.
                                ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute",
                        [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                          const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                          const Type& out_dtype) -> Array<te::Tensor> {
                          return {topi::identity(inputs[0])};
                        });
 
@@ -209,7 +209,7 @@ End of a region that is handled by a given compiler.
                                ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute",
                        [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                          const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                          const Type& out_dtype) -> Array<te::Tensor> {
                          return {topi::identity(inputs[0])};
                        });
 
diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc
index 14c0a01576d5..a0f7fbf4cfeb 100644
--- a/src/relay/op/debug.cc
+++ b/src/relay/op/debug.cc
@@ -36,9 +36,8 @@ namespace relay {
 TVM_REGISTER_NODE_TYPE(DebugAttrs);
 
 Array<te::Tensor> DebugCompute(const Attrs& attrs,
-                           const Array<te::Tensor>& inputs,
-                           const Type& out_type,
-                           const Target& target) {
+                               const Array<te::Tensor>& inputs,
+                               const Type& out_type) {
   return Array<te::Tensor>{ topi::identity(inputs[0]) };
 }
 
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index 076e3fcb0dbb..d15099b6b451 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -83,7 +83,7 @@ RELAY_REGISTER_OP("memory.alloc_storage")
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
     .set_attr<FTVMCompute>("FTVMCompute",
                            [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                              const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                              const Type& out_dtype) -> Array<te::Tensor> {
                              return {topi::identity(inputs[0])};
                            });
 
@@ -179,7 +179,7 @@ RELAY_REGISTER_OP("memory.alloc_tensor")
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
     .set_attr<FTVMCompute>("FTVMCompute",
                            [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                              const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                              const Type& out_dtype) -> Array<te::Tensor> {
                              return {topi::identity(inputs[0])};
                            });
 
@@ -228,7 +228,7 @@ RELAY_REGISTER_OP("memory.invoke_tvm_op")
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
     .set_attr<FTVMCompute>("FTVMCompute",
                            [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                              const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                              const Type& out_dtype) -> Array<te::Tensor> {
                              return {topi::identity(inputs[0])};
                            });
 
@@ -252,7 +252,7 @@ RELAY_REGISTER_OP("memory.kill")
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
     .set_attr<FTVMCompute>("FTVMCompute",
                            [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                              const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                              const Type& out_dtype) -> Array<te::Tensor> {
                              return {topi::identity(inputs[0])};
                            });
 
@@ -340,7 +340,7 @@ RELAY_REGISTER_OP("memory.shape_func")
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
     .set_attr<FTVMCompute>("FTVMCompute",
                            [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                              const Type& out_dtype, const Target& target) -> Array<te::Tensor> {
+                              const Type& out_dtype) -> Array<te::Tensor> {
                              return {topi::identity(inputs[0])};
                            });
 
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 6977ac9b8575..cd9b5ddc7fbf 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -735,58 +735,6 @@ weight transformation in advance.
 .add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel);
 
 
-// Positional relay function to create conv2d winograd nnpack operator
-// used by frontend FFI.
-Expr MakeConv2DWinogradNNPACK(Expr data,
-                              Expr weight,
-                              Array<IndexExpr> strides,
-                              Array<IndexExpr> padding,
-                              Array<IndexExpr> dilation,
-                              int groups,
-                              IndexExpr channels,
-                              Array<IndexExpr> kernel_size,
-                              std::string data_layout,
-                              std::string kernel_layout,
-                              std::string out_layout,
-                              DataType out_dtype) {
-  auto attrs = make_object<Conv2DAttrs>();
-  attrs->strides = std::move(strides);
-  attrs->padding = std::move(padding);
-  attrs->dilation = std::move(dilation);
-  attrs->groups = groups;
-  attrs->channels = channels;
-  attrs->kernel_size = std::move(kernel_size);
-  attrs->data_layout = std::move(data_layout);
-  attrs->kernel_layout = std::move(kernel_layout);
-  attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("nn.contrib_conv2d_winograd_nnpack_without_weight_transform");
-  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_winograd_nnpack_without_weight_transform")
-.set_body_typed(MakeConv2DWinogradNNPACK);
-
-RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
-.describe(R"code(Compute conv2d with winograd nnpack. Only supports NCHW layout.
-              This operator assumes the weight tensor is already pre-transformed by
-              nn.contrib_conv2d_winograd_nnpack_weight_transform.
-
-- **data**: Input is 4D array of shape  (batch_size, in_channels, height, width)
-- **weight**: Any shape
-            We do not check the shape for this input tensor. Since different backend
-            has different layout strategy.
-
-- **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
-)code" TVM_ADD_FILELINE)
-.set_attrs_type<Conv2DAttrs>()
-.set_num_inputs(2)
-.add_argument("data", "Tensor", "The input tensor.")
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(10)
-.add_type_rel("Conv2DWinogradNNPACKRel", Conv2DWinogradRel<Conv2DAttrs>)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
-
 // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradNNPACKWeightTransformAttrs);
 
@@ -848,55 +796,6 @@ weight transformation in advance.
 .set_support_level(10)
 .add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel);
 
-// Positional relay function to create conv2d NCHWc operator
-// used by frontend FFI.
-Expr MakeConv2DNCHWcInt8(Expr data,
-                         Expr kernel,
-                         Array<IndexExpr> strides,
-                         Array<IndexExpr> padding,
-                         Array<IndexExpr> dilation,
-                         int groups,
-                         IndexExpr channels,
-                         Array<IndexExpr> kernel_size,
-                         std::string data_layout,
-                         std::string kernel_layout,
-                         std::string out_layout,
-                         DataType out_dtype) {
-  auto attrs = make_object<Conv2DAttrs>();
-  attrs->strides = std::move(strides);
-  attrs->padding = std::move(padding);
-  attrs->dilation = std::move(dilation);
-  attrs->groups = groups;
-  attrs->channels = channels;
-  attrs->kernel_size = std::move(kernel_size);
-  attrs->data_layout = std::move(data_layout);
-  attrs->kernel_layout = std::move(kernel_layout);
-  attrs->out_layout = std::move(out_layout);
-  attrs->out_dtype = std::move(out_dtype);
-  static const Op& op = Op::Get("nn.contrib_conv2d_NCHWc_int8");
-  return CallNode::make(op, {data, kernel}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_NCHWc_int8")
-.set_body_typed(MakeConv2DNCHWcInt8);
-
-
-RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc_int8")
-.describe(R"code(Compute conv2d with NCHWc data layout with int8 inputs.
-- **data**: Input is 5D packed tensor.
-- **weight**: 7D packed tensor.
-
-- **out**:  Output is 5D packed tensor
-)code" TVM_ADD_FILELINE)
-.set_attrs_type<Conv2DAttrs>()
-.set_num_inputs(2)
-.add_argument("data", "Tensor", "The input tensor.")
-.add_argument("weight", "Tensor", "The weight tensor.")
-.set_support_level(10)
-.add_type_rel("Conv2DNCHWcInt8", Conv2DWinogradRel<Conv2DAttrs>)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-        ConvInferCorrectLayout<Conv2DAttrs>);
-
 // Positional relay function to create conv2d NCHWc operator
 // used by frontend FFI.
 Expr MakeConv2DNCHWc(Expr data,
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 40619091656f..9ee84a0332bb 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -153,6 +153,16 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       << " But got " << out_layout;
 
   Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
+  bool is_depthwise = false;
+  if (param->groups > 1) {
+    CHECK(weight && weight->shape.defined()) <<
+        "Weight shape must be specified when groups is greater than 1.";
+    Array<IndexExpr> wshape_oihw = trans_kernel_layout.ForwardShape(weight->shape);
+    if (tvm::tir::Equal(param->groups, dshape_nchw[1]) &&
+        tvm::tir::Equal(param->groups, wshape_oihw[0])) {
+      is_depthwise = true;
+    }
+  }
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
   // infer weight if the kernel_size and channels are defined
@@ -161,9 +171,9 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     CHECK_EQ(param->dilation.size(), 2);
     Array<IndexExpr> wshape;
 
-    if (tvm::tir::Equal(param->channels, param->groups) && !tvm::tir::Equal(param->channels, 1)) {
+    if (is_depthwise) {
       // infer weight's shape for depthwise convolution
-      wshape = {{dshape_nchw[1], indexdiv(param->groups, dshape_nchw[1]), param->kernel_size[0],
+      wshape = {{dshape_nchw[1], indexdiv(param->channels, dshape_nchw[1]), param->kernel_size[0],
                  param->kernel_size[1]}};
     } else {
       wshape = {{param->channels, indexdiv(dshape_nchw[1], param->groups), param->kernel_size[0],
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index ee4471a85c17..10fd4d975ce4 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -93,8 +93,9 @@ RELAY_REGISTER_OP("nn.bias_add")
 .add_argument("bias", "1D Tensor", "Bias.")
 .set_support_level(1)
 .add_type_rel("BiasAdd", BiasAddRel)
-.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                                        const Type& out_type, const Target& target) {
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<te::Tensor>& inputs,
+                                         const Type& out_type) {
     const auto* param = attrs.as<BiasAddAttrs>();
     return tvm::Array<tvm::te::Tensor>{topi::nn::bias_add(inputs[0], inputs[1], param->axis)};
 });
@@ -234,8 +235,7 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const Attrs& attrs,
                     const Array<te::Tensor>& inputs,
-                    const Type& out_type,
-                    const Target& target) {
+                    const Type& out_type) {
     const auto* param = attrs.as<LeakyReluAttrs>();
     return Array<te::Tensor>{ topi::leaky_relu(inputs[0], param->alpha) };
 });
@@ -315,8 +315,7 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const Attrs& attrs,
                     const Array<te::Tensor>& inputs,
-                    const Type& out_type,
-                    const Target& target) {
+                    const Type& out_type) {
     const auto* param = attrs.as<PReluAttrs>();
     return Array<te::Tensor>{ topi::prelu(inputs[0], inputs[1], param->axis)};
 });
@@ -351,8 +350,7 @@ RELAY_REGISTER_OP("nn.softmax")
 .add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
                                          const Array<te::Tensor>& inputs,
-                                         const Type& out_type,
-                                         const Target& target) {
+                                         const Type& out_type) {
   const auto* param = attrs.as<SoftmaxAttrs>();
   CHECK(param != nullptr);
   return Array<te::Tensor>{ topi::nn::softmax(inputs[0], param->axis) };
@@ -385,8 +383,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
 .add_type_rel("Identity", IdentityRel)
 .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
                                          const Array<te::Tensor>& inputs,
-                                         const Type& out_type,
-                                         const Target& target) {
+                                         const Type& out_type) {
   const auto* param = attrs.as<SoftmaxAttrs>();
   CHECK(param != nullptr);
   CHECK(param->axis == -1 || param->axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
@@ -462,8 +459,7 @@ Example::
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const Attrs& attrs,
                     const Array<te::Tensor>& inputs,
-                    const Type& out_type,
-                    const Target& target) {
+                    const Type& out_type) {
     return Array<te::Tensor>{ topi::nn::flatten(inputs[0]) };
 });
 
@@ -489,8 +485,7 @@ RELAY_REGISTER_OP("nn.relu")
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
 .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
                                          const Array<te::Tensor>& inputs,
-                                         const Type& out_type,
-                                         const Target& target) {
+                                         const Type& out_type) {
   return Array<te::Tensor>{ topi::relu(inputs[0], 0.0f) };
 });
 
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 94602ec9a61a..84a49403e837 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -161,9 +161,8 @@ bool PadRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> PadCompute(const Attrs& attrs,
-                         const Array<te::Tensor>& inputs,
-                         const Type& out_type,
-                         const Target& target) {
+                             const Array<te::Tensor>& inputs,
+                             const Type& out_type) {
   const auto* param = attrs.as<PadAttrs>();
   CHECK(param != nullptr);
 
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 6775b09e8aa9..77baae567ab6 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -164,9 +164,8 @@ bool Pool2DRel(const Array<Type>& types,
 
 template<typename AttrType, topi::nn::PoolType mode>
 Array<te::Tensor> Pool2DCompute(const Attrs& attrs,
-                            const Array<te::Tensor>& inputs,
-                            const Type& out_type,
-                            const Target& target) {
+                                const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
@@ -331,9 +330,8 @@ bool GlobalPool2DRel(const Array<Type>& types,
 
 template<topi::nn::PoolType mode>
 Array<te::Tensor> GlobalPool2DCompute(const Attrs& attrs,
-                                  const Array<te::Tensor>& inputs,
-                                  const Type& out_type,
-                                  const Target& target) {
+                                      const Array<te::Tensor>& inputs,
+                                      const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<GlobalPool2DAttrs>();
   CHECK(param != nullptr);
@@ -465,9 +463,8 @@ bool AdaptivePool2DRel(const Array<Type>& types,
 
 template<topi::nn::PoolType mode>
 Array<te::Tensor> AdaptivePool2DCompute(const Attrs& attrs,
-                                    const Array<te::Tensor>& inputs,
-                                    const Type& out_type,
-                                    const Target& target) {
+                                        const Array<te::Tensor>& inputs,
+                                        const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AdaptivePool2DAttrs>();
   CHECK(param != nullptr);
@@ -593,8 +590,9 @@ bool Pool2DGradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 }
 
 template <typename AttrType, topi::nn::PoolType mode>
-Array<te::Tensor> Pool2DGradCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
-                                const Type& out_type, const Target& target) {
+Array<te::Tensor> Pool2DGradCompute(const Attrs& attrs,
+                                    const Array<te::Tensor>& inputs,
+                                    const Type& out_type) {
   static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
@@ -793,9 +791,8 @@ bool Pool1DRel(const Array<Type>& types,
 
 template<typename AttrType, topi::nn::PoolType mode>
 Array<te::Tensor> Pool1DCompute(const Attrs& attrs,
-                            const Array<te::Tensor>& inputs,
-                            const Type& out_type,
-                            const Target& target) {
+                                const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
   static const Layout kNCW("NCW");
   const auto* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
@@ -985,9 +982,8 @@ bool Pool3DRel(const Array<Type>& types,
 
 template<typename AttrType, topi::nn::PoolType mode>
 Array<te::Tensor> Pool3DCompute(const Attrs& attrs,
-                            const Array<te::Tensor>& inputs,
-                            const Type& out_type,
-                            const Target& target) {
+                                const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
   static const Layout kNCDHW("NCDHW");
   const auto* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index d1b915cfa142..58221ae66f6e 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -32,9 +32,8 @@ namespace relay {
 
 #define RELAY_BINARY_COMPUTE(FTOPI)                        \
   [] (const Attrs& attrs,                                  \
-      const Array<te::Tensor>& inputs,                         \
-      const Type& out_type,                                \
-      const Target& target) -> Array<te::Tensor> {             \
+      const Array<te::Tensor>& inputs,                     \
+      const Type& out_type) -> Array<te::Tensor> {         \
     CHECK_EQ(inputs.size(), 2U);                           \
     return {FTOPI(inputs[0], inputs[1])};                  \
   }                                                        \
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index acbde0d6e28b..5e0795eaa60b 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -176,7 +176,6 @@ template<typename F>
 Array<te::Tensor> ReduceCompute(const Attrs& attrs,
                             const Array<te::Tensor>& inputs,
                             const Type& out_type,
-                            const Target& target,
                             F f) {
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
   CHECK(param != nullptr);
@@ -321,10 +320,9 @@ bool ReduceRel(const Array<Type>& types,
 
 
 Array<te::Tensor> ArgMaxCompute(const Attrs& attrs,
-                            const Array<te::Tensor>& inputs,
-                            const Type& out_type,
-                            const Target& target) {
-  return ReduceCompute(attrs, inputs, out_type, target, topi::argmax);
+                                const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
+  return ReduceCompute(attrs, inputs, out_type, topi::argmax);
 }
 
 
@@ -341,10 +339,9 @@ values over a given axis.
 
 
 Array<te::Tensor> ArgMinCompute(const Attrs& attrs,
-                            const Array<te::Tensor>& inputs,
-                            const Type& out_type,
-                            const Target& target) {
-  return ReduceCompute(attrs, inputs, out_type, target, topi::argmin);
+                                const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
+  return ReduceCompute(attrs, inputs, out_type, topi::argmin);
 }
 
 RELAY_REGISTER_REDUCE_OP("argmin")
@@ -359,10 +356,9 @@ values over a given axis.
 .set_attr<TOpPattern>("TOpPattern", kCommReduce);
 
 Array<te::Tensor> SumCompute(const Attrs& attrs,
-                         const Array<te::Tensor>& inputs,
-                         const Type& out_type,
-                         const Target& target) {
-  return ReduceCompute(attrs, inputs, out_type, target, topi::sum);
+                             const Array<te::Tensor>& inputs,
+                             const Type& out_type) {
+  return ReduceCompute(attrs, inputs, out_type, topi::sum);
 }
 
 
@@ -393,10 +389,9 @@ Example::
 
 
 Array<te::Tensor> AllCompute(const Attrs& attrs,
-                         const Array<te::Tensor>& inputs,
-                         const Type& out_type,
-                         const Target& target) {
-  return ReduceCompute(attrs, inputs, out_type, target, topi::all);
+                             const Array<te::Tensor>& inputs,
+                             const Type& out_type) {
+  return ReduceCompute(attrs, inputs, out_type, topi::all);
 }
 
 
@@ -430,10 +425,9 @@ Example::
 
 
 Array<te::Tensor> AnyCompute(const Attrs& attrs,
-                         const Array<te::Tensor>& inputs,
-                         const Type& out_type,
-                         const Target& target) {
-  return ReduceCompute(attrs, inputs, out_type, target, topi::any);
+                             const Array<te::Tensor>& inputs,
+                             const Type& out_type) {
+  return ReduceCompute(attrs, inputs, out_type, topi::any);
 }
 
 
@@ -467,10 +461,9 @@ Example::
 
 
 Array<te::Tensor> MaxCompute(const Attrs& attrs,
-                         const Array<te::Tensor>& inputs,
-                         const Type& out_type,
-                         const Target& target) {
-  return ReduceCompute(attrs, inputs, out_type, target, topi::max);
+                             const Array<te::Tensor>& inputs,
+                             const Type& out_type) {
+  return ReduceCompute(attrs, inputs, out_type, topi::max);
 }
 
 RELAY_REGISTER_REDUCE_OP("max")
@@ -485,10 +478,9 @@ RELAY_REGISTER_REDUCE_OP("max")
 
 
 Array<te::Tensor> MinCompute(const Attrs& attrs,
-                         const Array<te::Tensor>& inputs,
-                         const Type& out_type,
-                         const Target& target) {
-  return ReduceCompute(attrs, inputs, out_type, target, topi::min);
+                             const Array<te::Tensor>& inputs,
+                             const Type& out_type) {
+  return ReduceCompute(attrs, inputs, out_type, topi::min);
 }
 
 
@@ -504,10 +496,9 @@ RELAY_REGISTER_REDUCE_OP("min")
 
 
 Array<te::Tensor> ProdCompute(const Attrs& attrs,
-                          const Array<te::Tensor>& inputs,
-                          const Type& out_type,
-                          const Target& target) {
-  return ReduceCompute(attrs, inputs, out_type, target, topi::prod);
+                              const Array<te::Tensor>& inputs,
+                              const Type& out_type) {
+  return ReduceCompute(attrs, inputs, out_type, topi::prod);
 }
 
 RELAY_REGISTER_REDUCE_OP("prod")
@@ -534,9 +525,8 @@ Example::
 
 
 Array<te::Tensor> MeanCompute(const Attrs& attrs,
-                          const Array<te::Tensor>& inputs,
-                          const Type& out_type,
-                          const Target& target) {
+                               const Array<te::Tensor>& inputs,
+                               const Type& out_type) {
   IndexExpr count = tir::make_const(inputs[0]->dtype, 1);
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
   CHECK(param != nullptr);
@@ -546,7 +536,7 @@ Array<te::Tensor> MeanCompute(const Attrs& attrs,
                                  param->exclude)) {
     count *= inputs[0]->shape[i];
   }
-  auto res = ReduceCompute(attrs, inputs, out_type, target, topi::sum);
+  auto res = ReduceCompute(attrs, inputs, out_type, topi::sum);
   return {topi::divide(res[0], count)};
 }
 
@@ -599,9 +589,8 @@ bool VarianceRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> VarianceCompute(const Attrs& attrs,
-                              const Array<te::Tensor>& inputs,
-                              const Type& out_type,
-                              const Target& target) {
+                                  const Array<te::Tensor>& inputs,
+                                  const Type& out_type) {
   IndexExpr count = tir::make_const(inputs[0]->dtype, 1);
   const ReduceAttrs* param = attrs.as<ReduceAttrs>();
   CHECK(param != nullptr);
@@ -615,7 +604,7 @@ Array<te::Tensor> VarianceCompute(const Attrs& attrs,
   }
   std::vector<Integer> expand_shape;
   auto sq_diff = topi::power(topi::subtract(data, mean), 2);
-  auto var = topi::divide(ReduceCompute(attrs, {sq_diff}, out_type, target, topi::sum)[0], count);
+  auto var = topi::divide(ReduceCompute(attrs, {sq_diff}, out_type, topi::sum)[0], count);
 
   return {var};
 }
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 969912f4de8b..fa96d7fcbee7 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -66,9 +66,8 @@ bool CastRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> CastCompute(const Attrs& attrs,
-                          const Array<te::Tensor>& inputs,
-                          const Type& out_type,
-                          const Target& target) {
+                              const Array<te::Tensor>& inputs,
+                              const Type& out_type) {
   const CastAttrs *param = attrs.as<CastAttrs>();
   CHECK(param != nullptr);
   DataType dtype = param->dtype;
@@ -126,9 +125,8 @@ bool CastLikeRel(const Array<Type>& types,
 
 
 Array<te::Tensor> CastLikeCompute(const Attrs& attrs,
-                              const Array<te::Tensor>& inputs,
-                              const Type& out_type,
-                              const Target& target) {
+                                  const Array<te::Tensor>& inputs,
+                                  const Type& out_type) {
   return { topi::cast(inputs[0], inputs[1]->dtype) };
 }
 
@@ -156,8 +154,9 @@ RELAY_REGISTER_OP("cast_like")
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
 
 
-Array<te::Tensor> ReinterpretCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
-                                 const Type& out_type, const Target& target) {
+Array<te::Tensor> ReinterpretCompute(const Attrs& attrs,
+                                     const Array<te::Tensor>& inputs,
+                                     const Type& out_type) {
   const CastAttrs* param = attrs.as<CastAttrs>();
   CHECK(param != nullptr);
   DataType dtype = param->dtype;
@@ -231,9 +230,8 @@ bool ExpandDimsRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> ExpandDimsCompute(const Attrs& attrs,
-                                const Array<te::Tensor>& inputs,
-                                const Type& out_type,
-                                const Target& target) {
+                                    const Array<te::Tensor>& inputs,
+                                    const Type& out_type) {
   const ExpandDimsAttrs *param = attrs.as<ExpandDimsAttrs>();
   CHECK(param != nullptr);
   return { topi::expand_dims(inputs[0], param->axis, param->num_newaxis) };
@@ -270,9 +268,8 @@ RELAY_REGISTER_OP("expand_dims")
 TVM_REGISTER_NODE_TYPE(ConcatenateAttrs);
 
 Array<te::Tensor> ConcatenateCompute(const Attrs& attrs,
-                          const Array<te::Tensor>& inputs,
-                          const Type& out_type,
-                          const Target& target) {
+                                     const Array<te::Tensor>& inputs,
+                                     const Type& out_type) {
   const ConcatenateAttrs *param = attrs.as<ConcatenateAttrs>();
   CHECK(param != nullptr);
   return { topi::concatenate(inputs, param->axis) };
@@ -413,9 +410,8 @@ bool StackRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> StackCompute(const Attrs& attrs,
-                           const Array<te::Tensor>& inputs,
-                           const Type& out_type,
-                           const Target& target) {
+                               const Array<te::Tensor>& inputs,
+                               const Type& out_type) {
   const StackAttrs *param = attrs.as<StackAttrs>();
   CHECK(param != nullptr);
   return { topi::stack(inputs, param->axis) };
@@ -505,9 +501,8 @@ bool TransposeRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> TransposeCompute(const Attrs& attrs,
-                               const Array<te::Tensor>& inputs,
-                               const Type& out_type,
-                               const Target& target) {
+                                   const Array<te::Tensor>& inputs,
+                                   const Type& out_type) {
   const auto* param = attrs.as<TransposeAttrs>();
   CHECK(param != nullptr);
   return Array<te::Tensor>{ topi::transpose(inputs[0], param->axes) };
@@ -688,9 +683,8 @@ bool ReshapeRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> ReshapeCompute(const Attrs& attrs,
-                             const Array<te::Tensor>& inputs,
-                             const Type& out_type,
-                             const Target& target) {
+                                 const Array<te::Tensor>& inputs,
+                                 const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
   CHECK(out_ttype != nullptr);
   Array<IndexExpr> newshape;
@@ -923,9 +917,8 @@ bool TakeRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> TakeCompute(const Attrs& attrs,
-                          const Array<te::Tensor>& inputs,
-                          const Type& out_type,
-                          const Target& target) {
+                              const Array<te::Tensor>& inputs,
+                              const Type& out_type) {
   const auto* param = attrs.as<TakeAttrs>();
   CHECK(param != nullptr);
   if (!param->axis.defined()) {
@@ -1010,9 +1003,8 @@ bool FullRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> FullCompute(const Attrs& attrs,
-                          const Array<te::Tensor>& inputs,
-                          const Type& out_type,
-                          const Target& target) {
+                              const Array<te::Tensor>& inputs,
+                              const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
   return { topi::full(out_ttype->shape, out_ttype->dtype, inputs[0]()) };
 }
@@ -1118,9 +1110,8 @@ bool FullLikeRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> FullLikeCompute(const Attrs& attrs,
-                              const Array<te::Tensor>& inputs,
-                              const Type& out_type,
-                              const Target& target) {
+                                  const Array<te::Tensor>& inputs,
+                                  const Type& out_type) {
   return { topi::full_like(inputs[0], inputs[1]()) };
 }
 
@@ -1230,9 +1221,8 @@ inline te::Tensor DynamicArange(const te::Tensor& start,
 }
 
 Array<te::Tensor> ArangeCompute(const Attrs& attrs,
-                                 const Array<te::Tensor>& inputs,
-                                 const Type& out_type,
-                                 const Target& target) {
+                                const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
   const ArangeAttrs* param = attrs.as<ArangeAttrs>();
   te::Tensor start = inputs[0];
   te::Tensor stop =  inputs[1];
@@ -1325,9 +1315,8 @@ bool RepeatRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> RepeatCompute(const Attrs& attrs,
-                            const Array<te::Tensor>& inputs,
-                            const Type& out_type,
-                            const Target& target) {
+                                const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
   const RepeatAttrs *param = attrs.as<RepeatAttrs>();
   CHECK(param != nullptr);
   return { topi::repeat(inputs[0], param->repeats, param->axis) };
@@ -1436,9 +1425,8 @@ bool TileRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> TileCompute(const Attrs& attrs,
-                          const Array<te::Tensor>& inputs,
-                          const Type& out_type,
-                          const Target& target) {
+                              const Array<te::Tensor>& inputs,
+                              const Type& out_type) {
   const TileAttrs *param = attrs.as<TileAttrs>();
   CHECK(param != nullptr);
   return { topi::tile(inputs[0], param->reps) };
@@ -1497,9 +1485,8 @@ bool ReverseRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> ReverseCompute(const Attrs& attrs,
-                             const Array<te::Tensor>& inputs,
-                             const Type& out_type,
-                             const Target& target) {
+                                 const Array<te::Tensor>& inputs,
+                                 const Type& out_type) {
   const ReverseAttrs *param = attrs.as<ReverseAttrs>();
   CHECK(param != nullptr);
   return { topi::flip(inputs[0], param->axis) };
@@ -1571,9 +1558,8 @@ Expr MakeWhere(const Expr& condition, const Expr& x, const Expr& y) {
 }
 
 Array<te::Tensor> WhereCompute(const Attrs& attrs,
-                           const Array<te::Tensor>& inputs,
-                           const Type& out_type,
-                           const Target& target) {
+                               const Array<te::Tensor>& inputs,
+                               const Type& out_type) {
   return { topi::where(inputs[0], inputs[1], inputs[2]) };
 }
 
@@ -1688,9 +1674,8 @@ bool SqueezeRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> SqueezeCompute(const Attrs& attrs,
-                             const Array<te::Tensor>& inputs,
-                             const Type& out_type,
-                             const Target& target) {
+                                 const Array<te::Tensor>& inputs,
+                                 const Type& out_type) {
   const SqueezeAttrs *param = attrs.as<SqueezeAttrs>();
   CHECK(param != nullptr);
   return { topi::squeeze(inputs[0], param->axis) };
@@ -1729,9 +1714,8 @@ Expr MakeCollapseSumLike(Expr data,
 }
 
 Array<te::Tensor> CollapseSumLikeCompute(const Attrs& attrs,
-                                     const Array<te::Tensor>& inputs,
-                                     const Type& out_type,
-                                     const Target& target) {
+                                         const Array<te::Tensor>& inputs,
+                                         const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
   CHECK(out_ttype != nullptr);
   return { topi::collapse_sum(inputs[0], out_ttype->shape) };
@@ -1774,9 +1758,8 @@ Expr MakeBroadCastTo(Expr data, Array<IndexExpr> shape) {
 }
 
 Array<te::Tensor> BroadCastToCompute(const Attrs& attrs,
-                                 const Array<te::Tensor>& inputs,
-                                 const Type& out_type,
-                                 const Target& target) {
+                                     const Array<te::Tensor>& inputs,
+                                     const Type& out_type) {
   auto ioattrs = attrs.as<InitOpAttrs>();
   CHECK(ioattrs != nullptr);
   return { topi::broadcast_to(inputs[0], ioattrs->shape) };
@@ -1812,9 +1795,8 @@ Expr MakeBroadCastToLike(Expr data,
 }
 
 Array<te::Tensor> BroadCastToLikeCompute(const Attrs& attrs,
-                                     const Array<te::Tensor>& inputs,
-                                     const Type& out_type,
-                                     const Target& target) {
+                                         const Array<te::Tensor>& inputs,
+                                         const Type& out_type) {
   const auto* out_ttype = out_type.as<TensorTypeNode>();
   CHECK(out_ttype != nullptr);
   return { topi::broadcast_to(inputs[0], out_ttype->shape) };
@@ -2019,9 +2001,8 @@ Expr MakeStridedSlice(Expr data,
 }
 
 Array<te::Tensor> StridedSliceCompute(const Attrs& attrs,
-                                  const Array<te::Tensor>& inputs,
-                                  const Type& out_type,
-                                  const Target& target) {
+                                      const Array<te::Tensor>& inputs,
+                                      const Type& out_type) {
   const StridedSliceAttrs *param = attrs.as<StridedSliceAttrs>();
   CHECK(param != nullptr);
   return Array<te::Tensor>{
@@ -2176,9 +2157,8 @@ bool SplitRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> SplitCompute(const Attrs& attrs,
-                           const Array<te::Tensor>& inputs,
-                           const Type& out_type,
-                           const Target& target) {
+                               const Array<te::Tensor>& inputs,
+                               const Type& out_type) {
   const auto param = attrs.as<SplitAttrs>();
   CHECK(param != nullptr);
 
@@ -2305,9 +2285,8 @@ Expr MakeSliceLike(Expr data,
 }
 
 Array<te::Tensor> SliceLikeCompute(const Attrs& attrs,
-                               const Array<te::Tensor>& inputs,
-                               const Type& out_type,
-                               const Target& target) {
+                                   const Array<te::Tensor>& inputs,
+                                   const Type& out_type) {
   const auto* param = attrs.as<SliceLikeAttrs>();
   CHECK(param != nullptr);
   Array<IndexExpr> src_shape = inputs[0]->shape;
@@ -2371,9 +2350,8 @@ RELAY_REGISTER_OP("slice_like")
 TVM_REGISTER_NODE_TYPE(LayoutTransformAttrs);
 
 Array<te::Tensor> LayoutTransformCompute(const Attrs& attrs,
-                                     const Array<te::Tensor>& inputs,
-                                     const Type& out_type,
-                                     const Target& target) {
+                                         const Array<te::Tensor>& inputs,
+                                         const Type& out_type) {
   const auto* param = attrs.as<LayoutTransformAttrs>();
   CHECK(param != nullptr);
   return Array<te::Tensor>{
@@ -2504,9 +2482,8 @@ bool GatherNDRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> GatherNDCompute(const Attrs& attrs,
-                              const Array<te::Tensor>& inputs,
-                              const Type& out_type,
-                              const Target& target) {
+                                  const Array<te::Tensor>& inputs,
+                                  const Type& out_type) {
   return { topi::gather_nd(inputs[0], inputs[1]) };
 }
 
@@ -2558,9 +2535,8 @@ bool SequenceMaskRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> SequenceMaskCompute(const Attrs& attrs,
-                                  const Array<te::Tensor>& inputs,
-                                  const Type& out_type,
-                                  const Target& target) {
+                                      const Array<te::Tensor>& inputs,
+                                      const Type& out_type) {
   const auto* param = attrs.as<SequenceMaskAttrs>();
   CHECK(param != nullptr);
   return Array<te::Tensor>{
@@ -2671,9 +2647,8 @@ bool OneHotRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> OneHotCompute(const Attrs& attrs,
-                            const Array<te::Tensor>& inputs,
-                            const Type& out_type,
-                            const Target& target) {
+                                const Array<te::Tensor>& inputs,
+                                const Type& out_type) {
   const auto* param = attrs.as<OneHotAttrs>();
   CHECK(param != nullptr);
   return Array<te::Tensor> {
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 7f6db50bf702..caa6451542c9 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -34,9 +34,8 @@ namespace relay {
 
 #define RELAY_UNARY_COMPUTE(FTOPI)                      \
   [] (const Attrs& attrs,                               \
-      const Array<te::Tensor>& inputs,                      \
-      const Type& out_type,                             \
-      const Target& target) -> Array<te::Tensor> {          \
+      const Array<te::Tensor>& inputs,                  \
+      const Type& out_type) -> Array<te::Tensor> {      \
     return {FTOPI(inputs[0])};                          \
   }                                                     \
 
@@ -302,9 +301,8 @@ bool ShapeOfRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> ShapeOfCompute(const Attrs& attrs,
-                             const Array<te::Tensor>& inputs,
-                             const Type& out_type,
-                             const Target& target) {
+                                 const Array<te::Tensor>& inputs,
+                                 const Type& out_type) {
   CHECK_EQ(inputs.size(), 1);
   const auto* param = attrs.as<ShapeOfAttrs>();
   CHECK(param != nullptr);
@@ -353,9 +351,8 @@ bool NdarraySizeRel(const Array<Type>& types,
 }
 
 Array<te::Tensor> NdarraySizeCompute(const Attrs& attrs,
-                          const Array<te::Tensor>& inputs,
-                          const Type& out_type,
-                          const Target& target) {
+                                     const Array<te::Tensor>& inputs,
+                                     const Type& out_type) {
   CHECK_EQ(inputs.size(), 1);
   const auto* param = attrs.as<NdarraySizeAttrs>();
   CHECK(param != nullptr);
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
index 9c4a2850903b..7d152718f3a0 100644
--- a/src/relay/op/vision/yolo.cc
+++ b/src/relay/op/vision/yolo.cc
@@ -83,8 +83,7 @@ Its function is mostly shape transform.")doc" TVM_ADD_FILELINE)
 .add_type_rel("YoloReorg", YoloReorgRel)
 .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
                                          const Array<te::Tensor>& inputs,
-                                         const Type& out_type,
-                                         const Target& target) {
+                                         const Type& out_type) {
   const auto* params = attrs.as<YoloReorgAttrs>();
   CHECK(params != nullptr);
   return Array<te::Tensor>{ topi::vision::reorg(inputs[0], params->stride) };
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index 0cc3ff090dd8..fe8862523dda 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -83,7 +83,10 @@ class AlterTransformMemorizer : public TransformMemorizer {
         auto ttype = expr->type_as<TensorTypeNode>();
         tinfos.push_back(tvm::te::placeholder(ttype->shape, ttype->dtype));
       }
-      Expr altered_value = falter_layout[op](ref_call->attrs, new_args, tinfos);
+      // TODO(@kevinthesun, @icemelon9): This won't work if inputs/outputs are dynamic shapes.
+      //   Probably we need to disable the AlterOpLayout when compiling dynamic models.
+      Expr altered_value = falter_layout[op](ref_call->attrs, new_args, tinfos,
+                                             ref_call->checked_type());
       if (altered_value.defined()) {
         new_e = altered_value;
         modified = true;
diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc
index d3b448d37790..7a2e05a56fd3 100644
--- a/src/te/schedule/schedule_lang.cc
+++ b/src/te/schedule/schedule_lang.cc
@@ -20,9 +20,11 @@
 /*!
  * \file schedule_lang.cc
  */
+#include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/te/schedule.h>
 #include <tvm/te/operation.h>
+#include <stack>
 #include <unordered_set>
 #include "graph.h"
 
@@ -787,6 +789,53 @@ IterVarRelation SingletonNode::make(IterVar iter) {
   return IterVarRelation(n);
 }
 
+SpecializedCondition::SpecializedCondition(Array<PrimExpr> conditions) {
+  ObjectPtr<SpecializedConditionNode> n = make_object<SpecializedConditionNode>();
+  n->clauses = std::move(conditions);
+  data_ = std::move(n);
+}
+
+/*! \brief Entry to hold the SpecializedCondition context stack. */
+struct TVMSpecializationThreadLocalEntry {
+  /*! \brief The current specialized condition */
+  std::stack<SpecializedCondition> condition_stack;
+};
+
+/*! \brief Thread local store to hold the Target context stack. */
+typedef dmlc::ThreadLocalStore<TVMSpecializationThreadLocalEntry> TVMSpecializationThreadLocalStore;
+
+void SpecializedCondition::EnterWithScope() {
+  TVMSpecializationThreadLocalEntry *entry = TVMSpecializationThreadLocalStore::Get();
+  entry->condition_stack.push(*this);
+}
+
+void SpecializedCondition::ExitWithScope() {
+  TVMSpecializationThreadLocalEntry *entry = TVMSpecializationThreadLocalStore::Get();
+  CHECK(!entry->condition_stack.empty());
+  CHECK(entry->condition_stack.top().same_as(*this));
+  entry->condition_stack.pop();
+}
+
+SpecializedCondition SpecializedCondition::Current() {
+  TVMSpecializationThreadLocalEntry *entry = TVMSpecializationThreadLocalStore::Get();
+  SpecializedCondition cond;
+  if (entry->condition_stack.size() > 0) {
+    cond = entry->condition_stack.top();
+  }
+  return cond;
+}
+
+class SpecializedCondition::Internal {
+ public:
+  static void EnterScope(SpecializedCondition cond) {
+    cond.EnterWithScope();
+  }
+
+  static void ExitScope(SpecializedCondition cond) {
+    cond.ExitWithScope();
+  }
+};
+
 TVM_REGISTER_NODE_TYPE(StageNode);
 TVM_REGISTER_NODE_TYPE(IterVarAttrNode);
 TVM_REGISTER_NODE_TYPE(SplitNode);
@@ -794,6 +843,7 @@ TVM_REGISTER_NODE_TYPE(FuseNode);
 TVM_REGISTER_NODE_TYPE(RebaseNode);
 TVM_REGISTER_NODE_TYPE(SingletonNode);
 TVM_REGISTER_NODE_TYPE(ScheduleNode);
+TVM_REGISTER_NODE_TYPE(SpecializedConditionNode);
 
 // Printer
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
@@ -848,7 +898,13 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 .set_dispatch<ScheduleNode>([](const ObjectRef& node, ReprPrinter* p) {
     auto* op = static_cast<const ScheduleNode*>(node.get());
     p->stream << "schedule(" << op << ")";
-  });
+})
+.set_dispatch<SpecializedConditionNode>([](const ObjectRef& node, ReprPrinter* p) {
+    auto* op = static_cast<const SpecializedConditionNode*>(node.get());
+    p->stream << "specialized_condition(";
+    p->Print(op->clauses);
+    p->stream << ')';
+});
 
 
 TVM_REGISTER_GLOBAL("te.CreateSchedule")
@@ -962,5 +1018,22 @@ TVM_REGISTER_GLOBAL("te.ScheduleCacheWrite")
 
 TVM_REGISTER_GLOBAL("te.ScheduleRFactor")
 .set_body_method(&Schedule::rfactor);
+
+TVM_REGISTER_GLOBAL("te.CreateSpecializedCondition")
+.set_body_typed([](Array<PrimExpr> condition) {
+    return SpecializedCondition(condition);
+});
+
+TVM_REGISTER_GLOBAL("te.GetCurrentSpecialization")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = SpecializedCondition::Current();
+});
+
+TVM_REGISTER_GLOBAL("te.EnterSpecializationScope")
+.set_body_typed(SpecializedCondition::Internal::EnterScope);
+
+TVM_REGISTER_GLOBAL("te.ExitSpecializationScope")
+.set_body_typed(SpecializedCondition::Internal::ExitScope);
+
 }  // namespace te
 }  // namespace tvm
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index c27408315016..b9a8f8f96f8b 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -24,18 +24,56 @@
 #include <tvm/relay/type.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/transform.h>
+#include <tvm/relay/op_strategy.h>
+#include <tvm/relay/op_attr_types.h>
+#include <topi/broadcast.h>
 #include <topi/generic/injective.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 
-TVM_REGISTER_GLOBAL("test.sch")
-.set_body([](tvm::TVMArgs args, tvm::TVMRetValue *rv) {
-  *rv = topi::generic::schedule_injective(args[0], args[1]);
-  });
+using namespace tvm;
+using namespace tvm::relay;
+
+TVM_REGISTER_GLOBAL("test.strategy")
+.set_body_typed([](const Attrs& attrs, const Array<te::Tensor>& inputs,
+                   const Type& out_type, const Target& target) {
+    FTVMCompute fcompute = [](const Attrs& attrs,
+                              const Array<te::Tensor>& inputs,
+                              const Type& out_type) -> Array<te::Tensor> {
+        CHECK_EQ(inputs.size(), 2U);
+        return {topi::add(inputs[0], inputs[1])};
+    };
+    FTVMSchedule fschedule = [](const Attrs& attrs,
+                                const Array<te::Tensor>& outs,
+                                const Target& target) {
+        With<Target> target_scope(target);
+        return topi::generic::schedule_injective(target, outs);
+    };
+
+    auto n = make_object<OpStrategyNode>();
+    auto strategy = tvm::relay::OpStrategy(std::move(n));
+    strategy.AddImplementation(fcompute, fschedule, "test.strategy", 10);
+    return strategy;
+});
+
+TVM_REGISTER_GLOBAL("relay.backend.lower_call")
+.set_body_typed([](const relay::Call& call, const Array<te::Tensor>& inputs,
+                   const Target& target) {
+    static auto fstrategy = Op::GetAttr<relay::FTVMStrategy>("FTVMStrategy");
+    Op op = Downcast<Op>(call->op);
+    auto out_type = call->checked_type();
+    OpStrategy strategy = fstrategy[op](call->attrs, inputs, out_type, target);
+    auto impl = strategy->specializations[0]->implementations[0];
+    auto outs = impl.Compute(call->attrs, inputs, out_type);
+    auto f = tvm::runtime::Registry::Get("relay.backend._make_LoweredOutput");
+    if (!f) {
+      LOG(FATAL) << "relay.backend._make_LoweredOutput is not registered";
+    }
+    return (*f)(outs, impl);
+});
 
 TEST(Relay, BuildModule) {
-  using namespace tvm;
   auto tensor_type = relay::TensorType({2, 3}, DataType::Float(32));
   auto a = relay::VarNode::make("a", tensor_type);
   auto b = relay::VarNode::make("b", tensor_type);
@@ -59,14 +97,15 @@ TEST(Relay, BuildModule) {
   }
   // get schedule
   auto reg = tvm::runtime::Registry::Get("relay.op._Register");
-  auto s_i = tvm::runtime::Registry::Get("test.sch");
   if (!reg) {
     LOG(FATAL) << "no _Register";
   }
-  if (!s_i) {
-    LOG(FATAL) << "no _Register";
+  auto fs = tvm::runtime::Registry::Get("test.strategy");
+  if (!fs) {
+    LOG(FATAL) << "No test_strategy registered.";
   }
-  (*reg)("add", "FTVMSchedule", *s_i, 10);
+  auto fgeneric = GenericFunc::Get("test.strategy_generic").set_default(*fs);
+  (*reg)("add", "FTVMStrategy", fgeneric, 10);
   // build
   auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule");
   tvm::runtime::Module build_mod = (*pfb)();
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 8a6ceb81f263..504f70031e24 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -852,17 +852,22 @@ def verify(data_shape, out_shape, begin, end):
 
 
 def test_forward_convolution():
-    def verify(data_shape, kernel_size, stride, pad, num_filter):
-        weight_shape=(num_filter, data_shape[1],) + kernel_size
+    def verify(data_shape, kernel_size, stride, pad, num_filter, is_depthwise=False):
+        if is_depthwise:
+            groups = data_shape[1]
+            weight_shape=(data_shape[1], num_filter // groups,) + kernel_size
+        else:
+            groups = 1
+            weight_shape=(num_filter, data_shape[1],) + kernel_size
         x = np.random.uniform(size=data_shape).astype("float32")
         weight = np.random.uniform(size=weight_shape).astype("float32")
         bias = np.random.uniform(size=num_filter).astype("float32")
         ref_res = mx.nd.Convolution(data=mx.nd.array(x), weight=mx.nd.array(weight),
                                     bias=mx.nd.array(bias), kernel=kernel_size, stride=stride,
-                                    pad=pad, num_filter=num_filter)
+                                    pad=pad, num_filter=num_filter, num_group=groups)
         mx_sym = mx.sym.Convolution(mx.sym.var("x"), mx.sym.var("weight"), mx.sym.var("bias"),
                                     kernel=kernel_size, stride=stride,
-                                    pad=pad, num_filter=num_filter)
+                                    pad=pad, num_filter=num_filter, num_group=groups)
         shape_dict = {"x": x.shape, "weight": weight.shape, "bias": bias.shape}
         mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict)
         for target, ctx in ctx_list():
@@ -879,6 +884,8 @@ def verify(data_shape, kernel_size, stride, pad, num_filter):
     verify(data_shape=(20, 1, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=2)
     verify(data_shape=(1, 8, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=2)
     verify(data_shape=(20, 8, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=2)
+    verify(data_shape=(1, 8, 32, 32), kernel_size=(3, 3), stride=(1, 1), pad=(1, 1), num_filter=8,
+           is_depthwise=True)
 
 def test_forward_deconvolution():
     def verify(data_shape, kernel_size, stride, pad, num_filter):
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index fec7d3e4f83f..99f8b47cce07 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -25,7 +25,7 @@
 from tvm import autotvm
 from tvm.autotvm.tuner import RandomTuner
 
-@autotvm.template
+@autotvm.register_customized_task("testing/conv2d_no_batching")
 def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
     """An example template for testing"""
     assert N == 1, "Only consider batch_size = 1 in this template"
@@ -114,7 +114,7 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
 
 def get_sample_task(target=tvm.target.cuda(), target_host=None):
     """return a sample task for testing"""
-    task = autotvm.task.create(conv2d_no_batching,
+    task = autotvm.task.create("testing/conv2d_no_batching",
                                args=(1, 7, 7, 512, 512, 3, 3),
                                target=target, target_host=target_host)
     return task, target
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 3e392a8e630f..24176e4c41dd 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
 
 import tvm
 from tvm import relay
@@ -384,6 +385,8 @@ def verify_any_conv2d_NCHWc(data_shape, kernel_shape, strides, padding, dilation
         assert result.asnumpy().shape == ref_out_shape, \
             "Shape mismatch: expect %s but got %s." % (str(ref_out_shape), str(result.asnumpy().shape))
 
+# TODO(@kevinthesun): Need to fix the compute in conv2d_NCHWc to support any
+@pytest.mark.skip
 def test_any_conv2d_NCHWc():
     verify_any_conv2d_NCHWc((relay.Any(), 8, relay.Any(), relay.Any(), 8), (8, 8, 3, 3, 8, 8), (1, 1), (1, 1), (1, 1),
                             "NCHW8c", "OIHW8i8o", "NCHW8c", (1, 8, 224, 224, 8), (1, 8, 224, 224, 8))
diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py
index 8f550d82c4f6..e555caada626 100644
--- a/tests/python/relay/test_autotvm_task_extraction.py
+++ b/tests/python/relay/test_autotvm_task_extraction.py
@@ -39,25 +39,28 @@ def test_task_extraction():
     target = 'llvm'
     mod_list = []
     params_list = []
+    conv2d = relay.op.get("nn.conv2d")
+    conv2d_transpose = relay.op.get("nn.conv2d_transpose")
+    dense = relay.op.get("nn.dense")
 
     mod, params, _ = get_network('resnet-18', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d,))
+                                              ops=(conv2d,))
     assert len(tasks) == 12
     tasks = autotvm.task.extract_from_program(mod, target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d,))
+                                              ops=(conv2d,))
     assert len(tasks) == 12
 
     mod, params, _ = get_network('resnet-18', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
-                                              ops=(relay.op.nn.dense,))
+                                              ops=(dense,))
     assert len(tasks) == 1
     tasks = autotvm.task.extract_from_program(mod, target=target,
                                               params=params,
-                                              ops=(relay.op.nn.dense,))
+                                              ops=(dense,))
     assert len(tasks) == 1
 
     mod, params, _ = get_network('resnet-18', batch_size=1)
@@ -65,11 +68,14 @@ def test_task_extraction():
     params_list.append(params)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d, relay.op.nn.dense))
+                                              ops=(conv2d, dense))
     assert len(tasks) == 13
     tasks = autotvm.task.extract_from_program(mod, target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d, relay.op.nn.dense))
+                                              ops=(conv2d, dense))
+    assert len(tasks) == 13
+    tasks = autotvm.task.extract_from_program(mod, target=target,
+                                              params=params)
     assert len(tasks) == 13
 
     mod, params, _ = get_network('mobilenet', batch_size=1)
@@ -77,65 +83,19 @@ def test_task_extraction():
     params_list.append(params)
     tasks = autotvm.task.extract_from_program(mod, target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d, relay.op.nn.dense))
+                                              ops=(conv2d, dense))
     assert len(tasks) == 20
 
     mod, params, _ = get_network('dcgan', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod, target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d_transpose,))
+                                              ops=(conv2d_transpose,))
     assert len(tasks) == 4
 
     tasks = autotvm.task.extract_from_multiple_program(mod_list, params_list,
                                                        target=target,
-                                                       ops=(relay.op.nn.conv2d,))
+                                                       ops=(conv2d,))
     assert len(tasks) == 31
 
-def test_template_key_provided():
-    """test task extraction using non-'direct' template_key"""
-    target = 'llvm'
-
-    import topi
-    template_keys = {
-        # topi.nn.conv2d - is left blank to test fallback logic
-        topi.nn.dense: 'direct_nopack',
-        topi.nn.depthwise_conv2d_nchw: 'direct',
-    }
-
-    mod, params, _ = get_network('mobilenet', batch_size=1)
-    tasks = autotvm.task.extract_from_program(mod['main'], target=target,
-                                              params=params,
-                                              ops=(relay.op.nn.conv2d, relay.op.nn.dense),
-                                              template_keys=template_keys)
-    for task in tasks:
-        if 'dense' in task.name:
-            assert task.config_space.template_key == 'direct_nopack'
-        else:
-            assert task.config_space.template_key == 'direct'
-
-def test_template_key_empty():
-    """test task extraction using empty template_key"""
-    target = 'llvm'
-    mod, params, _ = get_network('mobilenet', batch_size=1)
-    tasks = autotvm.task.extract_from_program(mod['main'], target=target,
-                                              params=params,
-                                              ops=(relay.op.nn.conv2d, relay.op.nn.dense),
-                                              template_keys=None)
-    for task in tasks:
-        assert task.config_space.template_key == 'direct'
-
-def test_template_key_default():
-    """test task extraction without template_key"""
-    target = 'llvm'
-    mod, params, _ = get_network('mobilenet', batch_size=1)
-    tasks = autotvm.task.extract_from_program(mod['main'], target=target,
-                                              params=params,
-                                              ops=(relay.op.nn.conv2d, relay.op.nn.dense))
-    for task in tasks:
-        assert task.config_space.template_key == 'direct'
-
 if __name__ == '__main__':
     test_task_extraction()
-    test_template_key_provided()
-    test_template_key_empty()
-    test_template_key_default()
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
index fd7ec188611f..4e4122a28cf0 100644
--- a/tests/python/relay/test_backend_compile_engine.py
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -14,11 +14,136 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import numpy as np
 import tvm
 import tvm.testing
-import numpy as np
 from tvm import relay
+from tvm import autotvm
+import topi
+from tvm.relay.testing import run_infer_type
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+
+
+@autotvm.register_topi_compute("test/conv2d_1")
+def _compute_conv2d_1(cfg, input, filter, strides, padding, dilation, out_dtype):
+    return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
+
+@autotvm.register_topi_schedule("test/conv2d_1")
+def _schedule_conv2d_1(cfg, outs):
+    return topi.generic.schedule_conv2d_nchw(outs)
+
+@autotvm.register_topi_compute("test/conv2d_2")
+def _compute_conv2d_2(cfg, input, filter, strides, padding, dilation, out_dtype):
+    return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
+
+@autotvm.register_topi_schedule("test/conv2d_2")
+def _schedule_conv2d_2(cfg, outs):
+    return topi.generic.schedule_conv2d_nchw(outs)
+
+def _compute_conv2d_3(input, filter, strides, padding, dilation, out_dtype):
+    return topi.nn.conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
+
+def _schedule_conv2d_3(outs):
+    return topi.generic.schedule_conv2d_nchw(outs)
+
+@tvm.target.override_native_generic_func("test_conv2d_strategy")
+def _tmp_strategy(attrs, inputs, out_type, target):
+    strategy = relay.op.OpStrategy()
+    strategy.add_implementation(
+        relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_1),
+        relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_1),
+        name="conv2d_1",
+        plevel=10)
+    strategy.add_implementation(
+        relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_2),
+        relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_2),
+        name="conv2d_2",
+        plevel=15)
+    ic = inputs[0].shape[1]
+    with tvm.te.SpecializedCondition(ic >= 16):
+        strategy.add_implementation(
+            relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_3),
+            relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_3),
+            name="conv2d_3",
+            plevel=20)
+    return strategy
+
+def _create_record(task_name, dshape, wshape, target, cost):
+    args = [tvm.placeholder(dshape), tvm.placeholder(wshape), (1, 1), (1, 1, 1, 1),
+            (1, 1), 'float32']
+    task = autotvm.task.create(task_name, args, target)
+    cfg = autotvm.ConfigEntity(0, None, {}, [])
+    cfg.cost = cost
+    inp = autotvm.MeasureInput(target=target, task=task, config=cfg)
+    result = autotvm.MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
+    return (inp, result)
+
+def test_get_valid_implementations():
+    target = tvm.target.create("llvm")
+
+    def _get_impls(dshape, wshape):
+        data = relay.var("data", shape=dshape)
+        weight = relay.var("wshape", shape=wshape)
+        out = relay.nn.conv2d(data, weight, padding=(1, 1))
+        out = run_infer_type(out)
+        return relay.backend.compile_engine.get_valid_implementations(
+            relay.op.get("nn.conv2d"),
+            out.attrs,
+            [tvm.placeholder(dshape), tvm.placeholder(wshape)],
+            out.checked_type,
+            target)
+
+    with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
+        impls = _get_impls((1, 8, 7, 7), (32, 8, 3, 3))
+        assert len(impls) == 2
+        impls = _get_impls((1, 16, 7, 7), (32, 16, 3, 3))
+        assert len(impls) == 3
+
+def test_select_implementation():
+    target = tvm.target.create("llvm")
+
+    def _select_impl(dshape, wshape, use_autotvm=False):
+        data = relay.var("data", shape=dshape)
+        weight = relay.var("wshape", shape=wshape)
+        out = relay.nn.conv2d(data, weight, padding=(1, 1))
+        out = run_infer_type(out)
+        return relay.backend.compile_engine.select_implementation(
+            relay.op.get("nn.conv2d"),
+            out.attrs,
+            [tvm.placeholder(dshape), tvm.placeholder(wshape)],
+            out.checked_type,
+            target,
+            use_autotvm)
+
+    with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
+        impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3))
+        assert impl.name == "conv2d_2"
+        impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True)
+        assert impl.name == "conv2d_2"
+        impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3))
+        assert impl.name == "conv2d_3"
+        impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True)
+        assert impl.name == "conv2d_3"
+
+        # add autotvm record
+        records = []
+        records.append(_create_record("test/conv2d_1", (1, 8, 7, 7), (32, 8, 3, 3), target, 0.5))
+        records.append(_create_record("test/conv2d_1", (1, 16, 7, 7), (32, 16, 3, 3), target, 1.0))
+        with target:
+            with autotvm.apply_history_best(records):
+                impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True)
+                assert impl.name == "conv2d_1"
+                impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True)
+                assert impl.name == "conv2d_1"
 
+        records.append(_create_record("test/conv2d_2", (1, 8, 7, 7), (32, 8, 3, 3), target, 0.2))
+        records.append(_create_record("test/conv2d_1", (1, 16, 7, 7), (32, 16, 3, 3), target, 1.2))
+        with target:
+            with autotvm.apply_history_best(records):
+                impl, _ = _select_impl((1, 8, 7, 7), (32, 8, 3, 3), True)
+                assert impl.name == "conv2d_2"
+                impl, _ = _select_impl((1, 16, 7, 7), (32, 16, 3, 3), True)
+                assert impl.name == "conv2d_1"
 
 def test_compile_engine():
     engine = relay.backend.compile_engine.get()
@@ -109,6 +234,8 @@ def test_compile_nhwc_pack():
 
 
 if __name__ == "__main__":
+    test_get_valid_implementations()
+    test_select_implementation()
     test_compile_engine()
     test_compile_placeholder_bypass()
     test_compile_injective_with_tuple()
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 0d3fd4b3f829..d545d0c1635a 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -199,7 +199,7 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
             except_targets = []
 
         x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", dtype=dtype)
+        w = relay.var("w", shape=kshape, dtype=dtype)
         y = relay.nn.conv2d(x, w,
                             padding=padding,
                             dilation=dilation,
@@ -222,7 +222,7 @@ def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
                 continue
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(data, kernel)
-            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4, atol=1e-4)
 
     def compile_test_conv2d_arm_cpu(dtype, out_dtype, scale, dshape, kshape,
                         padding=(1, 1),
@@ -230,7 +230,7 @@ def compile_test_conv2d_arm_cpu(dtype, out_dtype, scale, dshape, kshape,
                         dilation=(1, 1),
                         **attrs):
         x = relay.var("x", shape=dshape, dtype=dtype)
-        w = relay.var("w", dtype=dtype)
+        w = relay.var("w", shape=kshape, dtype=dtype)
         y = relay.nn.conv2d(x, w,
                             padding=padding,
                             dilation=dilation,
@@ -240,13 +240,13 @@ def compile_test_conv2d_arm_cpu(dtype, out_dtype, scale, dshape, kshape,
         mod = tvm.IRModule()
         mod["main"] = func
 
-        test_schedule='{"i": ["llvm -device=arm_cpu", "topi_nn_depthwise_conv2d_nchw", \
+        test_schedule='{"i": ["llvm -device=arm_cpu", "depthwise_conv2d_nchw_spatial_pack.arm_cpu", \
                         [["TENSOR", [1, 512, 32, 32], "float32"], \
                         ["TENSOR", [512, 1, 3, 3], "float32"], \
                         [1, 1], [1, 1], [1, 1], "float32"], {}, \
-                        ["depthwise_conv2d_nchw", [1, 512, 32, 32, "float32"], \
+                        ["depthwise_conv2d_nchw_spatial_pack.arm_cpu", [1, 512, 32, 32, "float32"], \
                         [512, 1, 3, 3, "float32"], [1, 1], [1, 1], [1, 1], "float32"], \
-                        {"i": 743640, "t": "contrib_spatial_pack", "c": null, \
+                        {"i": 743640, "t": "", "c": null, \
                         "e": [["tile_co", "sp", [32, 16]], ["tile_oh", "sp", [8, 1]], \
                         ["tile_ow", "sp", [1, 8]], \
                         ["reorder_0", "re", [0, 1, 2, 3, 4, 5, 8, 6, 7]], \
@@ -319,8 +319,8 @@ def _query_inside(self, target, workload):
             if key in self.memory:
                 return self.memory[key]
             cfg = autotvm.task.space.FallbackConfigEntity()
-            cfg.template_key = 'winograd'
             cfg.is_fallback = False
+            cfg.cost = 0.1 if 'winograd' in workload[0] else 1
             cfg['tile_b'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
             cfg['tile_y'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
             cfg['tile_x'] = autotvm.task.space.SplitEntity([-1, 1, 1, 1])
@@ -1113,6 +1113,9 @@ def _has_fast_int8_instructions(asm, target):
         else:
             assert False, "Target should be Skylake or Cascadelake"
 
+    # TODO(@anijain2305, @icemelon9): disable conv2d_int8 for NHWC data layout.
+    #   Re-enable this after adding conv2d_NCHWc_int8 support for NHWC.
+
     # compile conv2d for x86 (skylake, cascadelake) and test assembly contains *pmadd* instructions
     targets = ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]
     llvm_version = tvm.target.codegen.llvm_version_major()
@@ -1127,11 +1130,11 @@ def _has_fast_int8_instructions(asm, target):
                                dtypes=dtypes)
                 assert _has_fast_int8_instructions(asm, target)
 
-            for ic in [1, 4, 6]:
-                asm = _compile(ic=ic, oc=16, target=target, data_layout="NHWC",
-                               kernel_layout='HWIO',
-                               dtypes=dtypes)
-                assert _has_fast_int8_instructions(asm, target)
+            # for ic in [1, 4, 6]:
+            #     asm = _compile(ic=ic, oc=16, target=target, data_layout="NHWC",
+            #                    kernel_layout='HWIO',
+            #                    dtypes=dtypes)
+            #     assert _has_fast_int8_instructions(asm, target)
 
             # Sweep the output channels to check int8 robustness
             # Output channels should be a multiple of 16 internally.
@@ -1141,20 +1144,20 @@ def _has_fast_int8_instructions(asm, target):
                                dtypes=dtypes)
                 assert _has_fast_int8_instructions(asm, target)
 
-            for oc in [4, 16, 20]:
-                asm = _compile(ic=8, oc=oc, target=target, data_layout="NHWC",
-                               kernel_layout='HWIO',
-                               dtypes=dtypes)
-                assert _has_fast_int8_instructions(asm, target)
+            # for oc in [4, 16, 20]:
+            #     asm = _compile(ic=8, oc=oc, target=target, data_layout="NHWC",
+            #                    kernel_layout='HWIO',
+            #                    dtypes=dtypes)
+            #     assert _has_fast_int8_instructions(asm, target)
 
             # Check that both non-divisible oc and ic work
             asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
                            dtypes=dtypes)
             assert _has_fast_int8_instructions(asm, target)
 
-            asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                           dtypes=dtypes)
-            assert _has_fast_int8_instructions(asm, target)
+            # asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
+            #                dtypes=dtypes)
+            # assert _has_fast_int8_instructions(asm, target)
 
     # Check that int8 x int8 goes through legalization so that fast instructions can be picked up.
     for target in targets:
@@ -1165,16 +1168,16 @@ def _has_fast_int8_instructions(asm, target):
                            dtypes=dtypes)
             assert _has_fast_int8_instructions(asm, target)
 
-            asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                           dtypes=dtypes)
-            assert _has_fast_int8_instructions(asm, target)
+            # asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
+            #                dtypes=dtypes)
+            # assert _has_fast_int8_instructions(asm, target)
 
     # Ensure that code is generated when datatypes are not HW supported.
-    dtypes = ('uint8', 'uint8', 'int32')
-    asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                   dtypes=dtypes)
-    # Check that intrinisic is not present in the assembly.
-    assert not _has_fast_int8_instructions(asm, target)
+    # dtypes = ('uint8', 'uint8', 'int32')
+    # asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
+    #                dtypes=dtypes)
+    # # Check that intrinisic is not present in the assembly.
+    # assert not _has_fast_int8_instructions(asm, target)
 
     # Check that a vectorized instruction is generated for older Intel
     # generations, because we default to NCHWc layout.
@@ -1223,7 +1226,7 @@ def test_bitserial_conv2d_infer_type():
     y = relay.nn.bitserial_conv2d(
         x, w, kernel_size=(3, 3), padding=(0, 0), channels=32)
     yy = run_infer_type(y)
-    assert yy.checked_type ==  relay.TensorType(
+    assert yy.checked_type == relay.TensorType(
         (n, 32, 222, 222), "int16")
 
 
@@ -1233,9 +1236,11 @@ def test_bitpack_infer_type():
     x = relay.var("x", relay.ty.TensorType((o, i, h, w), "int16"))
     y = relay.nn.bitpack(x, bit_axis=4, pack_axis=1, pack_type='uint16', bits=1)
     yy = run_infer_type(y)
-    assert yy.checked_type ==  relay.TensorType(
+    assert yy.checked_type == relay.TensorType(
         (32, 2, 128, 128, 1), "uint16")
 
+# TODO(@jwfromm): Need to add bitserial_conv2d & bitpack run test cases
+
 
 if __name__ == "__main__":
     test_pool1d()
diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py
index 67a7ef694033..e827c722b255 100644
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ b/tests/python/relay/test_op_qnn_conv2d.py
@@ -116,23 +116,13 @@ def get_funcs(data_shape,
               data_layout,
               kernel_layout,
               out_dtype,
-              groups=1):
+              groups=1,
+              channels=None):
     data = relay.var("data", shape=data_shape,
             dtype=data_dtype)
     kernel = relay.var("kernel", shape=kernel_shape,
             dtype=kernel_dtype)
 
-    if groups > 1:
-        channels = groups
-    elif kernel_layout == "OIHW":
-        channels = kernel_shape[0]
-    elif kernel_layout == "HWIO":
-        channels = kernel_shape[3]
-    elif kernel_layout == "HWOI":
-        channels = kernel_shape[2]
-    else:
-        raise NotImplementedError
-
     ref_func = get_ref_func(data,
                             kernel,
                             input_zero_point,
@@ -431,9 +421,9 @@ def test_layout():
                 kernel_shape, kernel_dtype)
 
         # NHWC and HWOI layout. Used in depthwise conv.
-        data_shape = (2, 2, 4, 1) # NHWC
+        data_shape = (2, 2, 4, 3) # NHWC
         data_dtype = 'uint8'
-        kernel_shape = (2, 2, 1, 1) # HWOI
+        kernel_shape = (2, 2, 3, 1) # HWOI
         kernel_dtype = 'uint8'
         ref_func, qnn_func = get_funcs(data_shape=data_shape,
                                        data_dtype=data_dtype,
@@ -447,6 +437,7 @@ def test_layout():
                                        padding=(0, 0),
                                        strides=(1, 1),
                                        dilation=(1, 1),
+                                       groups=3,
                                        data_layout="NHWC",
                                        kernel_layout="HWOI",
                                        out_dtype="int32")
@@ -826,7 +817,8 @@ def test_depthwise_depth_multiplier():
                                        data_layout="NCHW",
                                        kernel_layout="OIHW",
                                        out_dtype="int32",
-                                       groups=8)
+                                       groups=4,
+                                       channels=8)
         verify(ref_func, qnn_func, data_shape, data_dtype,
                 kernel_shape, kernel_dtype)
 
@@ -875,7 +867,8 @@ def test_depthwise_depth_multiplier():
                                        data_layout="NHWC",
                                        kernel_layout="HWOI",
                                        out_dtype="int32",
-                                       groups=8)
+                                       groups=4,
+                                       channels=8)
         verify(ref_func, qnn_func, data_shape, data_dtype,
                 kernel_shape, kernel_dtype)
 
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 2ec3f282a6c4..df01310937ed 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test alter op layout pass"""
-import tvm
+import pytest
 
+import tvm
 from tvm import relay
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
@@ -43,7 +44,7 @@ def before():
         y = relay.Function([x, weight], y)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         weight = relay.multiply(weight, relay.const(2.0, "float32"))
         return relay.nn.conv2d(data, weight, **attrs)
@@ -77,7 +78,7 @@ def before():
 
     called = [False]
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         called[0] = True
         return None
 
@@ -109,7 +110,7 @@ def before():
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -176,7 +177,7 @@ def before():
         y = relay.Function(analysis.free_vars(ret), ret)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -236,7 +237,7 @@ def before():
         y = relay.nn.global_max_pool2d(y)
         return relay.Function(analysis.free_vars(y), y)
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -285,7 +286,7 @@ def before():
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -342,7 +343,7 @@ def before():
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -394,7 +395,7 @@ def before():
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -427,7 +428,7 @@ def expected():
 
 def test_alter_layout_concatenate():
     """ NCHW, NHWC and corner case concatenate layout transform."""
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -536,7 +537,7 @@ def before():
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -572,7 +573,7 @@ def before():
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW4c'
@@ -607,9 +608,9 @@ def before():
         return y
 
     import topi
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         with tvm.target.create("llvm"):
-            return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, relay)
+            return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type)
 
 
     def expected():
@@ -643,7 +644,7 @@ def before():
         y = relay.Function(analysis.free_vars(y), y)
         return y
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -675,7 +676,7 @@ def expected():
 
 def test_alter_layout_pad():
     """ Check NCHW, NHWC and corner case for pad layout conversion"""
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -785,7 +786,7 @@ def expected():
 
 def test_alter_layout_pool():
     """ Check NCHW, NHWC pool layout conversion"""
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -862,7 +863,7 @@ def expected_nhwc():
 
 def test_alter_layout_sum():
     """ Check NCHW, NHWC sum layout conversion"""
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         new_attrs = dict(attrs)
         new_attrs['data_layout'] = 'NCHW16c'
@@ -938,11 +939,15 @@ def expected_nhwc():
     assert analysis.alpha_equal(a, b), "Actual = \n" + str(a)
 
 
+# TODO(@anijain2305, @icemelon9): We should fix this. This doesn't seem to be the
+#   right behavior of alter_layout
+@pytest.mark.skip
 def test_alter_layout_nhwc_nchw_arm():
     """ Check NHWC to NHCW conversion for a small sequence of ops."""
-    def alter_conv2d(attrs, inputs, tinfos):
-        from topi.arm_cpu.conv2d import _alter_conv2d_layout_arm
-        return _alter_conv2d_layout_arm(attrs, inputs, tinfos, tvm.relay)
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        import topi
+        with tvm.target.create("llvm -device=arm_cpu"):
+            return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, out_type)
 
     # Check NHWC conversion.
     def before_nhwc():
@@ -1011,7 +1016,7 @@ def before():
         mod["main"] = relay.Function([x, weight], foo(x, weight))
         return mod
 
-    def alter_conv2d(attrs, inputs, tinfos):
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
         data, weight = inputs
         weight = relay.multiply(weight, relay.const(2.0, "float32"))
         return relay.nn.conv2d(data, weight, **attrs)
@@ -1054,5 +1059,5 @@ def expected():
     test_alter_layout_pad()
     test_alter_layout_pool()
     test_alter_layout_sum()
-    test_alter_layout_nhwc_nchw_arm()
+    # test_alter_layout_nhwc_nchw_arm()
     test_alter_op_with_global_var()
diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py
index 443d2e40634d..02438ef04f2a 100644
--- a/tests/python/relay/test_pass_auto_quantize.py
+++ b/tests/python/relay/test_pass_auto_quantize.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
+
 import tvm
 from tvm import relay
 from tvm.relay import testing
@@ -55,7 +57,8 @@ def get_calibration_dataset(input_name):
     return dataset
 
 
-def test_calibrate_target(create_target=False):
+@pytest.mark.parametrize("create_target", [True, False])
+def test_calibrate_target(create_target):
     mod, params = testing.resnet.get_workload(num_layers=18)
     dataset = get_calibration_dataset("data")
     with relay.quantize.qconfig(calibrate_mode="kl_divergence"):
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index 13995732d8ee..bfc3caba45e3 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -131,12 +131,13 @@ def expected(x, conv_weight, in_bias, in_scale, channels):
         z = relay.add(y1, y2)
         return relay.Function(args, z)
 
-    def check(shape, channels):
-        x =  relay.var("x", shape=shape)
-        in_channels = shape[-1]
+    def check(dshape, channels):
+        x =  relay.var("x", shape=dshape)
+        in_channels = dshape[-1]
         # test depthwise
         assert in_channels == channels
-        weight = relay.var("weight")
+        wshape = (3, 3, 1, channels) # HWIO
+        weight = relay.var("weight", shape=wshape)
         in_bias = relay.var("in_bias", shape=(in_channels,))
         in_scale = relay.const(_get_positive_scale(in_channels,))
         y1 = before(x, weight, in_bias, in_scale, channels)
diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py
index fac9f062a2e8..83bbd5492619 100644
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/tests/python/unittest/test_autotvm_common.py
@@ -36,7 +36,7 @@ def run(self, measure_inputs, build_results):
     def get_build_kwargs(self):
         return {}
 
-@autotvm.template
+@autotvm.register_customized_task("testing/matmul")
 def matmul(N, L, M, dtype):
     A = tvm.placeholder((N, L), name='A', dtype=dtype)
     B = tvm.placeholder((L, M), name='B', dtype=dtype)
@@ -63,7 +63,7 @@ def matmul(N, L, M, dtype):
 
     return s, [A, B, C]
 
-@autotvm.template
+@autotvm.register_customized_task("testing/bad_matmul")
 def bad_matmul(N, L, M, dtype):
     if 'bad_device' in tvm.target.Target.current().keys:
         A = tvm.placeholder((N, L), name='A', dtype=dtype)
@@ -85,7 +85,7 @@ def bad_matmul(N, L, M, dtype):
 def get_sample_task(n=128):
     """return a sample task for testing"""
     target = tvm.target.create("llvm")
-    task = autotvm.task.create(matmul, args=(n, n, n, 'float32'), target=target)
+    task = autotvm.task.create("testing/matmul", args=(n, n, n, 'float32'), target=target)
     return task, target
 
 def get_sample_records(n):
diff --git a/tests/python/unittest/test_autotvm_dispatch_context.py b/tests/python/unittest/test_autotvm_dispatch_context.py
index 716ab7f807f9..5a55c4f97ca1 100644
--- a/tests/python/unittest/test_autotvm_dispatch_context.py
+++ b/tests/python/unittest/test_autotvm_dispatch_context.py
@@ -18,42 +18,11 @@
 The dispatcher can choose which template to use according
 to the parameters of workload"""
 
-from collections import namedtuple
 from tvm import autotvm
-from tvm.autotvm.task import dispatcher, DispatchContext
-
-SimpleConfig = namedtuple('SimpleConfig', ('template_key', 'is_fallback'))
-
-def test_dispatch():
-    @dispatcher
-    def my_dispatcher(a, b):
-        return (a, b)
-
-    @my_dispatcher.register("im2col")
-    def _im2col(cfg, a, b):
-        return a
-
-    @my_dispatcher.register("spatial_pack")
-    def _spatial_pack(cfg, a, b):
-        return b
-
-    class SimpleDispatcher(DispatchContext):
-        def query(self, target, workload):
-            a, b = workload
-            tkey = "spatial_pack" if a + b > 2 else "im2col"
-            cfg = SimpleConfig(tkey, False)
-            return cfg
-
-    with SimpleDispatcher():
-        # this will call im2col
-        assert my_dispatcher(1, 0) == 1
-
-        # this will call spatial pack
-        assert my_dispatcher(1, 100) == 100
 
 def test_fallback():
 
-    @autotvm.template
+    @autotvm.register_customized_task("testing/dispatch/fallback")
     def simple_template(a, b):
         cfg = autotvm.get_config()
         assert cfg.is_fallback
@@ -62,5 +31,4 @@ def simple_template(a, b):
 
 
 if __name__ == "__main__":
-    test_dispatch()
     test_fallback()
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index 48a1d31899e7..0899f6f5bbff 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -64,7 +64,7 @@ def _callback_correct(tuner, measure_inputs, measure_results):
     # a bad template
     n = 128
     target = tvm.target.create("llvm -device=bad_device")
-    task = autotvm.task.create(bad_matmul, args=(n, n, n, 'float32'), target=target)
+    task = autotvm.task.create("testing/bad_matmul", args=(n, n, n, 'float32'), target=target)
 
     def _callback_wrong(tuner, measure_inputs, measure_results):
         for _, res in zip(measure_inputs, measure_results):
diff --git a/tests/python/unittest/test_codegen_blob.py b/tests/python/unittest/test_codegen_blob.py
index 2e0cee24097e..c14607d0c0b7 100644
--- a/tests/python/unittest/test_codegen_blob.py
+++ b/tests/python/unittest/test_codegen_blob.py
@@ -101,4 +101,4 @@ def test_system_lib():
 
 if __name__ == "__main__":
     test_resnet18()
-    test_system_lib()
+    #test_system_lib()
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index ec36a5fa5a7a..8652817c21ce 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -305,7 +305,7 @@ def check_cuda(dtype, m=32, n=32):
         e = topi.elemwise_sum([c, d])
         g = topi.sum(e)
         with tvm.target.cuda():
-            sg = topi.generic.schedule_reduce(g)
+            sg = topi.cuda.schedule_reduce(g)
             ctx = tvm.gpu(0)
             func = tvm.build(sg, [a, b, g], 'cuda')
             a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py
index a8b22fd787ee..27e077f5319c 100644
--- a/tests/python/unittest/test_graph_tuner_core.py
+++ b/tests/python/unittest/test_graph_tuner_core.py
@@ -31,7 +31,6 @@
 from tvm.autotvm.task import ConfigEntity
 from tvm.autotvm.measure import MeasureResult, MeasureInput
 from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
-from test_graph_tuner_utils import create_workload
 
 
 def _create_data(target, dshape, dtype, layout):
@@ -48,68 +47,53 @@ def _create_data(target, dshape, dtype, layout):
     tasks = autotvm.task.extract_from_program(mod["main"],
                                               target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d,))
-    wkl_list = [
-        create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype),
-        create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype),
-        create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype),
-    ]
+                                              ops=(relay.op.get("nn.conv2d"),))
     costs = [0.04, 0.012, 0.03]
     config_list = []
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [3, 1]],
-                      ["tile_oc", "sp", [4, 4]],
-                      ["tile_ow", "sp", [4, 2]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [3, 1]],
+                           ["tile_oc", "sp", [4, 4]],
+                           ["tile_ow", "sp", [4, 2]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [2, 8]],
-                      ["tile_oc", "sp", [1, 32]],
-                      ["tile_oh", "ot", 1],
-                      ["tile_ow", "sp", [4, 2]]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [2, 8]],
+                           ["tile_oc", "sp", [1, 32]],
+                           ["tile_oh", "ot", 1],
+                           ["tile_ow", "sp", [4, 2]]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [8, 4]],
-                      ["tile_oc", "sp", [4, 8]],
-                      ["tile_ow", "sp", [2, 4]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [8, 4]],
+                           ["tile_oc", "sp", [4, 8]],
+                           ["tile_ow", "sp", [2, 4]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
 
     records = []
-    for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks):
-        task.workload = wkl
+    for cost, config, task in zip(costs, config_list, tasks):
         ms_input = MeasureInput(target=target, task=task, config=config)
         ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
         records.append((ms_input, ms_output))
 
     ltf_records = []
     ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
-    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
-    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
-    ltf_task = copy.deepcopy(tasks[0])
-    ltf_task.workload = ltf_wkl
+    ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
     ms_input = MeasureInput(target=target, task=ltf_task, config=None)
     ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
     ltf_records.append((ms_input, ms_output))
 
     ltf_keys = []
     ltf_arg = [tvm.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"]
-    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
-    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform')
     ltf_keys.append(ltf_wkl)
     ltf_arg = [tvm.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"]
-    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
-    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform')
     ltf_keys.append(ltf_wkl)
     ltf_arg = [tvm.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"]
-    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
-    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
+    ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform')
     ltf_keys.append(ltf_wkl)
 
     return net, records, ltf_records, ltf_keys, tasks
@@ -121,7 +105,8 @@ def test_graph_tuner_layout_transform():
     dshape = (1, 3, 8, 8)
     dtype = "float32"
     layout = "NCHW"
-    target_ops = [relay.nn.conv2d]
+    conv2d = relay.op.get("nn.conv2d")
+    target_ops = [conv2d]
 
     g, records, ltf_records, ltf_keys, _ = _create_data(target, dshape, dtype, layout)
     executor = DPTuner(g, {"data": dshape}, records, target_ops, target=target, log_file=log_file)
@@ -156,36 +141,34 @@ def test_DPTuner_run():
     dtype = "float32"
     layout = "NCHW"
     dshape = (1, 3, 8, 8)
-    target_ops = [relay.nn.conv2d]
+    conv2d = relay.op.get("nn.conv2d")
+    target_ops = [conv2d]
 
     g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
     mod = tvm.IRModule()
     mod["main"] = g
     costs = [0.02, 0.02, 0.045]
     config_list = []
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [1, 3]],
-                      ["tile_oc", "sp", [2, 8]],
-                      ["tile_ow", "sp", [4, 2]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [1, 3]],
+                           ["tile_oc", "sp", [2, 8]],
+                           ["tile_ow", "sp", [4, 2]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [4, 4]],
-                      ["tile_oc", "sp", [2, 16]],
-                      ["tile_oh", "ot", 1],
-                      ["tile_ow", "sp", [4, 2]]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [4, 4]],
+                           ["tile_oc", "sp", [2, 16]],
+                           ["tile_oh", "ot", 1],
+                           ["tile_ow", "sp", [4, 2]]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [16, 2]],
-                      ["tile_oc", "sp", [8, 4]],
-                      ["tile_ow", "sp", [2, 4]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [16, 2]],
+                           ["tile_oc", "sp", [8, 4]],
+                           ["tile_ow", "sp", [2, 4]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
     for cost, config, task in zip(costs, config_list, tasks):
         ms_input = MeasureInput(target=target, task=task, config=config)
@@ -207,34 +190,32 @@ def test_PBQPTuner_run():
     dtype = "float32"
     layout = "NCHW"
     dshape = (1, 3, 8, 8)
-    target_ops = [relay.nn.conv2d]
+    conv2d = relay.op.get("nn.conv2d")
+    target_ops = [conv2d]
 
     g, records, ltf_records, ltf_keys, tasks = _create_data(target, dshape, dtype, layout)
     costs = [0.02, 0.02, 0.045]
     config_list = []
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [1, 3]],
-                      ["tile_oc", "sp", [2, 8]],
-                      ["tile_ow", "sp", [4, 2]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [1, 3]],
+                           ["tile_oc", "sp", [2, 8]],
+                           ["tile_ow", "sp", [4, 2]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [4, 4]],
-                      ["tile_oc", "sp", [2, 16]],
-                      ["tile_oh", "ot", 1],
-                      ["tile_ow", "sp", [4, 2]]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [4, 4]],
+                           ["tile_oc", "sp", [2, 16]],
+                           ["tile_oh", "ot", 1],
+                           ["tile_ow", "sp", [4, 2]]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [16, 2]],
-                      ["tile_oc", "sp", [8, 4]],
-                      ["tile_ow", "sp", [2, 4]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [16, 2]],
+                           ["tile_oc", "sp", [8, 4]],
+                           ["tile_ow", "sp", [2, 4]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
     for cost, config, task in zip(costs, config_list, tasks):
         ms_input = MeasureInput(target=target, task=task, config=config)
@@ -255,7 +236,8 @@ def test_many_sub_graphs():
     dtype = "float32"
     dshape = (1, 8, 8, 3)
     layout = "NCHW"
-    target_ops = [relay.nn.conv2d]
+    conv2d = relay.op.get("nn.conv2d")
+    target_ops = [conv2d]
 
     data = relay.var("data", shape=dshape, dtype=dtype)
     t0 = relay.transpose(data, (0, 3, 1, 2))
@@ -277,79 +259,63 @@ def test_many_sub_graphs():
     tasks = autotvm.task.extract_from_program(net["main"],
                                               target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d,))
-    wkl_list = [
-        create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype),
-        create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype),
-        create_workload((1, 32, 8, 8), (32, 32, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype),
-    ]
+                                              ops=(conv2d,))
     costs = [0.04, 0.012, 0.03, 0.02, 0.02, 0.045]
     config_list = []
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [3, 1]],
-                      ["tile_oc", "sp", [4, 4]],
-                      ["tile_ow", "sp", [4, 2]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [3, 1]],
+                           ["tile_oc", "sp", [4, 4]],
+                           ["tile_ow", "sp", [4, 2]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [2, 8]],
-                      ["tile_oc", "sp", [1, 32]],
-                      ["tile_oh", "ot", 1],
-                      ["tile_ow", "sp", [4, 2]]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [2, 8]],
+                           ["tile_oc", "sp", [1, 32]],
+                           ["tile_oh", "ot", 1],
+                           ["tile_ow", "sp", [4, 2]]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [8, 4]],
-                      ["tile_oc", "sp", [4, 8]],
-                      ["tile_ow", "sp", [2, 4]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [8, 4]],
+                           ["tile_oc", "sp", [4, 8]],
+                           ["tile_ow", "sp", [2, 4]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [1, 3]],
-                      ["tile_oc", "sp", [2, 8]],
-                      ["tile_ow", "sp", [4, 2]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [1, 3]],
+                           ["tile_oc", "sp", [2, 8]],
+                           ["tile_ow", "sp", [4, 2]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [4, 4]],
-                      ["tile_oc", "sp", [2, 16]],
-                      ["tile_oh", "ot", 1],
-                      ["tile_ow", "sp", [4, 2]]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [4, 4]],
+                           ["tile_oc", "sp", [2, 16]],
+                           ["tile_oh", "ot", 1],
+                           ["tile_ow", "sp", [4, 2]]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [16, 2]],
-                      ["tile_oc", "sp", [8, 4]],
-                      ["tile_ow", "sp", [2, 4]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [16, 2]],
+                           ["tile_oc", "sp", [8, 4]],
+                           ["tile_ow", "sp", [2, 4]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
 
     records = []
 
-    wkl_list = wkl_list + wkl_list
     tasks = tasks + tasks
-    for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks):
-        task.workload = wkl
+    for cost, config, task in zip(costs, config_list, tasks):
         ms_input = MeasureInput(target=target, task=task, config=config)
         ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
         records.append((ms_input, ms_output))
 
     ltf_records = []
     ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
-    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
-    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
-    ltf_task = copy.deepcopy(tasks[0])
-    ltf_task.workload = ltf_wkl
+    ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
     ms_input = MeasureInput(target=target, task=ltf_task, config=None)
     ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
     ltf_records.append((ms_input, ms_output))
@@ -376,7 +342,8 @@ def test_tuple():
     dtype = "float32"
     dshape = (1, 5, 32, 32)
     layout = "NCHW"
-    target_ops = [relay.nn.conv2d]
+    conv2d = relay.op.get("nn.conv2d")
+    target_ops = [conv2d]
 
     data = relay.var("data", shape=dshape, dtype=dtype)
     w0 = relay.var("w0_weight")
@@ -390,62 +357,48 @@ def test_tuple():
     tasks = autotvm.task.extract_from_program(net["main"],
                                               target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d,))
-    wkl_list = [
-        create_workload((1, 5, 32, 32), (2, 5, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype),
-        create_workload((1, 5, 32, 32), (3, 5, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype),
-    ]
+                                              ops=(conv2d,))
     costs = [0.01, 0.012, 0.03, 0.04]
     config_list = []
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [1, 5]],
-                      ["tile_oc", "sp", [1, 2]],
-                      ["tile_ow", "sp", [4, 8]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [1, 5]],
+                           ["tile_oc", "sp", [1, 2]],
+                           ["tile_ow", "sp", [4, 8]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [1, 5]],
-                      ["tile_oc", "sp", [1, 3]],
-                      ["tile_ow", "sp", [2, 16]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [1, 5]],
+                           ["tile_oc", "sp", [1, 3]],
+                           ["tile_ow", "sp", [2, 16]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [1, 5]],
-                      ["tile_oc", "sp", [2, 1]],
-                      ["tile_ow", "sp", [4, 8]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [1, 5]],
+                           ["tile_oc", "sp", [2, 1]],
+                           ["tile_ow", "sp", [4, 8]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [1, 5]],
-                      ["tile_oc", "sp", [3, 1]],
-                      ["tile_ow", "sp", [2, 16]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [1, 5]],
+                           ["tile_oc", "sp", [3, 1]],
+                           ["tile_ow", "sp", [2, 16]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
 
     records = []
-
-    wkl_list = wkl_list + wkl_list
     tasks = tasks + tasks
-    for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks):
-        task.workload = wkl
+    for cost, config, task in zip(costs, config_list, tasks):
         ms_input = MeasureInput(target=target, task=task, config=config)
         ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
         records.append((ms_input, ms_output))
 
     ltf_records = []
     ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
-    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
-    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
-    ltf_task = copy.deepcopy(tasks[0])
-    ltf_task.workload = ltf_wkl
+    ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
     ms_input = MeasureInput(target=target, task=ltf_task, config=None)
     ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
     ltf_records.append((ms_input, ms_output))
@@ -472,7 +425,8 @@ def test_triangle_block():
     dtype = "float32"
     dshape = (1, 3, 8, 8)
     layout = "NCHW"
-    target_ops = [relay.nn.conv2d]
+    conv2d = relay.op.get("nn.conv2d")
+    target_ops = [conv2d]
 
     data = relay.var("data", shape=dshape, dtype=dtype)
     w0 = relay.var("w0_weight")
@@ -488,79 +442,63 @@ def test_triangle_block():
     tasks = autotvm.task.extract_from_program(net["main"],
                                               target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d,))
-    wkl_list = [
-        create_workload((1, 3, 8, 8), (16, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype),
-        create_workload((1, 16, 8, 8), (32, 16, 1, 1), (1, 1), (0, 0, 0, 0), (1, 1), layout, layout, dtype, dtype),
-        create_workload((1, 3, 8, 8), (32, 3, 3, 3), (1, 1), (1, 1, 1, 1), (1, 1), layout, layout, dtype, dtype),
-    ]
+                                              ops=(conv2d,))
     costs = [0.04, 0.012, 0.03, 0.02, 0.02, 0.045]
     config_list = []
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [3, 1]],
-                      ["tile_oc", "sp", [4, 4]],
-                      ["tile_ow", "sp", [4, 2]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [3, 1]],
+                           ["tile_oc", "sp", [4, 4]],
+                           ["tile_ow", "sp", [4, 2]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [2, 8]],
-                      ["tile_oc", "sp", [1, 32]],
-                      ["tile_oh", "ot", 1],
-                      ["tile_ow", "sp", [4, 2]]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [2, 8]],
+                           ["tile_oc", "sp", [1, 32]],
+                           ["tile_oh", "ot", 1],
+                           ["tile_ow", "sp", [4, 2]]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [8, 4]],
-                      ["tile_oc", "sp", [4, 8]],
-                      ["tile_ow", "sp", [2, 4]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [8, 4]],
+                           ["tile_oc", "sp", [4, 8]],
+                           ["tile_ow", "sp", [2, 4]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [1, 3]],
-                      ["tile_oc", "sp", [2, 8]],
-                      ["tile_ow", "sp", [4, 2]],
-                      ["unroll_kw", "ot", True]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [1, 3]],
+                           ["tile_oc", "sp", [2, 8]],
+                           ["tile_ow", "sp", [4, 2]],
+                           ["unroll_kw", "ot", True]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [4, 4]],
-                      ["tile_oc", "sp", [2, 16]],
-                      ["tile_oh", "ot", 1],
-                      ["tile_ow", "sp", [4, 2]]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [4, 4]],
+                           ["tile_oc", "sp", [2, 16]],
+                           ["tile_oh", "ot", 1],
+                           ["tile_ow", "sp", [4, 2]]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
-    cfg_dict = {"i": -1,
-                "c": None,
-                "e": [["tile_ic", "sp", [16, 2]],
-                      ["tile_oc", "sp", [8, 4]],
-                      ["tile_ow", "sp", [2, 4]],
-                      ["unroll_kw", "ot", False]],
-                "t": ""}
+    cfg_dict = {"index": -1,
+                "code_hash": None,
+                "entity": [["tile_ic", "sp", [16, 2]],
+                           ["tile_oc", "sp", [8, 4]],
+                           ["tile_ow", "sp", [2, 4]],
+                           ["unroll_kw", "ot", False]]}
     config_list.append(ConfigEntity.from_json_dict(cfg_dict))
 
     records = []
 
-    wkl_list = wkl_list + wkl_list
     tasks = tasks + tasks
-    for wkl, cost, config, task in zip(wkl_list, costs, config_list, tasks):
-        task.workload = wkl
+    for cost, config, task in zip(costs, config_list, tasks):
         ms_input = MeasureInput(target=target, task=task, config=config)
         ms_output = MeasureResult(costs=(cost,), error_no=0, all_cost=-1, timestamp=-1)
         records.append((ms_input, ms_output))
 
     ltf_records = []
     ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
-    ltf_arg = autotvm.task.topi_integration.serialize_args(ltf_arg)
-    ltf_wkl = ('layout_transform',) + autotvm.task.args_to_workload(ltf_arg)
-    ltf_task = copy.deepcopy(tasks[0])
-    ltf_task.workload = ltf_wkl
+    ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
     ms_input = MeasureInput(target=target, task=ltf_task, config=None)
     ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
     ltf_records.append((ms_input, ms_output))
diff --git a/tests/python/unittest/test_graph_tuner_utils.py b/tests/python/unittest/test_graph_tuner_utils.py
index 397ea235ecbf..112c5b8a7059 100644
--- a/tests/python/unittest/test_graph_tuner_utils.py
+++ b/tests/python/unittest/test_graph_tuner_utils.py
@@ -26,17 +26,7 @@
 from tvm.relay.testing import resnet
 from tvm.autotvm.graph_tuner.utils import has_multiple_inputs, get_direct_ancestor, get_in_nodes, \
     get_out_nodes, expr2graph, bind_inputs
-from tvm.relay.expr import Call, TupleGetItem, Tuple
-from topi.nn.conv2d import conv2d
-
-
-def create_workload(dshape, kshape, strides,
-                    padding, dilation, layout,
-                    out_layout, dtype, out_dtype):
-    data = tvm.placeholder(dshape, dtype=dtype)
-    kernel = tvm.placeholder(kshape, dtype=dtype)
-    return autotvm.task.args_to_workload([data, kernel, strides, padding, dilation, layout,
-                                          out_dtype], conv2d)
+from tvm.relay.expr import Call, TupleGetItem, Tuple, Var
 
 
 def verify_has_multiple_inputs(node_list, node_idx, input_names, expected_result):
@@ -53,7 +43,7 @@ def test_has_multiple_inputs():
     out = relay.add(out1, out2)
     net = relay.Function(relay.analysis.free_vars(out), out)
     net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1)})
-    target_ops = ["conv2d"]
+    target_ops = [relay.op.get("nn.conv2d")]
     node_list = []
     node_dict = {}
     expr2graph(net, target_ops, node_dict, node_list)
@@ -67,22 +57,17 @@ def test_expr2graph():
     mod, _ = resnet.get_workload(num_layers=50, batch_size=1)
     node_dict = {}
     node_list = []
-    target_ops = ["conv2d"]
+    target_ops = [relay.op.get("nn.conv2d")]
     op_name_list = []
     def _count_node(node):
-        if not isinstance(node, relay.op.op.Op,):
-            return
         if isinstance(node, Call):
-            op_name_list.append(node.op.name.split(".")[-1])
-        elif isinstance(node, TupleGetItem):
-            op_name_list.append("TupleGetItem")
-        elif isinstance(node, Tuple):
-            op_name_list.append("Tuple")
-        else:
-            op_name_list.append("null")
+            op_name_list.append(node.op)
+        elif isinstance(node, (Var, TupleGetItem, Tuple)):
+            op_name_list.append(None)
     relay.analysis.post_order_visit(mod["main"], _count_node)
 
     expr2graph(mod["main"], target_ops, node_dict, node_list)
+    assert len(node_list) == len(op_name_list)
     for i, item in enumerate(zip(op_name_list, node_list)):
         op_name, node = item
         assert op_name == node["op"], "%dth Node operator mismatch: expecting %s but got %s" \
@@ -99,7 +84,7 @@ def test_get_direct_ancestor():
     out = relay.nn.conv2d(out3, w1)
     net = relay.Function(relay.analysis.free_vars(out), out)
     net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)})
-    target_ops = ["conv2d"]
+    target_ops = [relay.op.get("nn.conv2d")]
     node_list = []
     node_dict = {}
     expr2graph(net, target_ops, node_dict, node_list)
@@ -119,7 +104,7 @@ def test_get_in_nodes():
     out = relay.nn.conv2d(out3, w1)
     net = relay.Function(relay.analysis.free_vars(out), out)
     net = bind_inputs(net, {"data": (1, 16, 224, 224), "w0": (16, 16, 1, 1), "w1": (16, 16, 1, 1)})
-    target_ops = ["conv2d"]
+    target_ops = [relay.op.get("nn.conv2d")]
     input_names = ["data"]
     node_list = []
     node_dict = {}
diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py
index 98fdeaaf4328..01c0d26dfc9b 100644
--- a/tests/python/unittest/test_lang_tensor_overload_op.py
+++ b/tests/python/unittest/test_lang_tensor_overload_op.py
@@ -108,7 +108,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_elemwise(B)
+            s = topi.testing.get_elemwise_schedule(device)(B)
 
         k_ = 2
         foo = tvm.build(s, [A, B, k] + sh, device, name="tensor_scalar_" + typ)
@@ -154,7 +154,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_broadcast(C)
+            s = topi.testing.get_broadcast_schedule(device)(C)
 
         foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + typ)
         lhs_npy = np.random.uniform(size=lhs_shape).astype(A.dtype)
@@ -190,12 +190,14 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
 
+        conv2d_nchw, schedule_conv2d_nchw = topi.testing.get_conv2d_nchw_implement(device)
+
         k = 10.0
         dilation = (1, 1)
         with tvm.target.create(device):
             A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A')
             W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-            B = topi.nn.conv2d(A, W, stride, padding, dilation)
+            B = conv2d_nchw(A, W, stride, padding, dilation, A.dtype)
             if typ == "add":
                 C = B + k
             elif typ == "sub":
@@ -206,7 +208,7 @@ def check_device(device):
                 C = B / k
             else:
                 raise NotImplementedError()
-            s = topi.generic.schedule_conv2d_nchw([C])
+            s = schedule_conv2d_nchw([C])
 
         foo = tvm.build(s, [A, W, B, C], device, name="conv2d_scalar_" + typ)
 
diff --git a/topi/include/topi/cuda/normalization.h b/topi/include/topi/cuda/normalization.h
index 1b42308d0ac2..bfc209db213b 100644
--- a/topi/include/topi/cuda/normalization.h
+++ b/topi/include/topi/cuda/normalization.h
@@ -35,13 +35,10 @@ using namespace tvm::te;
 namespace cuda {
 /*!
 * \brief Create a CUDA schedule for LRN
-*
-* \param target The target to generate a schedule for.
 * \param outs The output tensors.
-*
 * \return A schedule for the given ops.
 */
-inline Schedule schedule_lrn(const Target &target, const Array<Tensor>& outs) {
+inline Schedule schedule_lrn(const Array<Tensor>& outs) {
   Array<Operation> out_ops;
   for (auto t : outs) {
     out_ops.push_back(t->op);
diff --git a/topi/include/topi/rocm/normalization.h b/topi/include/topi/rocm/normalization.h
index 692370d65bb7..303f4a8302c7 100644
--- a/topi/include/topi/rocm/normalization.h
+++ b/topi/include/topi/rocm/normalization.h
@@ -34,14 +34,11 @@ using namespace tvm::te;
 namespace rocm {
 /*!
 * \brief Create a rocm schedule for LRN
-*
-* \param target The target to generate a schedule for.
 * \param outs The output tensors.
-*
 * \return A schedule for the given ops.
 */
-inline Schedule schedule_lrn(const Target &target, const Array<Tensor>& outs) {
-  return topi::cuda::schedule_lrn(target, outs);
+inline Schedule schedule_lrn(const Array<Tensor>& outs) {
+  return topi::cuda::schedule_lrn(outs);
 }
 
 }  // namespace rocm
diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index a0c6ab0c6d2d..f1019e667e81 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -40,6 +40,7 @@
 from .broadcast import *
 from .sort import *
 from .argwhere import *
+from . import generic
 from . import nn
 from . import x86
 from . import cuda
diff --git a/topi/python/topi/argwhere.py b/topi/python/topi/argwhere.py
index 32f4e8718c46..c2a9adea0c2a 100644
--- a/topi/python/topi/argwhere.py
+++ b/topi/python/topi/argwhere.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks
 """Argwhere operator"""
-import tvm
 from tvm import hybrid
 
 @hybrid.script
@@ -164,7 +163,6 @@ def hybrid_argwhere_5d(output_shape, condition):
                             valid_index += 1
     return a
 
-@tvm.target.generic_func
 def argwhere(output_shape, condition):
     """Find the indices of elements of a tensor that are non-zero.
 
diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py
index 517941c1905f..eb05dd839e32 100644
--- a/topi/python/topi/arm_cpu/__init__.py
+++ b/topi/python/topi/arm_cpu/__init__.py
@@ -14,13 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+# pylint: disable=wildcard-import
 """Schedule for ARM CPU"""
 
-from . import conv2d
-from . import depthwise_conv2d
-from . import conv2d_transpose
-from . import conv2d_int8
-from . import bitserial_conv2d
-from . import bitserial_dense
-from . import injective
+from .conv2d import *
+from .depthwise_conv2d import *
+from .conv2d_transpose import *
+from .conv2d_int8 import *
+from . import conv2d_alter_op
+from .bitserial_conv2d import *
+from .bitserial_dense import *
+from .injective import *
diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py
index 4de2b1438a92..d28ec09925c2 100644
--- a/topi/python/topi/arm_cpu/bitserial_conv2d.py
+++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@@ -22,11 +22,10 @@
 from tvm import relay
 from .. import tag
 from ..nn.pad import pad
-from ..nn.bitserial_conv2d import bitserial_conv2d_nhwc, bitserial_conv2d_legalize
+from ..nn.bitserial_conv2d import bitserial_conv2d_legalize
 from ..nn.bitserial_util import bitpack, binary_op_multiplier
 from ..nn.util import get_pad_tuple
 from ..util import get_const_int, get_const_tuple
-from .. import generic
 
 def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC, use_bitpack=True):
     if use_bitpack:
@@ -38,9 +37,9 @@ def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC, use_bitpack=True):
     return tvm.compute(kvshape, lambda co, dh, dw, b, vc, ci: \
         kernel_q[dh][dw][b][ci][co*VC+vc], name='kernel_vec')
 
-@autotvm.register_topi_compute(bitserial_conv2d_nhwc, 'arm_cpu', 'direct')
-def spatial_pack_nhwc(cfg, data, kernel, stride, padding, activation_bits, weight_bits,
-                      pack_dtype, out_dtype, unipolar):
+@autotvm.register_topi_compute("bitserial_conv2d_nhwc.arm_cpu")
+def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, activation_bits, weight_bits,
+                          pack_dtype, out_dtype, unipolar):
     """ Compute convolution with pack on spatial axes. """
     assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
     assert pack_dtype == 'uint8', "only support packing into uint8 bits"
@@ -323,7 +322,7 @@ def _schedule_spatial_conv2d_nhwc(cfg, s, data_pad, data_vec, kernel_vec,
     s[last].parallel(oh)
     return s
 
-@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_conv2d_nhwc, 'arm_cpu', 'direct')
+@autotvm.register_topi_schedule("bitserial_conv2d_nhwc.arm_cpu")
 def schedule_bitserial_conv2d_nhwc(cfg, outs):
     """Arm cpu schedule for bitserial conv2d"""
     s = tvm.create_schedule([x.op for x in outs])
diff --git a/topi/python/topi/arm_cpu/bitserial_dense.py b/topi/python/topi/arm_cpu/bitserial_dense.py
index 8bd6c5d15f8c..3f1889c8d7ff 100644
--- a/topi/python/topi/arm_cpu/bitserial_dense.py
+++ b/topi/python/topi/arm_cpu/bitserial_dense.py
@@ -21,15 +21,13 @@
 from tvm import autotvm
 from topi.util import get_const_tuple
 from .. import tag
-from .. import generic
 from .bitserial_conv2d import _intrin_popcount
 from ..nn.pad import pad
-from ..nn.bitserial_dense import bitserial_dense
 from ..nn.bitserial_util import bitpack, binary_op_multiplier
 
-@autotvm.register_topi_compute(bitserial_dense, ['arm_cpu'], 'direct')
-def bitserial_dense_generic(cfg, data, weight, data_bits, weight_bits, pack_dtype, out_dtype,
-                            unipolar):
+@autotvm.register_topi_compute('bitserial_dense.arm_cpu')
+def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype, out_dtype,
+                    unipolar):
     """The default implementation of bitserial dense in topi.
 
     Parameters
@@ -111,7 +109,7 @@ def bitserial_dense_generic(cfg, data, weight, data_bits, weight_bits, pack_dtyp
     return matmul
 
 
-@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_dense, ['arm_cpu'], 'direct')
+@autotvm.register_topi_schedule('bitserial_dense.arm_cpu')
 def schedule_bitserial_dense(cfg, outs):
     """Schedule for binary_dense.
 
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index f0d650adeac1..2144d260c5b1 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -18,20 +18,12 @@
 """Conv2D schedule for ARM CPU"""
 from __future__ import absolute_import as _abs
 
-import logging
-
 import tvm
 from tvm import autotvm
 import tvm.contrib.nnpack
 
-from ..generic import schedule_conv2d_nchw, schedule_conv2d_nhwc, \
-                      schedule_conv2d_winograd_without_weight_transform, \
-                      schedule_conv2d_winograd_nnpack_without_weight_transform
 from ..util import traverse_inline, get_const_tuple
-from ..nn import dilate, pad, conv2d, conv2d_alter_layout, \
-                 conv2d_winograd_without_weight_transform, \
-                 conv2d_winograd_nnpack_without_weight_transform, \
-                 depthwise_conv2d_nchw
+from .. import nn
 from ..nn.util import get_const_int, get_pad_tuple
 from ..nn.winograd_util import winograd_transform_matrices
 from .conv2d_spatial_pack import conv2d_spatial_pack_nchw, \
@@ -39,75 +31,17 @@
                                  schedule_conv2d_spatial_pack_nchw, \
                                  schedule_conv2d_spatial_pack_nhwc
 
-logger = logging.getLogger('topi')
-
-@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct'])
-def conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    """TOPI compute callback for conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
-        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
-        filter_width, num_filter_block]
 
-    strides : list of two ints
-        [stride_height, stride_width]
+@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
+def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d with NCHW layout"""
+    return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
+                                    dilation, out_dtype, num_tile=2)
 
-    padding : list of two ints
-        [pad_height, pad_width]
-
-    dilation : list of two ints
-        [dilation_height, dilation_width]
-
-    layout : str
-        layout of data
-
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    if layout == 'NCHW':
-        return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
-                                        dilation, out_dtype, num_tile=2)
-    elif layout == 'NHWC':
-        return conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding,
-                                        dilation, out_dtype)
-    else:
-        raise ValueError("Unsupported layout {}".format(layout))
 
-
-@autotvm.register_topi_schedule(
-    schedule_conv2d_nchw, 'arm_cpu',
-    ['direct', 'winograd', 'winograd_nnpack_fp16', 'winograd_nnpack_fp32'])
-def schedule_conv2d_nchw_arm_cpu(cfg, outs):
-    """TOPI schedule callback for conv2d
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
+@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.arm_cpu")
+def schedule_conv2d_nchw_spatial_pack(cfg, outs):
+    """Create schedule for conv2d_nchw"""
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
@@ -131,35 +65,20 @@ def _callback(op):
             schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
                                               conv, output, outs[0])
 
-        if 'winograd_conv2d_output' in op.tag:
-            output = op.output(0)
-            _schedule_winograd(cfg, s, output, outs[0])
-
-        if 'winograd_nnpack_conv2d_output' in op.tag:
-            output = op.output(0)
-            _schedule_winograd_nnpack(cfg, s, output, outs[0])
-
     traverse_inline(s, outs[0].op, _callback)
     return s
 
-@autotvm.register_topi_schedule(schedule_conv2d_nhwc, 'arm_cpu', ['direct'])
-def schedule_conv2d_nhwc_arm_cpu(cfg, outs):
-    """TOPI schedule callback for conv2d
 
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
+@autotvm.register_topi_compute("conv2d_nhwc_spatial_pack.arm_cpu")
+def conv2d_nhwc_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d with NHWC layout"""
+    return conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding,
+                                    dilation, out_dtype)
 
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
 
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
+@autotvm.register_topi_schedule("conv2d_nhwc_spatial_pack.arm_cpu")
+def schedule_conv2d_nhwc_spatial_pack(cfg, outs):
+    """Create schedule for conv2d_nhwc"""
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
@@ -170,14 +89,29 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd'])
-def conv2d_arm_cpu_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    """ TOPI compute callback. Use winograd template """
+@autotvm.register_topi_compute("conv2d_nchw_winograd.arm_cpu")
+def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d_nchw layout using Winograd with weight transform"""
     tile_size = 4
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout,
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation,
                           out_dtype, tile_size)
 
-def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd.arm_cpu")
+def schedule_conv2d_nchw_winograd(cfg, outs):
+    """Create schedule for conv2d_nchw_winograd"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv2d_output' in op.tag:
+            output = op.output(0)
+            _schedule_winograd(cfg, s, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size):
     N, CI, IH, IW = get_const_tuple(data.shape)
 
     if isinstance(dilation, int):
@@ -187,7 +121,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
 
     if len(kernel.shape) == 4:
         if dilation_h != 1 or dilation_w != 1:
-            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
         pre_computed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
     else:
@@ -199,9 +133,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
 
-    assert layout == 'NCHW'
     assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
-    data_pad = pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
+    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
 
     idxd = tvm.indexdiv
     idxm = tvm.indexmod
@@ -272,6 +205,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
     cfg.add_flop(2 * N * K * H * W * KH * KW * C)
     return output
 
+
 def _schedule_winograd(cfg, s, output, last):
     Y = output.op.input_tensors[0]
     M, A = Y.op.input_tensors
@@ -356,26 +290,39 @@ def _schedule_winograd(cfg, s, output, last):
         s[output].compute_inline()
 
 
-@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd_nnpack_fp16'])
-def conv2d_arm_cpu_winograd_nnpack_fp16(
-        cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    """ TOPI compute callback. Use winograd_nnpack_fp16 template """
-    return conv2d_arm_cpu_winograd_nnpack(
-        cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
-        tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16)
+@autotvm.register_topi_compute("conv2d_nchw_winograd_nnpack.arm_cpu")
+def conv2d_nchw_winograd_nnpack(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d_nchw using nnpack Winograd implementation"""
+    dtype = data.dtype
+    if dtype == "float32":
+        return _conv2d_arm_cpu_winograd_nnpack(
+            cfg, data, kernel, strides, padding, dilation, out_dtype,
+            tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8)
+    elif dtype == "float16":
+        return _conv2d_arm_cpu_winograd_nnpack(
+            cfg, data, kernel, strides, padding, dilation, out_dtype,
+            tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8_FP16)
+    else:
+        raise ValueError("Unsupported data type {} for conv2d winograd nnpack".
+                         format(dtype))
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack.arm_cpu")
+def schedule_conv2d_nchw_winograd_nnpack(cfg, outs):
+    """Create schedule for conv2d_nchw_winograd_nnpack"""
+    s = tvm.create_schedule([x.op for x in outs])
 
+    def _callback(op):
+        if 'winograd_nnpack_conv2d_output' in op.tag:
+            output = op.output(0)
+            _schedule_winograd_nnpack(cfg, s, output, outs[0])
 
-@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd_nnpack_fp32'])
-def conv2d_arm_cpu_winograd_nnpack_fp32(
-        cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    """ TOPI compute callback. Use winograd_nnpack_fp32 template """
-    return conv2d_arm_cpu_winograd_nnpack(
-        cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
-        tvm.contrib.nnpack.ConvolutionAlgorithm.WT_8x8)
+    traverse_inline(s, outs[0].op, _callback)
+    return s
 
 
-def conv2d_arm_cpu_winograd_nnpack(
-        cfg, data, kernel, strides, padding, dilation, layout, out_dtype, convolution_algorithm):
+def _conv2d_arm_cpu_winograd_nnpack(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, convolution_algorithm):
     """ TOPI compute callback. Use winograd NNPACK template """
     N, CI, IH, IW = get_const_tuple(data.shape)
 
@@ -389,7 +336,6 @@ def conv2d_arm_cpu_winograd_nnpack(
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
 
-    assert layout == 'NCHW'
     assert KH == 3 and KW == 3 and pt == 1 and pb == 1 and pl == 1 and pr == 1 and HSTR == 1\
         and WSTR == 1
     H = (IH + pt + pb - 3) // HSTR + 1
@@ -416,6 +362,7 @@ def conv2d_arm_cpu_winograd_nnpack(
     cfg.add_flop(2 * N * CI * H * W * KH * KW * CO)
     return output
 
+
 def _schedule_winograd_nnpack(cfg, s, output, last):
     # Could have bias.
 
@@ -429,36 +376,10 @@ def _schedule_winograd_nnpack(cfg, s, output, last):
         s[TK].pragma(s[TK].op.axis[0], 'debug_skip_region')
 
 
-##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
-@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'arm_cpu', ['winograd'])
-def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
-    """TOPI compute callback"""
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,\
-                          tile_size)
-
-
-@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
-                                'arm_cpu', ['winograd'])
-def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
-    """TOPI schedule callback"""
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if 'winograd_conv2d_output' in op.tag:
-            output = op.output(0)
-            _schedule_winograd(cfg, s, output, outs[0])
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD NNPACK WITHOUT WEIGHT TRANSFORM #####
-@autotvm.register_topi_compute(conv2d_winograd_nnpack_without_weight_transform,
-                               'arm_cpu',
-                               ['winograd_nnpack_fp16', 'winograd_nnpack_fp32'])
-def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides,
-                              padding, dilation, layout, out_dtype):
-    """ TOPI compute callback. Use winograd NNPACK template """
+@autotvm.register_topi_compute("conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu")
+def conv2d_nchw_winograd_nnpack_without_weight_transform(
+        cfg, data, transformed_kernel, bias, strides, padding, dilation, out_dtype):
+    """Compute conv2d_nchw using NNPack winograd without weight transform"""
     N, CI, IH, IW = get_const_tuple(data.shape)
     if isinstance(dilation, int):
         dilation_h = dilation_w = dilation
@@ -471,7 +392,6 @@ def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides,
     KH, KW = 3, 3
     pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
 
-    assert layout == 'NCHW'
     assert KH == 3 and KW == 3 and pt == 1 and pb == 1 and pl == 1 and pr == 1 and HSTR == 1\
         and WSTR == 1
     H = (IH + pt + pb - 3) // HSTR + 1
@@ -492,9 +412,8 @@ def conv2d_winograd_nnpack_ww(cfg, data, transformed_kernel, bias, strides,
     return output
 
 
-@autotvm.register_topi_schedule(schedule_conv2d_winograd_nnpack_without_weight_transform,
-                                'arm_cpu', ['winograd_nnpack_fp16', 'winograd_nnpack_fp32'])
-def schedule_conv2d_winograd_nnpack_without_weight_transform_(cfg, outs):
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu")
+def schedule_conv2d_nchw_winograd_nnpack_without_weight_transform(cfg, outs):
     """TOPI schedule callback"""
     s = tvm.create_schedule([x.op for x in outs])
 
@@ -505,226 +424,3 @@ def _callback(op):
 
     traverse_inline(s, outs[0].op, _callback)
     return s
-
-
-##### REGISTER ALTER OP LAYOUT #####
-@conv2d_alter_layout.register(["arm_cpu"])
-def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
-    """Alter op layout for pre-computing kernel transformation
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : tvm.relay.Expr
-        Grouped input symbols
-    tinfos : list
-        Input shape and dtype
-    F: symbol
-        The context, can be either relay.op
-
-    Note
-    ----
-    Unlike other TOPI functions, this function operates on both graph level and operator level,
-    so we have to pass 'F' to make it support our two versions of graph IR,  Relay.
-    """
-    copy_inputs = list(inputs)
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-    if F.__name__ == 'tvm.relay.op':
-        # Derive channels for frontends (e.g ONNX) that miss "channel" field.
-        new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
-
-    dilation = attrs.get_int_tuple("dilation")
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    groups = attrs.get_int('groups')
-    data_layout_key = "data_layout" if "data_layout" in new_attrs else "layout"
-    layout = attrs[data_layout_key]
-    kernel_layout = attrs['kernel_layout']
-    out_dtype = attrs["out_dtype"]
-    if out_dtype in ("same", ""):
-        out_dtype = tinfos[0].dtype
-
-    if dilation != (1, 1):
-        logger.warning("Does not support weight pre-transform for dilated convolution.")
-        return None
-
-    # query config of this workload
-    data, kernel = tinfos[0:2]
-    if groups == 1:
-        workload = autotvm.task.args_to_workload(
-            [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
-    else:
-        workload = autotvm.task.args_to_workload(
-            [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw)
-
-    if layout == 'NCHW' and kernel_layout == 'OIHW':
-        N, CI, H, W = get_const_tuple(data.shape)
-        CO, _, KH, KW = get_const_tuple(kernel.shape)
-    elif layout == 'NHWC' and kernel_layout == 'HWIO':
-        N, H, W, CI = get_const_tuple(data.shape)
-        KH, KW, _, CO = get_const_tuple(kernel.shape)
-        # Also modify the workload to pick up because later we convert to NCHW
-        # layout.
-        new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype)
-        new_kernel = tvm.placeholder((CO, CI, KH, KW), dtype=kernel.dtype)
-        new_layout = 'NCHW'
-        workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype], conv2d)
-    elif layout == 'NHWC' and kernel_layout == 'HWOI':
-        # This is the case for depthwise convolution.
-        N, H, W, CI = get_const_tuple(data.shape)
-        KH, KW, CO, M = get_const_tuple(kernel.shape)
-        # Also modify the workload to pick up because later we convert to NCHW
-        # layout.
-        new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype)
-        new_kernel = tvm.placeholder((CO, M, KH, KW), dtype=kernel.dtype)
-        workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw)
-    else:
-        return None
-
-    idxd = tvm.indexdiv
-
-    if groups == 1:
-        target = tvm.target.Target.current()
-        dispatch_ctx = autotvm.DispatchContext.current
-        cfg = dispatch_ctx.query(target, workload)
-
-        if cfg.is_fallback:  # if is fallback, clear query cache and return None
-            autotvm.task.clear_fallback_cache(target, workload)
-            if layout == 'NHWC' and kernel_layout == 'HWIO':
-                new_attrs['data_layout'] = 'NCHW'
-                new_attrs['kernel_layout'] = 'OIHW'
-                return F.nn.conv2d(*copy_inputs, **new_attrs)
-            return None
-
-        if cfg.template_key == 'direct':  # pack weight tensor
-            VC = cfg['tile_co'].size[-1]
-            new_attrs['kernel_layout'] = 'OIHW%do' % VC
-
-            # Store the same config for the altered operator (workload)
-            new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype)
-            new_attrs[data_layout_key] = 'NCHW'
-            new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d)
-            dispatch_ctx.update(target, new_workload, cfg)
-
-            return F.nn.conv2d(*copy_inputs, **new_attrs)
-        elif cfg.template_key == "winograd":  # pre-compute weight transformation in winograd
-            if "-device=arm_cpu" in target.options:
-                tile_size = 4
-                VC = cfg['tile_k'].size[-1]
-            elif "-device=bifrost" in target.options:
-                tile_size = 2
-                VC = 0
-            else:
-                from ..mali.conv2d import _pick_tile_size
-                tile_size = _pick_tile_size(tinfos[0], tinfos[1])
-                VC = cfg['tile_bna'].val
-
-            weight = copy_inputs[1]
-            if kernel_layout != 'OIHW':
-                weight = F.transpose(weight, axes=(2, 3, 0, 1))
-            weight = F.nn.contrib_conv2d_winograd_weight_transform(weight,
-                                                                   tile_size=tile_size)
-            if VC > 0:
-                weight = F.reshape(weight,
-                                   newshape=(KH + tile_size - 1,
-                                             KW + tile_size - 1,
-                                             idxd(CO, VC), VC, CI))
-                weight = F.transpose(weight, axes=[0, 1, 2, 4, 3])
-                new_weight = tvm.placeholder((KH + tile_size - 1,
-                                              KW + tile_size -1,
-                                              idxd(CO, VC), CI, VC),
-                                             kernel.dtype)
-            else:
-                weight = F.reshape(weight,
-                                   newshape=(KH + tile_size - 1, KW + tile_size - 1, CO, CI))
-                new_weight = tvm.placeholder(
-                    (KH + tile_size - 1, KW + tile_size -1, CO, CI), kernel.dtype
-                )
-
-            copy_inputs[1] = weight
-            new_attrs['tile_size'] = tile_size
-            new_attrs[data_layout_key] = 'NCHW'
-
-            # Store the same config for the altered operator (workload)
-            new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype)
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_weight, strides, padding, dilation,
-                 new_attrs[data_layout_key], out_dtype, tile_size],
-                conv2d_winograd_without_weight_transform)
-            dispatch_ctx.update(target, new_workload, cfg)
-
-            return F.nn.contrib_conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
-        elif cfg.template_key in ["winograd_nnpack_fp16", "winograd_nnpack_fp32"]:
-            # pre-compute winograd_nnpack transform
-            # for winograd_nnpack_fp16, the the precomputeprune pass must run on device,
-            # where float16 is supported
-            weight_dtype = 'float32'
-            weight = copy_inputs[1]
-            if kernel_layout != 'OIHW':
-                weight = F.transpose(weight, axes=(2, 3, 0, 1))
-            weight = F.nn.contrib_conv2d_winograd_weight_transform(weight,
-                                                                   tile_size=tile_size)
-            transformed_kernel = F.nn.contrib_conv2d_winograd_nnpack_weight_transform(
-                weight,
-                convolution_algorithm=cfg['winograd_nnpack_algorithm'].val,
-                out_dtype=weight_dtype)
-            copy_inputs[1] = transformed_kernel
-
-            new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype)
-            new_kernel = tvm.placeholder((CO, CI, 8, 8), "float32")
-            bias = tvm.placeholder((CO, ), "float32")
-            new_attrs[data_layout_key] = 'NCHW'
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_kernel, bias, strides,
-                 padding, dilation, new_attrs[data_layout_key], out_dtype]
-                if len(copy_inputs) == 3 else
-                [new_data, new_kernel, strides,
-                 padding, dilation, new_attrs[data_layout_key], out_dtype],
-                conv2d_winograd_nnpack_without_weight_transform)
-            dispatch_ctx.update(target, new_workload, cfg)
-            return F.nn.contrib_conv2d_winograd_nnpack_without_weight_transform(
-                *copy_inputs, **new_attrs)
-        else:
-            raise RuntimeError("Unsupported template_key '%s'" % cfg.template_key)
-    else:
-        target = tvm.target.Target.current()
-        dispatch_ctx = autotvm.DispatchContext.current
-        cfg = dispatch_ctx.query(target, workload)
-
-        if cfg.is_fallback:  # if is fallback, clear query cache and return None
-            autotvm.task.clear_fallback_cache(tvm.target.Target.current(), workload)
-            if layout == 'NHWC' and kernel_layout == 'HWOI':
-                new_attrs['data_layout'] = 'NCHW'
-                new_attrs['kernel_layout'] = 'OIHW'
-                return F.nn.conv2d(*copy_inputs, **new_attrs)
-            return None
-        if cfg.template_key == 'contrib_spatial_pack':
-            VC = cfg['tile_co'].size[-1]
-            new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
-
-            # Store the same config for the altered operator (workload)
-            new_data = tvm.placeholder((N, CI, H, W), dtype=data.dtype)
-            new_attrs[data_layout_key] = 'NCHW'
-            if attrs['kernel_layout'] == 'OIHW':
-                CO, M, KH, KW = get_const_tuple(kernel.shape)
-            elif attrs['kernel_layout'] == 'HWOI':
-                KH, KW, CO, M = get_const_tuple(kernel.shape)
-            else:
-                raise RuntimeError("Depthwise conv should either have OIHW/HWIO kernel layout")
-            new_kernel = tvm.placeholder((idxd(CO, VC), M, KH, KW, VC), dtype=kernel.dtype)
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_kernel, strides, padding, dilation, out_dtype],
-                depthwise_conv2d_nchw)
-            dispatch_ctx.update(target, new_workload, cfg)
-
-            return F.nn.conv2d(*copy_inputs, **new_attrs)
-        else:
-            # currently we only have contrib_spatial_pack and direct template
-            # add more schedule templates.
-            return None
diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
new file mode 100644
index 000000000000..bfbf5d6d62b0
--- /dev/null
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Conv2D alter op and legalize functions for arm cpu"""
+
+import logging
+
+import tvm
+from tvm import relay
+from tvm import autotvm
+
+from ..nn import conv2d_alter_layout
+from ..util import get_const_tuple
+
+
+logger = logging.getLogger('topi')
+
+
+@conv2d_alter_layout.register(["arm_cpu"])
+def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+
+    _, outs = relay.backend.compile_engine.select_implementation(
+        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target)
+    workload = autotvm.task.get_workload(outs)
+    if workload is None:
+        # The best implementation is not an AutoTVM template,
+        # we then assume it's not necessary to alter this op.
+        return None
+    cfg = dispatch_ctx.query(target, workload)
+    if cfg.is_fallback:  # if is fallback, clear query cache and return None
+        autotvm.task.clear_fallback_cache(target, workload)
+        return None
+
+    topi_tmpl = workload[0]
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data, kernel = tinfos
+    out_dtype = out_type.dtype
+
+    idxd = tvm.indexdiv
+
+    if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+        VC = cfg['tile_co'].size[-1]
+
+        new_attrs['kernel_layout'] = 'OIHW%do' % VC
+
+        new_data = data
+        new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            "conv2d_nchw_spatial_pack.arm_cpu")
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    if topi_tmpl == "conv2d_nhwc_spatial_pack.arm_cpu":
+        assert data_layout == "NHWC" and kernel_layout == "HWIO"
+        N, H, W, CI = get_const_tuple(data.shape)
+        KH, KW, _, CO = get_const_tuple(kernel.shape)
+        VC = cfg['tile_co'].size[-1]
+
+        new_attrs['kernel_layout'] = 'OHWI%do' % VC
+
+        new_data = data
+        new_kernel = tvm.placeholder((idxd(CO, VC), KH, KW, CI, VC), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            "conv2d_nhwc_spatial_pack.arm_cpu")
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    if topi_tmpl == "conv2d_nchw_winograd.arm_cpu":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+        VC = cfg['tile_k'].size[-1]
+        tile_size = 4
+
+        weight_expr = inputs[1]
+        weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform(
+            weight_expr, tile_size=tile_size)
+        weight_expr = relay.reshape(weight_expr,
+                                    newshape=(KH + tile_size - 1,
+                                              KW + tile_size - 1,
+                                              idxd(CO, VC), VC, CI))
+        weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3])
+
+        new_attrs['tile_size'] = tile_size
+
+        new_data = data
+        new_kernel = tvm.placeholder((KH + tile_size - 1,
+                                      KW + tile_size -1,
+                                      idxd(CO, VC), CI, VC),
+                                     kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            'conv2d_nchw_winograd.arm_cpu')
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight_expr, **new_attrs)
+
+    if topi_tmpl == "conv2d_nchw_winograd_nnpack.arm_cpu":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+
+        # pre-compute winograd_nnpack transform
+        # for winograd_nnpack_fp16, the the precompute prune pass must run on device,
+        # where float16 is supported
+        weight_dtype = 'float32'
+        weight_expr = inputs[1]
+        transformed_weight = relay.nn.contrib_conv2d_winograd_nnpack_weight_transform(
+            weight_expr,
+            convolution_algorithm=cfg['winograd_nnpack_algorithm'].val,
+            out_dtype=weight_dtype)
+
+        new_data = data
+        new_kernel = tvm.placeholder((CO, CI, 8, 8), "float32")
+
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, None, strides, padding, dilation, out_dtype],
+            "conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu")
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], transformed_weight, **new_attrs)
+
+    if topi_tmpl == "depthwise_conv2d_nchw_spatial_pack.arm_cpu":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+        VC = cfg['tile_co'].size[-1]
+
+        new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
+
+        # Store the same config for the altered operator (workload)
+        new_data = data
+        new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            "depthwise_conv2d_nchw_spatial_pack.arm_cpu")
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    return None
diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py
index 8f43f5c210d4..5d177fe76ab6 100644
--- a/topi/python/topi/arm_cpu/conv2d_int8.py
+++ b/topi/python/topi/arm_cpu/conv2d_int8.py
@@ -19,9 +19,8 @@
 
 import tvm
 from tvm import autotvm
-from .. import generic, tag
+from .. import tag
 from ..util import get_const_tuple
-from ..nn.conv2d import conv2d_NCHWc_int8
 from ..generic import conv2d as conv2d_generic
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
@@ -42,9 +41,10 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
             cfg, wkl, int32_lanes=2, num_int8_elements=4)
 
 
-@autotvm.register_topi_compute(conv2d_NCHWc_int8, ['arm_cpu'], 'direct')
-def _declaration_conv_NCHWc_int8(cfg, data, kernel, strides,
-                                 padding, dilation, layout, out_layout, out_dtype):
+@autotvm.register_topi_compute("conv2d_NCHWc_int8.arm_cpu")
+def conv2d_NCHWc_int8(cfg, data, kernel, strides,
+                      padding, dilation, layout, out_layout, out_dtype):
+    """Compute conv2d int8 with NCHWc layout"""
     # layout and out_layout are not used here,
     # we keep them for debug convenience when dumping autotvm workload
     n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
@@ -68,8 +68,8 @@ def _declaration_conv_NCHWc_int8(cfg, data, kernel, strides,
                                         out_dtype)
 
 
-@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc_int8, ['arm_cpu'], ['direct'])
-def _schedule_conv2d_NCHWc_int8(cfg, outs):
+@autotvm.register_topi_schedule("conv2d_NCHWc_int8.arm_cpu")
+def schedule_conv2d_NCHWc_int8(cfg, outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
@@ -86,7 +86,7 @@ def traverse(op):
 
         if 'conv2d_NCHWc_int8' in op.tag:
             conv_out = op.output(0)
-            kernel = conv_out.op.input_tensors[1]
+            kernel_vec = conv_out.op.input_tensors[1]
             data_vec = conv_out.op.input_tensors[0]
             data = data_vec.op.input_tensors[0] \
                 if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
@@ -95,9 +95,9 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
-            args = [s, cfg, data_vec, conv_out, outs[0]]
+            args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
             # int8 conv kernel is 7-dim
-            _, _, kh, kw, _, _, _ = get_const_tuple(kernel.shape)
+            _, _, kh, kw, _, _, _ = get_const_tuple(kernel_vec.shape)
             dtype = "uint" if data.dtype == "uint8" else "int"
             if kh == 1 and kw == 1:
                 conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(
diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
index 350a0227ef48..032ac76ff6a2 100644
--- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
@@ -78,10 +78,12 @@ def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation,
     # fallback support
     if cfg.is_fallback:
         if num_tile == 2:     # arm cpu
-            ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'conv2d', 'direct')
+            ref_log = autotvm.tophub.load_reference_log(
+                'arm_cpu', 'rk3399', 'conv2d_nchw_spatial_pack.arm_cpu')
             cfg.fallback_with_reference_log(ref_log)
         elif num_tile == 3:  # mali gpu
-            ref_log = autotvm.tophub.load_reference_log('mali', 'rk3399', 'conv2d', 'direct')
+            ref_log = autotvm.tophub.load_reference_log(
+                'mali', 'rk3399', 'conv2d_nchw_spatial_pack.mali')
             cfg.fallback_with_reference_log(ref_log)
     # ====================================================================
 
diff --git a/topi/python/topi/arm_cpu/conv2d_transpose.py b/topi/python/topi/arm_cpu/conv2d_transpose.py
index 65f1024c88a3..93ff02900f37 100644
--- a/topi/python/topi/arm_cpu/conv2d_transpose.py
+++ b/topi/python/topi/arm_cpu/conv2d_transpose.py
@@ -21,13 +21,12 @@
 import tvm
 from tvm import autotvm
 
-from ..generic import schedule_conv2d_transpose_nchw
-from ..nn import conv2d_transpose_nchw, dilate, pad, get_pad_tuple
+from ..nn import dilate, pad, get_pad_tuple
 from ..util import get_const_tuple, traverse_inline
 from .conv2d_spatial_pack import schedule_conv2d_spatial_pack_nchw
 
-@autotvm.task.register_topi_compute(conv2d_transpose_nchw, "arm_cpu", "direct")
-def conv2d_transpose_nchw_arm(cfg, Input, Filter, strides, padding, out_dtype):
+@autotvm.register_topi_compute("conv2d_transpose_nchw.arm_cpu")
+def conv2d_transpose_nchw(cfg, Input, Filter, strides, padding, out_dtype):
     """Transposed 2D convolution nchw forward operator.
 
     Parameters
@@ -135,8 +134,8 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
 
 
 # register customized schedule for arm cpu.
-@autotvm.task.register_topi_schedule(schedule_conv2d_transpose_nchw, "arm_cpu", "direct")
-def schedule_conv2d_transpose_arm(cfg, outs):
+@autotvm.register_topi_schedule("conv2d_transpose_nchw.arm_cpu")
+def schedule_conv2d_transpose_nchw(cfg, outs):
     """Schedule conv2d transpose for arm cpu"""
     s = tvm.create_schedule([x.op for x in outs])
 
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index 207fc712c450..8d668f3e9188 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -20,19 +20,19 @@
 import tvm
 from tvm import autotvm
 
-from ..generic import schedule_depthwise_conv2d_nchw
-from ..nn import depthwise_conv2d_nchw, pad
+from .. import nn
 from ..util import traverse_inline, get_const_tuple, get_const_int
 from ..nn.util import get_pad_tuple
 
-# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
-                              depthwise_conv2d_nchw.fdefault)
 
-# register customized schedule for arm cpu.
-@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu',
-                                ['direct', 'contrib_spatial_pack'])
-def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
+@autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
+def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute depthwise_conv2d with NCHW layout"""
+    return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
+
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nchw.arm_cpu")
+def schedule_depthwise_conv2d_nchw(cfg, outs):
     """Schedule depthwise conv2d
 
     Parameters
@@ -65,7 +65,7 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         # fallback support
         if cfg.is_fallback:
             ref_log = autotvm.tophub.load_reference_log(
-                'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw', 'direct')
+                'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw.arm_cpu')
             cfg.fallback_with_reference_log(ref_log)
         ##### space definition end #####
 
@@ -134,25 +134,12 @@ def _callback(op):
                 data = data_pad.op.input_tensors[0]
             _schedule(cfg, s, data, data_pad, kernel, output)
 
-        if op.tag == 'spatial_depthwise_conv2d_nchw_output':
-            output = op.output(0)
-            conv = op.input_tensors[0]
-            data_vec = conv.op.input_tensors[0]
-            kernel_vec = conv.op.input_tensors[1]
-            if kernel_vec.op.name == 'kernel_vec':
-                kernel = kernel_vec.op.input_tensors[0]
-            else:
-                kernel = kernel_vec
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-
-            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
-
     traverse_inline(s, outs[0].op, _callback)
     return s
 
-@autotvm.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', ['contrib_spatial_pack'])
-def depthwise_conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, out_dtype):
+
+@autotvm.register_topi_compute("depthwise_conv2d_nchw_spatial_pack.arm_cpu")
+def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """TOPI compute callback for depthwise_conv2d nchw
 
     Parameters
@@ -189,6 +176,30 @@ def depthwise_conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, out_
     return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=2)
 
 
+@autotvm.register_topi_schedule("depthwise_conv2d_nchw_spatial_pack.arm_cpu")
+def schedule_depthwise_conv2d_nchw_spatial_pack(cfg, outs):
+    """Create the schedule for depthwise_conv2d_nchw_spatial_pack"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'spatial_depthwise_conv2d_nchw_output':
+            output = op.output(0)
+            conv = op.input_tensors[0]
+            data_vec = conv.op.input_tensors[0]
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
 def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
     out_dtype = out_dtype or data.dtype
 
@@ -220,16 +231,16 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype,
     WPAD = pad_left + pad_right
     DOPAD = (HPAD != 0 or WPAD != 0)
     if DOPAD:
-        data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right),
-                       name="data_pad")
+        data_pad = nn.pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right),
+                          name="data_pad")
     else:
         data_pad = data
 
     # fallback support
     # Currently, Mali schedule doesn't use it like conv2d.
     if cfg.is_fallback:
-        ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'depthwise_conv2d_nchw',
-                                                    'contrib_spatial_pack')
+        ref_log = autotvm.tophub.load_reference_log(
+            'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw_spatial_pack.arm_cpu')
         cfg.fallback_with_reference_log(ref_log)
 
     # ==================== define configuration space ====================
diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py
index 0b6a16d37d1a..644a7e3fb523 100644
--- a/topi/python/topi/arm_cpu/injective.py
+++ b/topi/python/topi/arm_cpu/injective.py
@@ -17,10 +17,8 @@
 # pylint: disable=invalid-name, unused-variable
 """Schedule for pooling operators"""
 import tvm
-from .. import generic
 from ..util import is_empty_shape
 
-@generic.schedule_injective_from_existing.register(["arm_cpu"])
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
 
@@ -46,7 +44,6 @@ def schedule_injective_from_existing(sch, out):
         sch[out].parallel(sch[out].op.axis[0])
     return sch
 
-@generic.schedule_injective.register(["arm_cpu"])
 def schedule_injective(outs):
     """ARM CPU schedule for injective op.
 
@@ -74,7 +71,6 @@ def schedule_injective(outs):
         schedule_injective_from_existing(s, x)
     return s
 
-@generic.schedule_concatenate.register(["arm_cpu"])
 def schedule_concatenate(outs):
     """Schedule for concatenate op.
 
diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py
index 2ae65800e925..816024ebdb25 100644
--- a/topi/python/topi/bifrost/conv2d.py
+++ b/topi/python/topi/bifrost/conv2d.py
@@ -15,27 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=invalid-name,unused-variable,unused-argument
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
 """conv2d schedule on ARM Mali (Bifrost) GPU"""
 
 import tvm
+from tvm import relay
 from tvm import autotvm
 
 from .gemm import decl_winograd_gemm, schedule_gemm
 from .transforms import tile_and_bind, tile_and_bind3d
-from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
 from ..util import traverse_inline, get_const_int, get_const_tuple
-from ..nn import conv2d, conv2d_winograd_without_weight_transform, \
-    get_pad_tuple, pad, conv2d_alter_layout, dilate
+from .. import nn
 from ..nn.winograd_util import winograd_transform_matrices
 
 # reuse some compute declarations from ARM CPU
 from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw
-from ..arm_cpu.conv2d import _alter_conv2d_layout_arm
 
 
-@autotvm.register_topi_compute(conv2d, 'bifrost', ['direct'])
-def conv2d_bifrost(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.bifrost")
+def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """TOPI compute callback for conv2d
 
     Parameters
@@ -60,9 +58,6 @@ def conv2d_bifrost(cfg, data, kernel, strides, padding, dilation, layout, out_dt
     dilation : list of two ints
         [dilation_height, dilation_width]
 
-    layout : str
-        layout of data
-
     out_dtype: str
         The output type. This is used for mixed precision.
 
@@ -71,14 +66,12 @@ def conv2d_bifrost(cfg, data, kernel, strides, padding, dilation, layout, out_dt
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    if layout == 'NCHW':
-        return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
-                                        dilation, out_dtype, num_tile=3)
-    raise ValueError("Unsupported layout {}".format(layout))
+    return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
+                                    dilation, out_dtype, num_tile=3)
 
 
-@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'bifrost', ['direct', 'winograd'])
-def schedule_conv2d_nchw_bifrost(cfg, outs):
+@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.bifrost")
+def schedule_conv2d_nchw_spatial_pack(cfg, outs):
     """TOPI schedule callback for conv2d
 
     Parameters
@@ -116,9 +109,6 @@ def _callback(op):
 
             _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
 
-        if 'winograd_conv2d_output' in op.tag:
-            _schedule_winograd(cfg, s, op)
-
     traverse_inline(s, outs[0].op, _callback)
     return s
 
@@ -195,10 +185,22 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
     return s
 
 
-@autotvm.register_topi_compute(conv2d, 'bifrost', ['winograd'])
-def conv2d_bifrost_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+@autotvm.register_topi_compute("conv2d_nchw_winograd.bifrost")
+def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """Use Winograd as the convolution method"""
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype)
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd.bifrost")
+def schedule_conv2d_nchw_winograd(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
 
 
 def _decl_winograd_kernel_transform(kernel, tile_size, G):
@@ -256,7 +258,7 @@ def upround(x, align):
     return U
 
 
-def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size=2):
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size=2):
     """Declare a winograd convolution - only tile_size=2 is currently supported"""
     N, CI, IH, IW = get_const_tuple(data.shape)
     if isinstance(dilation, int):
@@ -266,7 +268,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
 
     if int(kernel.shape[2]) == 3:
         if dilation_h != 1 or dilation_w != 1:
-            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
         pre_computed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
     else:
@@ -275,11 +277,10 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
         H_CAT, W_CAT, CO, CI = get_const_tuple(kernel.shape)
         KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
+    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
 
-    assert layout == 'NCHW'
     assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
-    data_pad = pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
+    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
 
     r = KW
     m = tile_size
@@ -454,31 +455,78 @@ def _schedule_winograd(cfg, s, op):
     tile_and_bind3d(s, output, k, h, w, 1, 2, 2)
 
 
-##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
-@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'bifrost', ['winograd'])
-def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
-    """TOPI compute callback"""
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
 
+##### REGISTER ALTER OP LAYOUT #####
+@nn.conv2d_alter_layout.register("bifrost")
+def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+
+    _, outs = relay.backend.compile_engine.select_implementation(
+        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target)
+    workload = autotvm.task.get_workload(outs)
+    if workload is None:
+        # The best implementation is not an AutoTVM template,
+        # we then assume it's not necessary to alter this op.
+        return None
+    cfg = dispatch_ctx.query(target, workload)
+    if cfg.is_fallback:  # if is fallback, clear query cache and return None
+        autotvm.task.clear_fallback_cache(target, workload)
+        return None
 
-@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
-                                'bifrost', ['winograd'])
-def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
-    """TOPI schedule callback"""
-    s = tvm.create_schedule([x.op for x in outs])
+    topi_tmpl = workload[0]
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
 
-    def _callback(op):
-        if 'winograd_conv2d_output' in op.tag:
-            _schedule_winograd(cfg, s, op)
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data, kernel = tinfos
+    out_dtype = out_type.dtype
 
-    traverse_inline(s, outs[0].op, _callback)
-    return s
+    idxd = tvm.indexdiv
 
+    if topi_tmpl == "conv2d_nchw_spatial_pack.bifrost":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+        VC = cfg['tile_co'].size[-1]
 
-##### REGISTER ALTER OP LAYOUT #####
-@conv2d_alter_layout.register(["bifrost"])
-def _alter_conv2d_layout(attrs, inputs, tinfos, F):
-    try:
-        return _alter_conv2d_layout_arm(attrs, inputs, tinfos, F)
-    except KeyError:  # to filter out fallback opencl templates
-        return None
+        new_attrs['kernel_layout'] = 'OIHW%do' % VC
+
+        new_data = data
+        new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            "conv2d_nchw_spatial_pack.bifrost")
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    if topi_tmpl == "conv2d_nchw_winograd.bifrost":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+        tile_size = 2
+
+        weight_expr = inputs[1]
+        weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform(
+            weight_expr, tile_size=tile_size)
+        weight_expr = relay.reshape(
+            weight_expr, newshape=(KH + tile_size - 1, KW + tile_size - 1, CO, CI))
+
+        new_attrs['tile_size'] = tile_size
+
+        new_data = data
+        new_kernel = tvm.placeholder(
+            (KH + tile_size - 1, KW + tile_size -1, CO, CI), kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            'conv2d_nchw_winograd.bifrost')
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight_expr, **new_attrs)
+
+    return None
diff --git a/topi/python/topi/bifrost/dense.py b/topi/python/topi/bifrost/dense.py
index 114168f27514..2a85db753226 100644
--- a/topi/python/topi/bifrost/dense.py
+++ b/topi/python/topi/bifrost/dense.py
@@ -15,19 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name,unused-variable
-"""dense schedule on ARM Mali GPU"""
+"""dense schedule on ARM Mali Biforst GPU"""
 
 from __future__ import absolute_import as _abs
 
 import tvm
 from tvm import autotvm
 
-from .. import generic, nn
+from .. import nn
 from ..util import traverse_inline
 
-autotvm.register_topi_compute(nn.dense, 'bifrost', 'direct', nn.dense.fdefault)
+@autotvm.register_topi_compute('dense.biforst')
+def dense(_, data, weight, bias=None, out_dtype=None):
+    """Dense operator on Biforst"""
+    return nn.dense(data, weight, bias, out_dtype)
 
-@autotvm.register_topi_schedule(generic.schedule_dense, 'bifrost', 'direct')
+@autotvm.register_topi_schedule('dense.bifrost')
 def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 
@@ -52,11 +55,11 @@ def _callback(op):
             vec_size = [1, 2, 4, 8, 16]
             max_unroll = 32
 
-            dense = op.output(0)
+            dense_out = op.output(0)
             output = outs[0]
 
             y, x = s[output].op.axis
-            c = s[dense].op.reduce_axis[0]
+            c = s[dense_out].op.reduce_axis[0]
 
             ##### space definition begin #####
             cfg.define_split('tile_y', y, num_outputs=3)
@@ -66,12 +69,12 @@ def _callback(op):
             # fallback support
             if cfg.is_fallback:
                 ref_log = autotvm.tophub.load_reference_log(
-                    'mali', 'rk3399', 'dense', 'direct')
+                    'mali', 'rk3399', 'dense.bifrost')
                 cfg.fallback_with_reference_log(ref_log)
             ##### space definition end #####
 
-            if dense.op in s.outputs:
-                dense = s.cache_write(output, 'local')
+            if dense_out.op in s.outputs:
+                dense_out = s.cache_write(output, 'local')
 
             by, ty, yi = cfg['tile_y'].apply(s, output, y)
             bx, tx, xi = cfg['tile_x'].apply(s, output, x)
@@ -85,17 +88,17 @@ def _callback(op):
                 s[output].unroll(yi)
             if cfg['tile_x'].size[-1] in vec_size:
                 s[output].vectorize(xi)
-            s[dense].compute_at(s[output], tx)
+            s[dense_out].compute_at(s[output], tx)
 
-            k = s[dense].op.reduce_axis[0]
-            y, x = s[dense].op.axis
-            k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
-            s[dense].reorder(k, k_unroll, y, x)
-            s[dense].unroll(k_unroll)
+            k = s[dense_out].op.reduce_axis[0]
+            y, x = s[dense_out].op.axis
+            k, k_unroll = cfg['c_unroll'].apply(s, dense_out, k)
+            s[dense_out].reorder(k, k_unroll, y, x)
+            s[dense_out].unroll(k_unroll)
             if cfg['tile_y'].size[-1] < max_unroll:
-                s[dense].unroll(y)
+                s[dense_out].unroll(y)
             if cfg['tile_x'].size[-1] in vec_size:
-                s[dense].vectorize(x)
+                s[dense_out].vectorize(x)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/bifrost/depthwise_conv2d.py b/topi/python/topi/bifrost/depthwise_conv2d.py
index 305abee0bcd9..4f7b0db7f95f 100644
--- a/topi/python/topi/bifrost/depthwise_conv2d.py
+++ b/topi/python/topi/bifrost/depthwise_conv2d.py
@@ -21,11 +21,9 @@
 from __future__ import absolute_import as _abs
 import tvm
 
-from .. import generic
 from .. import util
 from .. import tag
 
-@generic.schedule_depthwise_conv2d_nchw.register(["bifrost"])
 def schedule_depthwise_conv2d_nchw(outs):
     """Schedule for depthwise_conv2d nchw forward.
 
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 4c20dd0075d6..6e38318a0062 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -19,23 +19,27 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from . import conv1d, conv2d, depthwise_conv2d, conv2d_transpose_nchw, \
-              deformable_conv2d, group_conv2d_nchw, dense, conv1d_transpose_ncw
-from . import conv3d
-from .conv2d_hwcn import schedule_conv2d_hwcn
-from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
-from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
-from .group_conv2d_nchw import schedule_conv2d_nchw_cuda
+from .conv1d import *
+from .conv1d_transpose_ncw import *
+from .conv2d import *
+from .conv2d_hwcn import *
+from .conv2d_int8 import *
+from .conv2d_winograd import *
+from .depthwise_conv2d import *
+from .group_conv2d_nchw import *
+from . import conv2d_alter_op
+from .conv2d_transpose_nchw import *
+from .deformable_conv2d import *
+from .conv3d import *
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
-from .dense import schedule_dense
-from .pooling import schedule_pool, schedule_adaptive_pool
+from .dense import *
+from .pooling import *
 from .nn import schedule_lrn
-from .batch_matmul import schedule_batch_matmul
+from .batch_matmul import *
 from .vision import *
-from . import ssd
 from .ssd import *
-from .nms import *
+from .nms import get_valid_counts, non_max_suppression
 from .rcnn import *
 from .sort import *
diff --git a/topi/python/topi/cuda/batch_matmul.py b/topi/python/topi/cuda/batch_matmul.py
index 24fc2a17aa18..e293c7ad41e8 100644
--- a/topi/python/topi/cuda/batch_matmul.py
+++ b/topi/python/topi/cuda/batch_matmul.py
@@ -19,34 +19,8 @@
 from __future__ import absolute_import as _abs
 import tvm
 from tvm.contrib import cublas
-from topi.nn import batch_matmul, batch_matmul_default
-from .. import generic
 from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
 
-@batch_matmul.register(["cuda", "gpu"])
-def batch_matmul_cuda(x, y):
-    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
-    data in batch.
-
-    Parameters
-    ----------
-    x : tvm.Tensor
-        3-D with shape [batch, M, K]
-
-    y : tvm.Tensor
-        3-D with shape [batch, N, K]
-
-    Returns
-    -------
-    output : tvm.Tensor
-        3-D with shape [batch, M, N]
-    """
-    target = tvm.target.Target.current()
-    if target.target_name == "cuda" and "cublas" in target.libs:
-        return cublas.batch_matmul(x, y, False, True)
-    return batch_matmul_default(x, y)
-
-@generic.schedule_batch_matmul.register(["cuda", "gpu"])
 def schedule_batch_matmul(outs):
     """Schedule for batch_matmul
 
@@ -61,10 +35,6 @@ def schedule_batch_matmul(outs):
     s: Schedule
         The computation schedule for the op.
     """
-    target = tvm.target.Target.current()
-    if target.target_name == "cuda" and "cublas" in target.libs:
-        return generic.schedule_extern(outs)
-
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
 
@@ -134,3 +104,22 @@ def _callback(op):
 
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+def batch_matmul_cublas(x, y):
+    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
+    data in batch.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        3-D with shape [batch, M, K]
+
+    y : tvm.Tensor
+        3-D with shape [batch, N, K]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        3-D with shape [batch, M, N]
+    """
+    return cublas.batch_matmul(x, y, False, True)
diff --git a/topi/python/topi/cuda/conv1d.py b/topi/python/topi/cuda/conv1d.py
index 43754a31df48..56918e2bbba2 100644
--- a/topi/python/topi/cuda/conv1d.py
+++ b/topi/python/topi/cuda/conv1d.py
@@ -19,67 +19,22 @@
 import tvm
 from tvm import autotvm
 
-from .. import nn, generic
+from .. import nn
 from ..util import traverse_inline, get_const_tuple
 
 
-@autotvm.register_topi_compute(nn.conv1d, ['cuda', 'gpu'], ['direct'])
-def conv1d_cuda(cfg,
-                data,
-                kernel,
-                strides,
-                padding,
-                dilation,
-                layout='NCW',
-                out_dtype='float32'):
-    """ 1D convolution forward operator for cuda backend.
+@autotvm.register_topi_compute("conv1d_ncw.cuda")
+def conv1d_ncw(cfg,
+               data,
+               kernel,
+               strides,
+               padding,
+               dilation,
+               out_dtype='float32'):
+    return nn.conv1d_ncw(data, kernel, strides, padding, dilation, out_dtype)
 
-    Parameters
-    ----------
-    cfg : ConfigEntity
-        The config for this template
-
-    data : tvm.Tensor
-        3-D input shape [batch, in_channel, in_width] for layout == 'NCW'
-        and [batch, in_width, in_channel] for layout == 'NWC'
-
-    kernel : tvm.Tensor
-        3-D kernel with shape [num_filter, in_channel, filter_size] for layout == 'NCW'
-        and [filter_size, in_channel, num_filter] for layout == 'NWC'
-
-    strides : int or tuple
-        The spatial stride along width
 
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-
-    dilation : int or tuple
-        Dilation rate if convolution should be dilated.
-
-    layout : str
-        How input data is laid out, must be one of ['NCW', 'NWC']
-
-    out_dtype : str
-        The output data type. If None then output is same type as input.
-    """
-    if out_dtype is None:
-        out_dtype = data.dtype
-    if isinstance(strides, (tuple, list)):
-        strides = strides[0]
-    if isinstance(dilation, (tuple, list)):
-        dilation = dilation[0]
-
-    if layout == 'NCW':
-        return nn.conv1d_ncw(data, kernel, strides, padding, dilation,
-                             out_dtype)
-    if layout == 'NWC':
-        return nn.conv1d_nwc(data, kernel, strides, padding, dilation,
-                             out_dtype)
-    raise ValueError("This layout is not yet supported: {}".format(layout))
-
-
-@autotvm.register_topi_schedule(generic.schedule_conv1d_ncw, ["cuda", "gpu"],
-                                ["direct"])
+@autotvm.register_topi_schedule("conv1d_ncw.cuda")
 def schedule_conv1d_ncw(cfg, outs):
     """TOPI schedule callback of conv1d ncw for cuda gpu
 
@@ -193,8 +148,18 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_schedule(generic.schedule_conv1d_nwc, ["cuda", "gpu"],
-                                ["direct"])
+@autotvm.register_topi_compute("conv1d_nwc.cuda")
+def conv1d_nwc(cfg,
+               data,
+               kernel,
+               strides,
+               padding,
+               dilation,
+               out_dtype='float32'):
+    return nn.conv1d_nwc(data, kernel, strides, padding, dilation, out_dtype)
+
+
+@autotvm.register_topi_schedule("conv1d_nwc.cuda")
 def schedule_conv1d_nwc(cfg, outs):
     """TOPI schedule callback of conv1d nwc for cuda gpu
 
diff --git a/topi/python/topi/cuda/conv1d_transpose_ncw.py b/topi/python/topi/cuda/conv1d_transpose_ncw.py
index 4cedbd529f02..4802a0d144a3 100644
--- a/topi/python/topi/cuda/conv1d_transpose_ncw.py
+++ b/topi/python/topi/cuda/conv1d_transpose_ncw.py
@@ -19,11 +19,11 @@
 
 import tvm
 from tvm import autotvm
-from .. import nn, generic
+from .. import nn
 from ..util import get_const_tuple, traverse_inline
 
-@autotvm.task.register_topi_compute(nn.conv1d_transpose_ncw, ['cuda', 'gpu'], "direct")
-def conv1d_transpose_ncw_cuda(cfg, data, kernel, stride, padding, out_dtype):
+@autotvm.task.register_topi_compute("conv1d_transpose_nchw.cuda")
+def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype):
     """Transposed 1D convolution ncw forward operator.
 
     Parameters
@@ -79,9 +79,8 @@ def conv1d_transpose_ncw_cuda(cfg, data, kernel, stride, padding, out_dtype):
 
     return data_out
 
-@autotvm.task.register_topi_schedule(generic.schedule_conv1d_transpose_ncw,
-                                     ['cuda', 'gpu'], 'direct')
-def schedule_conv1d_transpose_ncw_cuda(cfg, outs):
+@autotvm.task.register_topi_schedule("conv1d_transpose_nchw.cuda")
+def schedule_conv1d_transpose_ncw(cfg, outs):
     """TOPI Schedule callback for conv1d_transpose operator.
 
     Parameters
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index f26069cfc3f0..e1ada325ea63 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name, unused-argument
 """Compute definition for conv2d with cuda backend"""
 import tvm
 from tvm import autotvm
@@ -23,179 +23,95 @@
 from .. import nn, generic
 from ..nn.util import get_pad_tuple
 from ..util import get_const_tuple, traverse_inline
-
 from .conv2d_direct import schedule_direct_cuda
-from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
-from .conv2d_int8 import conv2d_NCHWc_int8, schedule_conv2d_NCHWc_int8
-
-
-@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8'])
-def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'):
-    """Conv2D operator for cuda backend.
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width] or
-        5-D with shape [batch, ic_chunk, in_height, in_width, ic_block]
-
-    kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
-        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
-        filter_width, num_filter_block, in_channel_block]
-
-    strides : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding : int or a list/tuple of 2 or 4 ints
-        padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
 
-    dilation: int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
 
-    layout : str
-        layout of data
+@autotvm.register_topi_compute("conv2d_nchw.cuda")
+def conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'):
+    """Compute conv2d with NCHW layout"""
+    return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
 
-    out_dtype: str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-    target = tvm.target.Target.current()
-
-    if "cudnn" in target.libs:
-        if layout == 'NCHW':
-            tensor_format = 0 # CUDNN_TENSOR_NCHW
-            N, _, H, W = get_const_tuple(data.shape)
-        elif layout == 'NHWC':
-            tensor_format = 1 # CUDNN_TENSOR_NHWC
-            N, H, W, _ = get_const_tuple(data.shape)
-        else:
-            raise ValueError("Unsupported layout %s in cudnn" % layout)
-        CO, CI, KH, KW = get_const_tuple(kernel.shape)
-
-        # handle dilation
-        stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
-        dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
-
-        if isinstance(padding, (list, tuple)) and len(padding) == 4 and \
-           (padding[0] != padding[2] or padding[1] != padding[3]):
-            raise ValueError("Cudnn doesn't support asymmetric padding.")
-        pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
-        OH = (H + pt + pb - KH) // stride_h + 1
-        OW = (W + pl + pr - KW) // stride_w + 1
-        cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
-                    ((KW - 1) * dilation_w + 1))
-
-        if data.dtype == "int8" or kernel.dtype == "int8":
-            if layout == 'NCHW':
-                raise ValueError("NCHW layout do not support int8 in cudnn")
-            dtype = "int32"
-        else:
-            dtype = data.dtype
-
-        return cudnn.conv_forward(data,
-                                  kernel,
-                                  [pt, pl], # cudnn padding pt, pl on both sides of input
-                                  [stride_h, stride_w],
-                                  [dilation_h, dilation_w],
-                                  conv_mode=1,
-                                  tensor_format=tensor_format,
-                                  algo=-1,         # let CUDNN choose the best algo
-                                  conv_dtype=dtype)
-
-    if cfg.template_key == 'winograd':
-        return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
-                             pre_computed=False)
-    if cfg.template_key == 'int8':
-        if (data.dtype == 'int8' or data.dtype == 'uint8'):
-            return conv2d_NCHWc_int8(
-                cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
-
-    if layout == 'NCHW':
-        return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-    if layout == 'HWCN':
-        return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
-    if layout == 'NHWC':
-        return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
-    raise ValueError("not support this layout {} yet".format(layout))
-
-
-@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
-                                ["direct", 'winograd', "int8"])
-def schedule_conv2d_nchw_cuda(cfg, outs):
-    """TOPI schedule callback of conv2d for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    target = tvm.target.Target.current()
-    if 'cudnn' in target.libs:
-        return generic.schedule_extern(outs)
 
+@autotvm.register_topi_schedule("conv2d_nchw.cuda")
+def schedule_conv2d_nchw(cfg, outs):
+    """Create the schedule for conv2d_nchw"""
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv2d_nchw':
             schedule_direct_cuda(cfg, s, op.output(0))
-        if op.tag == 'conv2d_nchw_winograd':
-            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
-        if op.tag == "conv2d_NCHWc_int8":
-            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
-@autotvm.register_topi_schedule(generic.schedule_conv2d_nhwc, ["cuda", "gpu"],
-                                ["direct"])
-def schedule_conv2d_nhwc_cuda(cfg, outs):
-    """TOPI schedule for CUDA conv2d_nhwc
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d.
-    """
-    target = tvm.target.Target.current()
-    if 'cudnn' in target.libs:
-        return generic.schedule_extern(outs)
-
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+# TODO(@alexgl-github): It's invalid to call schedule_direct_cuda for NHWC layout
+#  as it assumes the input layout to be NCHW. Please fix this.
+# @autotvm.register_topi_compute("conv2d_nhwc.cuda")
+# def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'):
+#     return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
+#
+#
+# @autotvm.register_topi_schedule("conv2d_nhwc.cuda")
+# def schedule_conv2d_nhwc(cfg, outs):
+#     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+#     s = tvm.create_schedule([x.op for x in outs])
+#
+#     def _callback(op):
+#         if op.tag == 'conv2d_nhwc':
+#             schedule_direct_cuda(cfg, s, op.output(0))
+#
+#     traverse_inline(s, outs[0].op, _callback)
+#     return s
 
-    def _callback(op):
-        if op.tag == 'conv2d_nhwc':
-            schedule_direct_cuda(cfg, s, op.output(0))
 
-    traverse_inline(s, outs[0].op, _callback)
-    return s
+@autotvm.register_topi_compute("conv2d_cudnn.cuda")
+def conv2d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCHW',
+                 out_dtype='float32'):
+    """Compute conv2d using CuDNN library"""
+    if layout == 'NCHW':
+        tensor_format = 0 # CUDNN_TENSOR_NCHW
+        N, _, H, W = get_const_tuple(data.shape)
+    elif layout == 'NHWC':
+        tensor_format = 1 # CUDNN_TENSOR_NHWC
+        N, H, W, _ = get_const_tuple(data.shape)
+    else:
+        raise ValueError("Unsupported layout %s in cudnn" % layout)
+    CO, CI, KH, KW = get_const_tuple(kernel.shape)
+
+    # handle dilation
+    stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
+    dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
+
+    if isinstance(padding, (list, tuple)) and len(padding) == 4 and \
+            (padding[0] != padding[2] or padding[1] != padding[3]):
+        raise ValueError("Cudnn doesn't support asymmetric padding.")
+    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
+    OH = (H + pt + pb - KH) // stride_h + 1
+    OW = (W + pl + pr - KW) // stride_w + 1
+    cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) * \
+                 ((KW - 1) * dilation_w + 1))
+
+    if data.dtype == "int8" or kernel.dtype == "int8":
+        if layout == 'NCHW':
+            raise ValueError("NCHW layout do not support int8 in cudnn")
+        dtype = "int32"
+    else:
+        dtype = data.dtype
+
+    return cudnn.conv_forward(data,
+                              kernel,
+                              [pt, pl], # cudnn padding pt, pl on both sides of input
+                              [stride_h, stride_w],
+                              [dilation_h, dilation_w],
+                              conv_mode=1,
+                              tensor_format=tensor_format,
+                              algo=-1,         # let CUDNN choose the best algo
+                              conv_dtype=dtype)
+
+
+@autotvm.register_topi_schedule("conv2d_cudnn.cuda")
+def schedule_conv2d_cudnn(cfg, outs):
+    """Create the schedule for conv2d_cudnn"""
+    return generic.schedule_extern(outs)
diff --git a/topi/python/topi/cuda/conv2d_alter_op.py b/topi/python/topi/cuda/conv2d_alter_op.py
new file mode 100644
index 000000000000..f3e4f4c3b3c9
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_alter_op.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Conv2D alter op and legalize functions for cuda backend"""
+
+import logging
+import tvm
+from tvm import relay
+from tvm import autotvm
+
+from .. import nn
+from ..util import get_const_tuple
+from .conv2d_winograd import _infer_tile_size
+
+logger = logging.getLogger('topi')
+
+@nn.conv2d_alter_layout.register(["cuda", "gpu"])
+def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+
+    _, outs = relay.backend.compile_engine.select_implementation(
+        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target)
+    workload = autotvm.task.get_workload(outs)
+    if workload is None:
+        # The best implementation is not an AutoTVM template,
+        # we then assume it's not necessary to alter this op.
+        return None
+    cfg = dispatch_ctx.query(target, workload)
+    if cfg.is_fallback:  # if is fallback, clear query cache and return None
+        autotvm.task.clear_fallback_cache(target, workload)
+        return None
+
+    topi_tmpl = workload[0]
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int('groups')
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data, kernel = tinfos
+    out_dtype = out_type.dtype
+
+    if topi_tmpl == "conv2d_NCHWc_int8.cuda":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+
+        new_layout = 'NCHW4c'
+        new_attrs["channels"] = CO
+        new_attrs["data_layout"] = new_layout
+        new_attrs['out_layout'] = new_layout
+        new_attrs['kernel_layout'] = 'OIHW4o4i'
+        ic_block_factor = oc_block_factor = 4
+
+        # Store the same config for the altered operator (workload)
+        new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                   dtype=data.dtype)
+        new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW, \
+                                      oc_block_factor, ic_block_factor), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype],
+            "conv2d_NCHWc_int8.cuda")
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    if topi_tmpl == "conv2d_nchw_winograd.cuda":
+        if dilation != (1, 1):
+            logger.warning("Does not support weight pre-transform for dilated convolution.")
+            return None
+
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+
+        # pre-compute weight transformation in winograd
+        tile_size = _infer_tile_size(tinfos[0], tinfos[1])
+
+        weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1],
+                                                                   tile_size=tile_size)
+        weight = relay.transpose(weight, axes=[0, 1, 3, 2])
+        new_attrs['tile_size'] = tile_size
+        new_attrs['channels'] = CO
+
+        # Store the same config for the altered operator (workload)
+        new_data = data
+        new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO),
+                                     dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_weight, strides, padding, dilation, out_dtype],
+            "conv2d_nchw_winograd_without_weight_transform.cuda")
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight, **new_attrs)
+
+    if topi_tmpl == "group_conv2d_NCHWc_int8.cuda":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+
+        new_layout = 'NCHW4c'
+        new_attrs["channels"] = CO
+        new_attrs["data_layout"] = new_layout
+        new_attrs['out_layout'] = new_layout
+        new_attrs['kernel_layout'] = 'OIHW4o4i'
+        ic_block_factor = oc_block_factor = 4
+
+        # Store the same config for the altered operator (workload)
+        new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                   dtype=data.dtype)
+        new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,
+                                      KH, KW, oc_block_factor, ic_block_factor),
+                                     dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
+            "group_conv2d_NCHWc_int8.cuda")
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    return None
diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py
index b7df88579f49..2fab8cf12253 100644
--- a/topi/python/topi/cuda/conv2d_direct.py
+++ b/topi/python/topi/cuda/conv2d_direct.py
@@ -43,7 +43,7 @@ def schedule_direct_cuda(cfg, s, conv):
     # fallback support
     if cfg.is_fallback:
         ref_log = autotvm.tophub.load_reference_log(
-            target.target_name, target.model, 'conv2d', 'direct')
+            target.target_name, target.model, 'conv2d_nchw.cuda')
         cfg.fallback_with_reference_log(ref_log)
     ##### space definition end #####
 
diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index 18a624a67aea..b0925ae93a16 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -14,16 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-statements
+# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
 """Schedule for conv2d_hwcn with auto fusion"""
 import tvm
 from tvm import autotvm
+
 from tvm.autotvm.task.space import SplitEntity
 
-from .. import generic, tag
+from .. import nn, tag
+
+@autotvm.register_topi_compute("conv2d_hwcn.cuda")
+def conv2d_hwcn(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'):
+    """Compute conv2d with HWCN layout on CUDA"""
+    return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
 
 
-@autotvm.register_topi_schedule(generic.schedule_conv2d_hwcn, ["cuda", "gpu"], ["direct"])
+@autotvm.register_topi_schedule("conv2d_hwcn.cuda")
 def schedule_conv2d_hwcn(cfg, outs):
     """Schedule for conv2d_hwcn and any element-wise operations.
 
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 580cf96b53e8..53a7bd9fa849 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -23,9 +23,10 @@
 from .tensor_intrin import dp4a
 from ..nn.pad import pad
 from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple
+from ..util import get_const_tuple, traverse_inline
 
 
+@autotvm.register_topi_compute("conv2d_NCHWc_int8.cuda")
 def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_dtype):
     """Convolution operator in NCHW[x]c layout for int8.
 
@@ -152,8 +153,21 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
 _dp4a = dp4a('shared', 'shared', 'local')
 
 
-def schedule_conv2d_NCHWc_int8(cfg, s, output):
+@autotvm.register_topi_schedule("conv2d_NCHWc_int8.cuda")
+def schedule_conv2d_NCHWc_int8(cfg, outs):
     """Schedule conv2d int8 NCHWc template"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'conv2d_NCHWc_int8':
+            _schedule_conv2d_NCHWc_int8(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _schedule_conv2d_NCHWc_int8(cfg, s, output):
     conv = output.op.input_tensors[0]
     packed_data, packed_kernel = conv.op.input_tensors
 
diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py
index be9f31567bc9..8751800c4517 100644
--- a/topi/python/topi/cuda/conv2d_transpose_nchw.py
+++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py
@@ -20,12 +20,12 @@
 import tvm
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from .. import nn, generic
+from .. import nn
 from ..util import get_const_tuple, traverse_inline
 
 
-@autotvm.task.register_topi_compute(nn.conv2d_transpose_nchw, ['cuda', 'gpu'], "direct")
-def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype):
+@autotvm.register_topi_compute("conv2d_transpose_nchw.cuda")
+def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype):
     """Transposed 2D convolution nchw forward operator.
 
     Parameters
@@ -101,9 +101,8 @@ def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype):
 
     return data_out
 
-@autotvm.task.register_topi_schedule(generic.schedule_conv2d_transpose_nchw,
-                                     ['cuda', 'gpu'], 'direct')
-def schedule_conv2d_transpose_nchw_cuda(cfg, outs):
+@autotvm.register_topi_schedule("conv2d_transpose_nchw.cuda")
+def schedule_conv2d_transpose_nchw(cfg, outs):
     """TOPI Schedule callback for conv2d transpose operator.
 
     Parameters
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 37307d62357d..6e09be97390c 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -22,9 +22,7 @@
 from tvm import autotvm
 
 from .. import nn
-from ..nn import conv2d, group_conv2d_nchw, conv2d_winograd_without_weight_transform
 from ..util import get_const_int, get_const_tuple, traverse_inline
-from ..generic import schedule_conv2d_winograd_without_weight_transform
 from ..nn.winograd_util import winograd_transform_matrices
 
 
@@ -37,10 +35,9 @@ def _infer_tile_size(data, kernel):
         return 4
     return 2
 
-def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed):
+def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
+                  pre_computed):
     """Compute declaration for winograd"""
-    assert layout == 'NCHW'
-
     tile_size = _infer_tile_size(data, kernel)
 
     N, CI, H, W = get_const_tuple(data.shape)
@@ -53,7 +50,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dty
 
     if not pre_computed: # kernel tensor is raw tensor, do strict check
         if dilation_h != 1 or dilation_w != 1:
-            kernel = dilation(kernel, (1, 1, dilation_h, dilation_w))
+            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
         CO, CI, KH, KW = get_const_tuple(kernel.shape)
         alpha = KW + tile_size - 1
         assert HSTR == 1 and WSTR == 1 and KH == KW
@@ -282,161 +279,38 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
 
     return s
 
-##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
-@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform,
-                               ['cuda', 'gpu'], ['winograd'])
-def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
-    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
-                         pre_computed=True)
-
+@autotvm.register_topi_compute("conv2d_nchw_winograd.cuda")
+def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
+                         pre_computed=False)
 
-@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
-                                ['cuda', 'gpu'], ['winograd'])
-def schedule_conv2d_winograd_without_weight_transform_cuda(cfg, outs):
-    """TOPI schedule callback"""
+@autotvm.register_topi_schedule("conv2d_nchw_winograd.cuda")
+def schedule_conv2d_nchw_winograd(cfg, outs):
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'conv2d_nchw_winograd' in op.tag:
-            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True)
+            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
-##### REGISTER ALTER OP LAYOUT #####
-@nn.conv2d_alter_layout.register(["cuda", "gpu"])
-def _alter_conv2d_layout(attrs, inputs, tinfos, F):
-    """Alter op layout for pre-computing kernel transformation
-
-    Parameters
-    ----------
-    attrs : tvm.ir.Attrs
-        Attributes of current convolution
-    inputs : tvm.relay.Expr
-        Grouped input symbols
-    tinfos : list
-        Input shape and dtype
-    F: symbol
-        The context, can be relay.op
-
-    Note
-    ----
-    Unlike other TOPI functions, this function operates on both graph level and operator level,
-    so we have to pass 'F' to make it support our two versions of graph IR,  Relay.
-    """
-    if 'cudnn' in tvm.target.Target.current().libs or 'miopen' in tvm.target.Target.current().libs:
-        return None
-
-    copy_inputs = list(inputs)
-    new_attrs = {k: attrs[k] for k in attrs.keys()}
-
-
-    new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
-
-    strides = attrs.get_int_tuple("strides")
-    padding = attrs.get_int_tuple("padding")
-    dilation = attrs.get_int_tuple("dilation")
-    groups = attrs.get_int('groups')
-    data_layout_key = "data_layout" if "data_layout" in new_attrs else "layout"
-    layout = attrs[data_layout_key]
-    out_dtype = attrs["out_dtype"]
-    if out_dtype in ("", "same"):
-        out_dtype = tinfos[0].dtype
-
-    data, kernel = tinfos[0:2]
-    N, CI, H, W = get_const_tuple(data.shape)
-    CO, _, KH, KW = get_const_tuple(kernel.shape)
+@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.cuda")
+def conv2d_nchw_winograd_without_weight_transform(cfg, data, kernel, strides,
+                                                  padding, dilation, out_dtype):
+    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
+                         pre_computed=True)
 
-    dispatch_ctx = autotvm.DispatchContext.current
-    target = tvm.target.Target.current()
 
-    if groups == 1:
-        # query config of this workload
-        workload = autotvm.task.args_to_workload(
-            [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype], conv2d)
-        cfg = autotvm.DispatchContext.current.query(target, workload)
-
-        if cfg.is_fallback:  # if is fallback, clear query cache and return None
-            autotvm.task.clear_fallback_cache(target, workload)
-            return None
-
-        if cfg.template_key == 'direct':
-            return None
-
-        if cfg.template_key == 'int8':
-            assert 'cuda' in target.keys
-            new_layout = 'NCHW4c'
-            new_attrs[data_layout_key] = new_layout
-            new_attrs['out_layout'] = new_layout
-            new_attrs['kernel_layout'] = 'OIHW4o4i'
-            ic_block_factor = oc_block_factor = 4
-
-            # Store the same config for the altered operator (workload)
-            new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
-                                       dtype=data.dtype)
-            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\
-                                         oc_block_factor, ic_block_factor), dtype=kernel.dtype)
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype],
-                conv2d
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-            return F.nn.conv2d(*copy_inputs, **new_attrs)
-
-        if attrs.get_int_tuple("dilation") != (1, 1):
-            logger.warning("Does not support weight pre-transform for dilated convolution.")
-            return None
-
-        # pre-compute weight transformation in winograd
-        tile_size = _infer_tile_size(tinfos[0], tinfos[1])
-
-        weight = F.nn.contrib_conv2d_winograd_weight_transform(copy_inputs[1],
-                                                               tile_size=tile_size)
-        weight = F.transpose(weight, axes=[0, 1, 3, 2])
-        copy_inputs[1] = weight
-        new_attrs['tile_size'] = tile_size
-
-        # Store the same config for the altered operator (workload)
-        new_data = data
-        new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO),
-                                     dtype=kernel.dtype)
-        new_workload = autotvm.task.args_to_workload(
-            [new_data, new_weight, strides, padding, dilation, layout, out_dtype, tile_size],
-            conv2d_winograd_without_weight_transform
-        )
-        dispatch_ctx.update(target, new_workload, cfg)
-        return F.nn.contrib_conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
-    if groups != CI:
-        workload = autotvm.task.args_to_workload(
-            [tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype],
-            group_conv2d_nchw)
-        cfg = autotvm.DispatchContext.current.query(target, workload)
-
-        if cfg.is_fallback:  # if is fallback, clear query cache and return None
-            autotvm.task.clear_fallback_cache(target, workload)
-            return None
-
-        if cfg.template_key == 'int8':
-            assert 'cuda' in target.keys
-            new_layout = 'NCHW4c'
-            new_attrs[data_layout_key] = new_layout
-            new_attrs['out_layout'] = new_layout
-            new_attrs['kernel_layout'] = 'OIHW4o4i'
-            ic_block_factor = oc_block_factor = 4
-
-            # Store the same config for the altered operator (workload)
-            new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
-                                       dtype=data.dtype)
-            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,\
-                                         KH, KW, oc_block_factor, ic_block_factor),
-                                         dtype=kernel.dtype)
-            new_workload = autotvm.task.args_to_workload(
-                [new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
-                group_conv2d_nchw
-            )
-            dispatch_ctx.update(target, new_workload, cfg)
-            return F.nn.conv2d(*copy_inputs, **new_attrs)
-
-    # do nothing for depthwise convolution
-    return None
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.cuda")
+def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nchw_winograd' in op.tag:
+            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/cuda/conv3d.py b/topi/python/topi/cuda/conv3d.py
index b46f284ef5b7..0a6a71ccc2f0 100644
--- a/topi/python/topi/cuda/conv3d.py
+++ b/topi/python/topi/cuda/conv3d.py
@@ -14,22 +14,20 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name, unused-argument
 """Compute definition for conv3d with cuda backend"""
 import tvm
 from tvm import autotvm
 from tvm.contrib import cudnn
 
 from .. import nn, generic
-from ..nn.util import get_pad_tuple3d
 from ..util import get_const_tuple, traverse_inline
+from .conv3d_direct import schedule_direct_conv3d_cuda
 
-from .conv3d_direct import schedule_direct_3d_cuda
 
-
-@autotvm.register_topi_compute(nn.conv3d, ['cuda', 'gpu'], ['direct'])
-def conv3d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', out_dtype='float32'):
-    """Conv3D operator for cuda backend.
+@autotvm.register_topi_compute("conv3d_ncdhw.cuda")
+def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'):
+    """Conv3D operator in NCDHW layout for cuda backend.
 
     Parameters
     ----------
@@ -45,17 +43,12 @@ def conv3d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', o
     strides : int or a list/tuple of three ints
         stride size, or [stride_depth, stride_height, stride_width]
 
-    padding : int or a list/tuple of 3 or 6 ints
-        padding size, or
-        [pad_depth, pad_height, pad_width] for 3 ints, or
-        [pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right] for 6 ints
+    padding : int or a list/tuple of three ints
+        padding size, or [pad_depth, pad_height, pad_width]
 
     dilation: int or a list/tuple of three ints
         dilation size, or [dilation_depth, dilation_height, dilation_width]
 
-    layout : str
-        layout of data
-
     out_dtype: str
         The output type. This is used for mixed precision.
 
@@ -64,52 +57,11 @@ def conv3d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCDHW', o
     output : tvm.Tensor
         5-D with shape [batch, out_channel, out_depth, out_height, out_width]
     """
-    target = tvm.target.Target.current()
-
-    if "cudnn" in target.libs:
-        if layout == 'NCDHW':
-            tensor_format = 0 # CUDNN_TENSOR_NCHW
-            N, _, D, H, W = get_const_tuple(data.shape)
-        elif layout == 'NDHWC':
-            tensor_format = 1 # CUDNN_TENSOR_NHWC
-            N, D, H, W, _ = get_const_tuple(data.shape)
-        else:
-            raise ValueError("Unsupported layout %s in cudnn" % layout)
-        CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
-
-        # handle dilation
-        stride_d, stride_h, stride_w = (strides, strides, strides) if isinstance(strides, int) \
-            else strides
-        if isinstance(padding, (list, tuple)) and len(padding) > 3:
-            raise ValueError("Cudnn doesn't support asymmetric padding.")
-        pf, pt, pl, pk, pb, pr = get_pad_tuple3d(padding, (KD, KH, KW))
-        dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation) if \
-            isinstance(dilation, int) else dilation
-
-        OD = (D + pf + pk - KD) // stride_d + 1
-        OH = (H + pt + pb - KH) // stride_h + 1
-        OW = (W + pl + pr - KW) // stride_w + 1
-        cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) *\
-                    ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1))
-
-        return cudnn.conv_forward(data,
-                                  kernel,
-                                  [pf, pt, pl],  # cudnn padding pt, pl on both sides of input
-                                  [stride_d, stride_h, stride_w],
-                                  [dilation_d, dilation_h, dilation_w],
-                                  conv_mode=1,
-                                  tensor_format=tensor_format,
-                                  algo=-1,         # let CUDNN choose the best algo
-                                  conv_dtype=data.dtype)
-
-    if layout == 'NCDHW':
-        return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype)
-    raise ValueError("not support this layout {} yet".format(layout))
+    return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype)
 
 
-@autotvm.register_topi_schedule(generic.schedule_conv3d_ncdhw, ["cuda", "gpu"],
-                                ["direct"])
-def schedule_conv3d_ncdhw_cuda(cfg, outs):
+@autotvm.register_topi_schedule("conv3d_ncdhw.cuda")
+def schedule_conv3d_ncdhw(cfg, outs):
     """TOPI schedule callback of conv3d for cuda gpu
 
     Parameters
@@ -126,24 +78,49 @@ def schedule_conv3d_ncdhw_cuda(cfg, outs):
     s: Schedule
         The computation schedule for conv2d.
     """
-    target = tvm.target.Target.current()
-    if 'cudnn' in target.libs:
-        return generic.schedule_extern(outs)
-
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv3d_ncdhw':
-            schedule_direct_3d_cuda(cfg, s, op.output(0))
+            schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NCDHW",
+                                        "conv3d_ncdhw.cuda")
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
-@autotvm.register_topi_schedule(generic.schedule_conv3d_ndhwc, ["cuda", "gpu"],
-                                ["direct"])
-def schedule_conv3d_ndhwc_cuda(cfg, outs):
+@autotvm.register_topi_compute("conv3d_ndhwc.cuda")
+def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'):
+    """Conv3d operator in NDHWC layout for cuda backend.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        5-D with shape [batch, in_depth, in_height, in_width, in_channel]
+
+    Filter : tvm.Tensor
+        5-D with shape [filter_depth, filter_height, filter_width, in_channel, num_filter]
+
+    stride : int or a list/tuple of three ints
+        Stride size, or [stride_depth, stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    dilation: int or a list/tuple of three ints
+        dilation size, or [dilation_depth, dilation_height, dilation_width]
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        5-D with shape [batch, out_depth, out_height, out_width, out_channel]
+    """
+    return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, out_dtype)
+
+
+@autotvm.register_topi_schedule("conv3d_ndhwc.cuda")
+def schedule_conv3d_ndhwc(cfg, outs):
     """TOPI schedule callback of conv3d for cuda gpu
 
     Parameters
@@ -160,16 +137,104 @@ def schedule_conv3d_ndhwc_cuda(cfg, outs):
     s: Schedule
         The computation schedule for conv2d.
     """
-    target = tvm.target.Target.current()
-    if 'cudnn' in target.libs:
-        return generic.schedule_extern(outs)
-
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv3d_ndhwc':
-            schedule_direct_3d_cuda(cfg, s, op.output(0))
+            schedule_direct_conv3d_cuda(cfg, s, op.output(0), "NDHWC",
+                                        "conv3d_ndhwc.cuda")
 
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+@autotvm.register_topi_compute("conv3d_cudnn.cuda")
+def conv3d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCDHW',
+                 out_dtype='float32'):
+    """Conv3D operator for cuda backend.
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
+
+    kernel : tvm.Tensor
+        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
+
+    strides : int or a list/tuple of three ints
+        stride size, or [stride_depth, stride_height, stride_width]
+
+    padding : int or a list/tuple of three ints
+        padding size, or [pad_depth, pad_height, pad_width]
+
+    dilation: int or a list/tuple of three ints
+        dilation size, or [dilation_depth, dilation_height, dilation_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
+    """
+    if layout == 'NCDHW':
+        tensor_format = 0 # CUDNN_TENSOR_NCHW
+        N, _, D, H, W = get_const_tuple(data.shape)
+    elif layout == 'NDHWC':
+        tensor_format = 1 # CUDNN_TENSOR_NHWC
+        N, D, H, W, _ = get_const_tuple(data.shape)
+    else:
+        raise ValueError("Unsupported layout %s in cudnn" % layout)
+    CO, CI, KD, KH, KW = get_const_tuple(kernel.shape)
+
+    # handle dilation
+    stride_d, stride_h, stride_w = (strides, strides, strides) if isinstance(strides, int) \
+        else strides
+    pad_d, pad_h, pad_w = (padding, padding, padding) if isinstance(padding, int) else padding
+    dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation) if \
+        isinstance(dilation, int) else dilation
+
+    OD = (D + 2 * pad_d - KD) // stride_d + 1
+    OH = (H + 2 * pad_h - KH) // stride_h + 1
+    OW = (W + 2 * pad_w - KW) // stride_w + 1
+    cfg.add_flop(2 * N * OD * OH * OW * CO * CI * ((KD - 1) * dilation_d + 1) * \
+                 ((KH - 1) * dilation_h + 1) * ((KW - 1) * dilation_w + 1))
+
+    return cudnn.conv_forward(data,
+                              kernel,
+                              [pad_d, pad_h, pad_w],
+                              [stride_d, stride_h, stride_w],
+                              [dilation_d, dilation_h, dilation_w],
+                              conv_mode=1,
+                              tensor_format=tensor_format,
+                              algo=-1,         # let CUDNN choose the best algo
+                              conv_dtype=dtype)
+
+
+@autotvm.register_topi_schedule("conv3d_cudnn.cuda")
+def schedule_conv3d_cudnn(_, outs):
+    """TOPI schedule callback of conv3d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d.
+    """
+    return generic.schedule_extern(outs)
diff --git a/topi/python/topi/cuda/conv3d_direct.py b/topi/python/topi/cuda/conv3d_direct.py
index ad48deb27539..fa6c8781b5d3 100644
--- a/topi/python/topi/cuda/conv3d_direct.py
+++ b/topi/python/topi/cuda/conv3d_direct.py
@@ -20,11 +20,16 @@
 from tvm import autotvm
 from ..util import get_const_tuple
 
-def schedule_direct_3d_cuda(cfg, s, conv):
+def schedule_direct_conv3d_cuda(cfg, s, conv, layout, workload_name):
     """schedule optimized for batch size = 1"""
 
     ##### space definition begin #####
-    n, f, d, y, x = s[conv].op.axis
+    if layout == "NCDHW":
+        n, f, d, y, x = s[conv].op.axis
+    elif layout == "NDHWC":
+        n, d, y, x, f = s[conv].op.axis
+    else:
+        raise ValueError("not support this layout {} yet".format(layout))
     rc, rd, ry, rx = s[conv].op.reduce_axis
     cfg.define_split("tile_f", f, num_outputs=4)
     cfg.define_split("tile_d", d, num_outputs=4)
@@ -45,7 +50,7 @@ def schedule_direct_3d_cuda(cfg, s, conv):
     # fallback support
     if cfg.is_fallback:
         ref_log = autotvm.tophub.load_reference_log(
-            target.target_name, target.model, 'conv3d', 'direct')
+            target.target_name, target.model, workload_name)
         cfg.fallback_with_reference_log(ref_log)
     ##### space definition end #####
 
diff --git a/topi/python/topi/cuda/deformable_conv2d.py b/topi/python/topi/cuda/deformable_conv2d.py
index 33a8c9adc1ca..bdec4e120fe4 100644
--- a/topi/python/topi/cuda/deformable_conv2d.py
+++ b/topi/python/topi/cuda/deformable_conv2d.py
@@ -14,20 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name,unused-argument
 """Schedule template of deformable conv2d with cuda backend"""
 import tvm
 from tvm import autotvm
-from .. import nn, generic
+from .. import nn
 from ..util import traverse_inline
 
 
-autotvm.register_topi_compute(nn.deformable_conv2d_nchw, ["cuda", "gpu"], "direct",
-                              nn.deformable_conv2d_nchw.fdefault)
+@autotvm.register_topi_compute("deformable_conv2d_nchw.cuda")
+def deformable_conv2d_nchw(cfg, data, offset, kernel, strides, padding, dilation,
+                           deformable_groups, groups, out_dtype):
+    return nn.deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation,
+                                     deformable_groups, groups, out_dtype)
 
-
-@autotvm.register_topi_schedule(generic.schedule_deformable_conv2d_nchw, ["cuda", "gpu"], "direct")
-def schedule_deformable_conv2d_nchw_cuda(cfg, outs):
+@autotvm.register_topi_schedule("deformable_conv2d_nchw.cuda")
+def schedule_deformable_conv2d_nchw(cfg, outs):
     """TOPI schedule callback of deformable conv2d for cuda gpu
 
     Parameters
@@ -49,13 +51,13 @@ def schedule_deformable_conv2d_nchw_cuda(cfg, outs):
 
     def _callback(op):
         if op.tag == 'deformable_conv2d_nchw':
-            schedule_direct_cuda(cfg, s, op.output(0))
+            _schedule_direct_cuda(cfg, s, op.output(0))
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
 
-def schedule_direct_cuda(cfg, s, conv):
+def _schedule_direct_cuda(cfg, s, conv):
     """Schedule template of deformable conv2d"""
     n, f, y, x = s[conv].op.axis
     rc, ry, rx = s[conv].op.reduce_axis
diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py
index 1a1af703c55c..93797a4b49ba 100644
--- a/topi/python/topi/cuda/dense.py
+++ b/topi/python/topi/cuda/dense.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-variable
+# pylint: disable=invalid-name, unused-argument
 """Schedule for dense operator"""
 from __future__ import absolute_import as _abs
 import logging
@@ -23,111 +23,60 @@
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cublas
 from .tensor_intrin import dp4a
-from ..nn.dense import dense, dense_default
+from .. import nn
 from .. import tag
 from .. import generic
 from ..util import traverse_inline, get_const_tuple
 
 logger = logging.getLogger('topi')
 
-
-@autotvm.register_topi_compute(dense, ["cuda", "gpu"], "direct")
-def dense_cuda(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator for cuda backend.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        2-D with shape [batch, in_dim]
-
-    weight : tvm.Tensor
-        2-D with shape [out_dim, in_dim]
-
-    bias : tvm.Tensor, optional
-        1-D with shape [out_dim]
-
-    Returns
-    -------
-    output : tvm.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    # pylint: disable=unused-argument
+@autotvm.register_topi_compute("dense_cublas.cuda")
+def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
+    """Dense operator on CUDA with CUBLAS"""
     assert len(data.shape) == 2 and len(weight.shape) == 2, \
         "only support 2-dim dense"
     if bias is not None:
         assert len(bias.shape) == 1
     if out_dtype is None:
         out_dtype = data.dtype
+    assert out_dtype == data.dtype, "Mixed precision not supported."
     batch, in_dim = data.shape
     out_dim, _ = weight.shape
-    target = tvm.target.Target.current()
-    if "cublas" in target.libs:
-        matmul = cublas.matmul(data, weight, False, True, out_dtype)
-        if bias is not None:
-            matmul = tvm.compute((batch, out_dim), \
-                                 lambda i, j: matmul[i, j] + bias[j], \
-                                 tag=tag.BROADCAST)
-        return matmul
-    return dense_default(data, weight, bias, out_dtype)
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, ["cuda", "gpu"], "direct")
-def schedule_dense(cfg, outs):
-    """Schedule for dense operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of dense
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for dense.
-    """
-    # pylint: disable=unused-argument
-    target = tvm.target.Target.current()
-
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    if target.target_name == "cuda" and "cublas" in target.libs:
-        return generic.schedule_extern(outs)
+    matmul = cublas.matmul(data, weight, False, True)
+    cfg.add_flop(batch * in_dim * out_dim * 2)
+    if bias is not None:
+        matmul = tvm.compute((batch, out_dim),
+                             lambda i, j: matmul[i, j] + bias[j],
+                             tag=tag.BROADCAST)
+    return matmul
 
-    s = tvm.create_schedule([x.op for x in outs])
 
-    def _schedule(C):
-        A, _ = C.op.input_tensors
-        batch, _ = get_const_tuple(A.shape)
-        if batch < 32:
-            return schedule_dense_small_batch(cfg, s, C)
-        return schedule_dense_large_batch(cfg, s, C)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal traverse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule dense
-        elif OP.tag == 'dense':
-            Dense = OP.output(0)
-            _schedule(Dense)
-        else:
-            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+@autotvm.register_topi_schedule("dense_cublas.cuda")
+def schedule_dense_cublas(_, outs):
+    """Schedule dense operator using CUBLAS"""
+    return generic.schedule_extern(outs)
 
-        scheduled_ops.append(OP)
 
-    traverse(outs[0].op)
-    return s
+@autotvm.register_topi_compute("dense_small_batch.cuda")
+def dense_small_batch(cfg, data, weight, bias=None, out_dtype=None):
+    """Dense operator on CUDA"""
+    return nn.dense(data, weight, bias, out_dtype)
 
 
-def schedule_dense_small_batch(cfg, s, C):
+@autotvm.register_topi_schedule("dense_small_batch.cuda")
+def schedule_dense_small_batch(cfg, outs):
     """Schedule float32/64 dense with small batch size"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'dense':
+            _schedule_dense_small_batch(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+def _schedule_dense_small_batch(cfg, s, C):
     A, _ = C.op.input_tensors
     _, in_dim = get_const_tuple(A.shape)
     cfg.define_split('tile_k', in_dim, num_outputs=2)
@@ -152,7 +101,28 @@ def schedule_dense_small_batch(cfg, s, C):
     s[C].set_store_predicate(thread_x.var.equal(0))
     s[Out].set_store_predicate(thread_x.var.equal(0))
 
-def schedule_dense_large_batch(cfg, s, C):
+
+@autotvm.register_topi_compute("dense_large_batch.cuda")
+def dense_large_batch(cfg, data, weight, bias=None, out_dtype=None):
+    """Dense operator on CUDA"""
+    return nn.dense(data, weight, bias, out_dtype)
+
+
+@autotvm.register_topi_schedule("dense_large_batch.cuda")
+def schedule_dense_large_batch(cfg, outs):
+    """Schedule float32/64 dense with large batch size"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'dense':
+            _schedule_dense_large_batch(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _schedule_dense_large_batch(cfg, s, C):
     """Schedule float32/64 dense with large batch size"""
     A, B = C.op.input_tensors
     batch, in_dim = get_const_tuple(A.shape)
@@ -250,7 +220,8 @@ def schedule_dense_large_batch(cfg, s, C):
     s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
     s[BB].double_buffer()
 
-@autotvm.register_topi_compute(dense, ['cuda'], ['int8'])
+
+@autotvm.register_topi_compute("dense_int8.cuda")
 def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
     """Dense operator for int8 on CUDA"""
     if out_dtype is None:
@@ -258,16 +229,6 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
 
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
-
-    target = tvm.target.Target.current()
-    if "cublas" in target.libs:
-        matmul = cublas.matmul(data, weight, False, True, out_dtype)
-        if bias is not None:
-            matmul = tvm.compute((batch, out_dim), \
-                                 lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), \
-                                 tag=tag.BROADCAST)
-        return matmul
-
     k = tvm.reduce_axis((0, in_dim), name='k')
 
     matmul = tvm.compute((batch, out_dim),
@@ -286,15 +247,11 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
     return matmul
 
 
-@autotvm.register_topi_schedule(generic.schedule_dense, ['cuda', 'gpu'], ['int8'])
+@autotvm.register_topi_schedule("dense_int8.cuda")
 def schedule_dense_int8(cfg, outs):
     """Dense schedule for int8 on CUDA"""
-    s = tvm.create_schedule([x.op for x in outs])
-    target = tvm.target.Target.current()
-
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    if "cublas" in target.libs:
-        return generic.schedule_extern(outs)
+    s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if "dense_int8" in op.tag:
diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py
index 05e1117ac2ce..062f95f00eff 100644
--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -14,20 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name, unused-argument
 """Schedule for depthwise_conv2d with auto fusion"""
 import tvm
 from tvm import autotvm
 from ..util import traverse_inline
 from .. import tag
-from .. import generic, nn
+from .. import nn
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.register_topi_compute(nn.depthwise_conv2d_nchw, ['cuda', 'gpu'], 'direct',
-                              nn.depthwise_conv2d_nchw.fdefault)
+@autotvm.register_topi_compute("depthwise_conv2d_nchw.cuda")
+def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute depthwise_conv2d with NCHW layout."""
+    return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
 
-@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_nchw, ['cuda', 'gpu'], 'direct')
-def schedule_depthwise_conv2d_nchw_cuda(cfg, outs):
+@autotvm.register_topi_schedule("depthwise_conv2d_nchw.cuda")
+def schedule_depthwise_conv2d_nchw(cfg, outs):
     """Schedule for depthwise_conv2d nchw forward.
 
     Parameters
@@ -66,7 +68,7 @@ def _callback(op):
             # fallback support
             if cfg.is_fallback:
                 ref_log = autotvm.tophub.load_reference_log(
-                    target.target_name, target.model, 'depthwise_conv2d_nchw', 'direct')
+                    target.target_name, target.model, 'depthwise_conv2d_nchw.cuda')
                 cfg.fallback_with_reference_log(ref_log)
                 # TODO(lmzheng): A bug here, set unroll_explicit to False as workaround
                 cfg['unroll_explicit'].val = 0
@@ -131,7 +133,6 @@ def _callback(op):
     traverse_inline(s, outs[0].op, _callback)
     return s
 
-@generic.schedule_depthwise_conv2d_nhwc.register(["cuda", "gpu"])
 def schedule_depthwise_conv2d_nhwc(outs):
     """Schedule for depthwise_conv2d nhwc forward.
 
diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py
index 54e8427daf79..5abf2985273c 100644
--- a/topi/python/topi/cuda/group_conv2d_nchw.py
+++ b/topi/python/topi/cuda/group_conv2d_nchw.py
@@ -24,15 +24,163 @@
 from ..nn.pad import pad
 from ..nn.util import get_pad_tuple
 from ..util import traverse_inline, get_const_tuple, get_const_int
-from .. import nn, generic
+from .. import nn
 
 
-autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], 'direct',
-                              nn.group_conv2d_nchw.fdefault)
+@autotvm.register_topi_compute("group_conv2d_nchw.cuda")
+def group_conv2d_nchw(_, data, kernel, stride, padding, dilation, groups,
+                      out_dtype='float32'):
+    return nn.group_conv2d_nchw(data, kernel, stride, padding, dilation, groups, out_dtype)
 
-@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['int8'])
-def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
-                           out_dtype='float32'):
+
+@autotvm.register_topi_schedule("group_conv2d_nchw.cuda")
+def schedule_group_conv2d_nchw(cfg, outs):
+    """TOPI schedule callback of group conv2d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for group conv2d.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == "group_conv2d_nchw":
+            _schedule_group_conv2d_nchw_direct(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _schedule_group_conv2d_nchw_direct(cfg, s, conv):
+    """Schedule group conv2d NCHW direct template"""
+    workload = conv.op.attrs["workload"]
+    groups = get_const_int(workload[6])
+    num_filters = get_const_int(conv.shape[1])
+
+    ##### space definition begin #####
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_n", n, num_outputs=4)
+    cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2)
+    cfg.define_split("tile_f", cfg.axis(num_filters // groups), num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+    target = tvm.target.Target.current()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, 'local')
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope('local')
+        OL = conv
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    g, f = s[output].split(f, nparts=groups)
+    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
+    bg, vg = cfg["tile_g"].apply(s, output, g)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
+    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(s[output].fuse(bg, bf), tvm.thread_axis("blockIdx.y"))
+    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vn, tvm.thread_axis("vthread"))
+    s[output].bind(vg, tvm.thread_axis("vthread"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+
+    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
+    if cfg["fuse_yx"].val:
+        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+        tyx = s[output].fuse(ty, tx)
+        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
+        s[OL].compute_at(s[output], tyx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2]
+        n_ty = cfg["tile_f"].size[2]
+        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
+    else:
+        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
+        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[OL].compute_at(s[output], tx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
+        n_ty = cfg["tile_y"].size[2]
+        n_tx = cfg["tile_x"].size[2]
+
+    # tile reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        fused, tx = s[load].split(fused, factor=n_tx)
+        fused, ty = s[load].split(fused, factor=n_ty)
+        fused, tz = s[load].split(fused, factor=n_tz)
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # unroll
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    N, CO, OH, OW = get_const_tuple(output.shape)
+    _, CI_div_groups, KH, KW = get_const_tuple(kernel.shape)
+    cfg.add_flop(2 * N * OH * OW * CO * CI_div_groups * KH * KW)
+
+
+@autotvm.register_topi_compute("group_conv2d_NCHWc_int8.cuda")
+def group_conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, groups,
+                            out_dtype='float32'):
     """Group convolution operator for 'group_conv2d_NCHWc_int8'.
 
     Parameters
@@ -154,30 +302,58 @@ def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
     #
     # Compared with a normal convolution, group convolution only sums
     # input channels from the group that an output channel resides in.
-    conv = tvm.compute(oshape, lambda n, occ, oh, ow, ocb:
-                       tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
-                                        oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
-                               .astype('int32') *
-                               packed_kernel[occ, icc,
-                                             kh, kw, ocb, icb]
-                               .astype('int32'),
-                               axis=[icc, kh, kw, icb]))
+    conv = tvm.compute(
+        oshape, lambda n, occ, oh, ow, ocb:
+        tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
+                         oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
+                .astype('int32') *
+                packed_kernel[occ, icc, kh, kw, ocb, icb].astype('int32'),
+                axis=[icc, kh, kw, icb]))
 
     # Type conversion
     output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype),
                          tag='group_conv2d_NCHWc_int8')
 
     num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
-        ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
+               ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
     cfg.add_flop(num_flop)
 
     return output
 
 
+@autotvm.register_topi_schedule("group_conv2d_NCHWc_int8.cuda")
+def schedule_group_conv2d_NCHWc_int8(cfg, outs):
+    """TOPI schedule callback of group conv2d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for group conv2d.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == "group_conv2d_NCHWc_int8":
+            _schedule_group_conv2d_NCHWc_int8(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
 _dp4a = dp4a('shared', 'shared', 'local')
 
 
-def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
+def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
     """Schedule group conv2d int8 NCHWc template"""
     workload = output.op.attrs["workload"]
     groups = get_const_int(workload[6])
@@ -198,7 +374,7 @@ def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
         s[packed_kernel].pragma(
             s[packed_kernel].op.axis[0], "debug_skip_region")
     else:
-        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\
+        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and \
                 packed_kernel.name == 'packed_kernel':
             # data and kernel are not pre-computed, schedule layout transform here
             schedule_injective_from_existing(s, packed_data)
@@ -319,151 +495,3 @@ def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
     s[output].pragma(kernel_scope, 'unroll_explicit', False)
 
     return s
-
-
-def schedule_group_conv2d_nchw_direct(cfg, s, conv):
-    """Schedule group conv2d NCHW direct template"""
-    workload = conv.op.attrs["workload"]
-    groups = get_const_int(workload[6])
-    num_filters = get_const_int(conv.shape[1])
-
-    ##### space definition begin #####
-    n, f, y, x = s[conv].op.axis
-    rc, ry, rx = s[conv].op.reduce_axis
-    cfg.define_split("tile_n", n, num_outputs=4)
-    cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2)
-    cfg.define_split("tile_f", cfg.axis(num_filters // groups), num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=2)
-    cfg.define_split("tile_ry", ry, num_outputs=2)
-    cfg.define_split("tile_rx", rx, num_outputs=2)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-
-    target = tvm.target.Target.current()
-    if target.target_name in ['nvptx', 'rocm']:
-        cfg.define_knob("unroll_explicit", [1])
-    else:
-        cfg.define_knob("unroll_explicit", [0, 1])
-
-    pad_data, kernel = s[conv].op.input_tensors
-
-    s[pad_data].compute_inline()
-
-    if conv.op in s.outputs:
-        output = conv
-        OL = s.cache_write(conv, 'local')
-    else:
-        output = s.outputs[0].output(0)
-        s[conv].set_scope('local')
-        OL = conv
-
-    # create cache stage
-    AA = s.cache_read(pad_data, 'shared', [OL])
-    WW = s.cache_read(kernel, 'shared', [OL])
-
-    # tile and bind spatial axes
-    n, f, y, x = s[output].op.axis
-    kernel_scope, n = s[output].split(n, nparts=1)
-
-    g, f = s[output].split(f, nparts=groups)
-    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
-    bg, vg = cfg["tile_g"].apply(s, output, g)
-    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-
-    s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(s[output].fuse(bg, bf), tvm.thread_axis("blockIdx.y"))
-    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vn, tvm.thread_axis("vthread"))
-    s[output].bind(vg, tvm.thread_axis("vthread"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
-
-    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
-    if cfg["fuse_yx"].val:
-        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
-        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
-        tyx = s[output].fuse(ty, tx)
-        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
-        s[OL].compute_at(s[output], tyx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2]
-        n_ty = cfg["tile_f"].size[2]
-        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
-    else:
-        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
-        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
-        s[OL].compute_at(s[output], tx)
-
-        # number of threads
-        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
-        n_ty = cfg["tile_y"].size[2]
-        n_tx = cfg["tile_x"].size[2]
-
-    # tile reduction axes
-    n, f, y, x = s[OL].op.axis
-    rc, ry, rx = s[OL].op.reduce_axis
-    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
-    ryo, ryi = cfg['tile_rx'].apply(s, OL, ry)
-    rxo, rxi = cfg['tile_ry'].apply(s, OL, rx)
-    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
-
-    s[AA].compute_at(s[OL], rxo)
-    s[WW].compute_at(s[OL], rxo)
-
-    # cooperative fetching
-    for load in [AA, WW]:
-        n, f, y, x = s[load].op.axis
-        fused = s[load].fuse(n, f, y, x)
-        fused, tx = s[load].split(fused, factor=n_tx)
-        fused, ty = s[load].split(fused, factor=n_ty)
-        fused, tz = s[load].split(fused, factor=n_tz)
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
-
-    # unroll
-    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
-    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
-
-    N, CO, OH, OW = get_const_tuple(output.shape)
-    _, CI_div_groups, KH, KW = get_const_tuple(kernel.shape)
-    cfg.add_flop(2 * N * OH * OW * CO * CI_div_groups * KH * KW)
-
-
-@autotvm.register_topi_schedule(generic.schedule_group_conv2d_nchw,
-                                ["cuda", "gpu"], ["int8", "direct"])
-def schedule_conv2d_nchw_cuda(cfg, outs):
-    """TOPI schedule callback of group conv2d for cuda gpu
-
-    Parameters
-    ----------
-    cfg: ConfigEntity
-        The config for this template
-
-    outs: Array of Tensor
-        The computation graph description of conv2d
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for group conv2d.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "group_conv2d_NCHWc_int8":
-            schedule_group_conv2d_NCHWc_int8(cfg, s, op.output(0))
-        if op.tag == "group_conv2d_nchw":
-            schedule_group_conv2d_nchw_direct(cfg, s, op.output(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
diff --git a/topi/python/topi/cuda/injective.py b/topi/python/topi/cuda/injective.py
index eb7019bd7654..1690407a1602 100644
--- a/topi/python/topi/cuda/injective.py
+++ b/topi/python/topi/cuda/injective.py
@@ -17,10 +17,8 @@
 # pylint: disable=invalid-name, unused-variable,
 """Schedule for composition of injective operator"""
 import tvm
-from .. import generic, util
-from ..util import is_empty_shape
+from .. import util
 
-@generic.schedule_injective_from_existing.register(["cuda", "gpu"])
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
 
@@ -67,7 +65,6 @@ def schedule_injective_from_existing(sch, out):
 
     return sch
 
-@generic.schedule_injective.register(["cuda", "gpu"])
 def schedule_injective(outs):
     """Schedule for injective op.
 
@@ -87,7 +84,7 @@ def schedule_injective(outs):
 
     tvm.schedule.AutoInlineInjective(s)
     for out in outs:
-        if not is_empty_shape(out.shape):
+        if not util.is_empty_shape(out.shape):
             schedule_injective_from_existing(s, out)
     return s
 
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 5485859de01f..27a52724fb2d 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -22,7 +22,6 @@
 
 from tvm import api
 from tvm.intrin import if_then_else
-from topi.vision import non_max_suppression, get_valid_counts
 from .sort import argsort
 from .. import tag
 
@@ -238,8 +237,7 @@ def out_rewrite(data, flag, prefix_sum, valid_count, out):
     return ib.get()
 
 
-@get_valid_counts.register(["cuda", "gpu"])
-def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1):
+def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     """Get valid count of bounding boxes given a score threshold.
     Also moves valid boxes to the top of input data.
 
@@ -585,11 +583,10 @@ def invalid_to_bottom_ir(data, flag, idx, out):
     return ib.get()
 
 
-@non_max_suppression.register(["cuda", "gpu"])
-def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
-                            iou_threshold=0.5, force_suppress=False, top_k=-1,
-                            coord_start=2, score_index=1, id_index=0,
-                            return_indices=True, invalid_to_bottom=False):
+def non_max_suppression(data, valid_count, max_output_size=-1,
+                        iou_threshold=0.5, force_suppress=False, top_k=-1,
+                        coord_start=2, score_index=1, id_index=0,
+                        return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
diff --git a/topi/python/topi/cuda/nn.py b/topi/python/topi/cuda/nn.py
index 327afa87edb5..4460f7b4cd8a 100644
--- a/topi/python/topi/cuda/nn.py
+++ b/topi/python/topi/cuda/nn.py
@@ -18,11 +18,8 @@
 """scheduler functions for cuda backend"""
 from __future__ import absolute_import as _abs
 
-import tvm
-from .. import generic
 from .. import cpp
 
-@generic.schedule_lrn.register(["cuda"])
 def schedule_lrn(outs):
     """Schedule for LRN
 
@@ -37,6 +34,4 @@ def schedule_lrn(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    target = tvm.target.Target.current(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.cuda.schedule_lrn(cpp_target, outs)
+    return cpp.cuda.schedule_lrn(outs)
diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
index 2bf1e6bb9ef0..2bebd3912378 100644
--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -18,12 +18,9 @@
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
-from .. import generic
 from ..util import traverse_inline
 
 
-
-@generic.schedule_adaptive_pool.register(["cuda", "gpu"])
 def schedule_adaptive_pool(outs):
     """Schedule for adaptive_pool.
 
@@ -89,7 +86,6 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_pool.register(["cuda", "gpu"])
 def schedule_pool(outs, layout):
     """Schedule for pool.
 
@@ -153,8 +149,7 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_pool_grad.register(['cuda', 'gpu'])
-def schedule_pool_grad_cuda(outs):
+def schedule_pool_grad(outs):
     """Schedule for pool_grad on CUDA
 
     Parameters
diff --git a/topi/python/topi/cuda/rcnn/__init__.py b/topi/python/topi/cuda/rcnn/__init__.py
index 42b34f0a31e6..da55b070a807 100644
--- a/topi/python/topi/cuda/rcnn/__init__.py
+++ b/topi/python/topi/cuda/rcnn/__init__.py
@@ -17,4 +17,4 @@
 
 # pylint: disable=wildcard-import
 """Faster R-CNN and Mask R-CNN operators"""
-from .proposal import *
+from .proposal import proposal
diff --git a/topi/python/topi/cuda/rcnn/proposal.py b/topi/python/topi/cuda/rcnn/proposal.py
index 4344226d787e..489c354e6cf3 100644
--- a/topi/python/topi/cuda/rcnn/proposal.py
+++ b/topi/python/topi/cuda/rcnn/proposal.py
@@ -18,7 +18,7 @@
 """Proposal operator"""
 import math
 import tvm
-from ...vision.rcnn import proposal, generate_anchor, reg_bbox, reg_iou
+from ...vision.rcnn import generate_anchor, reg_bbox, reg_iou
 from ...util import get_const_tuple, get_const_int
 
 
@@ -308,9 +308,8 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
     return body
 
 
-@proposal.register("cuda")
-def proposal_cuda(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold,
-                  rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss):
+def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold,
+             rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss):
     """Proposal operator.
 
     Parameters
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 69c685cb50b4..0b9d5885375e 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -19,7 +19,6 @@
 from __future__ import absolute_import as _abs
 import tvm
 from .. import tag
-from .. import generic
 from .injective import schedule_injective_from_existing
 
 def _schedule_reduce(op, sch, is_idx_reduce=False):
@@ -89,7 +88,6 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
     return sch
 
 
-@generic.schedule_reduce.register(["cuda", "gpu"])
 def schedule_reduce(outs):
     """Schedule for inject->reduce->bcast ops.
 
diff --git a/topi/python/topi/cuda/softmax.py b/topi/python/topi/cuda/softmax.py
index 26a1baffa092..afd11ea0e71e 100644
--- a/topi/python/topi/cuda/softmax.py
+++ b/topi/python/topi/cuda/softmax.py
@@ -17,10 +17,9 @@
 # pylint: disable=invalid-name, unused-variable, trailing-whitespace
 """Schedule for softmax operator"""
 import tvm
-from .. import generic
 from .injective import schedule_injective_from_existing
 
-@generic.schedule_softmax.register(["cuda", "gpu"])
+
 def schedule_softmax(outs):
     """Schedule for softmax op.
 
diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py
index b32cce75362f..88ca9d876abc 100644
--- a/topi/python/topi/cuda/sort.py
+++ b/topi/python/topi/cuda/sort.py
@@ -19,10 +19,9 @@
 import tvm
 
 from tvm import api
-from ..sort import argsort, topk
+from .injective import schedule_injective_from_existing
 from ..math import identity
 from ..transform import strided_slice
-from .. import generic
 from .. import tag
 
 def _schedule_sort(outs):
@@ -42,8 +41,7 @@ def _schedule_sort(outs):
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
-    # pylint: disable=import-outside-toplevel
-    from .injective import schedule_injective_from_existing
+
     def traverse(op):
         if tag.is_injective(op.tag):
             schedule_injective_from_existing(s, op.output(0))
@@ -239,8 +237,7 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
 
     return ib.get()
 
-@argsort.register(["cuda", "gpu"])
-def argsort_gpu(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
+def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
     """Performs sorting along the given axis and returns an array of indicies
     having same shape as an input array that index data in sorted order.
 
@@ -294,7 +291,6 @@ def argsort_gpu(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
                          tag="argsort_gpu")[1]
     return out
 
-@generic.schedule_argsort.register(["cuda", "gpu"])
 def schedule_argsort(outs):
     """Schedule for argsort operator.
 
@@ -311,8 +307,7 @@ def schedule_argsort(outs):
     """
     return _schedule_sort(outs)
 
-@topk.register(["cuda", "gpu"])
-def topk_gpu(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
+def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
     """Get the top k elements in an input tensor along the given axis.
 
     Parameters
@@ -389,7 +384,6 @@ def topk_gpu(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"
     return output
 
 
-@generic.schedule_topk.register(["cuda", "gpu"])
 def schedule_topk(outs):
     """Schedule for argsort operator.
 
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 10ba7a1051ea..0b3f50ba0031 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -25,9 +25,6 @@
 
 import topi
 
-from topi.vision.ssd import multibox_prior
-from topi.vision.ssd import multibox_detection
-from topi.vision.ssd import multibox_transform_loc
 from ..nms import non_max_suppression
 
 
@@ -112,9 +109,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     return body
 
 
-@multibox_prior.register(["cuda", "gpu"])
-def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
-                       offsets=(0.5, 0.5), clip=False):
+def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
+                   offsets=(0.5, 0.5), clip=False):
     """Generate prior(anchor) boxes from data, sizes and ratios.
 
     Parameters
@@ -346,9 +342,8 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
     return ib.get()
 
 
-@multibox_transform_loc.register(["cuda", "gpu"])
-def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \
-                               threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)):
+def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, \
+                           threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)):
     """Location transformation for multibox detection
 
     Parameters
@@ -426,9 +421,8 @@ def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \
     return [out_loc, valid_count]
 
 
-@multibox_detection.register(["cuda", "gpu"])
-def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
-                           force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
+def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
+                       force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
     """Convert multibox detection predictions.
 
     Parameters
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index d456aadf4f5e..8666c22774de 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -18,17 +18,16 @@
 """Schedule for vision operators"""
 from __future__ import absolute_import as _abs
 import tvm
-from .. import generic
 from .. import cpp
 from .. import tag
 from .pooling import schedule_pool
+from .injective import schedule_injective_from_existing
 
 def _default_schedule(outs):
     """Default schedule for gpu."""
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
-    from .injective import schedule_injective_from_existing
     def traverse(op):
         if tag.is_broadcast(op.tag) or op.tag in ['bbox_score', 'sorted_bbox']:
             schedule_injective_from_existing(s, op.output(0))
@@ -39,7 +38,6 @@ def traverse(op):
     traverse(outs[0].op)
     return s
 
-@generic.schedule_reorg.register(["cuda", "gpu"])
 def schedule_reorg(outs):
     """Schedule for reorg operator.
     Parameters
@@ -57,7 +55,6 @@ def schedule_reorg(outs):
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.cuda.schedule_injective(cpp_target, outs)
 
-@generic.schedule_nms.register(["cuda", "gpu"])
 def schedule_nms(outs):
     """Schedule for non-maximum suppression
 
@@ -74,7 +71,6 @@ def schedule_nms(outs):
     """
     return _default_schedule(outs)
 
-@generic.schedule_multibox_prior.register(["cuda", "gpu"])
 def schedule_multibox_prior(outs):
     """Schedule for multibox_prior operator.
 
@@ -91,7 +87,6 @@ def schedule_multibox_prior(outs):
     """
     return _default_schedule(outs)
 
-@generic.schedule_multibox_transform_loc.register(["cuda", "gpu"])
 def schedule_multibox_transform_loc(outs):
     """Schedule for multibox_transform_loc
 
@@ -109,7 +104,6 @@ def schedule_multibox_transform_loc(outs):
     """
     return _default_schedule(outs)
 
-@generic.schedule_multibox_detection.register(["cuda", "gpu"])
 def schedule_multibox_detection(outs):
     """Schedule for multibox_detection operator.
 
@@ -126,15 +120,12 @@ def schedule_multibox_detection(outs):
     """
     return _default_schedule(outs)
 
-@generic.schedule_roi_align.register(["cuda", "gpu"])
 def schedule_roi_align(outs):
     return schedule_pool(outs, 'NCHW')
 
-@generic.schedule_roi_pool.register(["cuda", "gpu"])
 def schedule_roi_pool(outs):
     return schedule_pool(outs, 'NCHW')
 
-@generic.schedule_proposal.register(["cuda", "gpu"])
 def schedule_proposal(outs):
     """Schedule for proposal operator.
 
@@ -151,7 +142,6 @@ def schedule_proposal(outs):
     """
     return _default_schedule(outs)
 
-@generic.schedule_get_valid_counts.register(["cuda", "gpu"])
 def schedule_get_valid_counts(outs):
     """Schedule for get_valid_counts operator.
 
diff --git a/topi/python/topi/generic/conv2d.py b/topi/python/topi/generic/conv2d.py
index 332c2fdad459..08bb06c6f855 100644
--- a/topi/python/topi/generic/conv2d.py
+++ b/topi/python/topi/generic/conv2d.py
@@ -19,6 +19,7 @@
 """Generic convolution schedules"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from ..util import get_const_tuple
 
@@ -109,7 +110,8 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
     raise ValueError("cannot decide default schedule for workload: {}".format(wkl))
 
 
-def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last, int32_lanes=16, intrin=None):
+def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data_vec, kernel_vec, conv_out,
+                                        last, int32_lanes=16, intrin=None):
     """
     Defines the schedule for INT8 for Intel and ARM machines
     Uses the Intel/ARM intrinsics to use INT8 operations
@@ -117,14 +119,39 @@ def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last, int32_lane
     lower-numerical-precision-deep-learning-inference-and-training
     """
     reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val
-    _, _, _, _, ic_bn = get_const_tuple(data.shape)
+    _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
     _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
 
-    A = data
-    if isinstance(s[A].op, tvm.tensor.ComputeOp):
-        batch, ic_chunk, ih, iw, _ = s[A].op.axis
-        parallel_axis = s[A].fuse(batch, ic_chunk, ih)
-        s[A].parallel(parallel_axis)
+    # schedule pad
+    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+            and "pad" in data_vec.op.tag:
+        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+        data_vec = data_vec.op.input_tensors[0]
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
+        # skip this part during tuning to make records accurate.
+        # this part will be folded during Relay fold_constant pass.
+        s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
+        s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
+    elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \
+            kernel_vec.name == 'kernel_vec':
+        # data and kernel are not pre-computed, schedule layout transform here.
+        # this should only be used by x86 conv2d_nchw, which is for
+        # testing purpose.
+        batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+
+        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis
+        s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+        oc_bn = cfg["tile_oc"].size[-1]
+        if oc_bn > 1:
+            s[kernel_vec].vectorize(oc_block)
+        parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
+        s[kernel_vec].parallel(parallel_axis)
 
     # schedule 5-D NCHW[x]c conv
     C, O = conv_out, last
@@ -173,7 +200,8 @@ def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last, int32_lane
 
     return s
 
-def schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last, int32_lanes=16, intrin=None):
+def schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data_vec, kernel_vec, conv_out,
+                                     last, int32_lanes=16, intrin=None):
     """
     Defines the 1x1 conv schedule for INT8 for Intel and ARM machines
     Uses the Intel/ARM intrinsics to use INT8 operations
@@ -181,15 +209,39 @@ def schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last, int32_lanes=1
     lower-numerical-precision-deep-learning-inference-and-training
     """
     oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
-    _, _, _, _, ic_bn = get_const_tuple(data.shape)
+    _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
     _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
 
-    # schedule data
-    A = data
-    if isinstance(s[A].op, tvm.tensor.ComputeOp):
-        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
-        parallel_axis = s[A].fuse(batch, ic_chunk, ih)
-        s[A].parallel(parallel_axis)
+    # schedule pad
+    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+            and "pad" in data_vec.op.tag:
+        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+        data_vec = data_vec.op.input_tensors[0]
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
+        # skip this part during tuning to make records accurate.
+        # this part will be folded during Relay fold_constant pass.
+        s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
+        s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
+    elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \
+            kernel_vec.name == 'kernel_vec':
+        # data and kernel are not pre-computed, schedule layout transform here.
+        # this should only be used by x86 conv2d_nchw, which is for
+        # testing purpose.
+        batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+
+        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis
+        s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+        oc_bn = cfg["tile_oc"].size[-1]
+        if oc_bn > 1:
+            s[kernel_vec].vectorize(oc_block)
+        parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
+        s[kernel_vec].parallel(parallel_axis)
 
     C, O = conv_out, last
     CC = s.cache_write(C, 'global')
diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py
index e895385e8b66..977c53763a52 100644
--- a/topi/python/topi/generic/extern.py
+++ b/topi/python/topi/generic/extern.py
@@ -21,7 +21,6 @@
 import tvm
 from .. import cpp
 
-@tvm.target.generic_func
 def schedule_extern(outs):
     """Schedule for an extern op followed by injective operations.
 
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index 2aff96f9636c..6f1013c06dbd 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -20,7 +20,6 @@
 
 import tvm
 
-@tvm.target.override_native_generic_func("schedule_injective_from_existing")
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
 
@@ -36,10 +35,9 @@ def schedule_injective_from_existing(sch, out):
     sch: Schedule
          The updated schedule.
     """
-    sch[out].fuse(s[out].op.axis)
+    sch[out].fuse(*sch[out].op.axis)
     return sch
 
-@tvm.target.override_native_generic_func("schedule_injective")
 def schedule_injective(outs):
     """Schedule for injective op.
 
@@ -64,22 +62,5 @@ def schedule_injective(outs):
     schedule_injective_from_existing(s, x)
     return s
 
-@tvm.target.generic_func
-def schedule_concatenate(outs):
-    """Schedule for concatenate op.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of reduce in the format
-          of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return schedule_injective(outs)
-
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 883182941202..ba50a8b88cb4 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -18,7 +18,6 @@
 """Generic nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
-from .. import cpp
 
 def _default_schedule(outs, auto_inline):
     """Default schedule for llvm."""
@@ -34,7 +33,6 @@ def _default_schedule(outs, auto_inline):
     return s
 
 
-@tvm.target.generic_func
 def schedule_conv1d_ncw(outs):
     """Schedule for conv1d_ncw
 
@@ -52,7 +50,6 @@ def schedule_conv1d_ncw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv1d_nwc(outs):
     """Schedule for conv1d_nwc
 
@@ -70,7 +67,6 @@ def schedule_conv1d_nwc(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv2d_hwcn(outs):
     """Schedule for conv2d_hwcn
 
@@ -88,7 +84,6 @@ def schedule_conv2d_hwcn(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv2d_nchw(outs):
     """Schedule for conv2d_nchw
 
@@ -106,7 +101,6 @@ def schedule_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv2d_nhwc_pack(outs):
     """Schedule for conv2d_nhwc_pack
 
@@ -124,7 +118,6 @@ def schedule_conv2d_nhwc_pack(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv2d_nhwc(outs):
     """Schedule for conv2d_nhwc
 
@@ -142,7 +135,6 @@ def schedule_conv2d_nhwc(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv2d_NCHWc(outs):
     """Schedule for conv2d_NCHW[x]c
 
@@ -161,7 +153,6 @@ def schedule_conv2d_NCHWc(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv2d_NCHWc_int8(outs):
     """Schedule for conv2d_NCHW[x]c_int8
 
@@ -180,7 +171,6 @@ def schedule_conv2d_NCHWc_int8(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv2d_winograd_weight_transform(outs):
     """Schedule for weight transformation of winograd
 
@@ -210,7 +200,6 @@ def schedule_conv2d_winograd_weight_transform(outs):
     return s
 
 
-@tvm.target.generic_func
 def schedule_conv2d_winograd_without_weight_transform(outs):
     """Schedule for winograd without weight transformation
 
@@ -228,7 +217,6 @@ def schedule_conv2d_winograd_without_weight_transform(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv2d_winograd_nnpack_weight_transform(outs):
     """Schedule for weight transformation of winograd
      Parameters
@@ -245,23 +233,7 @@ def schedule_conv2d_winograd_nnpack_weight_transform(outs):
     s = tvm.create_schedule([x.op for x in outs])
     return s
 
-@tvm.target.generic_func
-def schedule_conv2d_winograd_nnpack_without_weight_transform(outs):
-    """Schedule for winograd without weight transformation
-     Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of this operator
-          in the format of an array of tensors.
-     Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
 
-
-@tvm.target.generic_func
 def schedule_conv3d_ncdhw(outs):
     """Schedule for conv3d_ncdhw
 
@@ -278,7 +250,6 @@ def schedule_conv3d_ncdhw(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_conv3d_ndhwc(outs):
     """Schedule for conv3d_ndhwc
 
@@ -295,7 +266,6 @@ def schedule_conv3d_ndhwc(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_conv2d_transpose_nchw(outs):
     """Schedule for conv2d_transpose_nchw
 
@@ -313,7 +283,6 @@ def schedule_conv2d_transpose_nchw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_conv1d_transpose_ncw(outs):
     """Schedule for conv1d_transpose_ncw
 
@@ -331,7 +300,6 @@ def schedule_conv1d_transpose_ncw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_depthwise_conv2d_nchw(outs):
     """Schedule for depthwise_conv2d_nchw
 
@@ -349,7 +317,6 @@ def schedule_depthwise_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_depthwise_conv2d_nhwc(outs):
     """Schedule for depthwise_conv2d_nhwc
     Parameters
@@ -366,7 +333,6 @@ def schedule_depthwise_conv2d_nhwc(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_depthwise_conv2d_NCHWc(outs):
     """Schedule for depthwise_conv2d_NCHWc
     Parameters
@@ -383,7 +349,6 @@ def schedule_depthwise_conv2d_NCHWc(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_group_conv2d_nchw(outs):
     """Schedule for group_conv2d_nchw
 
@@ -401,7 +366,6 @@ def schedule_group_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_deformable_conv2d_nchw(outs):
     """Schedule for deformable_conv2d_nchw
 
@@ -419,7 +383,6 @@ def schedule_deformable_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_bitserial_conv2d_nchw(outs):
     """Schedule for bitserial_conv2d_nchw
 
@@ -437,7 +400,6 @@ def schedule_bitserial_conv2d_nchw(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_bitserial_conv2d_nhwc(outs):
     """Schedule for bitserial_conv2d_nhwc
 
@@ -455,7 +417,6 @@ def schedule_bitserial_conv2d_nhwc(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_bitserial_dense(outs):
     """Schedule for bitserial_dense
     Parameters
@@ -471,7 +432,6 @@ def schedule_bitserial_dense(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.override_native_generic_func("schedule_reduce")
 def schedule_reduce(outs):
     """Schedule for reduction
 
@@ -489,7 +449,6 @@ def schedule_reduce(outs):
     return _default_schedule(outs, True)
 
 
-@tvm.target.override_native_generic_func("schedule_softmax")
 def schedule_softmax(outs):
     """Schedule for softmax
 
@@ -507,7 +466,6 @@ def schedule_softmax(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.override_native_generic_func("schedule_dense")
 def schedule_dense(outs):
     """Schedule for dense
 
@@ -525,7 +483,6 @@ def schedule_dense(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.override_native_generic_func("schedule_pool")
 def schedule_pool(outs, layout):
     """Schedule for pool
 
@@ -546,7 +503,6 @@ def schedule_pool(outs, layout):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_pool_grad(outs):
     """Schedule for pool_grad
 
@@ -559,7 +515,6 @@ def schedule_pool_grad(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.override_native_generic_func("schedule_adaptive_pool")
 def schedule_adaptive_pool(outs):
     """Schedule for adaptive pool
 
@@ -577,7 +532,6 @@ def schedule_adaptive_pool(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.override_native_generic_func("schedule_binarize_pack")
 def schedule_binarize_pack(outs):
     """Schedule for binarize_pack
 
@@ -595,7 +549,6 @@ def schedule_binarize_pack(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.override_native_generic_func("schedule_bitpack")
 def schedule_bitpack(outs):
     """Schedule for bitpack
     Parameters
@@ -612,7 +565,6 @@ def schedule_bitpack(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.override_native_generic_func("schedule_binary_dense")
 def schedule_binary_dense(outs):
     """Schedule for binary_dense
 
@@ -630,7 +582,6 @@ def schedule_binary_dense(outs):
     return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_lrn(outs):
     """Schedule for lrn
 
@@ -645,12 +596,9 @@ def schedule_lrn(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    target = tvm.target.Target.current(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.generic.default_schedule(cpp_target, outs, False)
+    return _default_schedule(outs, False)
 
 
-@tvm.target.generic_func
 def schedule_sparse_dense(outs):
     """Schedule for sparse_dense
 
@@ -667,7 +615,7 @@ def schedule_sparse_dense(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
+
 def schedule_sparse_transpose(outs):
     """Schedule for sparse_transpose
 
@@ -684,8 +632,19 @@ def schedule_sparse_transpose(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
+
 def schedule_batch_matmul(outs):
-    target = tvm.target.Target.current(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.generic.default_schedule(cpp_target, outs, False)
+    """Schedule for batch_matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of sparse_transpose
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
diff --git a/topi/python/topi/generic/search.py b/topi/python/topi/generic/search.py
index 41045e492e53..69f236684bb3 100644
--- a/topi/python/topi/generic/search.py
+++ b/topi/python/topi/generic/search.py
@@ -17,10 +17,8 @@
 # pylint: disable=invalid-name, no-member
 """Generic search operators"""
 from __future__ import absolute_import as _abs
-import tvm
 from .vision import _default_schedule
 
-@tvm.target.generic_func
 def schedule_argwhere(outs):
     """Schedule for argwhere operator.
 
diff --git a/topi/python/topi/generic/sort.py b/topi/python/topi/generic/sort.py
index 5462f2ce917c..9eca588e5655 100644
--- a/topi/python/topi/generic/sort.py
+++ b/topi/python/topi/generic/sort.py
@@ -17,10 +17,8 @@
 # pylint: disable=invalid-name, no-member
 """Generic vision operators"""
 from __future__ import absolute_import as _abs
-import tvm
 from .vision import _default_schedule
 
-@tvm.target.generic_func
 def schedule_argsort(outs):
     """Schedule for argsort operator.
 
@@ -37,7 +35,6 @@ def schedule_argsort(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_topk(outs):
     """Schedule for topk operator.
 
diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py
index 85d9153e6424..d6e80df9b89d 100644
--- a/topi/python/topi/generic/vision.py
+++ b/topi/python/topi/generic/vision.py
@@ -33,7 +33,6 @@ def _default_schedule(outs, auto_inline):
         s[x].fuse(s[x].op.axis)
     return s
 
-@tvm.target.generic_func
 def schedule_reorg(outs):
     """Schedule for reorg
 
@@ -52,7 +51,6 @@ def schedule_reorg(outs):
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.generic.default_schedule(cpp_target, outs, False)
 
-@tvm.target.generic_func
 def schedule_get_valid_counts(outs):
     """Schedule for get_valid_counts
 
@@ -69,7 +67,6 @@ def schedule_get_valid_counts(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_nms(outs):
     """Schedule for non-maximum suppression
 
@@ -86,7 +83,6 @@ def schedule_nms(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_multibox_prior(outs):
     """Schedule for multibox_prior
 
@@ -103,7 +99,6 @@ def schedule_multibox_prior(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_multibox_transform_loc(outs):
     """Schedule for multibox_transform_loc
 
@@ -121,7 +116,6 @@ def schedule_multibox_transform_loc(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_multibox_detection(outs):
     """Schedule for multibox_detection
 
@@ -138,7 +132,6 @@ def schedule_multibox_detection(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_roi_align(outs):
     """Schedule for roi_align
 
@@ -155,7 +148,6 @@ def schedule_roi_align(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_roi_pool(outs):
     """Schedule for roi_align
 
@@ -172,7 +164,6 @@ def schedule_roi_pool(outs):
     """
     return _default_schedule(outs, False)
 
-@tvm.target.generic_func
 def schedule_proposal(outs):
     """Schedule for proposal operator.
 
diff --git a/topi/python/topi/hls/injective.py b/topi/python/topi/hls/injective.py
index de584287a90e..d4ccf41ed26d 100644
--- a/topi/python/topi/hls/injective.py
+++ b/topi/python/topi/hls/injective.py
@@ -17,9 +17,7 @@
 # pylint: disable=invalid-name, unused-variable,
 """Schedule for composition of injective operator"""
 import tvm
-from .. import generic
 
-@generic.schedule_injective_from_existing.register(["hls"])
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
 
@@ -40,7 +38,6 @@ def schedule_injective_from_existing(sch, out):
     sch[out].bind(px, tvm.thread_axis("pipeline"))
     return sch
 
-@generic.schedule_injective.register(["hls"])
 def schedule_injective(outs):
     """Schedule for injective op.
 
diff --git a/topi/python/topi/hls/nn.py b/topi/python/topi/hls/nn.py
index d73cb9c847f7..06cf3298682d 100644
--- a/topi/python/topi/hls/nn.py
+++ b/topi/python/topi/hls/nn.py
@@ -19,7 +19,6 @@
 from __future__ import absolute_import as _abs
 import tvm
 from .. import tag
-from .. import generic
 
 
 def _schedule_conv2d(outs):
@@ -52,7 +51,6 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_conv2d_nchw.register(["hls"])
 def schedule_conv2d_nchw(outs):
     """Schedule for conv2d_nchw
 
@@ -70,7 +68,6 @@ def schedule_conv2d_nchw(outs):
     return _schedule_conv2d(outs)
 
 
-@generic.schedule_conv2d_nhwc.register(["hls"])
 def schedule_conv2d_nhwc(outs):
     """Schedule for conv2d_nhwc
 
@@ -88,7 +85,6 @@ def schedule_conv2d_nhwc(outs):
     return _schedule_conv2d(outs)
 
 
-@generic.schedule_conv2d_NCHWc.register(["hls"])
 def schedule_conv2d_NCHWc(outs):
     """Schedule for conv2d_NCHW[x]c
 
@@ -106,7 +102,6 @@ def schedule_conv2d_NCHWc(outs):
     return _schedule_conv2d(outs)
 
 
-@generic.schedule_conv2d_transpose_nchw.register(["hls"])
 def schedule_conv2d_transpose_nchw(outs):
     """Schedule for conv2d_transpose_nchw
 
@@ -124,7 +119,6 @@ def schedule_conv2d_transpose_nchw(outs):
     return _schedule_conv2d(outs)
 
 
-@generic.schedule_depthwise_conv2d_nchw.register(["hls"])
 def schedule_depthwise_conv2d_nchw(outs):
     """Schedule for depthwise_conv2d_nchw
 
@@ -142,7 +136,6 @@ def schedule_depthwise_conv2d_nchw(outs):
     return _schedule_conv2d(outs)
 
 
-@generic.schedule_depthwise_conv2d_nhwc.register(["hls"])
 def schedule_depthwise_conv2d_nhwc(outs):
     """Schedule for depthwise_conv2d_nhwc
     Parameters
@@ -158,7 +151,6 @@ def schedule_depthwise_conv2d_nhwc(outs):
     """
     return _schedule_conv2d(outs)
 
-@generic.schedule_bitserial_conv2d_nchw.register(["hls"])
 def schedule_bitserial_conv2d_nchw(outs):
     """Schedule for bitserial_conv2d_nchw
 
@@ -176,7 +168,6 @@ def schedule_bitserial_conv2d_nchw(outs):
     return _schedule_conv2d(outs)
 
 
-@generic.schedule_bitserial_conv2d_nhwc.register(["hls"])
 def schedule_bitserial_conv2d_nhwc(outs):
     """Schedule for bitserial_conv2d_nhwc
 
@@ -194,7 +185,6 @@ def schedule_bitserial_conv2d_nhwc(outs):
     return _schedule_conv2d(outs)
 
 
-@generic.schedule_reduce.register(["hls"])
 def schedule_reduce(outs):
     """Schedule for reduction
 
@@ -241,7 +231,6 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_softmax.register(["hls"])
 def schedule_softmax(outs):
     """Schedule for softmax
 
@@ -286,7 +275,6 @@ def schedule_softmax(outs):
     return s
 
 
-@generic.schedule_dense.register(["hls"])
 def schedule_dense(outs):
     """Schedule for dense
 
@@ -330,7 +318,6 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_pool.register(["hls"])
 def schedule_pool(outs, layout):
     """Schedule for pool
 
@@ -374,7 +361,6 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_adaptive_pool.register(["hls"])
 def schedule_adaptive_pool(outs):
     """Schedule for adaptive_pool
 
diff --git a/topi/python/topi/intel_graphics/__init__.py b/topi/python/topi/intel_graphics/__init__.py
index 5223d2d2bbc9..5f82fe758786 100644
--- a/topi/python/topi/intel_graphics/__init__.py
+++ b/topi/python/topi/intel_graphics/__init__.py
@@ -20,3 +20,5 @@
 from __future__ import absolute_import as _abs
 
 from .conv2d import *
+from . import conv2d_alter_op
+from .depthwise_conv2d import *
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index 65ea590905f9..8993063b16e3 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -20,19 +20,12 @@
 from __future__ import absolute_import as _abs
 
 import tvm
-
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from tvm.autotvm.task.topi_integration import deserialize_args
-from tvm.autotvm.task import get_config
-from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, conv2d_infer_layout
-from ..nn.util import get_pad_tuple
-from ..nn.depthwise_conv2d import depthwise_conv2d_nchw
-from ..nn import pad
-from .. import tag
-from .. import generic
+
+from .. import nn
 from .. import util
-from ..util import simplify, get_const_tuple
+from ..util import simplify, get_const_tuple, traverse_inline
 
 
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False):
@@ -73,17 +66,12 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth
     cfg["block_ow"] = OtherOptionEntity(block_ow)
 
 
-def _create_schedule_template(cfg, data, kernel, strides, padding, dilation, layout):
+def _create_schedule_template(cfg, dshape, kshape, strides, padding, dilation):
     """Create schedule configuration from input arguments"""
-    dshape = get_const_tuple(data.shape)
-    kshape = get_const_tuple(kernel.shape)
-    if layout == 'NCHW':
-        n, ic, h, w = dshape
-        oc, _, kh, kw = kshape
-    else:
-        raise ValueError("Not support this layout {} with "
-                         "schedule template.".format(layout))
-    pt, pl, pb, pr = get_pad_tuple(padding, kernel)
+    n, ic, h, w = dshape
+    oc, _, kh, kw = kshape
+
+    pt, pl, pb, pr = nn.get_pad_tuple(padding, (kh, kw))
     sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     oh = (h - kh + pt + pb) // sh + 1
     ow = (w - kw + pl + pr) // sw + 1
@@ -155,112 +143,31 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
     s[tensor].bind(xi, thread_x)
     return xi, thread_z, thread_y, thread_x
 
-# Define template function for autotvm task
-# We define schedule template in this function instead of
-# declaration function since actual input arguments need
-# to be altered by the schedule selected.
-@autotvm.task.register("topi_intel_graphics_conv2d_NCHWc")
-def __topi_nn_conv2d_NCHWc(*args, **kwargs):
-    assert not kwargs, "Do not support kwargs in template function call"
-    data, kernel, strides, padding, dilation, layout, dtype = deserialize_args(args)
-    raw_data_shape = get_const_tuple(data.shape)
-    raw_kernel_shape = get_const_tuple(kernel.shape)
-
-    # get config here
-    cfg = get_config()
-    _create_schedule_template(cfg, data, kernel, strides, padding, dilation, layout)
-    cfg.add_flop(1)
-
-    # change shape with the value in config
-    ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1]
-    oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1]
-
-    new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn,
-                      raw_data_shape[2], raw_data_shape[3], ic_bn)
-    new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn,
-                        raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn)
-    new_data = tvm.placeholder(new_data_shape, data.dtype)
-    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
-
-    C = _decl_cl_spatialpack_NCHWc(cfg, new_data, new_kernel, strides, padding, dilation, dtype)
-    s = _schedule_conv2d_NCHWc(cfg, [C])
-
-    return s, [new_data, new_kernel, C]
-
-@conv2d_alter_layout.register(["intel_graphics"])
-def _alter_conv2d_layout(attrs, inputs, tinfo, F):
-    copy_inputs = list(inputs)
-    new_attrs = {k : attrs[k] for k in attrs.keys()}
-
-    if F.__name__ == 'tvm.relay.op':
-        # Derive channels for frontends (e.g ONNX) that miss "channel" field.
-        new_attrs["channels"] = inputs[1].checked_type.shape[attrs['kernel_layout'].index('O')]
-
-    data, kernel = tinfo[0], tinfo[1]
-    batch_size, in_channel, height, width = get_const_tuple(data.shape)
-
-    groups = attrs.get_int("groups")
-    out_channel = attrs.get_int("channels")
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    dilation = attrs.get_int_tuple("dilation")
-    out_dtype = attrs["out_dtype"]
-
-    layout_name = 'data_layout'
-    layout = attrs[layout_name]
-    kh, kw = attrs.get_int_tuple("kernel_size")
-
-    dtype = data.dtype
-    out_dtype = dtype if out_dtype in ("same", "") else out_dtype
-    is_depthwise = groups == in_channel and groups == out_channel
-
-    # only optimize for NCHW
-    if layout != 'NCHW':
-        return None
-    if groups != 1 and not is_depthwise:
-        return None
-
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    target = tvm.target.Target.current()
-
-    # query schedule and fallback if necessary
-    workload = autotvm.task.args_to_workload(
-        [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \
-        if is_depthwise else \
-        autotvm.task.args_to_workload(
-            [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
-    if is_depthwise:
-        return None
-    cfg = dispatch_ctx.query(target, workload)
-    if cfg.is_fallback:
-        _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise)
 
-    ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1]
-    oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1]
+def _pack_data(data, kernel, ic_bn, oc_bn):
+    n, _, ih, iw = get_const_tuple(data.shape)
+    oc, ic, kh, kw = get_const_tuple(kernel.shape)
 
-    new_attrs[layout_name] = 'NCHW%dc' % ic_bn
-    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+    ic_chunk = ic // ic_bn
+    oc_chunk = oc // oc_bn
 
-    new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
-                               dtype=data.dtype)
+    data = tvm.compute((n, ic_chunk, ih, iw, ic_bn),
+                       lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
+                       name="data_vec")
 
-    out_channel, _, kh, kw = get_const_tuple(kernel.shape)
-    # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
-    new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+    kernel = tvm.compute(
+        (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn),
+        lambda occ, icc, k_h, k_w, icb, ocb:
+        kernel[occ * oc_bn + ocb,
+               icc * ic_bn + icb, k_h, k_w],
+        name="kernel_vec")
 
-    # Store altered operator's config
-    new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn),
-                                 dtype=kernel.dtype)
-    new_workload = autotvm.task.args_to_workload(
-        [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name],
-         new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
+    return data, kernel
 
-    dispatch_ctx.update(target, new_workload, cfg)
-    return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
 
-@autotvm.register_topi_compute(conv2d_NCHWc, 'intel_graphics', 'direct')
-def _decl_conv2d(cfg, data, kernel, strides, padding, dilation,
-                 layout, out_layout, out_dtype='float32'):
+@autotvm.register_topi_compute("conv2d_NCHWc.intel_graphics")
+def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout,
+                 out_layout, out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -285,96 +192,49 @@ def _decl_conv2d(cfg, data, kernel, strides, padding, dilation,
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
+    if len(data.shape) == 5:
+        batch, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
+        oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)
+        in_channel = ic_chunk * ic_bn
+        num_filter = oc_chunk * oc_bn
+    else:
+        batch, in_channel, ih, iw = get_const_tuple(data.shape)
+        num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
+
     dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
+        padding, (kernel_height, kernel_width))
     assert (dh, dw) == (1, 1), "Does not support dilation"
+    if isinstance(strides, (tuple, list)):
+        stride_h, stride_w = strides
+    else:
+        stride_h, stride_w = strides, strides
+
+    data_shape = (batch, in_channel, ih, iw)
+    kernel_shape = (num_filter, in_channel, kernel_height, kernel_width)
+    _create_schedule_template(cfg, data_shape, kernel_shape, strides, padding, dilation)
 
-    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
-    oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)
-    in_channel = ic_chunk * ic_bn
-    num_filter = oc_chunk * oc_bn
     if cfg.is_fallback:
-        _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
+        _get_default_config(cfg, tvm.placeholder((batch, in_channel, ih, iw), dtype=data.dtype),
                             tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
                                             dtype=kernel.dtype),
                             strides, padding, out_dtype)
 
-    return _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, out_dtype)
-
-
-@conv2d_infer_layout.register("intel_graphics")
-def _conv2d_infer_layout(workload, cfg):
-    _, data, kernel, strides, padding, dilation, layout, dtype = workload
-    batch_size, in_channel, in_height, in_width = data[:-1]
-    out_channel, _, k_height, k_width = kernel[:-1]
-    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
-    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
-    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
-    in_layout = "NCHW%dc" % tile_ic
-    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
-    out_layout = "NCHW%dc" % tile_oc
-    return ((in_shape, in_layout),), ((out_shape, out_layout),)
-
-
-@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc, 'intel_graphics', ['direct'])
-def _schedule_conv2d_NCHWc(cfg, outs):
-    """Schedule for conv2d_nchw for Intel Graphics
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_nchw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_nchw.
-    """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if tag.is_injective(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        if "conv" in op.tag:
-            _schedule_cl_spatialpack_NCHWc(cfg, s, op)
-
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
-
-    return s
-
-def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, out_dtype='float16'):
-    batch, in_channel, in_height, in_width, vc = [util.get_const_int(x) for x in data.shape]
-    in_channel *= vc
-    num_filter, channel, kernel_h, kernel_w, ci, co = [util.get_const_int(x) for x in kernel.shape]
-    num_filter *= co
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
+    ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1]
+    oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1]
 
-    ic_bn = vc
-    assert vc == ci
-
-    if isinstance(strides, (tuple, list)):
-        stride_h, stride_w = strides
-    else:
-        stride_h, stride_w = strides, strides
+    # Pack data if raw 4-D data is provided.
+    if len(data.shape) == 4:
+        data, kernel = _pack_data(data, kernel, ic_bn, oc_bn)
 
     out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
-    oshape = (batch, out_channel // co, out_height, out_width, co)
+    out_height = simplify((ih - kernel_height + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((iw - kernel_width + pad_left + pad_right) // stride_w + 1)
+    oshape = (batch, out_channel // oc_bn, out_height, out_width, oc_bn)
 
     rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    ry = tvm.reduce_axis((0, kernel_height), name='ry')
+    rx = tvm.reduce_axis((0, kernel_width), name='rx')
 
     block_h = cfg["block_oh"].val
     block_w = cfg["block_ow"].val
@@ -388,7 +248,7 @@ def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, ou
     if out_width % block_w != 0:
         c_w = (out_width // block_w + 1) * block_w
 
-    cshape = (batch, out_channel // co, c_h, c_w, co)
+    cshape = (batch, out_channel // oc_bn, c_h, c_w, oc_bn)
 
     pad_before = [0, 0, pad_top, pad_left, 0]
     pad_after = [0, 0, pad_down + c_h - out_height, pad_right + \
@@ -397,7 +257,7 @@ def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, ou
              or pad_right + c_w - out_width != 0)
     DOUNPACK = (c_h - out_height != 0 or c_w - out_width != 0)
     if DOPAD:
-        temp = pad(data, pad_before, pad_after, name="pad_temp")
+        temp = nn.pad(data, pad_before, pad_after, name="pad_temp")
     else:
         temp = data
 
@@ -406,33 +266,53 @@ def _decl_cl_spatialpack_NCHWc(cfg, data, kernel, strides, padding, dilation, ou
         lambda nn, ff, yy, xx, ff_v: \
             tvm.sum(
                 temp[nn, rc//ic_bn, yy * stride_h + ry, xx * stride_w + rx, rc%ic_bn]. \
-                        astype(out_dtype) *
+                astype(out_dtype) *
                 kernel[ff, rc//ic_bn, ry, rx, rc%ic_bn, ff_v].astype(out_dtype),
-                axis=[rc, ry, rx]), tag="conv", name='conv')
+                axis=[rc, ry, rx]), tag="conv2d_NCHWc", name='conv2d_NCHWc')
 
     if DOUNPACK:
         output = tvm.compute(
             oshape,
             lambda nn, ff, yy, xx, ff_v:
             conv[nn][ff][yy][xx][ff_v],
-            name='output_unpack', tag="conv_unpack")
+            name='output_unpack', tag="conv2d_NCHWc_unpack")
     else:
         output = conv
 
-
     return output
 
 
+@autotvm.register_topi_schedule("conv2d_NCHWc.intel_graphics")
+def schedule_conv2d_NCHWc(cfg, outs):
+    """Schedule for conv2d_nchw for Intel Graphics
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d_nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        """inline all one-to-one-mapping operators except the last stage (output)"""
+        if "conv2d_NCHWc" in op.tag:
+            _schedule_cl_spatialpack_NCHWc(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+
+    return s
+
+
 def _schedule_cl_spatialpack_NCHWc(cfg, s, op):
     output = op.output(0)
-    conv = op.input_tensors[0]
-    if conv.op.name == "conv":
-        temp = s[conv].op.input_tensors[0]
-        kernel = s[conv].op.input_tensors[1]
-        temp_W = s.cache_read(temp, "warp", [conv])
-        conv_L = s.cache_write(conv, "local")
-        SCHEDULE_OUTPUT = True
-    else:
+    if op.name == "conv2d_NCHWc":
         temp = op.input_tensors[0]
         kernel = op.input_tensors[1]
         temp_W = s.cache_read(temp, "warp", [output])
@@ -443,8 +323,32 @@ def _schedule_cl_spatialpack_NCHWc(cfg, s, op):
             s[output].compute_inline()
             conv = s.outputs[0]
         SCHEDULE_OUTPUT = False
+    else: # conv2d_NCHWc_unpack
+        conv = op.input_tensors[0]
+        temp = s[conv].op.input_tensors[0]
+        kernel = s[conv].op.input_tensors[1]
+        temp_W = s.cache_read(temp, "warp", [conv])
+        conv_L = s.cache_write(conv, "local")
+        SCHEDULE_OUTPUT = True
     kernel_L = s.cache_read(kernel, "local", [conv_L])
 
+    if temp.name == "pad_temp":
+        data = temp.op.input_tensors[0]
+        # TODO(@Laurawly): Do we need to schedule pad op here?
+    else:
+        data = temp
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
+        # skip this part during tuning to make records accurate.
+        # this part will be folded during Relay fold_constant pass.
+        s[data].pragma(s[data].op.axis[0], "debug_skip_region")
+        s[kernel].pragma(s[kernel].op.axis[0], "debug_skip_region")
+    elif isinstance(kernel.op, tvm.tensor.ComputeOp) and kernel.name == "kernel_vec":
+        # data and kernel are not pre-computed, schedule layout transform here.
+        # TODO(@Laurawly): Add schedule for data and kernel pack
+        pass
+
     OUTPUT_BLOCK_HEIGHT = cfg["block_oh"].val
     OUTPUT_BLOCK_WIDTH = cfg["block_ow"].val
 
@@ -515,19 +419,7 @@ def _schedule_cl_spatialpack_NCHWc(cfg, s, op):
         tile_and_bind3d(s, out, w, h, vc, 4, 8, 8)
 
 
-def conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
-    """convert argument to workload"""
-    if len(kernel.shape) == 4:
-        raw_kernel = kernel
-    else:  # the input kernel is transformed by alter_op_layout
-        shape = get_const_tuple(kernel.shape)
-        raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]),
-                                     dtype=kernel.dtype)
-    return ('conv2d', ) + autotvm.task.args_to_workload(
-        [data, raw_kernel, strides, padding, layout, out_dtype])
-
-@autotvm.register_topi_compute(conv2d, 'intel_graphics', 'direct')
-def decl_conv2d(cfg, data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'):
+def conv2d_nchw(data, kernel, stride, padding, dilation, out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -540,21 +432,18 @@ def decl_conv2d(cfg, data, kernel, stride, padding, dilation, layout='NCHW', out
         stride size, or [stride_height, stride_width]
     padding : int or a list/tuple of two ints
         padding size, or [pad_height, pad_width]
-    layout : str
-        layout of data
     Returns
     -------
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    assert layout == 'NCHW', "only support NCHW convolution on intel gpu"
     assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
     assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
 
-    return _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype)
+    return _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype)
+
 
-@autotvm.task.register_topi_schedule(generic.schedule_conv2d_nchw, 'intel_graphics', ['direct'])
-def schedule_conv2d_nchw(cfg, outs):
+def schedule_conv2d_nchw(outs):
     """Schedule for conv2d_nchw for Intel Graphics
 
     Parameters
@@ -569,28 +458,20 @@ def schedule_conv2d_nchw(cfg, outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
 
-    def traverse(op):
+    def _callback(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
         if 'conv2d' in op.tag:
-            _schedule_cl_spatialpack(cfg, s, op)
-
-        scheduled_ops.append(op)
+            _schedule_cl_spatialpack(s, op)
 
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s
 
-def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype='float16'):
+
+def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'):
     batch, in_channel, in_height, in_width = [util.get_const_int(x) for x in data.shape]
     num_filter, channel, kernel_h, kernel_w = [util.get_const_int(x) for x in kernel.shape]
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, kernel)
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(padding, (kernel_h, kernel_w))
 
     if isinstance(stride, (tuple, list)):
         stride_h, stride_w = stride
@@ -606,8 +487,6 @@ def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype='
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
 
-    block_w = 1
-    block_h = 1
     if stride_h == 2:
         if num_filter + kernel_h == 515:
             block_h = 4
@@ -640,7 +519,7 @@ def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype='
 
     pad_before = [0, 0, pad_top, pad_left]
     pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
+    temp = nn.pad(data, pad_before, pad_after, name="pad_temp")
 
     nv = 16
     if num_filter % nv != 0:
@@ -667,13 +546,12 @@ def _decl_cl_spatialpack(cfg, data, kernel, stride, padding, layout, out_dtype='
         oshape,
         lambda nn, ff, yy, xx:
         conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag='conv2d',
-        attrs={'workload': conv_arg_to_workload(data, kernel, stride, padding,
-                                                layout, out_dtype)})
+        name='output_unpack', tag='conv2d')
 
     return output
 
-def _schedule_cl_spatialpack(cfg, s, op):
+
+def _schedule_cl_spatialpack(s, op):
     output = op.output(0)
     _, _, out_height, out_width = [util.get_const_int(x) for x in output.shape]
 
@@ -742,7 +620,7 @@ def _schedule_cl_spatialpack(cfg, s, op):
     s[kernel_vec].compute_inline()
 
     # schedule kernel_L
-    if "2_14" in s[conv].op.tag:
+    if OUTPUT_BLOCK_HEIGHT == 2 and OUTPUT_BLOCK_WIDTH == 14:
         s[kernel_L].compute_at(s[conv_L], ry)
     else:
         s[kernel_L].compute_at(s[conv_L], rx)
diff --git a/topi/python/topi/intel_graphics/conv2d_alter_op.py b/topi/python/topi/intel_graphics/conv2d_alter_op.py
new file mode 100644
index 000000000000..e95e59f4c6d7
--- /dev/null
+++ b/topi/python/topi/intel_graphics/conv2d_alter_op.py
@@ -0,0 +1,102 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Conv2D alter op and legalize functions for x86"""
+
+import tvm
+from tvm import relay
+from tvm import autotvm
+
+from ..util import get_const_tuple
+from ..nn import conv2d_alter_layout, conv2d_infer_layout
+from .conv2d import _get_default_config
+
+
+@conv2d_alter_layout.register(["intel_graphics"])
+def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
+        cfg = dispatch_ctx.query(target, None)
+        workload = cfg.workload
+    else:
+        _, outs = relay.backend.compile_engine.select_implementation(
+            relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target)
+        workload = autotvm.task.get_workload(outs)
+        if workload is None:
+            # The best implementation is not an AutoTVM template,
+            # we then assume it's not necessary to alter this op.
+            return None
+        cfg = dispatch_ctx.query(target, workload)
+
+    topi_tmpl = workload[0]
+    new_attrs = {k : attrs[k] for k in attrs.keys()}
+
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data_tensor, kernel_tensor = tinfos
+    data_dtype = data_tensor.dtype
+    kernel_dtype = kernel_tensor.dtype
+    out_dtype = out_type.dtype
+
+    if topi_tmpl == "conv2d_NCHWc.intel_graphics":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        if cfg.is_fallback:
+            _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding,
+                                out_dtype, False)
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
+        ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1]
+        oc_bn = cfg["tile_oc"].val if hasattr(cfg["tile_oc"], "val") else cfg["tile_oc"].size[-1]
+
+        # update new attrs
+        new_attrs['channels'] = out_channel
+        new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
+        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
+        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+        new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+
+        # Store altered operator's config
+        new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                   dtype=data_dtype)
+        new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn,
+                                      kh, kw, ic_bn, oc_bn), dtype=kernel_dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"],
+             new_attrs["out_layout"], out_dtype], "conv2d_NCHWc.intel_graphics")
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
+
+    return None
+
+
+@conv2d_infer_layout.register("intel_graphics")
+def _conv2d_infer_layout(workload, cfg):
+    _, data, kernel, strides, padding, dilation, layout, dtype = workload
+    batch_size, in_channel, in_height, in_width = data[1]
+    out_channel, _, k_height, k_width = kernel[1]
+    out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
+    out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
+    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    in_shape = (batch_size, in_channel // tile_ic, in_height, in_width, tile_ic)
+    in_layout = "NCHW%dc" % tile_ic
+    out_shape = (batch_size, out_channel // tile_oc, out_height, out_width, tile_oc)
+    out_layout = "NCHW%dc" % tile_oc
+    return ((in_shape, in_layout),), ((out_shape, out_layout),)
diff --git a/topi/python/topi/intel_graphics/depthwise_conv2d.py b/topi/python/topi/intel_graphics/depthwise_conv2d.py
index 97b7376933de..17f19435b62f 100644
--- a/topi/python/topi/intel_graphics/depthwise_conv2d.py
+++ b/topi/python/topi/intel_graphics/depthwise_conv2d.py
@@ -20,16 +20,17 @@
 from tvm import autotvm
 from ..util import traverse_inline
 from .. import tag
-from .. import generic, nn
+from .. import nn
 from ..nn.depthwise_conv2d import depthwise_conv2d_infer_layout
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.register_topi_compute(nn.depthwise_conv2d_nchw, ['intel_graphics'], 'direct',
-                              nn.depthwise_conv2d_nchw.fdefault)
+@autotvm.register_topi_compute("depthwise_conv2d_nchw.intel_graphics")
+def depthwise_conv2d_nchw(_, data, kernel, strides, padding, dilation, out_dtype):
+    return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
 
-@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_nchw, \
-        ['intel_graphics'], 'direct')
-def schedule_depthwise_conv2d_nchw_intel_graphics(cfg, outs):
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nchw.intel_graphics")
+def schedule_depthwise_conv2d_nchw(cfg, outs):
     """Schedule for depthwise_conv2d nchw forward.
 
     Parameters
@@ -68,7 +69,7 @@ def _callback(op):
             # fallback support
             if cfg.is_fallback:
                 ref_log = autotvm.tophub.load_reference_log(
-                    target.target_name, target.model, 'depthwise_conv2d_nchw', 'direct')
+                    target.target_name, target.model, 'depthwise_conv2d_nchw.intel_graphics')
                 cfg.fallback_with_reference_log(ref_log)
                 cfg['unroll_explicit'].val = 0
             ##### space definition end #####
@@ -132,7 +133,7 @@ def _callback(op):
     traverse_inline(s, outs[0].op, _callback)
     return s
 
-@generic.schedule_depthwise_conv2d_nhwc.register(["intel_graphics"])
+
 def schedule_depthwise_conv2d_nhwc(outs):
     """Schedule for depthwise_conv2d nhwc forward.
 
@@ -331,8 +332,8 @@ def _depthwise_conv2d_infer_layout(workload, _):
         Input shapes and layouts, and output shapes and layouts
     """
     _, data, kernel, strides, padding, _, _ = workload
-    batch_size, in_channel, in_height, in_width = data[:-1]
-    filter_channel, channel_multiplier, k_height, k_width = kernel[:-1]
+    batch_size, in_channel, in_height, in_width = data[1]
+    filter_channel, channel_multiplier, k_height, k_width = kernel[1]
     out_channel = filter_channel * channel_multiplier
     out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
     out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index 35a86e991c23..f774e76c0ccd 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -17,22 +17,20 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
 """conv2d schedule on ARM Mali GPU"""
 import tvm
+from tvm import relay
 from tvm import autotvm
 from tvm.autotvm.task.space import get_factors
 
-from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
 from ..util import traverse_inline, get_const_int, get_const_tuple
-from ..nn import conv2d, conv2d_winograd_without_weight_transform, \
-    get_pad_tuple, pad, conv2d_alter_layout
+from .. import nn
 from ..nn.winograd_util import winograd_transform_matrices
 
 # reuse some compute declarations from ARM CPU
-from ..arm_cpu.conv2d import _alter_conv2d_layout_arm
 from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw
 
 
-@autotvm.register_topi_compute(conv2d, 'mali', ['direct'])
-def conv2d_mali(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+@autotvm.register_topi_compute("conv2d_nchw_spatial_pack.mali")
+def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """TOPI compute callback for conv2d
 
     Parameters
@@ -57,9 +55,6 @@ def conv2d_mali(cfg, data, kernel, strides, padding, dilation, layout, out_dtype
     dilation : list of two ints
         [dilation_height, dilation_width]
 
-    layout : str
-        layout of data
-
     out_dtype: str
         The output type. This is used for mixed precision.
 
@@ -68,14 +63,11 @@ def conv2d_mali(cfg, data, kernel, strides, padding, dilation, layout, out_dtype
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    if layout == 'NCHW':
-        return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
-                                        dilation, out_dtype, num_tile=3)
-    else:
-        raise ValueError("Unsupported layout {}".format(layout))
+    return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
+                                    dilation, out_dtype, num_tile=3)
 
-@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'mali', ['direct', 'winograd'])
-def schedule_conv2d_nchw_mali(cfg, outs):
+@autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.mali")
+def schedule_conv2d_nchw_spatial_pack(cfg, outs):
     """TOPI schedule callback for conv2d
 
     Parameters
@@ -113,9 +105,6 @@ def _callback(op):
 
             _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
 
-        if 'winograd_conv2d_output' in op.tag:
-            _schedule_winograd(cfg, s, op)
-
     traverse_inline(s, outs[0].op, _callback)
     return s
 
@@ -200,13 +189,27 @@ def _pick_tile_size(data, kernel):
     else:
         return 2
 
-@autotvm.register_topi_compute(conv2d, 'mali', ['winograd'])
-def conv2d_mali_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd.mali")
+def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
     tile_size = _pick_tile_size(data, kernel)
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype,
                           tile_size)
 
-def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd.mali")
+def schedule_conv2d_nchw_winograd(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size):
     N, CI, IH, IW = get_const_tuple(data.shape)
     if isinstance(dilation, int):
         dilation_h = dilation_w = dilation
@@ -214,9 +217,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
         dilation_h, dilation_w = dilation
 
     if len(kernel.shape) == 4:
-
         if dilation_h != 1 or dilation_w != 1:
-            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
         pre_computed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
     else:
@@ -226,11 +228,10 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dt
         CO *= VC
         KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
+    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
 
-    assert layout == 'NCHW'
     assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
-    data_pad = pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
+    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
 
     r = KW
     m = tile_size
@@ -420,34 +421,85 @@ def _schedule_winograd(cfg, s, op):
 
     s[Y].compute_at(s[output], tt)
 
-##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
-@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'mali', ['winograd'])
-def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
-    """TOPI compute callback"""
-    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
-                          tile_size)
 
+##### REGISTER ALTER OP LAYOUT #####
+@nn.conv2d_alter_layout.register(["mali"])
+def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+
+    _, outs = relay.backend.compile_engine.select_implementation(
+        relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target)
+    workload = autotvm.task.get_workload(outs)
+    if workload is None:
+        # The best implementation is not an AutoTVM template,
+        # we then assume it's not necessary to alter this op.
+        return None
+    cfg = dispatch_ctx.query(target, workload)
+    if cfg.is_fallback:  # if is fallback, clear query cache and return None
+        autotvm.task.clear_fallback_cache(target, workload)
+        return None
 
-@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
-                                'mali', ['winograd'])
-def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
-    """TOPI schedule callback"""
-    s = tvm.create_schedule([x.op for x in outs])
+    topi_tmpl = workload[0]
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
 
-    def _callback(op):
-        if 'winograd_conv2d_output' in op.tag:
-            _schedule_winograd(cfg, s, op)
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data, kernel = tinfos
+    out_dtype = out_type.dtype
 
-    traverse_inline(s, outs[0].op, _callback)
-    return s
+    idxd = tvm.indexdiv
 
+    if topi_tmpl == "conv2d_nchw_spatial_pack.mali":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+        VC = cfg['tile_co'].size[-1]
 
-##### REGISTER ALTER OP LAYOUT #####
-@conv2d_alter_layout.register(["mali"])
-def _alter_conv2d_layout(attrs, inputs, tinfos, F):
-    try:
-        return _alter_conv2d_layout_arm(attrs, inputs, tinfos, F)
-    except KeyError:  # to filter out fallback opencl templates
+        new_attrs['kernel_layout'] = 'OIHW%do' % VC
+
+        new_data = data
+        new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            "conv2d_nchw_spatial_pack.mali")
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.conv2d(*inputs, **new_attrs)
+    elif topi_tmpl == "conv2d_nchw_winograd.mali":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data.shape)
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+        tile_size = _pick_tile_size(data, kernel)
+        VC = cfg['tile_bna'].val
+
+        weight_expr = inputs[1]
+        weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform(
+            weight_expr, tile_size=tile_size)
+        weight_expr = relay.reshape(weight_expr,
+                                    newshape=(KH + tile_size - 1,
+                                              KW + tile_size - 1,
+                                              idxd(CO, VC), VC, CI))
+        weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3])
+
+        new_attrs['tile_size'] = tile_size
+
+        new_data = data
+        new_kernel = tvm.placeholder((KH + tile_size - 1,
+                                      KW + tile_size -1,
+                                      idxd(CO, VC), CI, VC),
+                                     kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, out_dtype],
+            'conv2d_nchw_winograd.mali')
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight_expr, **new_attrs)
+    else:
         return None
 
 
diff --git a/topi/python/topi/mali/dense.py b/topi/python/topi/mali/dense.py
index 6096a99c97c2..3b233e92ba8a 100644
--- a/topi/python/topi/mali/dense.py
+++ b/topi/python/topi/mali/dense.py
@@ -22,12 +22,18 @@
 import tvm
 from tvm import autotvm
 
-from .. import generic, nn
+from .. import nn
 from ..util import traverse_inline
 
-autotvm.register_topi_compute(nn.dense, 'mali', 'direct', nn.dense.fdefault)
 
-@autotvm.register_topi_schedule(generic.schedule_dense, 'mali', 'direct')
+
+@autotvm.register_topi_compute('dense.mali')
+def dense(_, data, weight, bias=None, out_dtype=None):
+    """Dense operator on Mali"""
+    return nn.dense(data, weight, bias, out_dtype)
+
+
+@autotvm.register_topi_schedule('dense.mali')
 def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 
@@ -52,11 +58,11 @@ def _callback(op):
             vec_size = [1, 2, 4, 8, 16]
             max_unroll = 32
 
-            dense = op.output(0)
+            dense_out = op.output(0)
             output = outs[0]
 
             y, x = s[output].op.axis
-            c = s[dense].op.reduce_axis[0]
+            c = s[dense_out].op.reduce_axis[0]
 
             ##### space definition begin #####
             cfg.define_split('tile_y', y, num_outputs=3)
@@ -66,12 +72,12 @@ def _callback(op):
             # fallback support
             if cfg.is_fallback:
                 ref_log = autotvm.tophub.load_reference_log(
-                    'mali', 'rk3399', 'dense', 'direct')
+                    'mali', 'rk3399', 'dense.mali')
                 cfg.fallback_with_reference_log(ref_log)
             ##### space definition end #####
 
-            if dense.op in s.outputs:
-                dense = s.cache_write(output, 'local')
+            if dense_out.op in s.outputs:
+                dense_out = s.cache_write(output, 'local')
 
             by, ty, yi = cfg['tile_y'].apply(s, output, y)
             bx, tx, xi = cfg['tile_x'].apply(s, output, x)
@@ -85,23 +91,25 @@ def _callback(op):
                 s[output].unroll(yi)
             if cfg['tile_x'].size[-1] in vec_size:
                 s[output].vectorize(xi)
-            s[dense].compute_at(s[output], tx)
+            s[dense_out].compute_at(s[output], tx)
 
-            k = s[dense].op.reduce_axis[0]
-            y, x = s[dense].op.axis
-            k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
-            s[dense].reorder(k, k_unroll, y, x)
-            s[dense].unroll(k_unroll)
+            k = s[dense_out].op.reduce_axis[0]
+            y, x = s[dense_out].op.axis
+            k, k_unroll = cfg['c_unroll'].apply(s, dense_out, k)
+            s[dense_out].reorder(k, k_unroll, y, x)
+            s[dense_out].unroll(k_unroll)
             if cfg['tile_y'].size[-1] < max_unroll:
-                s[dense].unroll(y)
+                s[dense_out].unroll(y)
             if cfg['tile_x'].size[-1] in vec_size:
-                s[dense].vectorize(x)
+                s[dense_out].vectorize(x)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
 
+
 def fuse_and_bind(s, tensor, axis=None, num_thread=None):
     """ fuse all the axis and bind to GPU threads """
+    # TODO(@comaniac): figure out where this function is used.
     axis = axis or s[tensor].op.axis
     fused = s[tensor].fuse(*axis)
     bx, tx = s[tensor].split(fused, num_thread)
diff --git a/topi/python/topi/mali/depthwise_conv2d.py b/topi/python/topi/mali/depthwise_conv2d.py
index 274b2944e4d9..4ff17e534feb 100644
--- a/topi/python/topi/mali/depthwise_conv2d.py
+++ b/topi/python/topi/mali/depthwise_conv2d.py
@@ -20,17 +20,18 @@
 import tvm
 from tvm import autotvm
 
-from ..generic import schedule_depthwise_conv2d_nchw
-from ..nn import depthwise_conv2d_nchw
+from .. import nn
 from ..util import traverse_inline
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.register_topi_compute(depthwise_conv2d_nchw, 'mali', 'direct',
-                              depthwise_conv2d_nchw.fdefault)
+@autotvm.register_topi_compute("depthwise_conv2d_nchw.mali")
+def depthwise_conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    return nn.depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
+
 
 # register customized schedule for arm cpu.
-@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'mali', 'direct')
-def schedule_depthwise_conv2d_nchw_mali(cfg, outs):
+@autotvm.register_topi_schedule("depthwise_conv2d_nchw.mali")
+def schedule_depthwise_conv2d_nchw(cfg, outs):
     """Schedule depthwise conv2d
 
     Parameters
@@ -64,7 +65,7 @@ def _schedule(pad_data, kernel, conv):
         # fallback support
         if cfg.is_fallback:
             ref_log = autotvm.tophub.load_reference_log(
-                'mali', 'rk3399', 'depthwise_conv2d_nchw', 'direct')
+                'mali', 'rk3399', 'depthwise_conv2d_nchw.mali')
             cfg.fallback_with_reference_log(ref_log)
         ###### space definition end ######
 
diff --git a/topi/python/topi/nn/batch_matmul.py b/topi/python/topi/nn/batch_matmul.py
index 7b872ceacf29..d69562c4daf6 100644
--- a/topi/python/topi/nn/batch_matmul.py
+++ b/topi/python/topi/nn/batch_matmul.py
@@ -20,7 +20,7 @@
 import tvm
 from ..util import get_const_tuple
 
-def batch_matmul_default(x, y):
+def batch_matmul(x, y):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
     data in batch.
 
@@ -48,23 +48,3 @@ def batch_matmul_default(x, y):
     return tvm.compute((batch, M, N),
                        lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k),
                        tag='batch_matmul')
-
-@tvm.target.generic_func
-def batch_matmul(x, y):
-    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
-    data in batch.
-
-    Parameters
-    ----------
-    x : tvm.Tensor
-        3-D with shape [batch, M, K]
-
-    y : tvm.Tensor
-        3-D with shape [batch, N, K]
-
-    Returns
-    -------
-    output : tvm.Tensor
-        3-D with shape [batch, M, N]
-    """
-    return batch_matmul_default(x, y)
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index e1f8f819968f..f18a5aae7eed 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -19,13 +19,11 @@
 """Bitserial Conv2D operators"""
 from __future__ import absolute_import as _abs
 import tvm
-from tvm import autotvm
 from .pad import pad
 from .util import get_pad_tuple
-from .bitserial_util import bitpack, binary_op_multiplier
+from .bitserial_util import bitpack
 from ..util import get_const_tuple
 
-@tvm.target.generic_func
 def bitserial_conv2d_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
                           pack_dtype='uint32', out_dtype='int16', unipolar=True):
     """Bitserial Conv2D operator.
@@ -117,7 +115,6 @@ def _conv(nn, ff, yy, xx):
     return tvm.compute((batch, out_channel, out_height, out_width), _conv,
                        name="Conv2dOutput", tag="bitserial_conv2d_nchw")
 
-@tvm.target.generic_func
 def bitserial_conv2d_nhwc(data, kernel, stride, padding, activation_bits, weight_bits,
                           pack_dtype='uint32', out_dtype='int16', unipolar=True):
     """Bitserial Conv2D operator.
@@ -213,222 +210,6 @@ def _conv(nn, yy, xx, ff):
 
     return conv
 
-@autotvm.register_topi_compute(bitserial_conv2d_nchw, ['cpu', 'arm_cpu'], 'direct')
-def spatial_pack_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits,
-                      pack_dtype='uint32', out_dtype='int16', unipolar=True):
-    """ Compute convolution with pack on spatial axes. """
-    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
-    data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
-    # Check if kernel is already bitpacked
-    if len(kernel.shape) == 4:
-        kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
-        KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape)
-    else:
-        kernel_vec = kernel
-        OCO, _, KH, KW, KB, VC = get_const_tuple(kernel_vec.shape)
-        CO = OCO * VC
-
-    IB, N, CI, H, W = get_const_tuple(data_q.shape)
-    KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape)
-
-    if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
-        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
-    else:
-        TPAD, LPAD, DPAD, RPAD = padding
-    pad_before = [0, 0, 0, TPAD, LPAD]
-    pad_after = [0, 0, 0, DPAD, RPAD]
-
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    HCAT, WCAT = KH-1, KW-1
-
-    TH = H + TPAD + DPAD
-    TW = W + LPAD + RPAD
-    OH = (H + TPAD + DPAD - KH) // HSTR + 1
-    OW = (W + LPAD + RPAD - KW) // WSTR + 1
-
-     # ==================== define configuration space ====================
-    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
-    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-    ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)
-
-    co, vc = cfg.define_split('tile_co', co, num_outputs=2,
-                              filter=lambda x: max(x.size[1:]) <= 16)
-    oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2,
-                              filter=lambda x: max(x.size[1:]) <= 16)
-    ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2,
-                              filter=lambda x: max(x.size[1:]) <= 16)
-    cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
-
-    cfg.define_reorder("reorder_0",
-                       [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci],
-                       policy='interval_all', interval=(6, 11))
-    # binary ops
-    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
-    # ====================
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB)
-    kvshape = (CO//VC, CI, KH, KW, KB, VC)
-    ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC)
-    oshape = (1, CO, OH, OW)
-
-    if (TPAD != 0 and RPAD != 0):
-        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
-    else:
-        data_pad = data_q
-
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \
-        data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
-
-    if len(kernel.shape) == 4:
-        kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \
-            kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec')
-
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
-    b1 = tvm.reduce_axis((0, IB), name='ib')
-    b2 = tvm.reduce_axis((0, KB), name='kb')
-
-    def _conv(n, co, h, w, vh, vw, vc):
-        b1b2 = (b1+b2).astype(out_dtype)
-        if unipolar:
-            return tvm.sum((tvm.popcount(
-                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) &
-                kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype))  -
-                            tvm.popcount(
-                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype)
-                                & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2,
-                           axis=[ci, dh, dw, b1, b2])
-
-        return tvm.sum((tvm.popcount(
-            data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
-            kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
-                       axis=[ci, dh, dw, b1, b2])
-
-    conv = tvm.compute(ovshape, _conv, name='conv_out')
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
-
-    return tvm.compute(
-        oshape, lambda n, co, h, w:
-        conv[n,
-             idxd(co, VC), idxd(h, VH), idxd(w, VW),
-             idxm(h, VH), idxm(w, VW), idxm(co, VC)],
-        name='conv_vec', tag='spatial_bitserial_conv_nchw')
-
-@autotvm.register_topi_compute(bitserial_conv2d_nhwc, 'cpu', 'direct')
-def spatial_pack_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits,
-                      pack_dtype='uint32', out_dtype='int16', unipolar=True):
-    """ Compute convolution with pack on spatial axes. """
-    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
-    data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype)
-    pack_kernel = len(kernel.shape) == 4
-
-    if pack_kernel:
-        kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype)
-    else:
-        kernel_q = kernel
-
-    KH, KW, _, CO, KB = get_const_tuple(kernel_q.shape)
-    N, H, W, CI, IB = get_const_tuple(data_q.shape)
-
-    if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
-        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
-    else:
-        TPAD, LPAD, DPAD, RPAD = padding
-    pad_before = [0, TPAD, LPAD, 0, 0]
-    pad_after = [0, DPAD, RPAD, 0, 0]
-
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-    HCAT, WCAT = KH-1, KW-1
-
-    PAD_H = H + (TPAD + DPAD)
-    PAD_W = W + (LPAD + RPAD)
-    OH = (PAD_H - KH) // HSTR + 1
-    OW = (PAD_W - KW) // WSTR + 1
-    oshape = (1, OH, OW, CO)
-
-    # ==================== define configuration space ====================
-    n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO)
-    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
-    ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)
-
-    co, vc = cfg.define_split('tile_co', co, num_outputs=2,
-                              filter=lambda x: max(x.size[1:]) <= 16)
-    oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2,
-                              filter=lambda x: max(x.size[1:]) <= 16)
-    ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2,
-                              filter=lambda x: max(x.size[1:]) <= 16)
-    cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
-    cfg.define_reorder("reorder_0",
-                       [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci],
-                       policy='interval_all', interval=(3, 7))
-    # binary ops
-    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
-    # ====================
-
-    VC = cfg["tile_co"].size[-1]
-    VH = cfg["tile_oh"].size[-1]
-    VW = cfg["tile_ow"].size[-1]
-
-    dvshape = (1, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, CI, IB)
-    kvshape = (CO, KH, KW, CI, VC, KB)
-    ovshape = (1, OH, OW, CO, VH, VW, VC)
-    oshape = (1, OH, OW, CO)
-
-    if (DPAD != 0 and RPAD != 0):
-        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
-    else:
-        data_pad = data_q
-
-    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \
-        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec')
-
-    kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \
-        kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec')
-
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
-    b1 = tvm.reduce_axis((0, IB), name='ib')
-    b2 = tvm.reduce_axis((0, KB), name='kb')
-
-    def _conv(n, h, w, co, vh, vw, vc):
-        b1b2 = (b1+b2).astype(out_dtype)
-        if unipolar:
-            return tvm.sum(
-                ((tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
-                               kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) -
-                  tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1]&
-                               ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2),
-                axis=[dh, dw, ci, b1, b2])
-
-        return tvm.sum(tvm.popcount(
-            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
-            kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
-                       axis=[dh, dw, ci, b1, b2])
-
-    conv = tvm.compute(ovshape, _conv, name='conv')
-
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
-    return tvm.compute(
-        oshape, lambda n, h, w, co:
-        conv[n,
-             idxd(h, VH), idxd(w, VW), idxd(co, VC),
-             idxm(h, VH), idxm(w, VW), idxm(co, VC)],
-        name='output_unpack', tag='spatial_bitserial_conv_nhwc')
-
 @tvm.target.generic_func
 def bitserial_conv2d_legalize(attrs, inputs, types):
     """Legalizes Bitserial Conv2D op.
diff --git a/topi/python/topi/nn/bitserial_dense.py b/topi/python/topi/nn/bitserial_dense.py
index d77a1b7b0fc2..fa1b5df7d066 100644
--- a/topi/python/topi/nn/bitserial_dense.py
+++ b/topi/python/topi/nn/bitserial_dense.py
@@ -18,11 +18,9 @@
 """Bitserial Dense operator."""
 from __future__ import absolute_import
 import tvm
-from tvm import autotvm
 from topi.util import get_const_tuple
-from .bitserial_util import bitpack, binary_op_multiplier
+from .bitserial_util import bitpack
 
-@tvm.target.generic_func
 def bitserial_dense(data, weight, data_bits, weight_bits, pack_dtype='uint32',
                     out_dtype='int16', unipolar=True):
     """The default implementation of bitserial dense in topi.
@@ -66,78 +64,3 @@ def bitserial_dense(data, weight, data_bits, weight_bits, pack_dtype='uint32',
     if unipolar:
         return matmul_unipolar
     return matmul
-
-
-@autotvm.register_topi_compute(bitserial_dense, ['cpu'], 'direct')
-def bitserial_dense_default(cfg, data, weight, data_bits, weight_bits, pack_dtype='uint32',
-                            out_dtype='int16', unipolar=True):
-    """Bitserial dense implementation. TODO: Why are these separate
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        2-D with shape [batch, in_dim]
-    weight : tvm.Tensor
-        2-D with shape [out_dim, in_dim] or
-        3-D with shape [out_dim, weight_bits, in_dim]
-    Returns
-    -------
-    output : tvm.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
-    if len(weight.shape) == 2:
-        weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
-    else:
-        weight_packed = weight
-    Y, DB, K = get_const_tuple(data_packed.shape)
-    X, WB, _ = get_const_tuple(weight_packed.shape)
-    ######## Search space
-    x, y = cfg.axis(X), cfg.axis(Y)
-    db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K)
-    ko, ki = cfg.define_split('tile_k', k, num_outputs=2)
-    yo, yi = cfg.define_split('tile_y', y, num_outputs=2)
-    xo, xi = cfg.define_split('tile_x', x, num_outputs=2)
-
-    cfg.define_reorder('reorder_0', [yo, xo, ko, yi, wb, db, ki, xi],
-                       policy='candidate', candidate=[
-                           [yo, xo, ko, yi, wb, db, ki, xi],
-                           [yo, xo, yi, ko, wb, db, ki, xi]])
-
-    cfg.define_annotate('ann_reduce', [db, wb], policy='try_unroll')
-    cfg.define_annotate('ann_spatial', [yi, xi], policy='try_unroll_vec')
-
-    ###### Compute rule
-    VX = cfg['tile_x'].size[-1]
-
-    wvshape = (X//VX, WB, VX, K)
-    oshape = (Y, X)
-
-    k = tvm.reduce_axis((0, K), name='k')
-    db = tvm.reduce_axis((0, DB), name='db')
-    wb = tvm.reduce_axis((0, WB), name='wb')
-
-    # Tile data and weights
-    weight_vec = tvm.compute(wvshape, lambda xo, wb, vx, k:
-                             weight_packed[xo*VX+vx][wb][k], name='weight_vec')
-
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
-
-    matmul_unipolar = tvm.compute(oshape, lambda i, j: tvm.sum(
-        (tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) -
-         tvm.popcount(~weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k])
-        ).astype(out_dtype)
-        << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar')
-
-    matmul = tvm.compute(oshape, lambda i, j: tvm.sum(
-        tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]
-                    ).astype(out_dtype)
-        << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense')
-
-    # binary ops
-    cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype))
-
-    if unipolar:
-        return matmul_unipolar
-    return matmul
diff --git a/topi/python/topi/nn/conv1d.py b/topi/python/topi/nn/conv1d.py
index 98fa2e3d7001..4565fd2f5a46 100644
--- a/topi/python/topi/nn/conv1d.py
+++ b/topi/python/topi/nn/conv1d.py
@@ -23,7 +23,6 @@
 from .util import get_pad_tuple1d
 
 
-@tvm.target.generic_func
 def conv1d(data,
            kernel,
            strides=1,
@@ -101,6 +100,13 @@ def conv1d_ncw(data,
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
+    if out_dtype is None:
+        out_dtype = data.dtype
+    if isinstance(strides, (tuple, list)):
+        strides = strides[0]
+    if isinstance(dilation, (tuple, list)):
+        dilation = dilation[0]
+
     batch, in_channels, data_width = data.shape
     out_channels, _, kernel_size = kernel.shape
 
@@ -158,6 +164,13 @@ def conv1d_nwc(data,
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
+    if out_dtype is None:
+        out_dtype = data.dtype
+    if isinstance(strides, (tuple, list)):
+        strides = strides[0]
+    if isinstance(dilation, (tuple, list)):
+        dilation = dilation[0]
+
     batch, data_width, in_channels = data.shape
     kernel_size, _, out_channels = kernel.shape
 
diff --git a/topi/python/topi/nn/conv1d_transpose.py b/topi/python/topi/nn/conv1d_transpose.py
index 39918e90c317..8d224247db01 100644
--- a/topi/python/topi/nn/conv1d_transpose.py
+++ b/topi/python/topi/nn/conv1d_transpose.py
@@ -24,7 +24,6 @@
 from .util import get_pad_tuple1d
 
 
-@tvm.target.generic_func
 def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype):
     """Transposed 1D convolution ncw forward operator.
 
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 52f4b12a1d2d..a7a75ed0ef0c 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -23,7 +23,7 @@
 
 from .pad import pad
 from .util import get_pad_tuple
-from ..util import simplify, get_const_tuple, get_const_int
+from ..util import simplify, get_const_tuple, get_const_int, tag
 from .winograd_util import winograd_transform_matrices
 
 # workload description of conv2d
@@ -31,7 +31,6 @@
                       ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'groups',
                        'out_filter', 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
 
-@tvm.target.generic_func
 def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=None):
     """Conv2D operator.
 
@@ -96,7 +95,7 @@ def conv2d_legalize(attrs, inputs, types):
 
 
 @tvm.target.generic_func
-def conv2d_alter_layout(attrs, inputs, tinfos, F):
+def conv2d_alter_layout(attrs, inputs, tinfos, out_type):
     """Change Conv2D layout.
 
     Parameters
@@ -107,13 +106,12 @@ def conv2d_alter_layout(attrs, inputs, tinfos, F):
         Grouped input symbols
     tinfos : list
         Input shape and dtype
-    F: symbol
-        The context, can be either relay.op
+    out_type: type
+        The output type
 
     Note
     ----
-    Unlike other TOPI functions, this function operates on both graph level and operator level,
-    so we have to pass 'F' to make it support our two versions of graph IR, Relay.
+    Unlike other TOPI functions, this function operates on both graph level and operator level.
     """
     # not to change by default
     return None
@@ -368,7 +366,6 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     return Output
 
 
-@tvm.target.generic_func
 def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
     """Conv2D operator for nChw[x]c layout.
 
@@ -408,58 +405,9 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
 
-    return conv2d_NCHWc_compute(data,
-                                kernel,
-                                stride,
-                                padding,
-                                dilation,
-                                layout,
-                                out_layout,
-                                out_dtype)
-
-
-def conv2d_NCHWc_compute(data, kernel, strides, padding, dilation, layout, out_layout, out_dtype):
-    """Conv2D operator compute for nChw[x]c layout.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
-
-    kernel : tvm.Tensor
-        6-D with shape
-        [num_filter_chunk, in_channel_chunk, filter_height, filter_width,
-        in_channel_block, num_filter_block]
-
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding : int or a list/tuple of 2 or 4 ints
-        padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    dilation: int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    layout : str
-        Input data layout
-
-    out_layout : str
-        Output data layout
-
-    out_dtype : str
-        output data type
-
-    Returns
-    -------
-    output : tvm.Tensor
-        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
-    """
-
     # layout and out_layout are not used here,
     # we keep them for debug convenience when dumping autotvm workload
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride)
     dilation_h, dilation_w = dilation if isinstance(dilation, (tuple, list)) \
         else (dilation, dilation)
 
@@ -516,8 +464,7 @@ def conv2d_NCHWc_compute(data, kernel, strides, padding, dilation, layout, out_l
                        name='conv2d_NCHWc', tag="conv2d_NCHWc")
 
 
-@tvm.target.generic_func
-def conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, layout, out_layout,
+def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layout,
                       out_dtype='int32'):
     """Conv2D operator for nChw[x]c layout.
 
@@ -557,59 +504,9 @@ def conv2d_NCHWc_int8(data, kernel, strides, padding, dilation, layout, out_layo
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
 
-    return conv2d_NCHWc_int8_compute(data,
-                                     kernel,
-                                     strides,
-                                     padding,
-                                     dilation,
-                                     layout,
-                                     out_layout,
-                                     out_dtype)
-
-
-def conv2d_NCHWc_int8_compute(data, kernel, strides, padding, dilation, layout, out_layout,
-                              out_dtype='int32'):
-    """Conv2D operator for nChw[x]c layout.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
-
-    kernel : tvm.Tensor
-        7-D with shape
-        [num_filter_chunk, in_channel_chunk, filter_height, filter_width, in_channel_block/4,
-        num_filter_block, 4]
-
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
-
-    padding : int or a list/tuple of 2 or 4 ints
-        padding size, or
-        [pad_height, pad_width] for 2 ints, or
-        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
-
-    dilation: int or a list/tuple of two ints
-        dilation size, or [dilation_height, dilation_width]
-
-    layout : str
-        Input data layout
-
-    out_layout : str
-        Output data layout
-
-    out_dtype : str
-        output data type
-
-    Returns
-    -------
-    output : tvm.Tensor
-        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
-    """
-
     # layout and out_layout are not used here,
     # we keep them for debug convenience when dumping autotvm workload
-    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride)
     dilation_h, dilation_w = dilation if isinstance(dilation, (tuple, list)) \
         else (dilation, dilation)
 
@@ -724,33 +621,6 @@ def conv2d_winograd_weight_transform(kernel, tile_size):
                                axis=[r_kh, r_kw]), name='transform_weight')
 
 
-@tvm.target.generic_func
-def conv2d_winograd_without_weight_transform(input, filter, strides, padding, dilation,
-                                             layout, out_dtype, tile_size):
-    """Compute convolution in winograd algorithm. The filter is supposed to be transformed
-    in advance.
-
-    Parameters
-    ----------
-    input : tvm.Tensor
-        4-D with shape [batch, in_height, in_width, in_channel]
-    filter : tvm.Tensor
-        4-D with shape [filter_height, filter_width, in_channel, num_filter]
-    strides : int or a list/tuple of two ints
-        Stride size, or [stride_height, stride_width]
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-    tile_size: int
-        Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3)
-
-    Returns
-    -------
-    output : tvm.Tensor
-        4-D with shape [batch, out_height, out_width, out_channel]
-    """
-    raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
-
-
 def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype):
     """Weight transformation for winograd
      Parameters
@@ -769,32 +639,7 @@ def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_d
     return nnpack.convolution_inference_weight_transform(
         kernel, algorithm=convolution_algorithm, dtype=out_dtype)
 
-@tvm.target.generic_func
-def conv2d_winograd_nnpack_without_weight_transform(
-        input, filter, bias, strides, padding, dilation, layout, out_dtype):
-    """Compute convolution in winograd algorithm. The filter is supposed to be transformed
-    in advance.
-     Parameters
-    ----------
-    input : tvm.Tensor
-        4-D with shape [batch, in_height, in_width, in_channel]
-    filter : tvm.Tensor
-        4-D with shape [num_filter, in_channel, 8, 8]
-    bias : tvm.Tensor
-        1-D with shape [num_filter]
-    strides : int or a list/tuple of two ints
-        Stride size, or [stride_height, stride_width]
-    padding : int or str
-        Padding size, or ['VALID', 'SAME']
-     Returns
-    -------
-    output : tvm.Tensor
-        4-D with shape [batch, out_height, out_width, out_channel]
-    """
-    raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
-
 
-@tvm.target.generic_func
 def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
     """Group convolution operator in NCHW layout.
 
@@ -871,3 +716,35 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
                  xx * stride_w + rx * dilation_w].astype(out_dtype) *
             Filter[ff, rc, ry, rx].astype(out_dtype),
             axis=[rc, ry, rx]), tag='group_conv2d_nchw')
+
+
+def unpack_NCHWc_to_nchw(packed_out, out_dtype):
+    """Unpack conv2d_NCHWc output from layout NCHWc to NCHW
+
+     Parameters
+    -----------
+    packed_out : tvm.Tensor
+        The output tensor of conv2d_NCHWc.
+
+    out_dtype : str
+        The output dtype.
+
+    Returns
+    -------
+    unpacked_out : tvm.Tensor
+        The unpacked output tensor in NCHW layout.
+    """
+    n, oc_chunk, oh, ow, oc_bn = get_const_tuple(packed_out.shape)
+
+    idxmod = tvm.indexmod
+    idxdiv = tvm.indexdiv
+
+    oshape = (n, oc_chunk * oc_bn, oh, ow)
+    unpacked_out = \
+        tvm.compute(oshape,
+                    lambda n, c, h, w:
+                    packed_out[n, idxdiv(c, oc_bn), h, w, idxmod(c, oc_bn)]
+                    .astype(out_dtype),
+                    name='output_unpack',
+                    tag=tag.INJECTIVE+",unpack_nchwc")
+    return unpacked_out
diff --git a/topi/python/topi/nn/conv2d_transpose.py b/topi/python/topi/nn/conv2d_transpose.py
index e635f43cdbc4..db132fc81f13 100644
--- a/topi/python/topi/nn/conv2d_transpose.py
+++ b/topi/python/topi/nn/conv2d_transpose.py
@@ -25,7 +25,6 @@
 from ..util import simplify
 
 
-@tvm.target.generic_func
 def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
     """Transposed 2D convolution nchw forward operator.
 
diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py
index 83c16dae7ac4..88c7c6a3ed90 100644
--- a/topi/python/topi/nn/conv3d.py
+++ b/topi/python/topi/nn/conv3d.py
@@ -25,46 +25,8 @@
 from ..util import simplify
 
 
-@tvm.target.generic_func
-def conv3d(input, filter, strides, padding, dilation, layout='NCDHW', out_dtype=None):
-    """Conv3D operator.
-
-    Parameters
-    ----------
-    input : tvm.Tensor
-        5-D with shape [batch, in_depth, in_channel, in_height, in_width]
-
-    filter : tvm.Tensor
-        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
-
-    strides : int or a list/tuple of three ints
-        stride size, or [stride_depth, stride_height, stride_width]
-
-    padding : int or a list/tuple of three ints
-        padding size, or [pad_depth, pad_height, pad_width]
-
-    dilation: int or a list/tuple of three ints
-        dilation size, or [dilation_depth, dilation_height, dilation_width]
-
-    layout : str
-        layout of data
-
-    Returns
-    -------
-    output : tvm.Tensor
-        5-D with shape [batch, out_depth, out_channel, out_height, out_width]
-    """
-    # search platform specific declaration first
-    # default declaration
-    if layout == 'NCDHW':
-        return conv3d_ncdhw(input, filter, strides, padding, dilation, out_dtype)
-    elif layout == 'NDHWC':
-        return conv3d_ndhwc(input, filter, strides, padding, dilation, out_dtype)
-    raise ValueError("not support this layout {} yet".format(layout))
-
-
 def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None):
-    """Convolution operator in NCDHW layout.
+    """Conv3D operator in NCDHW layout.
 
     Parameters
     ----------
@@ -138,13 +100,13 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     Parameters
     ----------
     Input : tvm.Tensor
-        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
+        5-D with shape [batch, in_depth, in_height, in_width, in_channel]
 
     Filter : tvm.Tensor
-        5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
+        5-D with shape [filter_depth, filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of three ints
-        Stride size, or [strid_depth, stride_height, stride_width]
+        Stride size, or [stride_depth, stride_height, stride_width]
 
     padding : int or str
         Padding size, or ['VALID', 'SAME']
@@ -155,7 +117,7 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     Returns
     -------
     Output : tvm.Tensor
-        5-D with shape [batch, out_channel, out_depth, out_height, out_width]
+        5-D with shape [batch, out_depth, out_height, out_width, out_channel]
     """
     assert isinstance(stride, int) or len(stride) == 3
     assert isinstance(dilation, int) or len(dilation) == 3
diff --git a/topi/python/topi/nn/deformable_conv2d.py b/topi/python/topi/nn/deformable_conv2d.py
index 2417411efc37..251f68aa8c25 100644
--- a/topi/python/topi/nn/deformable_conv2d.py
+++ b/topi/python/topi/nn/deformable_conv2d.py
@@ -22,7 +22,6 @@
 from ..util import get_const_tuple
 from ..cpp.util import bilinear_sample_nchw
 
-@tvm.target.generic_func
 def deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation, deformable_groups,
                            groups, out_dtype):
     """Deformable conv2D operator in NCHW layout.
diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py
index 671b602edc30..fe21e7417bda 100644
--- a/topi/python/topi/nn/dense.py
+++ b/topi/python/topi/nn/dense.py
@@ -19,7 +19,7 @@
 import tvm
 from .. import tag
 
-def dense_default(data, weight, bias=None, out_dtype=None):
+def dense(data, weight, bias=None, out_dtype=None):
     """The default implementation of dense in topi.
 
     Parameters
@@ -59,29 +59,3 @@ def dense_default(data, weight, bias=None, out_dtype=None):
                              lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), \
                              tag=tag.BROADCAST)
     return matmul
-
-
-@tvm.target.override_native_generic_func("dense")
-def dense(data, weight, bias=None, out_dtype=None):
-    """Applies a linear transformation: :math:`Y = XW^T + b`.
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        2-D with shape [batch, in_dim]
-
-    weight : tvm.Tensor
-        2-D with shape [out_dim, in_dim]
-
-    bias : tvm.Tensor, optional
-        1-D with shape [out_dim]
-
-    out_dtype : str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    return dense_default(data, weight, bias, out_dtype)
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index f50e357a3bb8..49aaace0f833 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -47,7 +47,6 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
                     out_channel, kh, kw, HPAD, WPAD, HSTR, WSTR)
 
 
-@tvm.target.generic_func
 def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Depthwise convolution nchw forward operator.
 
@@ -121,7 +120,6 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=No
     return Output
 
 
-@tvm.target.generic_func
 def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Depthwise convolution nhwc forward operator.
 
@@ -307,7 +305,6 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
     return Weight_grad
 
 
-@tvm.target.generic_func
 def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
                            layout, out_layout, out_dtype=None):
     """Depthwise convolution NCHW[x]c forward operator.
diff --git a/topi/python/topi/nn/local_response_norm.py b/topi/python/topi/nn/local_response_norm.py
index de002bfffbe6..1b41c7dbfb5e 100644
--- a/topi/python/topi/nn/local_response_norm.py
+++ b/topi/python/topi/nn/local_response_norm.py
@@ -17,10 +17,8 @@
 # pylint: disable=invalid-name
 """TVM operator for local response norm compute."""
 from __future__ import absolute_import
-import tvm
 from .. import cpp
 
-@tvm.target.generic_func
 def lrn(data, size, axis=1, alpha=0.0001, beta=0.75, bias=2):
     """Perform the across channels local response normalisation
     on the input data.
diff --git a/topi/python/topi/nn/sparse.py b/topi/python/topi/nn/sparse.py
index 584126ea2015..6974ff4a13ab 100644
--- a/topi/python/topi/nn/sparse.py
+++ b/topi/python/topi/nn/sparse.py
@@ -22,7 +22,6 @@
 from ..util import get_const_tuple
 
 
-@tvm.target.generic_func
 def sparse_dense(data, weight_data, weight_indices, weight_indptr):
     """
     Computes sparse-dense matrix multiplication of `data` and
@@ -105,7 +104,7 @@ def _compute_block(i, nb_j, j):
         lambda m, n: bsrmm_block[m, idxd(n, bs_r), idxm(n, bs_r)],
         tag="sparse_dense_bsrmm")
 
-@tvm.target.generic_func
+
 def sparse_transpose(sparse_data, sparse_indices, sparse_indptr):
     """
     Transpose a square sparse matrix,
@@ -148,14 +147,15 @@ def sparse_transpose(sparse_data, sparse_indices, sparse_indptr):
         shape=output_shape,
         inputs=[sparse_data, sparse_indices, sparse_indptr],
         fcompute=lambda ins, outs:
-        csr_transpose_ir(ins[0], ins[1], ins[2], outs[0], outs[1], outs[2]),
+        _csr_transpose_ir(ins[0], ins[1], ins[2], outs[0], outs[1], outs[2]),
         tag="sparse_transpose_csr",
         dtype=['float32', 'int32', 'int32'],
         name='out')
 
     return [output_data, output_indices, output_indptr]
 
-def csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr):
+
+def _csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr):
     """define ir for csr_transpose"""
     irb = tvm.ir_builder.create()
 
diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py
index aa73e849427b..f0cdd9a0d3c2 100644
--- a/topi/python/topi/nn/util.py
+++ b/topi/python/topi/nn/util.py
@@ -143,7 +143,7 @@ def get_pad_tuple(padding, kernel):
             pad_h = padding[0] * 2
             pad_w = padding[1] * 2
         elif len(padding) == 4:
-            return  padding[0], padding[1], padding[2], padding[3]
+            return padding[0], padding[1], padding[2], padding[3]
         else:
             raise ValueError("Size of padding can only be 2 or 4")
     elif isinstance(padding, int):
diff --git a/topi/python/topi/opengl/conv2d_nchw.py b/topi/python/topi/opengl/conv2d_nchw.py
index e39d1ad805b0..52ed11972e6f 100644
--- a/topi/python/topi/opengl/conv2d_nchw.py
+++ b/topi/python/topi/opengl/conv2d_nchw.py
@@ -18,9 +18,7 @@
 """Schedule for conv2d_nchw with auto fusion"""
 import tvm
 from .. import tag
-from .. import generic
 
-@generic.schedule_conv2d_nchw.register(["opengl"])
 def schedule_conv2d_nchw(outs):
     """Schedule for conv2d_nchw.
 
diff --git a/topi/python/topi/opengl/dense.py b/topi/python/topi/opengl/dense.py
index c93dfccbeece..db2c4a677904 100644
--- a/topi/python/topi/opengl/dense.py
+++ b/topi/python/topi/opengl/dense.py
@@ -19,9 +19,7 @@
 from __future__ import absolute_import as _abs
 import tvm
 from .. import tag
-from .. import generic
 
-@generic.schedule_dense.register(["opengl"])
 def schedule_dense(outs):
     """Schedule for dense operator.
 
diff --git a/topi/python/topi/opengl/injective.py b/topi/python/topi/opengl/injective.py
index d3ebc943b962..28dc87d1a5fb 100644
--- a/topi/python/topi/opengl/injective.py
+++ b/topi/python/topi/opengl/injective.py
@@ -17,9 +17,7 @@
 # pylint: disable=invalid-name, unused-variable,
 """Schedule for composition of injective operator"""
 import tvm
-from .. import generic
 
-@generic.schedule_injective_from_existing.register(["opengl"])
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
 
@@ -38,7 +36,6 @@ def schedule_injective_from_existing(sch, out):
     sch[out].opengl()
     return sch
 
-@generic.schedule_injective.register(["opengl"])
 def schedule_injective(outs):
     """Schedule for injective op.
 
diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py
index 04c7b0cd0002..3226422048e5 100644
--- a/topi/python/topi/opengl/pooling.py
+++ b/topi/python/topi/opengl/pooling.py
@@ -18,9 +18,7 @@
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
-from .. import generic
 
-@generic.schedule_adaptive_pool.register(["opengl"])
 def schedule_adaptive_pool(outs):
     """Schedule for adaptive pool.
 
@@ -69,7 +67,6 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_pool.register(["opengl"])
 def schedule_pool(outs, layout):
     """Schedule for pool.
 
diff --git a/topi/python/topi/opengl/softmax.py b/topi/python/topi/opengl/softmax.py
index e343d4513241..ff218d13c2b1 100644
--- a/topi/python/topi/opengl/softmax.py
+++ b/topi/python/topi/opengl/softmax.py
@@ -17,9 +17,7 @@
 # pylint: disable=invalid-name, unused-variable, trailing-whitespace
 """Schedule for softmax operator"""
 import tvm
-from .. import generic
 
-@generic.schedule_softmax.register(["opengl"])
 def schedule_softmax(outs):
     """Schedule for softmax op.
 
diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py
index be29c6f6b0cc..ce56dc4e0847 100644
--- a/topi/python/topi/rocm/conv2d.py
+++ b/topi/python/topi/rocm/conv2d.py
@@ -14,19 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name, unused-argument
 """Compute definition for conv2d with rocm backend"""
-import tvm
 from tvm import autotvm
 from tvm.contrib import miopen
 
-from .. import nn, generic
+from .. import generic
 from ..util import get_const_tuple
-from ..cuda.conv2d import conv2d_cuda, schedule_conv2d_nchw_cuda
 from ..nn.util import get_pad_tuple
 
-@autotvm.register_topi_compute(nn.conv2d, 'rocm', ['direct', 'winograd'])
-def conv2d_rocm(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'):
+@autotvm.register_topi_compute("conv2d_nchw_miopen.rocm")
+def conv2d_nchw_miopen(cfg, data, kernel, strides, padding, dilation, out_dtype='float32'):
     """Conv2D operator for rocm backend.
 
     Parameters
@@ -57,39 +55,34 @@ def conv2d_rocm(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou
         4-D with shape [batch, out_channel, out_height, out_width]
     """
 
-    target = tvm.target.Target.current()
-    if "miopen" in target.libs:
-        assert layout == 'NCHW', "Only NCHW layout is supported."
-        CO, CI, KH, KW = get_const_tuple(kernel.shape)
-        N, _, H, W = get_const_tuple(data.shape)
-
-        # handle dilation
-        stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
-        pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
-        pad_h, pad_w = pt + pb, pl + pr
-        dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
-
-        OH = (H + 2 * pad_h - KH) // stride_h + 1
-        OW = (W + 2 * pad_w - KW) // stride_w + 1
-        cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
-                    ((KW - 1) * dilation_w + 1))
-
-        return miopen.conv2d_forward(data,
-                                     kernel,
-                                     stride_h,
-                                     stride_w,
-                                     pad_h,
-                                     pad_w,
-                                     dilation_h,
-                                     dilation_w,
-                                     conv_mode=0,
-                                     data_type=1)
-
-    return conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
-
-
-@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'rocm', ["direct", 'winograd'])
-def schedule_conv2d_nchw_rocm(cfg, outs):
+    CO, CI, KH, KW = get_const_tuple(kernel.shape)
+    N, _, H, W = get_const_tuple(data.shape)
+
+    # handle dilation
+    stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
+    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))
+    pad_h, pad_w = pt + pb, pl + pr
+    dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
+
+    OH = (H + 2 * pad_h - KH) // stride_h + 1
+    OW = (W + 2 * pad_w - KW) // stride_w + 1
+    cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
+                ((KW - 1) * dilation_w + 1))
+
+    return miopen.conv2d_forward(data,
+                                 kernel,
+                                 stride_h,
+                                 stride_w,
+                                 pad_h,
+                                 pad_w,
+                                 dilation_h,
+                                 dilation_w,
+                                 conv_mode=0,
+                                 data_type=1)
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_miopen.rocm")
+def schedule_conv2d_nchw_miopen(cfg, outs):
     """TOPI schedule callback of conv2d for rocm
 
     Parameters
@@ -106,8 +99,4 @@ def schedule_conv2d_nchw_rocm(cfg, outs):
     s: Schedule
         The computation schedule for conv2d.
     """
-    target = tvm.target.Target.current()
-    if target and "miopen" in target.libs:
-        return generic.schedule_extern(outs)
-
-    return schedule_conv2d_nchw_cuda(cfg, outs)
+    return generic.schedule_extern(outs)
diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py
index f2adeaabef61..8729a62bd677 100644
--- a/topi/python/topi/rocm/dense.py
+++ b/topi/python/topi/rocm/dense.py
@@ -20,13 +20,12 @@
 import tvm
 from tvm import autotvm
 from tvm.contrib import rocblas
-import topi
-from ..nn.dense import dense, dense_default
+from .. import generic, nn
 from .. import tag
-from .. import generic
+from ..util import traverse_inline
 
-@autotvm.register_topi_compute(dense, "rocm", "direct")
-def dense_rocm(cfg, data, weight, bias=None, out_dtype=None):
+@autotvm.register_topi_compute('dense.rocm')
+def dense(cfg, data, weight, bias=None, out_dtype=None):
     """Dense operator for rocm backend.
 
     Parameters
@@ -54,21 +53,10 @@ def dense_rocm(cfg, data, weight, bias=None, out_dtype=None):
         assert len(bias.shape) == 1
     if out_dtype is None:
         out_dtype = data.dtype
-    batch, in_dim = data.shape
-    out_dim, _ = weight.shape
-    target = tvm.target.Target.current()
-    if "rocblas" in target.libs:
-        assert out_dtype == data.dtype, "Mixed precision not supported."
-        matmul = rocblas.matmul(data, weight, False, True)
-        if bias is not None:
-            matmul = tvm.compute((batch, out_dim), \
-                                 lambda i, j: matmul[i, j] + bias[j], \
-                                 tag=tag.BROADCAST)
-        return matmul
-    return dense_default(data, weight, bias, out_dtype)
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "rocm", "direct")
+    return nn.dense(data, weight, bias, out_dtype)
+
+
+@autotvm.register_topi_schedule('dense.rocm')
 def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 
@@ -83,7 +71,72 @@ def schedule_dense(cfg, outs):
     s: Schedule
         The computation schedule for dense.
     """
-    target = tvm.target.Target.current()
-    if target.target_name == "rocm" and "rocblas" in target.libs:
-        return generic.schedule_extern(outs)
-    return topi.cuda.schedule_dense(cfg, outs)
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'dense':
+            Dense = op.output(0)
+            num_thread = 64
+            k = Dense.op.reduce_axis[0]
+            ko, kf = s[Dense].split(k, factor=num_thread)
+            DenseF = s.rfactor(Dense, kf)
+
+            if Dense.op in s.outputs:
+                Out = Dense
+            else:
+                Out = outs[0].op.output(0)
+                s[Dense].compute_at(s[Out], s[Out].op.axis[1])
+            s[Out].bind(s[Out].op.axis[0], tvm.thread_axis("blockIdx.y"))
+            s[Out].bind(s[Out].op.axis[1], tvm.thread_axis("blockIdx.x"))
+
+            tx = s[Dense].op.reduce_axis[0]
+            thread_x = tvm.thread_axis("threadIdx.x")
+            s[Dense].bind(tx, thread_x)
+            s[DenseF].compute_at(s[Dense], tx)
+            s[Dense].set_store_predicate(thread_x.var.equal(0))
+            s[Out].set_store_predicate(thread_x.var.equal(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_compute('dense_rocblas.rocm')
+def dense_rocblas(cfg, data, weight, bias=None, out_dtype=None):
+    """Dense operator for rocm backend with cblas.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [out_dim]
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    assert out_dtype == data.dtype, "Mixed precision not supported."
+    matmul = rocblas.matmul(data, weight, False, True)
+    batch, in_dim = data.shape
+    out_dim, _ = weight.shape
+    cfg.add_flop(batch * in_dim * out_dim * 2)
+    if bias is not None:
+        matmul = tvm.compute((batch, out_dim),
+                             lambda i, j: matmul[i, j] + bias[j],
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+@autotvm.register_topi_schedule('dense_rocblas.rocm')
+def schedule_dense_rocblas(_, outs):
+    """Schedule for dense operator with rocm cblas"""
+    return generic.schedule_extern(outs)
diff --git a/topi/python/topi/rocm/nn.py b/topi/python/topi/rocm/nn.py
index 8a9c8c393da6..5f134cb32c98 100644
--- a/topi/python/topi/rocm/nn.py
+++ b/topi/python/topi/rocm/nn.py
@@ -17,12 +17,7 @@
 """scheduler for normalization functions on rocm backend"""
 from __future__ import absolute_import as _abs
 
-import tvm
-from .. import generic
 from .. import cpp
 
-@generic.schedule_lrn.register(["rocm", "gpu"])
 def schedule_lrn(outs):
-    target = tvm.target.Target.current(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.rocm.schedule_lrn(cpp_target, outs)
+    return cpp.rocm.schedule_lrn(outs)
diff --git a/topi/python/topi/sort.py b/topi/python/topi/sort.py
index 22899c4232f7..96a088923d2d 100644
--- a/topi/python/topi/sort.py
+++ b/topi/python/topi/sort.py
@@ -20,7 +20,6 @@
 from tvm import api
 from .util import get_const_tuple
 
-@tvm.target.generic_func
 def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
     """Performs sorting along the given axis and returns an array
     of indices having the same shape as an input array that index
@@ -99,7 +98,6 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
     return out
 
 
-@tvm.target.generic_func
 def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
     """Get the top k elements in an input tensor along the given axis.
 
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 91b7dc5bc60c..b0f4752ad492 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -53,3 +53,5 @@
 from .depth_to_space import depth_to_space_python
 from .space_to_depth import space_to_depth_python
 from .crop_and_resize_python import crop_and_resize_python
+from .common import get_injective_schedule, get_reduce_schedule, get_broadcast_schedule, \
+    get_elemwise_schedule, get_conv2d_nchw_implement, dispatch
diff --git a/topi/python/topi/testing/common.py b/topi/python/topi/testing/common.py
new file mode 100644
index 000000000000..5817513f7f65
--- /dev/null
+++ b/topi/python/topi/testing/common.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Common utility for topi test"""
+
+import tvm
+import topi
+
+_injective_schedule = {
+    "generic": topi.generic.schedule_injective,
+    "cpu": topi.x86.schedule_injective,
+    "arm_cpu": topi.arm_cpu.schedule_injective,
+    "gpu": topi.cuda.schedule_injective,
+    "hls": topi.hls.schedule_injective,
+    "opengl": topi.opengl.schedule_injective
+}
+
+_reduce_schedule = {
+    "generic": topi.generic.schedule_reduce,
+    "cpu": topi.x86.schedule_reduce,
+    "gpu": topi.cuda.schedule_reduce,
+    "hls": topi.cuda.schedule_reduce
+}
+
+def dispatch(target, dispatch_map):
+    if isinstance(target, str):
+        target = tvm.target.create(target)
+    assert isinstance(target, tvm.target.Target)
+    for key in target.keys:
+        if key in dispatch_map:
+            return dispatch_map[key]
+    return dispatch_map["generic"]
+
+def get_injective_schedule(target):
+    return dispatch(target, _injective_schedule)
+
+def get_reduce_schedule(target):
+    return dispatch(target, _reduce_schedule)
+
+get_broadcast_schedule = get_injective_schedule
+get_elemwise_schedule = get_injective_schedule
+
+_conv2d_nchw_implement = {
+    "generic": (topi.nn.conv2d_nchw, topi.generic.schedule_conv2d_nchw),
+    "cpu": (topi.x86.conv2d_nchw, topi.x86.schedule_conv2d_nchw),
+    "arm_cpu": (topi.arm_cpu.conv2d_nchw_spatial_pack,
+                topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
+    "gpu": (topi.cuda.conv2d_nchw, topi.cuda.schedule_conv2d_nchw),
+    "mali": (topi.mali.conv2d_nchw_spatial_pack,
+             topi.mali.schedule_conv2d_nchw_spatial_pack),
+    "bifrost": (topi.bifrost.conv2d_nchw_spatial_pack,
+                topi.bifrost.schedule_conv2d_nchw_spatial_pack),
+    "opengl": (topi.nn.conv2d_nchw, topi.opengl.schedule_conv2d_nchw),
+    "intel_graphics": (topi.intel_graphics.conv2d_nchw,
+                       topi.intel_graphics.schedule_conv2d_nchw),
+    "hls": (topi.nn.conv2d_nchw, topi.hls.schedule_conv2d_nchw)
+}
+
+def get_conv2d_nchw_implement(target):
+    return dispatch(target, _conv2d_nchw_implement)
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index 5bb36f7dfa74..c171f8ca5fe3 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -116,7 +116,7 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one):
                     out_tensor[i, j, k] = -one
     return valid_count, out_tensor
 
-@tvm.target.generic_func
+
 def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
     """Get valid count of bounding boxes given a score threshold.
     Also moves valid boxes to the top of input data.
@@ -289,7 +289,6 @@ def hybrid_nms(data, sorted_index, valid_count,
     return output, box_indices
 
 
-@tvm.target.generic_func
 def non_max_suppression(data, valid_count, max_output_size=-1,
                         iou_threshold=0.5, force_suppress=False, top_k=-1,
                         coord_start=2, score_index=1, id_index=0,
diff --git a/topi/python/topi/vision/rcnn/proposal.py b/topi/python/topi/vision/rcnn/proposal.py
index d48c89078ec0..5de4998c066c 100644
--- a/topi/python/topi/vision/rcnn/proposal.py
+++ b/topi/python/topi/vision/rcnn/proposal.py
@@ -317,7 +317,7 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
     body = ib.get()
     return body
 
-@tvm.target.generic_func
+
 def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold,
              rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss):
     """Proposal operator.
diff --git a/topi/python/topi/vision/rcnn/roi_align.py b/topi/python/topi/vision/rcnn/roi_align.py
index a6540b3666a5..a0bc5e291597 100644
--- a/topi/python/topi/vision/rcnn/roi_align.py
+++ b/topi/python/topi/vision/rcnn/roi_align.py
@@ -21,7 +21,6 @@
 from ...cpp.util import bilinear_sample_nchw
 
 
-@tvm.target.generic_func
 def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
     """ROI align operator in NCHW layout.
 
diff --git a/topi/python/topi/vision/rcnn/roi_pool.py b/topi/python/topi/vision/rcnn/roi_pool.py
index 53ffe35e7e1b..f346f580b3ba 100644
--- a/topi/python/topi/vision/rcnn/roi_pool.py
+++ b/topi/python/topi/vision/rcnn/roi_pool.py
@@ -19,7 +19,6 @@
 import tvm
 from ...util import get_const_tuple
 
-@tvm.target.generic_func
 def roi_pool_nchw(data, rois, pooled_size, spatial_scale):
     """ROI pool operator in NCHW layout.
 
diff --git a/topi/python/topi/vision/reorg.py b/topi/python/topi/vision/reorg.py
index 7adfc73d9be1..3ba5e8495a22 100644
--- a/topi/python/topi/vision/reorg.py
+++ b/topi/python/topi/vision/reorg.py
@@ -20,10 +20,8 @@
 Reorg operator, used in darknet.
 """
 from __future__ import absolute_import as _abs
-import tvm
 from .. import cpp
 
-@tvm.target.generic_func
 def reorg(data, stride):
     """Reorg forward operators.
 
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index 8c31f823cbe4..4309af4303f1 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -89,7 +89,6 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
     return output
 
 
-@tvm.target.generic_func
 def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
     """Generate prior(anchor) boxes from data, sizes and ratios.
 
@@ -233,7 +232,6 @@ def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
 
     return out_loc, valid_count
 
-@tvm.target.generic_func
 def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
                            variances=(0.1, 0.1, 0.2, 0.2)):
     """Location transformation for multibox detection
@@ -267,7 +265,6 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
                                          tvm.const(threshold, "float32"),
                                          tvm.convert(variances))
 
-@tvm.target.generic_func
 def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
                        force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
     """Convert multibox detection predictions.
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index d1c728d7b75c..ce07c194268a 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -19,9 +19,9 @@
 """x86 specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from .conv1d import schedule_conv1d_nwc
-from .conv2d import schedule_conv2d, schedule_conv2d_nhwc
-from .conv3d import schedule_conv3d_ndhwc
+from .conv1d import *
+from .conv2d import *
+from .conv3d import *
 from .binarize_pack import schedule_binarize_pack
 from .binary_dense import schedule_binary_dense
 from .nn import *
@@ -29,12 +29,12 @@
 from .injective import *
 from .reduction import *
 from .pooling import schedule_pool, schedule_adaptive_pool
-from .bitserial_conv2d import schedule_bitserial_conv2d
-from .bitserial_dense import schedule_bitserial_dense
-from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc
-from .dense import _schedule_dense, _schedule_dense_pack, _schedule_dense_nopack
-from .batch_matmul import schedule_batch_matmul
+from .bitserial_conv2d import *
+from .bitserial_dense import *
+from .depthwise_conv2d import *
+from .dense import *
+from .batch_matmul import *
 from .roi_align import roi_align_nchw
-from .conv2d_transpose import _schedule_conv2d_transpose_nchw
+from .conv2d_transpose import *
 from .sparse import *
 from .conv2d_alter_op import *
diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py
index fef6c48d6bed..a7cb9e98f11f 100644
--- a/topi/python/topi/x86/batch_matmul.py
+++ b/topi/python/topi/x86/batch_matmul.py
@@ -21,12 +21,12 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cblas
-from .. import generic, nn
+from .. import generic
 from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
 
 
-@autotvm.register_topi_compute(nn.batch_matmul, "cpu", "direct")
-def _declaration_batch_matmul_nopack(cfg, x, y):
+@autotvm.register_topi_compute("batch_matmul.x86")
+def batch_matmul(cfg, x, y):
     """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
     data in batch.
 
@@ -43,10 +43,6 @@ def _declaration_batch_matmul_nopack(cfg, x, y):
     output : tvm.Tensor
         3-D with shape [batch, M, N]
     """
-    target = tvm.target.Target.current()
-    if "cblas" in target.libs:
-        return cblas.batch_matmul(x, y, False, True)
-
     assert len(x.shape) == 3 and len(
         y.shape) == 3, "only support 3-dim batch_matmul"
     XB, M, XK = get_const_tuple(x.shape)
@@ -56,7 +52,7 @@ def _declaration_batch_matmul_nopack(cfg, x, y):
     B = XB
     K = XK
     if cfg.is_fallback:
-        _default_batch_matmul_nopack_config(cfg, M, N, K)
+        _default_batch_matmul_config(cfg, M, N, K)
 
     k = tvm.reduce_axis((0, K), name='k')
     C = tvm.compute(
@@ -66,7 +62,7 @@ def _declaration_batch_matmul_nopack(cfg, x, y):
     return C
 
 
-@autotvm.register_topi_schedule(generic.schedule_batch_matmul, "cpu", "direct")
+@autotvm.register_topi_schedule("batch_matmul.x86")
 def schedule_batch_matmul(cfg, outs):
     """Schedule for batch_matmul
 
@@ -83,10 +79,6 @@ def schedule_batch_matmul(cfg, outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    target = tvm.target.Target.current()
-    if "cblas" in target.libs:
-        return generic.schedule_extern(outs)
-
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
@@ -131,9 +123,42 @@ def _callback(op):
     return s
 
 
-def _default_batch_matmul_nopack_config(cfg, M, N, K):
+def _default_batch_matmul_config(cfg, M, N, K):
     cfg["tile_k"] = SplitEntity([K // 16, 16])
     x_bn = get_max_power2_factor(N, 8)
     cfg["tile_x"] = SplitEntity([N // x_bn, x_bn])
     y_bn = get_max_power2_factor(M, 8)
     cfg["tile_y"] = SplitEntity([M // y_bn, y_bn])
+
+
+@autotvm.register_topi_compute("batch_matmul_cblas.x86")
+def batch_matmul_cblas(cfg, x, y):
+    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
+    data in batch.
+
+    Parameters
+    ----------
+    cfg : ConfigSpace
+        Autotvm tuning space config file
+    x : tvm.Tensor
+        3-D with shape [batch, M, K]
+    y : tvm.Tensor
+        3-D with shape [batch, N, K]
+    Returns
+    -------
+    output : tvm.Tensor
+        3-D with shape [batch, M, N]
+    """
+    assert len(x.shape) == 3 and len(
+        y.shape) == 3, "only support 3-dim batch_matmul"
+    XB, M, XK = get_const_tuple(x.shape)
+    YB, N, YK = get_const_tuple(y.shape)
+    assert XB == YB, "batch dimension doesn't match"
+    assert XK == YK, "shapes of x and y is inconsistant"
+    cfg.add_flop(XB * M * N * XK * 2)
+    return cblas.batch_matmul(x, y, False, True)
+
+
+@autotvm.register_topi_schedule("batch_matmul_cblas.x86")
+def schedule_batch_matmul_cblas(_, outs):
+    return generic.schedule_extern(outs)
diff --git a/topi/python/topi/x86/binarize_pack.py b/topi/python/topi/x86/binarize_pack.py
index ea2bbed7345e..bab91a940edc 100644
--- a/topi/python/topi/x86/binarize_pack.py
+++ b/topi/python/topi/x86/binarize_pack.py
@@ -18,10 +18,8 @@
 """Schedule for binarization and bit-packing."""
 from __future__ import absolute_import as _abs
 import tvm
-from .. import generic
 
 
-@generic.schedule_binarize_pack.register(["cpu"])
 def schedule_binarize_pack(outs):
     """Schedule for binarize_pack.
 
diff --git a/topi/python/topi/x86/binary_dense.py b/topi/python/topi/x86/binary_dense.py
index abf090889ec3..ccf74e7bd230 100644
--- a/topi/python/topi/x86/binary_dense.py
+++ b/topi/python/topi/x86/binary_dense.py
@@ -19,10 +19,8 @@
 from __future__ import absolute_import as _abs
 import tvm
 from .. import tag
-from .. import generic
 
 
-@generic.schedule_binary_dense.register(["cpu"])
 def schedule_binary_dense(outs):
     """Schedule for binary_dense.
 
diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
index 97d0dc0eefaa..2ec565375654 100644
--- a/topi/python/topi/x86/bitserial_conv2d.py
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -18,12 +18,237 @@
 """Bitserial conv2d schedule on x86"""
 import tvm
 from tvm import autotvm
-from topi.util import get_const_int
-from .. import generic, tag
+from .. import tag
+from ..util import get_const_int, get_const_tuple
+from ..nn.pad import pad
+from ..nn.util import get_pad_tuple
+from ..nn.bitserial_util import bitpack, binary_op_multiplier
+
+@autotvm.register_topi_compute("bitserial_conv2d_nchw.x86")
+def bitserial_conv2d_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bits,
+                          pack_dtype='uint32', out_dtype='int16', unipolar=True):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    data_q = bitpack(data, in_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
+    # Check if kernel is already bitpacked
+    if len(kernel.shape) == 4:
+        kernel_q = bitpack(kernel, weight_bits, pack_axis=1, bit_axis=0, pack_type=pack_dtype)
+        KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape)
+    else:
+        kernel_vec = kernel
+        OCO, _, KH, KW, KB, VC = get_const_tuple(kernel_vec.shape)
+        CO = OCO * VC
+
+    IB, N, CI, H, W = get_const_tuple(data_q.shape)
+    KB, CO, _, KH, KW = get_const_tuple(kernel_q.shape)
+
+    if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
+        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
+    else:
+        TPAD, LPAD, DPAD, RPAD = padding
+    pad_before = [0, 0, 0, TPAD, LPAD]
+    pad_after = [0, 0, 0, DPAD, RPAD]
 
-@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_conv2d_nchw, ['cpu'], 'direct')
-@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_conv2d_nhwc, ['cpu'], 'direct')
-def schedule_bitserial_conv2d(cfg, outs):
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+    TH = H + TPAD + DPAD
+    TW = W + LPAD + RPAD
+    OH = (H + TPAD + DPAD - KH) // HSTR + 1
+    OW = (W + LPAD + RPAD - KW) // WSTR + 1
+
+    # ==================== define configuration space ====================
+    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
+    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
+    ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)
+
+    co, vc = cfg.define_split('tile_co', co, num_outputs=2,
+                              filter=lambda x: max(x.size[1:]) <= 16)
+    oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2,
+                              filter=lambda x: max(x.size[1:]) <= 16)
+    ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2,
+                              filter=lambda x: max(x.size[1:]) <= 16)
+    cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
+
+    cfg.define_reorder("reorder_0",
+                       [n, co, oh, ow, vc, vh, vw, kh, kw, kb, ib, ci],
+                       policy='interval_all', interval=(6, 11))
+    # binary ops
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
+    # ====================
+
+    VC = cfg["tile_co"].size[-1]
+    VH = cfg["tile_oh"].size[-1]
+    VW = cfg["tile_ow"].size[-1]
+
+    dvshape = (1, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT, IB)
+    kvshape = (CO//VC, CI, KH, KW, KB, VC)
+    ovshape = (1, CO//VC, OH//VH, OW//VW, VH, VW, VC)
+    oshape = (1, CO, OH, OW)
+
+    if (TPAD != 0 and RPAD != 0):
+        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \
+        data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
+
+    if len(kernel.shape) == 4:
+        kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \
+            kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    b1 = tvm.reduce_axis((0, IB), name='ib')
+    b2 = tvm.reduce_axis((0, KB), name='kb')
+
+    def _conv(n, co, h, w, vh, vw, vc):
+        b1b2 = (b1+b2).astype(out_dtype)
+        if unipolar:
+            return tvm.sum((tvm.popcount(
+                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) &
+                kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype))  -
+                            tvm.popcount(
+                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype)
+                                & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2,
+                           axis=[ci, dh, dw, b1, b2])
+
+        return tvm.sum((tvm.popcount(
+            data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
+            kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
+                       axis=[ci, dh, dw, b1, b2])
+
+    conv = tvm.compute(ovshape, _conv, name='conv_out')
+    idxd = tvm.indexdiv
+    idxm = tvm.indexmod
+
+    return tvm.compute(
+        oshape, lambda n, co, h, w:
+        conv[n,
+             idxd(co, VC), idxd(h, VH), idxd(w, VW),
+             idxm(h, VH), idxm(w, VW), idxm(co, VC)],
+        name='conv_vec', tag='spatial_bitserial_conv_nchw')
+
+@autotvm.register_topi_compute("bitserial_conv2d_nhwc.x86")
+def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bits,
+                          pack_dtype='uint32', out_dtype='int16', unipolar=True):
+    """ Compute convolution with pack on spatial axes. """
+    assert data.shape[0].value == 1, "spatial pack convolution only support batch size=1"
+    data_q = bitpack(data, in_bits, pack_axis=3, bit_axis=4, pack_type=pack_dtype)
+    pack_kernel = len(kernel.shape) == 4
+
+    if pack_kernel:
+        kernel_q = bitpack(kernel, weight_bits, pack_axis=2, bit_axis=4, pack_type=pack_dtype)
+    else:
+        kernel_q = kernel
+
+    KH, KW, _, CO, KB = get_const_tuple(kernel_q.shape)
+    N, H, W, CI, IB = get_const_tuple(data_q.shape)
+
+    if isinstance(padding, int) or (isinstance(padding, (tuple, list)) and len(padding) == 2):
+        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
+    else:
+        TPAD, LPAD, DPAD, RPAD = padding
+    pad_before = [0, TPAD, LPAD, 0, 0]
+    pad_after = [0, DPAD, RPAD, 0, 0]
+
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    HCAT, WCAT = KH-1, KW-1
+
+    PAD_H = H + (TPAD + DPAD)
+    PAD_W = W + (LPAD + RPAD)
+    OH = (PAD_H - KH) // HSTR + 1
+    OW = (PAD_W - KW) // WSTR + 1
+    oshape = (1, OH, OW, CO)
+
+    # ==================== define configuration space ====================
+    n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO)
+    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
+    ib, kb = cfg.reduce_axis(in_bits), cfg.reduce_axis(weight_bits)
+
+    co, vc = cfg.define_split('tile_co', co, num_outputs=2,
+                              filter=lambda x: max(x.size[1:]) <= 16)
+    oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2,
+                              filter=lambda x: max(x.size[1:]) <= 16)
+    ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2,
+                              filter=lambda x: max(x.size[1:]) <= 16)
+    cfg.define_annotate('ann_reduce', [ib, kb, kh, kw], policy='try_unroll')
+    cfg.define_reorder("reorder_0",
+                       [n, oh, ow, co, vh, vw, kh, kw, kb, ib, vc, ci],
+                       policy='interval_all', interval=(3, 7))
+    # binary ops
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW * binary_op_multiplier(pack_dtype))
+    # ====================
+
+    VC = cfg["tile_co"].size[-1]
+    VH = cfg["tile_oh"].size[-1]
+    VW = cfg["tile_ow"].size[-1]
+
+    dvshape = (1, PAD_H//(VH*HSTR), PAD_W//(VW*WSTR), VH*HSTR+HCAT, VW*WSTR+WCAT, CI, IB)
+    kvshape = (CO, KH, KW, CI, VC, KB)
+    ovshape = (1, OH, OW, CO, VH, VW, VC)
+    oshape = (1, OH, OW, CO)
+
+    if (DPAD != 0 and RPAD != 0):
+        data_pad = pad(data_q, pad_before, pad_after, name="data_pad")
+    else:
+        data_pad = data_q
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \
+        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec')
+
+    kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \
+        kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    dh = tvm.reduce_axis((0, KH), name='dh')
+    dw = tvm.reduce_axis((0, KW), name='dw')
+    b1 = tvm.reduce_axis((0, IB), name='ib')
+    b2 = tvm.reduce_axis((0, KB), name='kb')
+
+    def _conv(n, h, w, co, vh, vw, vc):
+        b1b2 = (b1+b2).astype(out_dtype)
+        if unipolar:
+            return tvm.sum(
+                ((tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+                               kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) -
+                  tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1]&
+                               ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2),
+                axis=[dh, dw, ci, b1, b2])
+
+        return tvm.sum(tvm.popcount(
+            data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+            kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
+                       axis=[dh, dw, ci, b1, b2])
+
+    conv = tvm.compute(ovshape, _conv, name='conv')
+
+    idxd = tvm.indexdiv
+    idxm = tvm.indexmod
+    return tvm.compute(
+        oshape, lambda n, h, w, co:
+        conv[n,
+             idxd(h, VH), idxd(w, VW), idxd(co, VC),
+             idxm(h, VH), idxm(w, VW), idxm(co, VC)],
+        name='output_unpack', tag='spatial_bitserial_conv_nhwc')
+
+@autotvm.register_topi_schedule("bitserial_conv2d_nchw.x86")
+def schedule_bitserial_conv2d_nchw(cfg, outs):
+    return _schedule_bitserial_conv2d(cfg, outs)
+
+@autotvm.register_topi_schedule("bitserial_conv2d_nhwc.x86")
+def schedule_bitserial_conv2d_nhwc(cfg, outs):
+    return _schedule_bitserial_conv2d(cfg, outs)
+
+def _schedule_bitserial_conv2d(cfg, outs):
     """CPU schedule for bitserial convolutions NCHW and NHWC"""
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
diff --git a/topi/python/topi/x86/bitserial_dense.py b/topi/python/topi/x86/bitserial_dense.py
index 47b972fa1319..d464cae951b3 100644
--- a/topi/python/topi/x86/bitserial_dense.py
+++ b/topi/python/topi/x86/bitserial_dense.py
@@ -19,11 +19,85 @@
 from __future__ import absolute_import as _abs
 import tvm
 from tvm import autotvm
-from topi.util import get_const_int
+from topi.util import get_const_int, get_const_tuple
 from .. import tag
-from .. import generic
+from ..nn.bitserial_util import bitpack, binary_op_multiplier
 
-@autotvm.register_topi_schedule(generic.nn.schedule_bitserial_dense, ['cpu'], 'direct')
+@autotvm.register_topi_compute('bitserial_dense.x86')
+def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype='uint32',
+                    out_dtype='int16', unipolar=True):
+    """Bitserial dense implementation. TODO: Why are these separate
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [batch, in_dim]
+    weight : tvm.Tensor
+        2-D with shape [out_dim, in_dim] or
+        3-D with shape [out_dim, weight_bits, in_dim]
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
+    if len(weight.shape) == 2:
+        weight_packed = bitpack(weight, weight_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
+    else:
+        weight_packed = weight
+    Y, DB, K = get_const_tuple(data_packed.shape)
+    X, WB, _ = get_const_tuple(weight_packed.shape)
+    ######## Search space
+    x, y = cfg.axis(X), cfg.axis(Y)
+    db, wb, k = cfg.reduce_axis(DB), cfg.reduce_axis(WB), cfg.reduce_axis(K)
+    ko, ki = cfg.define_split('tile_k', k, num_outputs=2)
+    yo, yi = cfg.define_split('tile_y', y, num_outputs=2)
+    xo, xi = cfg.define_split('tile_x', x, num_outputs=2)
+
+    cfg.define_reorder('reorder_0', [yo, xo, ko, yi, wb, db, ki, xi],
+                       policy='candidate', candidate=[
+                           [yo, xo, ko, yi, wb, db, ki, xi],
+                           [yo, xo, yi, ko, wb, db, ki, xi]])
+
+    cfg.define_annotate('ann_reduce', [db, wb], policy='try_unroll')
+    cfg.define_annotate('ann_spatial', [yi, xi], policy='try_unroll_vec')
+
+    ###### Compute rule
+    VX = cfg['tile_x'].size[-1]
+
+    wvshape = (X//VX, WB, VX, K)
+    oshape = (Y, X)
+
+    k = tvm.reduce_axis((0, K), name='k')
+    db = tvm.reduce_axis((0, DB), name='db')
+    wb = tvm.reduce_axis((0, WB), name='wb')
+
+    # Tile data and weights
+    weight_vec = tvm.compute(wvshape, lambda xo, wb, vx, k:
+                             weight_packed[xo*VX+vx][wb][k], name='weight_vec')
+
+    idxdiv = tvm.indexdiv
+    idxmod = tvm.indexmod
+
+    matmul_unipolar = tvm.compute(oshape, lambda i, j: tvm.sum(
+        (tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) -
+         tvm.popcount(~weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k])
+        ).astype(out_dtype)
+        << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar')
+
+    matmul = tvm.compute(oshape, lambda i, j: tvm.sum(
+        tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]
+                    ).astype(out_dtype)
+        << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense')
+
+    # binary ops
+    cfg.add_flop(2 * Y * X * K * binary_op_multiplier(pack_dtype))
+
+    if unipolar:
+        return matmul_unipolar
+    return matmul
+
+@autotvm.register_topi_schedule('biserial_dense.x86')
 def schedule_bitserial_dense(cfg, outs):
     """Schedule for bitserial_dense.
 
diff --git a/topi/python/topi/x86/conv1d.py b/topi/python/topi/x86/conv1d.py
index 95fd159acd47..70c2a6881dbf 100644
--- a/topi/python/topi/x86/conv1d.py
+++ b/topi/python/topi/x86/conv1d.py
@@ -18,10 +18,9 @@
 """Conv1D schedule on for Intel CPU"""
 from __future__ import absolute_import as _abs
 import tvm
-from .. import generic, tag
+from .. import tag
 
 
-@generic.schedule_conv1d_ncw.register(["cpu"])
 def schedule_conv1d_ncw(outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
@@ -76,7 +75,6 @@ def traverse(op):
     return s
 
 
-@generic.schedule_conv1d_nwc.register(["cpu"])
 def schedule_conv1d_nwc(outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 95ce3376ac3a..2403b01b7453 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -14,25 +14,20 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member,import-outside-toplevel
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
 """Conv2D schedule on x86"""
 
 import logging
-import re
 
 import tvm
 from tvm import autotvm
-from tvm.autotvm.task.topi_integration import deserialize_args
-from tvm.autotvm.task import get_config
-from .. import generic, tag
 from .. import nn
-from ..nn.conv2d import conv2d, conv2d_NCHWc, \
-    conv2d_infer_layout, _get_workload as _get_conv2d_workload
+from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
+from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
-from ..nn.pad import pad
 from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple
-
+from ..util import get_const_tuple, traverse_inline
 from . import conv2d_avx_1x1, conv2d_avx_common
 
 logger = logging.getLogger('topi')
@@ -61,219 +56,30 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth
         else:
             conv2d_avx_common._fallback_schedule(cfg, wkl)
 
-def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout):
-    """Create schedule configuration from input arguments"""
-    dshape = get_const_tuple(data.shape)
-    kshape = get_const_tuple(kernel.shape)
-    pat = re.compile(r'NCHW.+(\d+)c')
-    if layout == 'NCHW':
-        n, ic, h, w = dshape
-        oc, _, kh, kw = kshape
-    elif layout == 'NHWC':
-        n, h, w, ic = dshape
-        kh, kw, oc, _ = kshape
-    elif pat.match(layout) is not None:
-        n, ic_chunk, h, w, ic_bn = dshape
-        target = tvm.target.Target.current(allow_none=False)
-        oc_chunk, k_ic_chunk, kh, kw, k_ic_bn, oc_bn = kshape
-        assert ic_chunk == k_ic_chunk
-        assert ic_bn == k_ic_bn
-        ic = ic_chunk*ic_bn
-        oc = oc_chunk*oc_bn
-    else:
-        raise ValueError("Not support this layout {} with "
-                         "schedule template.".format(layout))
-
-    is_kernel_1x1 = kh == 1 and kw == 1
-    pt, pl, pb, pr = get_pad_tuple(padding, (kh, kw))
-    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    oh = (h - kh + pt + pb) // sh + 1
-    ow = (w - kw + pl + pr) // sw + 1
-
-    # Create schedule config
-    cfg.define_split("tile_ic", ic, num_outputs=2)
-    cfg.define_split("tile_oc", oc, num_outputs=2)
-    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
-    if is_kernel_1x1:
-        cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
-    else:
-        cfg.define_knob("unroll_kw", [True, False])
-
-
-@autotvm.register_topi_compute(conv2d, 'cpu', ['direct'])
-def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
-
-    if layout == 'NCHW':
-        _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout)
-        if cfg.is_fallback:
-            _get_default_config(cfg, data, kernel, strides, padding, out_dtype)
-        return _declaration_conv_impl(cfg, data, kernel, strides,
-                                      padding, dilation, layout, out_dtype)
-
-    # HWOI kernel layout is for NHWC and HWCN
-    kh, kw, _, _ = get_const_tuple(kernel.shape)
-    if layout == 'HWCN':
-        return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
-    # FIXME - https://github.com/apache/incubator-tvm/issues/4122
-    # _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO
-    # layout. Commenting until we have clarity about the nhwc_pack implementation from the author.
-    # elif layout == 'NHWC' and kh == 1 and kw == 1 and kernel.dtype == "int8":
-    #     if cfg.is_fallback:
-    #         _get_default_config(cfg, data, kernel, strides, padding, out_dtype, False, layout)
-    #     # specialize for INT8 1X1 conv on X86
-    #     return conv2d_avx_1x1._declaration_conv_nhwc_pack(cfg, data, kernel, strides,
-    #                                                       padding, dilation, out_dtype)
-    if layout == 'NHWC':
-        return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
-    raise ValueError("not support this layout {} yet".format(layout))
-
-
-def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-    assert layout == 'NCHW', "only support NCHW convolution for AVX"
-
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(dilation, int):
-        dilation_h, dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    HSTR, WSTR = strides
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_height, kernel_width))
-    pad_h = pad_top + pad_down
-    pad_w = pad_left + pad_right
-
-    pad_height = in_height + pad_h
-    pad_width = in_width + pad_w
-
-    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
-    out_height = (in_height + pad_h - dilated_kernel_h) // HSTR + 1
-    out_width = (in_width + pad_w - dilated_kernel_w) // WSTR + 1
-
-    # pack data
-    DOPAD = (pad_h != 0 or pad_w != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right), \
-            name="data_pad")
-    else:
-        data_pad = data
-
-    # fetch schedule
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-
-    shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width)
-    data_vec = tvm.compute(shape,
-                           lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w],
-                           name='data_vec')
-
-    # pack kernel
-    shape = (num_filter//oc_bn, in_channel//ic_bn,
-             kernel_height, kernel_width, ic_bn, oc_bn)
-    kernel_vec = tvm.compute(shape,
-                             lambda CO, CI, h, w, ci, co:
-                             kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w],
-                             name='kernel_vec')
-
-    # convolution
-    oshape = (batch_size, num_filter//oc_bn, out_height, out_width, oc_bn)
-    unpack_shape = (batch_size, num_filter, out_height, out_width)
-
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
-    idxmod = tvm.indexmod
+@conv2d_infer_layout.register("cpu")
+def _conv2d_infer_layout(workload, cfg):
+    _, data, kernel, strides, padding, dilation, layout, _, dtype = workload
+    batch_size, in_channel, in_height, in_width = data[1]
+    out_channel, _, k_height, k_width = kernel[1]
     idxdiv = tvm.indexdiv
 
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_vec[n, idxdiv(ic, ic_bn), oh*HSTR+kh*dilation_h,
-                                        idxmod(ic, ic_bn),
-                                        ow*WSTR+kw*dilation_w].astype(out_dtype) *
-                               kernel_vec[oc_chunk, idxdiv(ic, ic_bn), kh, kw,
-                                          idxmod(ic, ic_bn),
-                                          oc_block].astype(out_dtype),
-                               axis=[ic, kh, kw]), name='conv')
-
-    unpack = tvm.compute(unpack_shape,
-                         lambda n, c, h, w: conv[n, idxdiv(c, oc_bn), h, w, idxmod(c, oc_bn)]
-                         .astype(out_dtype),
-                         name='output_unpack',
-                         tag='conv2d_nchw')
-    return unpack
-
-
-@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'cpu', ['direct'])
-def schedule_conv2d(cfg, outs):
-    """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-
-        if 'conv2d_nchw' in op.tag:
-            output = op.output(0)
-            conv_out = op.input_tensors[0]
-            kernel_vec = conv_out.op.input_tensors[1]
-            kernel = kernel_vec.op.input_tensors[0]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-                s[kernel].compute_inline()
-            data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-
-            _, _, kh, kw = get_const_tuple(kernel.shape)
-            is_kernel_1x1 = kh == 1 and kw == 1
-            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
-            if is_kernel_1x1:
-                conv2d_avx_1x1._schedule_conv(*args)
-            else:
-                conv2d_avx_common._schedule_conv(*args)
-
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
-    return s
+    pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
+    out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1
+    out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1
+    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic)
+    in_layout = "NCHW%dc" % tile_ic
+    out_shape = (batch_size, idxdiv(out_channel, tile_oc), out_height, out_width, tile_oc)
+    out_layout = "NCHW%dc" % tile_oc
+    return ((in_shape, in_layout),), ((out_shape, out_layout),)
 
-@generic.schedule_conv2d_nhwc.register("cpu")
 def schedule_conv2d_nhwc(outs):
-    """Create schedule for tensors"""
+    """Create schedule for conv2d_nhwc"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     output_op = outs[0].op
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            else: # inject custom schedule
-                if len(op.axis) == 4: # schedule bias + bn + relu
-                    n, h, w, c = op.axis
-                    fused = s[op].fuse(n, h, w)
-                    s[op].parallel(fused)
-                    s[op].vectorize(c)
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
 
+    def _callback(op):
         if 'conv2d_nhwc' in op.tag:
             conv = op.output(0)
             kernel = op.input_tensors[1]
@@ -291,146 +97,133 @@ def traverse(op):
             s[data_pad].parallel(pad_fused)
             C = conv
             n, h, w, c = C.op.axis
-            ry, rx, rc = C.op.reduce_axis
-            n_out, h_out, w_out, c_out = output_op.axis
             s[C].vectorize(c)
-            if op != output_op: # fuse bias + bn + relu into conv
-                s[C].compute_at(s[output_op], c_out)
-            else:
-                fused = s[C].fuse(n, h, w)
-                s[C].parallel(fused)
-
-        scheduled_ops.append(op)
 
-    traverse(output_op)
+            O = output_op.output(0)
+            if len(O.op.axis) == 4: # schedule bias + bn + relu
+                n, h, w, c = O.op.axis
+                fused = s[O].fuse(n, h, w)
+                s[O].parallel(fused)
+                channels = int(O.shape[-1])
+                if channels % 64 == 0:
+                    c, ci = s[O].split(c, 64)
+                    s[O].vectorize(ci)
+                if C != O:
+                    s[C].compute_at(s[O], c)
+
+    traverse_inline(s, output_op, _callback)
     return s
 
+def conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype):
+    layout = "NCHW"
+    packed_out = conv2d_NCHWc(data, kernel, strides, padding, dilation,
+                              layout, layout, out_dtype)
+    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
 
-# Define template function for autotvm task
-# We define schedule template in this function instead of
-# declaration function since actual input arguments need
-# to be altered by the schedule selected.
-@autotvm.task.register("topi_x86_conv2d_NCHWc")
-def _topi_nn_conv2d_NCHWc(*args, **kwargs):
-    assert not kwargs, "Do not support kwargs in template function call"
-    args = deserialize_args(args)
-
-    if len(args) == 7:
-        data, kernel, strides, padding, dilation, origin_layout, dtype = args
-    else:
-        assert len(args) == 8
-        data, kernel, strides, padding, dilation, origin_layout, out_layout, dtype = args
-
-    raw_data_shape = get_const_tuple(data.shape)
-    raw_kernel_shape = get_const_tuple(kernel.shape)
-
-    # get config here
-    cfg = get_config()
-    _create_tuning_space(cfg, data, kernel, strides, padding, dilation, origin_layout)
-
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
-    # change shape with the value in config
-    ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
-                           cfg["tile_ow"].size[-1])
-    new_data_shape = (raw_data_shape[0], idxdiv(raw_data_shape[1], ic_bn),
-                      raw_data_shape[2], raw_data_shape[3], ic_bn)
-    data_layout = "NCHW%dc" % ic_bn
-    out_layout = "NCHW%dc" % oc_bn
-    new_kernel_shape = (idxdiv(raw_kernel_shape[0], oc_bn),
-                        idxdiv(raw_kernel_shape[1], ic_bn),
-                        raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn)
-    new_data = tvm.placeholder(new_data_shape, data.dtype)
-    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
+def schedule_conv2d_nchw(outs):
+    """Create schedule for tensors"""
+    return schedule_conv2d_NCHWc(outs)
 
-    C = _declaration_conv_NCHWc(cfg, new_data, new_kernel, strides, padding, dilation,
-                                data_layout, out_layout, dtype)
-    s = _schedule_conv2d_NCHWc(cfg, [C])
-    return s, [new_data, new_kernel, C]
+def _pack_data(cfg, data, kernel):
+    n, _, ih, iw = get_const_tuple(data.shape)
+    oc, ic, kh, kw = get_const_tuple(kernel.shape)
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
 
+    ic_chunk = ic // ic_bn
+    oc_chunk = oc // oc_bn
 
-@conv2d_infer_layout.register("cpu")
-def _conv2d_infer_layout(workload, cfg):
-    _, data, kernel, strides, padding, dilation, layout, dtype = workload
-    batch_size, in_channel, in_height, in_width = data[:-1]
-    out_channel, _, k_height, k_width = kernel[:-1]
-    idxdiv = tvm.indexdiv
+    data = tvm.compute((n, ic_chunk, ih, iw, ic_bn),
+                       lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
+                       name="data_vec")
 
-    pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
-    out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1
-    out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1
-    tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic)
-    in_layout = "NCHW%dc" % tile_ic
-    out_shape = (batch_size, idxdiv(out_channel, tile_oc), out_height, out_width, tile_oc)
-    out_layout = "NCHW%dc" % tile_oc
-    return ((in_shape, in_layout),), ((out_shape, out_layout),)
+    kernel = tvm.compute(
+        (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn),
+        lambda occ, icc, k_h, k_w, icb, ocb:
+        kernel[occ * oc_bn + ocb, icc * ic_bn + icb, k_h, k_w],
+        name="kernel_vec")
 
+    return data, kernel
 
-@autotvm.register_topi_compute(conv2d_NCHWc, 'cpu', 'direct')
-def _declaration_conv_NCHWc(cfg, data, kernel, strides,
-                            padding, dilation, layout, out_layout, out_dtype):
+@autotvm.register_topi_compute("conv2d_NCHWc.x86")
+def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layout, out_dtype):
+    """Compute conv2d with NCHWc layout."""
     # layout and out_layout are not used here,
     # we keep them for debug convenience when dumping autotvm workload
-    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
-    in_channel = ic_chunk * ic_bn
-    oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = \
+    if len(data.shape) == 5:
+        n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
+        oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = \
             get_const_tuple(kernel.shape)
-    num_filter = oc_chunk * oc_bn
+        in_channel = ic_chunk * ic_bn
+        num_filter = oc_chunk * oc_bn
+    else:
+        n, in_channel, ih, iw = get_const_tuple(data.shape)
+        num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
+
+    # Define autotvm tuning space
+    is_kernel_1x1 = kernel_height == 1 and kernel_width == 1
+    pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width))
+    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    oh = (ih - kernel_height + pt + pb) // sh + 1
+    ow = (iw - kernel_width + pl + pr) // sw + 1
+
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", num_filter, num_outputs=2)
+    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    if is_kernel_1x1:
+        cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
+    else:
+        cfg.define_knob("unroll_kw", [True, False])
 
-    # If no config was set, we can fallback to NCHW config.
+    # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
         _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
                             tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
                                             dtype=kernel.dtype),
                             strides, padding, out_dtype)
 
-    return nn.conv2d_NCHWc_compute(data,
-                                   kernel,
-                                   strides,
-                                   padding,
-                                   dilation,
-                                   layout,
-                                   out_layout,
-                                   out_dtype)
-
-
-@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc, 'cpu', ['direct'])
-def _schedule_conv2d_NCHWc(cfg, outs):
+    # Pack data if raw 4-D data is provided.
+    # This can only happen when autotuning.
+    if len(data.shape) == 4:
+        data, kernel = _pack_data(cfg, data, kernel)
+
+    return nn.conv2d_NCHWc(data,
+                           kernel,
+                           strides,
+                           padding,
+                           dilation,
+                           layout,
+                           out_layout,
+                           out_dtype)
+
+@autotvm.register_topi_schedule("conv2d_NCHWc.x86")
+def schedule_conv2d_NCHWc(cfg, outs):
     """Create schedule for tensors"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
 
+    def _callback(op):
         if 'conv2d_NCHWc' in op.tag:
             conv_out = op.output(0)
-            kernel = conv_out.op.input_tensors[1]
+            kernel_vec = conv_out.op.input_tensors[1]
             data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0] \
-                if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
-                else data_vec
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
 
-            args = [s, cfg, data_vec, conv_out, outs[0]]
-            target = tvm.target.Target.current(allow_none=False)
-            _, _, kh, kw, _, _, = get_const_tuple(kernel.shape)
+            args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
+            _, _, kh, kw, _, _, = get_const_tuple(kernel_vec.shape)
             if kh == 1 and kw == 1:
                 conv2d_avx_1x1._schedule_conv_NCHWc(*args)
             else:
                 conv2d_avx_common._schedule_conv_NCHWc(*args)
 
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+# FIXME - https://github.com/apache/incubator-tvm/issues/4122
+# _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO
+# layout. Commenting until we have clarity about the nhwc_pack implementation from the author.
+# elif layout == 'NHWC' and kh == 1 and kw == 1 and kernel.dtype == "int8":
+#     if cfg.is_fallback:
+#         _get_default_config(cfg, data, kernel, strides, padding, out_dtype, False, layout)
+#     # specialize for INT8 1X1 conv on X86
+#     return conv2d_avx_1x1._declaration_conv_nhwc_pack(cfg, data, kernel, strides,
+#                                                       padding, dilation, out_dtype)
diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py
index 8b0c13c2c0bb..377d81539b7c 100644
--- a/topi/python/topi/x86/conv2d_alter_op.py
+++ b/topi/python/topi/x86/conv2d_alter_op.py
@@ -23,117 +23,103 @@
 from tvm import relay
 from tvm import autotvm
 from .conv2d import _get_default_config
-from .conv2d_int8 import _is_int8_hw_support, _get_default_config_int8
-from ..util import get_const_tuple, get_shape
-from ..nn import conv2d_legalize
-from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_NCHWc_int8, conv2d_alter_layout
-from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, depthwise_conv2d_nchw
+from .conv2d_int8 import is_int8_hw_support, _get_default_config_int8
+from ..util import get_const_tuple
+from ..nn import conv2d_legalize, conv2d_alter_layout
 from ..nn.util import get_pad_tuple
 
 logger = logging.getLogger('topi')
 
 @conv2d_alter_layout.register("cpu")
-def _alter_conv2d_layout(attrs, inputs, tinfo, F):
+def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
+        cfg = dispatch_ctx.query(target, None)
+        workload = cfg.workload
+    else:
+        _, outs = relay.backend.compile_engine.select_implementation(
+            relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target)
+        workload = autotvm.task.get_workload(outs)
+        if workload is None:
+            # The best implementation is not an AutoTVM template,
+            # we then assume it's not necessary to alter this op.
+            return None
+        cfg = dispatch_ctx.query(target, workload)
+
+    topi_tmpl = workload[0]
+    new_attrs = {k : attrs[k] for k in attrs.keys()}
+
     # Parse the attributes.
-    groups = attrs.get_int("groups")
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
-    out_dtype = attrs["out_dtype"]
-    layout_name = 'data_layout'
-    data_layout = attrs[layout_name]
-    kh, kw = attrs.get_int_tuple("kernel_size")
-
-    data_tensor, kernel_tensor = tinfo[0], tinfo[1]
-    if attrs[layout_name] == 'NHWC' and attrs['kernel_layout'] == 'HWIO':
-        batch_size, height, width, in_channel = get_const_tuple(data_tensor.shape)
-        kh, kw, _, out_channel = get_const_tuple(kernel_tensor.shape)
-    elif attrs[layout_name] == 'NCHW' and attrs['kernel_layout'] == 'OIHW':
-        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
-    else:
-        return None
-
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data_tensor, kernel_tensor = tinfos
     data_dtype = data_tensor.dtype
     kernel_dtype = kernel_tensor.dtype
-    out_dtype = data_dtype if out_dtype in ("same", "") else out_dtype
-
-    # Check if depthwise.
-    kshape = get_shape(kernel_tensor.shape, attrs["kernel_layout"], "OIHW")
-    is_depthwise = groups == kshape[0] and kshape[1] == 1
-
-    # Save the input exprs.
-    copy_inputs = list(inputs)
-
-    # Set the new attrs
-    new_attrs = {k : attrs[k] for k in attrs.keys()}
-    new_attrs['channels'] = out_channel
-
-    # Return if the groups is not 1 and depthwise.
-    if groups != 1 and not is_depthwise:
-        return None
-
-    # Set workload. Config update.
-    dispatch_ctx = autotvm.task.DispatchContext.current
-    target = tvm.target.Target.current()
-
-    if is_depthwise:
-        workload = autotvm.task.args_to_workload(
-            [data_tensor, kernel_tensor, strides, padding, dilation, out_dtype],
-            depthwise_conv2d_nchw)
-    else:
-        workload = autotvm.task.args_to_workload(
-            [data_tensor, kernel_tensor, strides, padding, dilation, data_layout, out_dtype],
-            conv2d)
-
-    cfg = dispatch_ctx.query(target, workload)
-    if cfg.is_fallback:
-        if _is_int8_hw_support(data_dtype, kernel_dtype):
-            _get_default_config_int8(cfg, data_tensor, kernel_tensor, strides, padding, out_dtype,
-                                     is_depthwise, data_layout)
-        else:
-            _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding, out_dtype,
-                                is_depthwise, data_layout)
+    out_dtype = out_type.dtype
+
+    if topi_tmpl == "conv2d_NCHWc.x86":
+        # we only convert conv2d_NCHW to conv2d_NCHWc for x86
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        if cfg.is_fallback:
+            _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding,
+                                out_dtype, False, data_layout)
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
+        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
 
-    # Get the tiling parameters to set the layout names.
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    new_attrs[layout_name] = 'NCHW%dc' % ic_bn
-    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
-    new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
-                               dtype=data_dtype)
+        # update new attrs
+        new_attrs['channels'] = out_channel
+        new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
+        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
+        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+        new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
 
-    if is_depthwise and data_layout == 'NCHW' and attrs['kernel_layout'] == 'OIHW':
-        new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn
         # Store altered operator's config
-        new_kernel = tvm.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
+        new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                   dtype=data_dtype)
+        new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn,
+                                      kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype)
         new_workload = autotvm.task.args_to_workload(
-            [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name],
-             new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc)
+            [new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"],
+             new_attrs["out_layout"], out_dtype], topi_tmpl)
         dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
 
-        return F.nn.contrib_depthwise_conv2d_nchwc(*copy_inputs, **new_attrs)
+    if topi_tmpl == "conv2d_NCHWc_int8.x86":
+        # TODO(@icemelon9, @anijain2305): Need to support data layout NHWC with kernel layout HWIO
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        if cfg.is_fallback:
+            _get_default_config_int8(cfg, data_tensor, kernel_tensor, strides, padding,
+                                     out_dtype, False, data_layout)
 
-    if _is_int8_hw_support(data_dtype, kernel_dtype):
-        # Convert kernel data layout from 4D to 7D
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
+        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
         n_elems = 4
-        data_expr, kernel_expr = inputs
-        if attrs['kernel_layout'] == 'HWIO':
-            kernel_IHWO = F.transpose(kernel_expr, axes=(2, 0, 1, 3))
-        elif attrs['kernel_layout'] == 'OIHW':
-            kernel_IHWO = F.transpose(kernel_expr, axes=(1, 2, 3, 0))
-        else:
-            return None
-
-        kernel_IHWOo = F.reshape(kernel_IHWO, (in_channel, kh, kw, out_channel//oc_bn, oc_bn))
-        kernel_OHWoI = F.transpose(kernel_IHWOo, axes=(3, 1, 2, 4, 0))
-        kernel_OHWoIi = F.reshape(kernel_OHWoI, (out_channel//oc_bn, kh, kw, oc_bn,
-                                                 in_channel//ic_bn, ic_bn))
-        kernel_OHWoIie = F.reshape(kernel_OHWoIi, (out_channel//oc_bn, kh, kw, oc_bn,
-                                                   in_channel//ic_bn, ic_bn//n_elems, n_elems))
-        kernel_OIHWioe = F.transpose(kernel_OHWoIie, axes=(0, 4, 1, 2, 5, 3, 6))
-        copy_inputs = [data_expr, kernel_OIHWioe]
 
-        # Store altered operator's config. New kernel layout OIHWio4
+        # convert kernel data layout from 4D to 7D
+        data_expr, kernel_expr = inputs
+        kernel_IHWO = relay.transpose(kernel_expr, axes=(1, 2, 3, 0))
+        kernel_IHWOo = relay.reshape(kernel_IHWO, (in_channel, kh, kw, out_channel//oc_bn, oc_bn))
+        kernel_OHWoI = relay.transpose(kernel_IHWOo, axes=(3, 1, 2, 4, 0))
+        kernel_OHWoIi = relay.reshape(kernel_OHWoI, (out_channel//oc_bn, kh, kw, oc_bn,
+                                                     in_channel//ic_bn, ic_bn))
+        kernel_OHWoIie = relay.reshape(kernel_OHWoIi, (out_channel//oc_bn, kh, kw, oc_bn,
+                                                       in_channel//ic_bn, ic_bn//n_elems, n_elems))
+        kernel_OIHWioe = relay.transpose(kernel_OHWoIie, axes=(0, 4, 1, 2, 5, 3, 6))
+
+        # update new attrs
+        new_attrs['channels'] = out_channel
+        new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
+        new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+
+        # Store altered operator's config.
+        new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                   dtype=data_dtype)
         new_kernel = tvm.placeholder((out_channel // oc_bn,
                                       in_channel // ic_bn,
                                       kh,
@@ -141,30 +127,41 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
                                       ic_bn // n_elems,
                                       oc_bn,
                                       n_elems), dtype=kernel_dtype)
-
-        new_workload = autotvm.task.args_to_workload([new_data,
-                                                      new_kernel,
-                                                      strides,
-                                                      padding,
-                                                      dilation,
-                                                      new_attrs[layout_name],
-                                                      new_attrs['out_layout'],
-                                                      out_dtype],
-                                                     conv2d_NCHWc_int8)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'],
+             new_attrs['out_layout'], out_dtype], topi_tmpl)
         dispatch_ctx.update(target, new_workload, cfg)
-        return F.nn.contrib_conv2d_nchwc_int8(*copy_inputs, **new_attrs)
 
-    # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
-    new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
-    # Store altered operator's config
-    new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn,
-                                  kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype)
-    new_workload = autotvm.task.args_to_workload(
-        [new_data, new_kernel, strides, padding, dilation, new_attrs[layout_name],
-         new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
-    dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_nchwc(data_expr, kernel_OIHWioe, **new_attrs)
+
+    if topi_tmpl == "depthwise_conv2d_NCHWc.x86":
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        if cfg.is_fallback:
+            _get_default_config(cfg, data_tensor, kernel_tensor, strides, padding,
+                                out_dtype, True, data_layout)
+
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
+        ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+        assert channel_multiplier == 1
+
+        # update new attrs
+        new_attrs['channels'] = out_channel
+        new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
+        new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn
+        new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
 
-    return F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
+        # Store altered operator's config.
+        new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                   dtype=data_dtype)
+        new_kernel = tvm.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'],
+             new_attrs['out_layout'], out_dtype], topi_tmpl)
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
+
+    return None
 
 
 @conv2d_legalize.register("cpu")
@@ -254,7 +251,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
     # input channel to be a multiple of 4 and output channels to be a multiple of 16. For input
     # channels, we pad both the inputs and weights input channels. For output channels, we pad the
     # weight and stride_slice the output.
-    if _is_int8_hw_support(data_dtype, kernel_dtype):
+    if is_int8_hw_support(data_dtype, kernel_dtype):
         # Flags to remember if the expr is modified
         ic_modified = False
         oc_modified = False
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 9726f3d8d4f9..083fff48d774 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -18,10 +18,11 @@
 """1x1 Conv2D schedule on for Intel CPU"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
 from ..nn.pad import pad
-from ..nn.util import infer_pad, get_pad_tuple
+from ..nn.util import get_pad_tuple
 from ..generic import conv2d as conv2d_generic
 from ..util import get_const_tuple, simplify
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
@@ -58,84 +59,41 @@ def _fallback_schedule(cfg, wkl):
     raise ValueError("cannot decide default schedule for workload: {}".format(wkl))
 
 
-def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
-    # fetch schedule
-    ic_bn, oc_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
-                                          cfg["tile_oh"].val, cfg["tile_ow"].size[-1])
-
-    # no stride and padding info here
-    padding = infer_pad(data, data_pad)
-    HPAD, WPAD = padding
-    DOPAD = (HPAD != 0 or WPAD != 0)
-
-    A, W = data, kernel_vec
-    A0, A1 = data_pad, data_vec
-    # schedule data
-    if DOPAD:
-        s[A0].compute_inline()
-    batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
-    parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
-    s[A1].parallel(parallel_axis)
-
-    # schedule kernel pack
-    oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
-    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-    if oc_bn > 1:
-        s[W].vectorize(oc_block)
-    parallel_axis = s[W].fuse(oc_chunk, oh)
-    s[W].parallel(parallel_axis)
-
-    C, O0, O = conv_out, output, last
-    CC = s.cache_write(C, 'global')
-
-    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
-    s[C].vectorize(oc_block)
-
-    s[CC].compute_at(s[C], oh_outer)
-    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, _, _ = s[CC].op.reduce_axis
-
-    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
-
-    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
-    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
-
-    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
-    s[CC].vectorize(oc_block)
-
-    s[CC].unroll(ow_inner)
-    s[CC].unroll(oh_inner)
-
-    if O0 != O:
-        s[O0].compute_inline()
-    batch, oc, oh, ow = s[O].op.axis
-
-    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-    oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
-    ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
-    s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
-
-    parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
-    s[C].compute_at(s[O], parallel_axis)
-    s[O].vectorize(oc_block)
-
-    s[O].parallel(parallel_axis)
-
-    return s
-
-
-def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
+def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
     # fetch schedule
     oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
-    _, _, _, _, ic_bn = get_const_tuple(data.shape)
-
-    # schedule data
-    A = data
-    if isinstance(s[A].op, tvm.tensor.ComputeOp):
-        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
-        parallel_axis = s[A].fuse(batch, ic_chunk, ih)
-        s[A].parallel(parallel_axis)
+    _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
+
+    # schedule pad
+    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+            and "pad" in data_vec.op.tag:
+        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+        data_vec = data_vec.op.input_tensors[0]
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
+        # skip this part during tuning to make records accurate.
+        # this part will be folded during Relay fold_constant pass.
+        s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
+        s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
+    elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \
+            kernel_vec.name == 'kernel_vec':
+        # data and kernel are not pre-computed, schedule layout transform here.
+        # this should only be used by x86 conv2d_nchw, which is for
+        # testing purpose.
+        batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+
+        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis
+        s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+        oc_bn = cfg["tile_oc"].size[-1]
+        if oc_bn > 1:
+            s[kernel_vec].vectorize(oc_block)
+        parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
+        s[kernel_vec].parallel(parallel_axis)
 
     C, O = conv_out, last
     CC = s.cache_write(C, 'global')
@@ -167,22 +125,36 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     s[CC].unroll(oh_inner)
 
     if C != O:
-        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
-        ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
-        s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
-
-        parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
-        s[C].compute_at(s[O], parallel_axis)
-        s[O].vectorize(oc_block)
-        s[O].parallel(parallel_axis)
+        out_ndim = len(s[O].op.axis)
+        if out_ndim == 5:
+            batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+            oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
+            ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
+            s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+
+            parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
+            s[C].compute_at(s[O], parallel_axis)
+            s[O].vectorize(oc_block)
+            s[O].parallel(parallel_axis)
+        elif out_ndim == 4:
+            batch, oc, oh, ow = s[O].op.axis
+            oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+            oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
+            ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
+            s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+            parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
+            s[C].compute_at(s[O], parallel_axis)
+            s[O].vectorize(oc_block)
+            s[O].parallel(parallel_axis)
+        else:
+            raise ValueError("Unsupported output ndim: %s" % out_ndim)
 
     return s
 
 
-def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
-    return conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last,
-                                                           int32_lanes=16,
+def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last):
+    return conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data_vec, kernel_vec,
+                                                           conv_out, last, int32_lanes=16,
                                                            intrin=dot_16x1x16_uint8_int8_int32())
 
 
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 7c5096dc2c1a..085d0aeb67c3 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -18,9 +18,9 @@
 """Conv2D schedule on for Intel CPU"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
-from ..nn.util import infer_pad
 from ..generic import conv2d as conv2d_generic
 from ..util import get_const_tuple
 from .tensor_intrin import dot_16x1x16_uint8_int8_int32
@@ -83,88 +83,42 @@ def _fallback_schedule_int8(cfg, wkl):
     cfg["unroll_kw"] = OtherOptionEntity(False)
 
 
-def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
-    # fetch schedule
-    ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
-                                      cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
-
-    # no stride and padding info here
-    padding = infer_pad(data, data_pad)
-    HPAD, WPAD = padding
-    DOPAD = (HPAD != 0 or WPAD != 0)
-
-    A, W = data, kernel_vec
-    A0, A1 = data_pad, data_vec
-
-    # schedule data
-    if DOPAD:
-        s[A0].compute_inline()
-    batch, ic_chunk, ih, ic_block, iw = s[A1].op.axis
-    parallel_axis = s[A1].fuse(batch, ic_chunk, ih)
-    s[A1].parallel(parallel_axis)
-
-    # schedule kernel pack
-    oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
-    s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-    if oc_bn > 1:
-        s[W].vectorize(oc_block)
-    parallel_axis = s[W].fuse(oc_chunk, oh)
-    s[W].parallel(parallel_axis)
-
-    # schedule conv
-    C, O0, O = conv_out, output, last
-    CC = s.cache_write(C, 'global')
-
-    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
-    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    s[C].fuse(oc_chunk, oh)
-    s[C].vectorize(oc_block)
-
-    s[CC].compute_at(s[C], ow_chunk)
-    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, kh, kw = s[CC].op.reduce_axis
-
-    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
-
-    if unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
-        s[CC].unroll(kw)
-    else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
-
-    s[CC].fuse(oc_chunk, oh)
-    s[CC].vectorize(oc_block)
-    s[CC].unroll(ow_block)
-
-    if O0 != O:
-        s[O0].compute_inline()
-
-    batch, oc, oh, ow = s[O].op.axis
-    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
-    s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-    parallel_axis = s[O].fuse(batch, oc_chunk, oh)
-    s[C].compute_at(s[O], parallel_axis)
-    s[O].vectorize(oc_block)
-
-    s[O].parallel(parallel_axis)
-
-    return s
-
-
-def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
+def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
     # fetch schedule
     reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val
-    _, _, _, _, ic_bn = get_const_tuple(data.shape)
+    _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
+
+    # schedule pad
+    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+            and "pad" in data_vec.op.tag:
+        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+        data_vec = data_vec.op.input_tensors[0]
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
+        # skip this part during tuning to make records accurate.
+        # this part will be folded during Relay fold_constant pass.
+        s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
+        s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
+    elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \
+            kernel_vec.name == 'kernel_vec':
+        # data and kernel are not pre-computed, schedule layout transform here.
+        # this should only be used by x86 conv2d_nchw, which is for
+        # testing purpose.
+        batch, ic_chunk, ih, ic_block, iw = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+
+        oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[kernel_vec].op.axis
+        s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
+        oc_bn = cfg["tile_oc"].size[-1]
+        if oc_bn > 1:
+            s[kernel_vec].vectorize(oc_block)
+        parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
+        s[kernel_vec].parallel(parallel_axis)
 
-    # schedule data
-    A = data
-    if isinstance(s[A].op, tvm.tensor.ComputeOp):
-        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
-        parallel_axis = s[A].fuse(batch, ic_chunk, ih)
-        s[A].parallel(parallel_axis)
 
     # schedule 5-D NCHW[x]c conv
     C, O = conv_out, last
@@ -195,18 +149,31 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
     s[CC].unroll(ow_block)
 
     if C != O:
-        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
-        s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-        parallel_axis = s[O].fuse(batch, oc_chunk, oh)
-        s[C].compute_at(s[O], parallel_axis)
-        s[O].vectorize(oc_block)
-        s[O].parallel(parallel_axis)
+        out_ndim = len(s[O].op.axis)
+        if out_ndim == 5:
+            batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+            ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+            parallel_axis = s[O].fuse(batch, oc_chunk, oh)
+            s[C].compute_at(s[O], parallel_axis)
+            s[O].vectorize(oc_block)
+            s[O].parallel(parallel_axis)
+        elif out_ndim == 4:
+            batch, oc, oh, ow = s[O].op.axis
+            ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+            oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+            parallel_axis = s[O].fuse(batch, oc_chunk, oh)
+            s[C].compute_at(s[O], parallel_axis)
+            s[O].vectorize(oc_block)
+            s[O].parallel(parallel_axis)
+        else:
+            raise ValueError("Unsupported output ndim: %s" % out_ndim)
 
     return s
 
 
-def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
-    return conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last,
-                                                              int32_lanes=16,
+def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last):
+    return conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(s, cfg, data_vec, kernel_vec,
+                                                              conv_out, last, int32_lanes=16,
                                                               intrin=dot_16x1x16_uint8_int8_int32())
diff --git a/topi/python/topi/x86/conv2d_int8.py b/topi/python/topi/x86/conv2d_int8.py
index 20712d2f6f4f..64fe92bbaaa4 100644
--- a/topi/python/topi/x86/conv2d_int8.py
+++ b/topi/python/topi/x86/conv2d_int8.py
@@ -14,21 +14,19 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-member, import-outside-toplevel
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter,import-outside-toplevel
 """Conv2D int8 schedule on x86"""
 
-import re
 import tvm
 from tvm import autotvm
-from tvm.autotvm.task import get_config
-from tvm.autotvm.task.topi_integration import deserialize_args
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
-from .. import generic, tag
+from .. import tag
 from ..generic import conv2d as conv2d_generic
 from ..nn.util import get_pad_tuple
-from ..util import get_const_tuple
-from ..nn.conv2d import conv2d_NCHWc_int8
+from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
+from ..util import get_const_tuple, traverse_inline
 from .. import nn
 from . import conv2d_avx_1x1, conv2d_avx_common
 
@@ -53,7 +51,7 @@ def _get_default_config_int8(cfg, data, kernel, strides, padding, out_dtype, is_
                 cfg, wkl, int32_lanes=16, num_int8_elements=4)
 
 
-def _is_int8_hw_support(data_dtype, kernel_dtype):
+def is_int8_hw_support(data_dtype, kernel_dtype):
     """
     Checks to ensure that we can use Intel DLBoost instructions
     1) The datatypes are correct.
@@ -76,150 +74,123 @@ def _is_int8_hw_support(data_dtype, kernel_dtype):
     return is_dtype_support and is_llvm_support and is_target_support
 
 
-def _create_tuning_space_int8(cfg, data, kernel, strides, padding, dilation, layout):
-    """Create schedule configuration from input arguments"""
-    dshape = get_const_tuple(data.shape)
-    kshape = get_const_tuple(kernel.shape)
-    pat = re.compile(r'NCHW.+(\d+)c')
-    if layout == 'NCHW':
-        n, ic, h, w = dshape
-        oc, _, kh, kw = kshape
-    elif layout == 'NHWC':
-        n, h, w, ic = dshape
-        kh, kw, oc, _ = kshape
-    elif pat.match(layout) is not None:
-        n, ic_chunk, h, w, ic_bn = dshape
-        target = tvm.target.Target.current(allow_none=False)
-        oc_chunk, k_ic, kh, kw, k_ic_f, oc_bn, k_ic_s = kshape
-        ic = ic_chunk * ic_bn
-        assert ic == k_ic * k_ic_f * k_ic_s
-        oc = oc_chunk*oc_bn
+def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d with NCHW layout and int8 dtype"""
+    layout = "NCHW"
+    packed_out = conv2d_NCHWc_int8(data, kernel, strides, padding, dilation,
+                                   layout, layout, out_dtype)
+    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
+
+
+def schedule_conv2d_nchw_int8(outs):
+    """Create the schedule for conv2d_nchw_int8"""
+    return schedule_conv2d_NCHWc_int8(outs)
+
+
+def _pack_data(cfg, data, kernel):
+    n_elems = 4
+    n, _, ih, iw = get_const_tuple(data.shape)
+    oc, ic, kh, kw = get_const_tuple(kernel.shape)
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+
+    ic_chunk = ic // ic_bn
+    oc_chunk = oc // oc_bn
+
+    data = tvm.compute((n, ic_chunk, ih, iw, ic_bn),
+                       lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
+                       name="data_vec")
+
+    kernel = tvm.compute(
+        (oc_chunk, ic_chunk, kh, kw, ic_bn//n_elems, oc_bn, n_elems),
+        lambda occ, icc, k_h, k_w, icbc, ocb, icbb:
+        kernel[occ * oc_bn + ocb,
+               icc * ic_bn + icbc * ic_bn//n_elems + icbb, k_h, k_w],
+        name="kernel_vec")
+
+    return data, kernel
+
+
+@autotvm.register_topi_compute("conv2d_NCHWc_int8.x86")
+def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding,
+                      dilation, layout, out_layout, out_dtype):
+    """Compute conv2d with NCHWc layout and int8 dtype"""
+    if len(data.shape) == 5:
+        n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
+        in_channel = ic_chunk * ic_bn
+        oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ \
+            = get_const_tuple(kernel.shape)
+        num_filter = oc_chunk * oc_bn
     else:
-        raise ValueError("Not support this layout {} with "
-                         "schedule template.".format(layout))
+        n, in_channel, ih, iw = get_const_tuple(data.shape)
+        num_filter, _, kernel_height, kernel_width = \
+            get_const_tuple(kernel.shape)
 
-    is_kernel_1x1 = kh == 1 and kw == 1
-    pt, pl, pb, pr = get_pad_tuple(padding, kernel)
+    # Define autotvm tuning space
+    is_kernel_1x1 = kernel_height == 1 and kernel_width == 1
+    pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width))
     sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    oh = (h - kh + pt + pb) // sh + 1
-    ow = (w - kw + pl + pr) // sw + 1
+    oh = (ih - kernel_height + pt + pb) // sh + 1
+    ow = (iw - kernel_width + pl + pr) // sw + 1
 
-    # Create schedule config
-    cfg.define_split('tile_ic', ic, num_outputs=2, filter=lambda y: y.size[-1] % 4 == 0)
-    cfg.define_split('tile_oc', oc, num_outputs=2, filter=lambda y: y.size[-1] % 16 == 0)
+    cfg.define_split('tile_ic', in_channel, num_outputs=2,
+                     filter=lambda y: y.size[-1] % 4 == 0)
+    cfg.define_split('tile_oc', num_filter, num_outputs=2,
+                     filter=lambda y: y.size[-1] % 16 == 0)
     cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
     if is_kernel_1x1:
         cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
     else:
         cfg.define_knob("unroll_kw", [True, False])
 
-
-# Define template function for autotvm task
-# We define schedule template in this function instead of
-# declaration function since actual input arguments need
-# to be altered by the schedule selected.
-@autotvm.task.register("topi_x86_conv2d_NCHWc_int8")
-def _topi_nn_conv2d_NCHWc_int8(*args, **kwargs):
-    assert not kwargs, "Do not support kwargs in template function call"
-    args = deserialize_args(args)
-
-    if len(args) == 7:
-        data, kernel, strides, padding, dilation, origin_layout, dtype = args
-    else:
-        assert len(args) == 8
-        data, kernel, strides, padding, dilation, origin_layout, out_layout, dtype = args
-
-    raw_data_shape = get_const_tuple(data.shape)
-    raw_kernel_shape = get_const_tuple(kernel.shape)
-
-    # get config here
-    cfg = get_config()
-    _create_tuning_space_int8(cfg, data, kernel, strides, padding, dilation, origin_layout)
-
-    # change shape with the value in config
-    ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
-                           cfg["tile_ow"].size[-1])
-
-    data_layout = "NCHW%dc" % ic_bn
-    out_layout = "NCHW%dc" % oc_bn
-
-    # Set up the new shape for data and kernel
-    new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn,
-                      raw_data_shape[2], raw_data_shape[3], ic_bn)
-    n_elems = 4
-    new_kernel_shape = (raw_kernel_shape[0] // oc_bn,
-                        raw_kernel_shape[1] // ic_bn,
-                        raw_kernel_shape[2],
-                        raw_kernel_shape[3],
-                        ic_bn // n_elems,
-                        oc_bn,
-                        n_elems)
-
-    new_data = tvm.placeholder(new_data_shape, data.dtype)
-    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
-
-    C = _declaration_conv_NCHWc_int8(cfg, new_data, new_kernel, strides, padding, dilation,
-                                     data_layout, out_layout, dtype)
-    s = _schedule_conv2d_NCHWc_int8(cfg, [C])
-    return s, [new_data, new_kernel, C]
-
-
-@autotvm.register_topi_compute(conv2d_NCHWc_int8, 'cpu', 'direct')
-def _declaration_conv_NCHWc_int8(cfg, data, kernel, strides,
-                                 padding, dilation, layout, out_layout, out_dtype):
-    return nn.conv2d_NCHWc_int8_compute(data,
-                                        kernel,
-                                        strides,
-                                        padding,
-                                        dilation,
-                                        layout,
-                                        out_layout,
-                                        out_dtype)
-
-
-@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc_int8, 'cpu', ['direct'])
-def _schedule_conv2d_NCHWc_int8(cfg, outs):
+    # If no config was set, we can fallback to default config.
+    if cfg.is_fallback:
+        _get_default_config_int8(
+            cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
+            tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
+                            dtype=kernel.dtype),
+            strides, padding, out_dtype)
+
+    # Pack data if raw 4-D data is provided.
+    # This can only happen when autotuning.
+    if len(data.shape) == 4:
+        data, kernel = _pack_data(cfg, data, kernel)
+
+    return nn.conv2d_NCHWc_int8(data,
+                                kernel,
+                                strides,
+                                padding,
+                                dilation,
+                                layout,
+                                out_layout,
+                                out_dtype)
+
+
+@autotvm.register_topi_schedule("conv2d_NCHWc_int8.x86")
+def schedule_conv2d_NCHWc_int8(cfg, outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
 
-    def traverse(op):
+    def _callback(op):
         """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-
         if 'conv2d_NCHWc_int8' in op.tag:
             conv_out = op.output(0)
-            kernel = conv_out.op.input_tensors[1]
+            kernel_vec = conv_out.op.input_tensors[1]
             data_vec = conv_out.op.input_tensors[0]
-            data = data_vec.op.input_tensors[0] \
-                if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
-                else data_vec
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
 
-            args = [s, cfg, data_vec, conv_out, outs[0]]
-            target = tvm.target.Target.current(allow_none=False)
+            args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
             # int8 conv kernel is 7-dim
-            _, _, kh, kw, _, _, _ = get_const_tuple(kernel.shape)
+            _, _, kh, kw, _, _, _ = get_const_tuple(kernel_vec.shape)
             if kh == 1 and kw == 1:
                 conv2d_avx_1x1._schedule_conv_NCHWc_int8(*args)
             else:
                 conv2d_avx_common._schedule_conv_NCHWc_int8(*args)
 
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s
 
-@autotvm.register_topi_schedule(generic.schedule_conv2d_nhwc_pack, 'cpu', ['direct'])
-def schedule_conv2d_nhwc_pack(cfg, outs):
+
+@autotvm.register_topi_schedule("conv2d_nhwc_pack_int8.x86")
+def schedule_conv2d_nhwc_pack_int8(cfg, outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
     output_op = outs[0].op
diff --git a/topi/python/topi/x86/conv2d_transpose.py b/topi/python/topi/x86/conv2d_transpose.py
index 27fc0afce999..71f47d6c037b 100644
--- a/topi/python/topi/x86/conv2d_transpose.py
+++ b/topi/python/topi/x86/conv2d_transpose.py
@@ -17,59 +17,34 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D Transpose schedule on x86"""
 import tvm
-from tvm import autotvm
-from .. import generic
-from ..util import get_const_tuple, traverse_inline
-from ..nn import conv2d_transpose_nchw_preprocess, conv2d_transpose_nchw
-from . import conv2d_avx_1x1, conv2d_avx_common
-from .conv2d import _declaration_conv_impl, \
-    _create_tuning_space as _create_tuning_space_conv2d, \
-    _get_default_config as _get_default_config_conv2d
+from ..util import traverse_inline
+from .. import nn
+from .conv2d import conv2d_nchw, schedule_conv2d_nchw
 
-
-@autotvm.register_topi_compute(conv2d_transpose_nchw, 'cpu', ['direct'])
-def _conv2d_transpose_nchw(cfg, data, kernel, strides, padding, out_dtype):
+def conv2d_transpose_nchw(data, kernel, strides, padding, out_dtype):
     data_pad, kernel_transform = \
-        conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype)
-    # reuse conv2d implementation
-    _create_tuning_space_conv2d(cfg, data_pad, kernel_transform, strides=(1, 1), \
-                                padding=(0, 0), dilation=(1, 1), layout="NCHW")
-    if cfg.is_fallback:
-        _get_default_config_conv2d(cfg, data_pad, kernel_transform, strides=(1, 1), \
-                                   padding=(0, 0), out_dtype=out_dtype, layout='NCHW')
-    return _declaration_conv_impl(cfg, data_pad, kernel_transform, strides=(1, 1), \
-                                  padding=(0, 0), dilation=(1, 1), layout="NCHW", \
-                                  out_dtype=out_dtype)
-
+        nn.conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype)
+    # reuse conv2d_nchw implementation
+    return conv2d_nchw(data_pad, kernel_transform, strides=(1, 1),
+                       padding=(0, 0), dilation=(1, 1), out_dtype=out_dtype)
 
-@autotvm.register_topi_schedule(generic.schedule_conv2d_transpose_nchw, 'cpu', ['direct'])
-def _schedule_conv2d_transpose_nchw(cfg, outs):
+def schedule_conv2d_transpose_nchw(outs):
     """Create schedule for tensors"""
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-
+    s = schedule_conv2d_nchw(outs)
     def _callback(op):
-        # reuse conv2d schedule
-        if 'conv2d_nchw' in op.tag:
-            output = op.output(0)
+        if 'unpack_nchwc' in op.tag:
             conv_out = op.input_tensors[0]
             # retrieve data
             data_vec = conv_out.op.input_tensors[0]
             data_pad = data_vec.op.input_tensors[0]
             data_dilate = data_pad.op.input_tensors[0]
             s[data_dilate].compute_inline()
+            s[data_pad].compute_inline()
             # retrieve kernel
             kernel_vec = conv_out.op.input_tensors[1]
             kernel_transform = kernel_vec.op.input_tensors[0]
             s[kernel_transform].compute_inline()
-            # call conv2d schedule
-            _, _, kh, kw = get_const_tuple(kernel_transform.shape)
-            is_kernel_1x1 = kh == 1 and kw == 1
-            args = [s, cfg, data_dilate, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
-            if is_kernel_1x1:
-                conv2d_avx_1x1._schedule_conv(*args)
-            else:
-                conv2d_avx_common._schedule_conv(*args)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/x86/conv3d.py b/topi/python/topi/x86/conv3d.py
index 4a6664eba0e4..1e156509c0a8 100644
--- a/topi/python/topi/x86/conv3d.py
+++ b/topi/python/topi/x86/conv3d.py
@@ -21,9 +21,7 @@
 import tvm
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
-from .. import generic
 from ..util import traverse_inline
-from ..nn.conv3d import conv3d, conv3d_ncdhw
 from ..nn.util import get_pad_tuple3d, infer_pad3d
 from ..nn.pad import pad
 from ..util import get_const_tuple, simplify, get_const_int
@@ -35,9 +33,8 @@
                          'hkernel', 'wkernel', 'dpad', 'hpad', 'wpad',
                          'dstride', 'hstride', 'wstride'])
 
-@autotvm.register_topi_compute(conv3d, 'cpu', ['direct'])
-def _declaration_conv3d(cfg, data, kernel, strides, padding, dilation,
-                        layout, out_dtype):
+@autotvm.register_topi_compute("conv3d_ndhwc.x86")
+def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
     """3D convolution forward operator.
 
     Parameters
@@ -59,30 +56,24 @@ def _declaration_conv3d(cfg, data, kernel, strides, padding, dilation,
     dilation: int or a list/tuple of three ints
         dilation size, or [dilation_depth, dilation_height, dilation_width]
 
-    layout : str
-        layout of data
-
     Returns
     -------
     output : tvm.Tensor
         5-D with shape [batch, out_depth, out_height, out_width, out_channel] for NDHWC layout
         5-D with shape [batch, out_channel, out_depth, out_height, out_width] for NCDHW layout
     """
+    layout = "NDHWC"
     out_dtype = data.dtype if out_dtype is None else out_dtype
     strides = strides if isinstance(strides, (tuple, list)) else (strides, strides, strides)
     dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation, dilation)
 
-    if layout == 'NDHWC':
-        _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout)
-        if cfg.is_fallback:
-            _get_default_config(cfg, data, kernel, strides, padding, out_dtype, layout)
-        return _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
-    elif layout == 'NCDHW':
-        return conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype)
-    raise ValueError("Layout {} is not supported".format(layout))
+    _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout)
+    if cfg.is_fallback:
+        _get_default_config(cfg, data, kernel, strides, padding, out_dtype, layout)
+    return _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype)
 
 
-@autotvm.register_topi_schedule(generic.schedule_conv3d_ndhwc, 'cpu', ['direct'])
+@autotvm.register_topi_schedule("conv3d_ndhwc.x86")
 def schedule_conv3d_ndhwc(cfg, outs):
     """TOPI schedule callback for conv3d
     Parameters
@@ -120,7 +111,7 @@ def _traverse(op):
     return s
 
 
-def _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+def _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
     out_dtype = data.dtype if out_dtype is None else out_dtype
 
     assert isinstance(dilation, int) or len(dilation) == 3
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
index c6c3d5e667ac..ea89cf4779b0 100644
--- a/topi/python/topi/x86/dense.py
+++ b/topi/python/topi/x86/dense.py
@@ -23,147 +23,9 @@
 from tvm.contrib import cblas
 
 from .util import get_fp32_len
-from .. import generic, tag, nn
+from .. import generic, tag
 from ..util import traverse_inline, get_const_tuple
 
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
-def _declaration_dense(cfg, data, weight, bias=None, out_dtype=None):
-    target = tvm.target.Target.current()
-    if "cblas" in target.libs:
-        C = cblas.matmul(data, weight, False, True)
-        if bias is not None:
-            C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j],
-                            tag=tag.BROADCAST)
-        return C
-
-    M, _ = get_const_tuple(data.shape)
-    # Always use dense_nopack for dynamic input.
-    # This is a temporary for CV models.
-    # TODO(kevinthesun): use kernel dispatcher instead.
-    if isinstance(M, tvm.expr.Var):
-        return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype)
-
-    # For small batch sizes, don't pack weight into cache-friendly layout
-    # because of overhead in packing and limited reuse from batch dimension
-    # TODO(icemelon9): use a more systematic way to determine which schedule to use
-    if M <= 16:
-        return _declaration_dense_nopack(cfg, data, weight, bias, out_dtype)
-    return _declaration_dense_pack(cfg, data, weight, bias, out_dtype)
-
-
-# Declare dense compute with packing weight into cache-friendly layout
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
-def _declaration_dense_pack(cfg, data, weight, bias=None, out_dtype=None):
-    if out_dtype is None:
-        out_dtype = data.dtype
-    M, K = get_const_tuple(data.shape) # batch, in_dim
-    N, _ = get_const_tuple(weight.shape) # out_dim
-    # create tuning space
-    cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=3)
-    cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=3)
-    cfg.define_split("tile_k", 32 if isinstance(K, tvm.expr.Var) else K, num_outputs=2)
-    if cfg.is_fallback:
-        _default_dense_pack_config(cfg, M, N, K)
-
-    packw_bn = cfg["tile_x"].size[-1]
-    packw_shape = (N // packw_bn, K, packw_bn)
-    packw = tvm.compute(packw_shape,
-                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
-
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
-    k = tvm.reduce_axis((0, K), name="k")
-    C = tvm.compute((M, N),
-                    lambda y, x: tvm.sum(
-                        data[y, k].astype(out_dtype) *
-                        packw[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype),
-                        axis=k),
-                    tag="dense_pack")
-    if bias is not None:
-        C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
-                        tag=tag.BROADCAST)
-    return C
-
-
-# Declare dense compute without packing weight
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
-def _declaration_dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
-    if out_dtype is None:
-        out_dtype = data.dtype
-    M, K = get_const_tuple(data.shape)
-    N, _ = get_const_tuple(weight.shape)
-    # create tuning space
-    cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=2)
-    cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=2)
-    cfg.define_split("tile_k", 32 if isinstance(K, tvm.expr.Var) else K, num_outputs=2)
-    if cfg.is_fallback:
-        _default_dense_nopack_config(cfg, M, N, K)
-
-    vec = cfg["tile_k"].size[-1]
-    k = tvm.reduce_axis((0, K // vec), "k")
-    CC = tvm.compute((M, N, vec),
-                     lambda z, y, x: tvm.sum(
-                         data[z, k * vec + x].astype(out_dtype) *
-                         weight[y, k * vec + x].astype(out_dtype), axis=k))
-
-    kk = tvm.reduce_axis((0, vec), "kk")
-    C = tvm.compute((M, N),
-                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
-                    tag="dense_nopack")
-    if bias is not None:
-        C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
-                        tag=tag.BROADCAST)
-
-    return C
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
-def _schedule_dense(cfg, outs):
-    target = tvm.target.Target.current()
-    if "cblas" in target.libs:
-        return generic.schedule_extern(outs)
-
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
-        elif 'dense_nopack' in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
-def _schedule_dense_pack(cfg, outs):
-    target = tvm.target.Target.current()
-    if "cblas" in target.libs:
-        return generic.schedule_extern(outs)
-
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
-def _schedule_dense_nopack(cfg, outs):
-    target = tvm.target.Target.current()
-    if "cblas" in target.libs:
-        return generic.schedule_extern(outs)
-
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if 'dense_nopack' in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
 def _schedule_dense_pack_template(cfg, s, C):
     A, packedB = s[C].op.input_tensors
 
@@ -270,3 +132,106 @@ def _default_dense_nopack_config(cfg, M, N, K):
     cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
     cfg["tile_x"] = SplitEntity([N, 1])
     cfg["tile_y"] = SplitEntity([1, M])
+
+@autotvm.register_topi_compute("dense_nopack.x86")
+def dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
+    """Compute dense without packing"""
+    if out_dtype is None:
+        out_dtype = data.dtype
+    M, K = get_const_tuple(data.shape)
+    N, _ = get_const_tuple(weight.shape)
+    # create tuning space
+    cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=2)
+    cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=2)
+    cfg.define_split("tile_k", 32 if isinstance(K, tvm.expr.Var) else K, num_outputs=2)
+    if cfg.is_fallback:
+        _default_dense_nopack_config(cfg, M, N, K)
+
+    vec = cfg["tile_k"].size[-1]
+    k = tvm.reduce_axis((0, K // vec), "k")
+    CC = tvm.compute((M, N, vec),
+                     lambda z, y, x: tvm.sum(
+                         data[z, k * vec + x].astype(out_dtype) *
+                         weight[y, k * vec + x].astype(out_dtype), axis=k))
+
+    kk = tvm.reduce_axis((0, vec), "kk")
+    C = tvm.compute((M, N),
+                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
+                    tag="dense_nopack")
+    if bias is not None:
+        C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
+                        tag=tag.BROADCAST)
+    return C
+
+
+@autotvm.register_topi_schedule("dense_nopack.x86")
+def schedule_dense_nopack(cfg, outs):
+    """Create the schedule for dense_nopack"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'dense_nopack' in op.tag:
+            _schedule_dense_nopack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+@autotvm.register_topi_compute("dense_pack.x86")
+def dense_pack(cfg, data, weight, bias=None, out_dtype=None):
+    """Compute dense with packing"""
+    if out_dtype is None:
+        out_dtype = data.dtype
+    M, K = get_const_tuple(data.shape) # batch, in_dim
+    N, _ = get_const_tuple(weight.shape) # out_dim
+    # create tuning space
+    cfg.define_split("tile_y", M, num_outputs=3)
+    cfg.define_split("tile_x", N, num_outputs=3)
+    cfg.define_split("tile_k", K, num_outputs=2)
+    if cfg.is_fallback:
+        _default_dense_pack_config(cfg, M, N, K)
+
+    packw_bn = cfg["tile_x"].size[-1]
+    packw_shape = (N // packw_bn, K, packw_bn)
+    packw = tvm.compute(packw_shape,
+                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
+
+    idxdiv = tvm.indexdiv
+    idxmod = tvm.indexmod
+    k = tvm.reduce_axis((0, K), name="k")
+    C = tvm.compute((M, N),
+                    lambda y, x: tvm.sum(
+                        data[y, k].astype(out_dtype) *
+                        packw[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype),
+                        axis=k),
+                    tag="dense_pack")
+    if bias is not None:
+        C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
+                        tag=tag.BROADCAST)
+    return C
+
+@autotvm.register_topi_schedule("dense_pack.x86")
+def schedule_dense_pack(cfg, outs):
+    """Create the schedule for dense_pack"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense_pack" in op.tag:
+            _schedule_dense_pack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+@autotvm.register_topi_compute("dense_cblas.x86")
+def dense_cblas(cfg, data, weight, bias=None, out_dtype=None):
+    """Compute dense using cblas library"""
+    M, K = get_const_tuple(data.shape)
+    N, _ = get_const_tuple(weight.shape)
+    cfg.add_flop(M * K * N * 2)
+    C = cblas.matmul(data, weight, False, True)
+    if bias is not None:
+        C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype),
+                        tag=tag.BROADCAST)
+    return C
+
+@autotvm.register_topi_schedule("dense_cblas.x86")
+def schedule_dense_cblas(_, outs):
+    """Create schedule for dense_cblas"""
+    return generic.schedule_extern(outs)
diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
index 385537b95e4d..2aa5e748e5c7 100644
--- a/topi/python/topi/x86/depthwise_conv2d.py
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -15,20 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+# pylint: disable=no-value-for-parameter
 """Depthwise Conv2D schedule on x86"""
 import tvm
 from tvm import autotvm
-from tvm.autotvm.task import get_config
 from tvm.autotvm.task.space import SplitEntity
-from tvm.autotvm.task.topi_integration import deserialize_args
-from .. import generic, tag
-from ..generic import schedule_depthwise_conv2d_nchw
 from ..nn.pad import pad
 from ..util import get_const_tuple
 from ..nn.util import get_pad_tuple
-from ..nn.depthwise_conv2d import depthwise_conv2d_nchw, depthwise_conv2d_NCHWc, \
-    _get_workload, depthwise_conv2d_infer_layout
-
+from ..nn.depthwise_conv2d import _get_workload, depthwise_conv2d_infer_layout
+from ..nn.conv2d import unpack_NCHWc_to_nchw
+from ..util import traverse_inline
 from .util import get_fp32_len
 
 def _fallback_schedule(cfg, wkl):
@@ -70,20 +67,57 @@ def _fallback_schedule(cfg, wkl):
     cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
     cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
 
+def depthwise_conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype):
+    """Compute depthwise conv2d with NCHW layout."""
+    layout = "NCHW"
+    packed_out = depthwise_conv2d_NCHWc(data, kernel, strides, padding, dilation,
+                                        layout, layout, out_dtype)
+    return unpack_NCHWc_to_nchw(packed_out, out_dtype)
+
+def schedule_depthwise_conv2d_nchw(outs):
+    """Create schedule for depthwise_conv2d_nchw."""
+    return schedule_depthwise_conv2d_NCHWc(outs)
+
+def _pack_data(cfg, data, kernel):
+    n, ic, ih, iw = get_const_tuple(data.shape)
+    filters, cm, kh, kw = get_const_tuple(kernel.shape)
+    oc = filters * cm
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+
+    ic_chunk = ic // ic_bn
+    oc_chunk = oc // oc_bn
+
+    data = tvm.compute((n, ic_chunk, ih, iw, ic_bn),
+                       lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
+                       name="data_vec")
 
-autotvm.register_topi_compute(depthwise_conv2d_nchw, 'cpu', 'direct',
-                              depthwise_conv2d_nchw.fdefault)
-autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'cpu', 'direct',
-                               schedule_depthwise_conv2d_nchw.fdefault)
+    kernel = tvm.compute(
+        (oc_chunk, 1, kh, kw, 1, oc_bn),
+        lambda occ, icc, k_h, k_w, icb, ocb:
+        kernel[(occ * oc_bn + ocb) // cm,
+               (occ * oc_bn + ocb) % cm, k_h, k_w],
+        name="kernel_vec")
 
+    return data, kernel
 
-@autotvm.register_topi_compute(depthwise_conv2d_NCHWc, 'cpu', 'direct')
-def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
-                                layout, out_layout, out_dtype=None):
+@autotvm.register_topi_compute("depthwise_conv2d_NCHWc.x86")
+def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
+                           layout, out_layout, out_dtype=None):
+    """Compute depthwise conv2d with NCHWc layout"""
     out_dtype = data.dtype if out_dtype is None else out_dtype
-    batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape)
-    out_channel_chunk, _, filter_height, filter_width, __, out_channel_block \
-        = get_const_tuple(kernel.shape)
+
+    if len(data.shape) == 5:
+        batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape)
+        out_channel_chunk, cm_chunk, filter_height, filter_width, cm_block, out_channel_block \
+            = get_const_tuple(kernel.shape)
+        in_channel = in_channel_chunk * in_channel_block
+        out_channel = out_channel_chunk * out_channel_block
+        channel_multiplier = cm_chunk * cm_block
+        assert channel_multiplier * in_channel == out_channel
+    else:
+        batch, in_channel, in_height, in_width = get_const_tuple(data.shape)
+        out_channel, channel_multiplier, filter_height, filter_width = get_const_tuple(kernel.shape)
+    assert channel_multiplier == 1
 
     strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     HSTR, WSTR = strides
@@ -92,21 +126,30 @@ def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
     dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     assert (dh, dw) == (1, 1), "Does not support dilation"
 
-    in_channel = in_channel_chunk * in_channel_block
-    out_channel = out_channel_chunk * out_channel_block
-    channel_multiplier = out_channel // in_channel
-
     out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1
     out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1
 
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+
     # get workload and related schedule config
-    wkl = _get_workload(tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
-                        tvm.placeholder((out_channel, in_channel, filter_height, filter_width),
-                                        dtype=kernel.dtype),
-                        strides, padding, out_dtype)
+    wkl = _get_workload(
+        tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
+        tvm.placeholder((out_channel, channel_multiplier, filter_height, filter_width),
+                        dtype=kernel.dtype),
+        strides, padding, out_dtype)
     if cfg.is_fallback:
         _fallback_schedule(cfg, wkl)
 
+    # Pack data if raw 4-D data is provided.
+    # This can only happen when autotuning.
+    if len(data.shape) == 4:
+        data, kernel = _pack_data(cfg, data, kernel)
+        _, _, _, _, in_channel_block = get_const_tuple(data.shape)
+        out_channel_chunk, _, _, _, _, out_channel_block \
+            = get_const_tuple(kernel.shape)
+
     # padding stage
     DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0)
     if DOPAD:
@@ -136,38 +179,39 @@ def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
         name='DepthwiseConv2d', tag="depthwise_conv2d_NCHWc")
     return Output
 
-
-@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_NCHWc, 'cpu', ['direct'])
+@autotvm.register_topi_schedule("depthwise_conv2d_NCHWc.x86")
 def schedule_depthwise_conv2d_NCHWc(cfg, outs):
     """CPU schedule for depthwise conv2d in NCHW[x]c layout"""
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-    def traverse(op):
+
+    def _callback(op):
         """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
         if 'depthwise_conv2d_NCHWc' in op.tag:
             conv_out = op.output(0)
             data = conv_out.op.input_tensors[0]
             kernel = conv_out.op.input_tensors[1]
             _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, outs[0])
-        scheduled_ops.append(op)
-    traverse(outs[0].op)
+
+    traverse_inline(s, outs[0].op, _callback)
     return s
 
-def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, output):
-    tile_ow = cfg["tile_ow"].size[-1]
-    # schedule data
-    A = data
-    if isinstance(s[A].op, tvm.tensor.ComputeOp):
-        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
-        p = s[A].fuse(ic_chunk, ih)
-        s[A].parallel(p)
+def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out, output):
+    tile_ow, oc_bn = cfg["tile_ow"].size[-1], cfg["tile_oc"].size[-1]
+    # schedule pad
+    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+            and "pad" in data_vec.op.tag:
+        batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
+        parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
+        s[data_vec].parallel(parallel_axis)
+        data_vec = data_vec.op.input_tensors[0]
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # only in autotuning, input data of conv2d_NCHWc will be 4-D.
+        # skip this part during tuning to make recrods accurate.
+        # this part will be folded during Relay fold_constant pass.
+        s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
+        s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
 
     C, O = conv_out, output
     CC = s.cache_write(C, 'global')
@@ -187,55 +231,34 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, output
     s[CC].unroll(ow_block)
 
     if C != O:
-        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        ow_chunk, ow_block = s[O].split(ow, factor=tile_ow)
-        s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
-        parallel_axis = s[O].fuse(oc_chunk, oh)
-        s[C].compute_at(s[O], parallel_axis)
-        s[O].vectorize(oc_block)
-        s[O].parallel(parallel_axis)
-    return s
-
-
-@autotvm.task.register("topi_x86_depthwise_conv2d_NCHWc_from_nchw")
-def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs):
-    assert not kwargs, "Do not support kwargs in template function call"
-    data, kernel, strides, padding, dilation, dtype = deserialize_args(args)
-
-    batch, in_channel, height, width = get_const_tuple(data.shape)
-    filter_channel, channel_multiplier, kh, kw = get_const_tuple(kernel.shape)
-    pt, pl, pb, pr = get_pad_tuple(padding, kernel)
-    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-    out_height = (height - kh + pt + pb) // sh + 1
-    out_width = (width - kw + pl + pr) // sw + 1
-    out_channel = filter_channel * channel_multiplier
-
-    # get config here
-    cfg = get_config()
-    cfg.define_split("tile_ic", in_channel, num_outputs=2)
-    cfg.define_split("tile_oc", out_channel, num_outputs=2)
-    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+        out_ndim = len(s[O].op.axis)
+        if out_ndim == 5:
+            batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+            ow_chunk, ow_block = s[O].split(ow, factor=tile_ow)
+            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+            parallel_axis = s[O].fuse(oc_chunk, oh)
+            s[C].compute_at(s[O], parallel_axis)
+            s[O].vectorize(oc_block)
+            s[O].parallel(parallel_axis)
+        elif out_ndim == 4:
+            batch, oc, oh, ow = s[O].op.axis
+            ow_chunk, ow_block = s[O].split(ow, factor=tile_ow)
+            oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+            s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+            parallel_axis = s[O].fuse(oc_chunk, oh)
+            s[C].compute_at(s[O], parallel_axis)
+            s[O].vectorize(oc_block)
+            s[O].parallel(parallel_axis)
+        else:
+            raise ValueError("Unsupported output ndim: %s" % out_ndim)
 
-    # change shape with the value in config
-    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-    new_data_shape = (batch, in_channel // ic_bn, height, width, ic_bn)
-    new_kernel_shape = (out_channel // oc_bn, 1, kh, kw, 1, oc_bn)
-    new_data = tvm.placeholder(new_data_shape, data.dtype)
-    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
-
-    data_layout = "NCHW%dc" % ic_bn
-    out_layout = "NCHW%dc" % oc_bn
-
-    C = _depthwise_conv2d_NCHWc_cpu(cfg, new_data, new_kernel, strides, padding, dilation,
-                                    data_layout, out_layout, dtype)
-    s = schedule_depthwise_conv2d_NCHWc(cfg, [C])
-    return s, [new_data, new_kernel, C]
+    return s
 
 @depthwise_conv2d_infer_layout.register("cpu")
 def _depthwise_conv2d_infer_layout(workload, cfg):
     _, data, kernel, strides, padding, dilation, dtype = workload
-    batch_size, in_channel, in_height, in_width = data[:-1]
-    filter_channel, channel_multiplier, k_height, k_width = kernel[:-1]
+    batch_size, in_channel, in_height, in_width = data[1]
+    filter_channel, channel_multiplier, k_height, k_width = kernel[1]
     out_channel = filter_channel * channel_multiplier
     out_height = (in_height + 2 * padding[0] - k_height) // strides[0] + 1
     out_width = (in_width + 2 * padding[1] - k_width) // strides[1] + 1
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
index d6bb7622d640..375827bb271c 100644
--- a/topi/python/topi/x86/injective.py
+++ b/topi/python/topi/x86/injective.py
@@ -18,10 +18,8 @@
 """x86 declaration and schedules."""
 from __future__ import absolute_import as _abs
 import tvm
-from .. import generic
 from ..util import is_empty_shape
 
-@generic.schedule_injective_from_existing.register(["cpu"])
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
 
@@ -53,7 +51,6 @@ def schedule_injective_from_existing(sch, out):
         sch[out].vectorize(li)
     return sch
 
-@generic.schedule_injective.register(["cpu"])
 def schedule_injective(outs):
     """X86 schedule for injective op.
 
@@ -77,7 +74,6 @@ def schedule_injective(outs):
         schedule_injective_from_existing(s, x)
     return s
 
-@generic.schedule_concatenate.register(["cpu"])
 def schedule_concatenate(outs):
     """X86 schedule for concatenate op.
 
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index 45cb17e5c7b3..3d57b6bbf203 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -18,9 +18,7 @@
 """x86 nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
-from .. import generic
 
-@generic.schedule_softmax.register(["cpu"])
 def schedule_softmax(outs):
     """Schedule for softmax
 
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
index ed7d525028e4..a8251dd13ae4 100644
--- a/topi/python/topi/x86/pooling.py
+++ b/topi/python/topi/x86/pooling.py
@@ -17,7 +17,6 @@
 # pylint: disable=invalid-name, unused-variable
 """Schedule for pooling operators"""
 import tvm
-from .. import generic
 from .. import tag
 
 def _parallel_sch(sch, oshape, do_vectorize=False):
@@ -59,7 +58,6 @@ def vectorize(fused_axis, num_parallel_axis, vectorize_limit=64):
     sch.parallel(fused)
 
 
-@generic.schedule_pool.register(["cpu"])
 def schedule_pool(outs, layout):
     """Schedule for pool
 
@@ -117,7 +115,6 @@ def traverse(OP):
     return s
 
 
-@generic.schedule_adaptive_pool.register(["cpu"])
 def schedule_adaptive_pool(outs):
     """Schedule for adaptive pool
 
diff --git a/topi/python/topi/x86/reduction.py b/topi/python/topi/x86/reduction.py
index f704d4961f15..b9dd4d4f1b3c 100644
--- a/topi/python/topi/x86/reduction.py
+++ b/topi/python/topi/x86/reduction.py
@@ -18,8 +18,8 @@
 """x86 declaration and schedules."""
 from __future__ import absolute_import as _abs
 import tvm
+from .injective import schedule_injective_from_existing
 from .. import tag
-from .. import generic
 from ..util import get_const_tuple
 
 def _schedule_reduce(sch, op, is_idx_reduce=False):
@@ -58,7 +58,6 @@ def _schedule_reduce(sch, op, is_idx_reduce=False):
             sch[out].parallel(fused)
 
 
-@generic.schedule_reduce.register(["cpu"])
 def schedule_reduce(outs):
     """X86 schedule for reduction op.
 
@@ -95,7 +94,7 @@ def traverse_after_reduce(operator):
         """Internal traverse function"""
         if tag.is_broadcast(operator.tag):
             if operator not in scheduled_ops:
-                generic.schedule_injective_from_existing(sch, operator)
+                schedule_injective_from_existing(sch, operator)
             for tensor in operator.input_tensors:
                 traverse_after_reduce(tensor.op)
         elif operator.tag == 'comm_reduce':
diff --git a/topi/python/topi/x86/roi_align.py b/topi/python/topi/x86/roi_align.py
index 26b84be9585b..203c3dd1802b 100644
--- a/topi/python/topi/x86/roi_align.py
+++ b/topi/python/topi/x86/roi_align.py
@@ -20,7 +20,6 @@
 import tvm
 
 from tvm import hybrid
-from ..vision.rcnn import roi_align_nchw
 from ..tensor import full
 from ..util import get_const_tuple
 
@@ -185,8 +184,7 @@ def roi_align_nchw_ir(data, rois, w_pc, pos_pc, pooled_size, spatial_scale, samp
     return output
 
 
-@roi_align_nchw.register("cpu")
-def roi_align_nchw_cpu(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
+def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
     """ROI align operator in NCHW layout.
 
     Parameters
diff --git a/topi/python/topi/x86/sparse.py b/topi/python/topi/x86/sparse.py
index c9e0e3864a5a..898d0e5ea2c6 100644
--- a/topi/python/topi/x86/sparse.py
+++ b/topi/python/topi/x86/sparse.py
@@ -18,13 +18,12 @@
 """sparse_dense schedule on x86"""
 import tvm
 
-from .. import generic
 from ..util import traverse_inline, get_const_int
 from .util import get_fp32_len
 
 
-@generic.schedule_sparse_dense.register(["cpu"])
-def _schedule_sparse_dense(outs):
+def schedule_sparse_dense(outs):
+    """Create schedule for sparse dense"""
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index a7b916093d98..79e223c30975 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -677,7 +677,7 @@ TVM_REGISTER_GLOBAL("topi.rocm.schedule_softmax")
 
 TVM_REGISTER_GLOBAL("topi.rocm.schedule_lrn")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::rocm::schedule_lrn(args[0], args[1]);
+  *rv = topi::rocm::schedule_lrn(args[0]);
   });
 
 /* CUDA schedules */
@@ -723,7 +723,7 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax")
 
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_lrn")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::cuda::schedule_lrn(args[0], args[1]);
+  *rv = topi::cuda::schedule_lrn(args[0]);
   });
 
 /* Utility functions */
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
index 4e0a45be0a22..e03708c67f26 100644
--- a/topi/tests/python/common.py
+++ b/topi/tests/python/common.py
@@ -16,9 +16,10 @@
 # under the License.
 """Common utility for topi test"""
 
+import tvm
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
-
+import topi
 
 def get_all_backend():
     """return all supported target
@@ -31,14 +32,12 @@ def get_all_backend():
     return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
             'llvm -device=arm_cpu', 'opencl -device=mali', 'aocl_sw_emu']
 
-
 class Int8Fallback(autotvm.FallbackContext):
     def _query_inside(self, target, workload):
         key = (target, workload)
         if key in self.memory:
             return self.memory[key]
         cfg = FallbackConfigEntity()
-        cfg.template_key = 'int8'
         self.memory[key] = cfg
         cfg.is_fallback = False
         return cfg
diff --git a/topi/tests/python/test_fifo_buffer.py b/topi/tests/python/test_fifo_buffer.py
index 022272f6c4da..34c389aad6c9 100644
--- a/topi/tests/python/test_fifo_buffer.py
+++ b/topi/tests/python/test_fifo_buffer.py
@@ -18,10 +18,12 @@
 
 import tvm
 import topi
+import topi.testing
 import numpy as np
-from common import get_all_backend
 from tvm.contrib.pickle_memoize import memoize
 
+from common import get_all_backend
+
 def verify_fifo_buffer(buffer_shape, data_shape, axis, dtype='float32'):
     buffer = tvm.placeholder(buffer_shape, name='buffer', dtype=dtype)
     data = tvm.placeholder(data_shape, name='data', dtype=dtype)
@@ -52,7 +54,7 @@ def check_device(device):
 
         with tvm.target.create(device):
             out = topi.nn.fifo_buffer(data, buffer, axis=axis)
-            s = topi.generic.schedule_injective([out])
+            s = topi.testing.get_injective_schedule(device)([out])
 
         buffer_tvm = tvm.nd.array(buffer_np, ctx=ctx)
         data_tvm = tvm.nd.array(data_np, ctx=ctx)
@@ -126,29 +128,29 @@ def check_device(device):
             return
         print('  Running on target: {}'.format(device))
 
+        conv2d_nchw, schedule_conv2d_nchw = topi.testing.get_conv2d_nchw_implement(device)
+
         with tvm.target.create(device):
             out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis)
-            s = topi.generic.schedule_injective([out])
+            s = topi.testing.get_injective_schedule(device)([out])
             update_context = tvm.build(s, [inc_input, context, out], device, name='update_context')
 
-            out = topi.nn.conv2d(context, kernel, strides=stride, padding=padding, dilation=dilate,
-                                 layout='NCHW', out_dtype=dtype)
-            s = topi.generic.schedule_conv2d_nchw([out])
+            out = conv2d_nchw(context, kernel, stride, padding, dilate, dtype)
+            s = schedule_conv2d_nchw([out])
             conv2d_inc = tvm.build(s, [context, kernel, out], device, name='conv2d_inc')
 
             out = topi.nn.fifo_buffer(inc_output, output_window, axis=buffer_axis)
-            s = topi.generic.schedule_injective([out])
+            s = topi.testing.get_injective_schedule(device)([out])
             update_output_window = tvm.build(s, [inc_output, output_window, out], device,
                  name='update_output_window')
 
             out = topi.nn.fifo_buffer(inc_input, input_window, axis=buffer_axis)
-            s = topi.generic.schedule_injective([out])
+            s = topi.testing.get_injective_schedule(device)([out])
             update_input_window = tvm.build(s, [inc_input, input_window, out], device,
                                             name='update_input_window')
 
-            out = topi.nn.conv2d(input_window, kernel, strides=stride, padding=padding,
-                                 dilation=dilate, layout='NCHW', out_dtype=dtype)
-            s = topi.generic.schedule_conv2d_nchw([out])
+            out = conv2d_nchw(input_window, kernel, stride, padding, dilate, dtype)
+            s = schedule_conv2d_nchw([out])
             conv2d = tvm.build(s, [input_window, kernel, out], device, name='conv2d')
 
         input_window_tvm = tvm.nd.array(input_window_np, ctx=ctx)
diff --git a/topi/tests/python/test_topi_batch_matmul.py b/topi/tests/python/test_topi_batch_matmul.py
index d1f50c86464b..1b38e9037fb9 100644
--- a/topi/tests/python/test_topi_batch_matmul.py
+++ b/topi/tests/python/test_topi_batch_matmul.py
@@ -24,6 +24,12 @@
 
 from common import get_all_backend
 
+_batch_matmul_implement = {
+    "generic": (topi.nn.batch_matmul, topi.generic.schedule_batch_matmul),
+    "cpu": (topi.x86.batch_matmul, topi.x86.schedule_batch_matmul),
+    "gpu": (topi.nn.batch_matmul, topi.cuda.schedule_batch_matmul),
+}
+
 def verify_batch_matmul(batch, M, N, K):
     x = tvm.placeholder((batch, M, K), name='x')
     y = tvm.placeholder((batch, N, K), name='y')
@@ -46,8 +52,9 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            out = topi.nn.batch_matmul(x, y)
-            s = topi.generic.schedule_batch_matmul([out])
+            fcompute, fschedule = topi.testing.dispatch(device, _batch_matmul_implement)
+            out = fcompute(x, y)
+            s = fschedule([out])
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx)
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
index eeaeed15df1c..274743d274ae 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -35,9 +35,9 @@ def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel,
     with tvm.target.create('llvm'):
         A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_dtype, name='A')
         W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_dtype, name='W')
-        B = topi.nn.bitserial_conv2d_nchw(A, W, stride, padding, activation_bits, weight_bits,
-                                          out_dtype=out_dtype, unipolar=unipolar)
-        s = topi.generic.schedule_bitserial_conv2d_nchw([B])
+        B = topi.x86.bitserial_conv2d_nchw(A, W, stride, padding, activation_bits, weight_bits,
+                                           input_dtype, out_dtype, unipolar)
+        s = topi.x86.schedule_bitserial_conv2d_nchw([B])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
@@ -73,9 +73,9 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
     with tvm.target.create('llvm'):
         A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_dtype, name='A')
         W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_dtype, name='W')
-        B = topi.nn.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits,
-                                          out_dtype=out_dtype, unipolar=unipolar)
-        s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
+        B = topi.x86.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits,
+                                           input_dtype, out_dtype, unipolar)
+        s = topi.x86.schedule_bitserial_conv2d_nhwc([B])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
index 1b2f40de1b21..1f87785b4f48 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -39,9 +39,9 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
     with tvm.target.create(device):
         A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
         W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
-        B = topi.nn.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits,
-                                          pack_dtype='uint8', out_dtype='int16', unipolar=unipolar)
-        s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
+        B = topi.arm_cpu.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits,
+                                               'uint8', out_dtype, unipolar)
+        s = topi.arm_cpu.schedule_bitserial_conv2d_nhwc([B])
 
     func = tvm.build(s, [A, W, B], device)
 
diff --git a/topi/tests/python/test_topi_bitserial_dense.py b/topi/tests/python/test_topi_bitserial_dense.py
index f1bd02357796..505ce794312f 100644
--- a/topi/tests/python/test_topi_bitserial_dense.py
+++ b/topi/tests/python/test_topi_bitserial_dense.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test code for bitserial_dense operator"""
+import os
 import numpy as np
 import tvm
 import topi
@@ -22,27 +23,21 @@
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
+_bitserial_dense_implement = {
+    "generic": (topi.nn.bitserial_dense, topi.generic.schedule_bitserial_dense),
+    "cpu": (topi.x86.bitserial_dense, topi.x86.schedule_bitserial_dense),
+    "arm_cpu": (topi.arm_cpu.bitserial_dense, topi.arm_cpu.schedule_bitserial_dense),
+}
+
 def generate_quantized_np(shape, bits, out_dtype):
     min_val = 0
     max_val = 1 << bits
     return np.random.randint(min_val, max_val, size=shape).astype(out_dtype)
 
 def verify_bitserial_dense(batch, in_dim, out_dim, activation_bits, weight_bits, unipolar):
-    input_dtype = 'uint32'
     out_dtype = 'int16'
 
-    with tvm.target.create('llvm'):
-        A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A')
-        B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B')
-        C = topi.nn.bitserial_dense(A, B, activation_bits, weight_bits, out_dtype=out_dtype,
-                                    unipolar=unipolar)
-        s = topi.generic.schedule_bitserial_dense([C])
-
-    a_shape = get_const_tuple(A.shape)
-    b_shape = get_const_tuple(B.shape)
-
-    @memoize("topi.tests.test_topi_bitseral_dense")
-    def get_ref_data():
+    def get_ref_data(a_shape, b_shape, input_dtype):
         a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype)
         b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype)
         if unipolar:
@@ -53,15 +48,30 @@ def get_ref_data():
         else:
             c_np = np.dot(a_np, b_np.T)
         return a_np, b_np, c_np
-    a_np, b_np, c_np = get_ref_data()
 
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(b_np, ctx)
-    c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-    func = tvm.build(s, [A, B, C], "llvm")
-    func(a, b, c)
-    tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+    for target in ["llvm", "llvm -device=arm_cpu"]:
+        if "arm_cpu" in target and 'arm' not in os.uname()[4]:
+            print ("Skipped running code, not an arm device")
+            continue
+        input_dtype = 'uint8' if "arm_cpu" in target else "uint32"
+        A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A')
+        B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B')
+        fcompute, fschedule = topi.testing.dispatch(target, _bitserial_dense_implement)
+        C = fcompute(A, B, activation_bits, weight_bits,
+                     input_dtype, out_dtype, unipolar)
+        s = fschedule([C])
+
+        a_shape = get_const_tuple(A.shape)
+        b_shape = get_const_tuple(B.shape)
+        a_np, b_np, c_np = get_ref_data(a_shape, b_shape, input_dtype)
+
+        ctx = tvm.cpu(0)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        func = tvm.build(s, [A, B, C], target)
+        func(a, b, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
 def test_bitserial_dense():
     verify_bitserial_dense(1, 1024, 1000, 1, 1, True)
diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py
index 13da6af9eb02..ce6a28643b58 100644
--- a/topi/tests/python/test_topi_bnn.py
+++ b/topi/tests/python/test_topi_bnn.py
@@ -33,9 +33,9 @@ def verify_binary_dense(batch, in_dim, out_dim):
     bnn_C = topi.nn.binary_dense(bnn_A1, bnn_B1)
     # schedule
     with tvm.target.create('llvm'):
-        s1 = topi.generic.schedule_binarize_pack(bnn_A)
-        s2 = topi.generic.schedule_binarize_pack(bnn_B)
-        s3 = topi.generic.schedule_binary_dense(bnn_C)
+        s1 = topi.x86.schedule_binarize_pack(bnn_A)
+        s2 = topi.x86.schedule_binarize_pack(bnn_B)
+        s3 = topi.x86.schedule_binary_dense(bnn_C)
 
     dtype = A.dtype
     @memoize("topi.tests.test_topi_binary_dense")
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index 5a0a940d3d7b..2bea9b09bbf4 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -15,10 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test code for broadcasting operators."""
-from common import get_all_backend
 import numpy as np
 import tvm
 import topi
+import topi.testing
+from common import get_all_backend
 
 
 def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
@@ -33,7 +34,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_broadcast(B)
+            s = topi.testing.get_broadcast_schedule(device)(B)
         foo = tvm.build(s, [A, B], device, name="broadcast_to")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.broadcast_to(data_npy, out_shape)
@@ -81,7 +82,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_broadcast(C)
+            s = topi.testing.get_broadcast_schedule(device)(C)
         foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__)
 
         lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, ctx)
@@ -252,7 +253,7 @@ def check_device(device):
                 return
             print("Running on target: %s" % device)
             with tvm.target.create(device):
-                s = topi.generic.schedule_broadcast(B)
+                s = topi.testing.get_broadcast_schedule(device)(B)
             foo = tvm.build(s, [A, B], device, name=name)
 
             data_npy = indata.astype(A.dtype)
@@ -293,7 +294,7 @@ def check_device(device):
                 return
             print("Running on target: %s" % device)
             with tvm.target.create(device):
-                s = topi.generic.schedule_broadcast(B)
+                s = topi.testing.get_broadcast_schedule(device)(B)
             foo = tvm.build(s, [A, B], device, name=name)
 
             data_npy = np.random.uniform(size=shape).astype(A.dtype)
@@ -335,7 +336,7 @@ def check_device(device):
                 return
             print("Running on target: %s" % device)
             with tvm.target.create(device):
-                s = topi.generic.schedule_broadcast(C)
+                s = topi.testing.get_broadcast_schedule(device)(C)
             foo = tvm.build(s, [A, B, C], device, name=name)
 
             lhs_nd = tvm.nd.array(lhs, ctx)
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index 585374f33a64..74034ce30b0e 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -18,6 +18,7 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
@@ -43,7 +44,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
 
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
diff --git a/topi/tests/python/test_topi_conv1d.py b/topi/tests/python/test_topi_conv1d.py
index d54742c01d14..6e55a574de4a 100644
--- a/topi/tests/python/test_topi_conv1d.py
+++ b/topi/tests/python/test_topi_conv1d.py
@@ -25,6 +25,18 @@
 from common import get_all_backend
 
 
+_conv1d_ncw_implement = {
+    "generic": (topi.nn.conv1d_ncw, topi.generic.schedule_conv1d_ncw),
+    "cpu": (topi.nn.conv1d_ncw, topi.x86.schedule_conv1d_ncw),
+    "gpu": (topi.cuda.conv1d_ncw, topi.cuda.schedule_conv1d_ncw)
+}
+
+_conv1d_nwc_implement = {
+    "generic": (topi.nn.conv1d_nwc, topi.generic.schedule_conv1d_nwc),
+    "cpu": (topi.nn.conv1d_nwc, topi.x86.schedule_conv1d_nwc),
+    "gpu": (topi.cuda.conv1d_nwc, topi.cuda.schedule_conv1d_nwc)
+}
+
 def verify_conv1d(batch,
                   in_channels,
                   in_width,
@@ -66,12 +78,13 @@ def check_device(device):
         if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
+        if layout == "NCW":
+            fcompute, fschedule = topi.testing.dispatch(device, _conv1d_ncw_implement)
+        else:
+            fcompute, fschedule = topi.testing.dispatch(device, _conv1d_nwc_implement)
         with tvm.target.create(device):
-            B = topi.nn.conv1d(A, W, stride, padding, dilation, layout, 'float32')
-            if layout == 'NCW':
-                s = topi.generic.schedule_conv1d_ncw([B])
-            else:
-                s = topi.generic.schedule_conv1d_nwc([B])
+            B = fcompute(A, W, stride, padding, dilation, 'float32')
+            s = fschedule([B])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
diff --git a/topi/tests/python/test_topi_conv1d_transpose_ncw.py b/topi/tests/python/test_topi_conv1d_transpose_ncw.py
index 9d6e9db254b5..64af254adc7d 100644
--- a/topi/tests/python/test_topi_conv1d_transpose_ncw.py
+++ b/topi/tests/python/test_topi_conv1d_transpose_ncw.py
@@ -24,6 +24,11 @@
 from topi.util import get_const_tuple
 from common import get_all_backend
 
+_conv1d_transpose_ncw_implement = {
+    "generic": (topi.nn.conv1d_transpose_ncw, topi.generic.schedule_conv1d_transpose_ncw),
+    "gpu": (topi.cuda.conv1d_transpose_ncw, topi.cuda.schedule_conv1d_transpose_ncw)
+}
+
 def verify_conv1d_transpose_ncw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
     in_width = in_size
     A = tvm.placeholder((batch, in_channel, in_width), name='A')
@@ -49,10 +54,11 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         with tvm.target.create(device):
-            B = topi.nn.conv1d_transpose_ncw(A, W, stride, padding, A.dtype)
+            fcompute, fschedule = topi.testing.dispatch(device, _conv1d_transpose_ncw_implement)
+            B = fcompute(A, W, stride, padding, A.dtype)
             C = topi.nn.relu(B)
-            s1 = topi.generic.schedule_conv1d_transpose_ncw([B])
-            s2 = topi.generic.schedule_conv1d_transpose_ncw([C])
+            s1 = fschedule([B])
+            s2 = fschedule([C])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py
index af585904293f..8a74b4f06cd2 100644
--- a/topi/tests/python/test_topi_conv2d_NCHWc.py
+++ b/topi/tests/python/test_topi_conv2d_NCHWc.py
@@ -98,16 +98,16 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), padding,
-                                     (dilation, dilation),
-                                     layout='NCHW%dc'%ic_block,
-                                     out_layout="NCHW%dc"%oc_block,
-                                     out_dtype=dtype)
+            C = topi.x86.conv2d_NCHWc(A, W, (stride, stride), padding,
+                                      (dilation, dilation),
+                                      'NCHW%dc'%ic_block,
+                                      "NCHW%dc"%oc_block,
+                                      dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.generic.schedule_conv2d_NCHWc([C])
+            s = topi.x86.schedule_conv2d_NCHWc([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index 35423a686e8f..086523e46013 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -24,6 +24,12 @@
 from topi.util import get_const_tuple
 
 
+_conv2d_hwcn_implement = {
+    "generic": (topi.nn.conv2d_hwcn, topi.generic.schedule_conv2d_hwcn),
+    "gpu": (topi.cuda.conv2d_hwcn, topi.cuda.schedule_conv2d_hwcn),
+    "opencl": (topi.cuda.conv2d_hwcn, topi.cuda.schedule_conv2d_hwcn),
+}
+
 def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
     in_height = in_width = in_size
 
@@ -56,12 +62,13 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            t_conv = topi.nn.conv2d(A, W, stride, padding, dilation, layout='HWCN')
+            fcompute, fschedule = topi.testing.dispatch(device, _conv2d_hwcn_implement)
+            t_conv = fcompute(A, W, stride, padding, dilation)
             t_bias = topi.add(t_conv, B)
             t_relu = topi.nn.relu(t_bias)
-            s1 = topi.generic.schedule_conv2d_hwcn([t_conv])
-            s2 = topi.generic.schedule_conv2d_hwcn([t_bias])
-            s3 = topi.generic.schedule_conv2d_hwcn([t_relu])
+            s1 = fschedule([t_conv])
+            s2 = fschedule([t_bias])
+            s3 = fschedule([t_relu])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(b_np, ctx)
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index 6cb66d013541..c36bfa331faf 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -82,13 +82,13 @@ def check_device(device):
 
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            C = topi.nn.conv2d(A, W, (stride, stride), padding, (dilation, dilation),
-                               layout='NCHW', out_dtype=dtype)
+            C = topi.cuda.conv2d_NCHWc_int8(A, W, (stride, stride), padding, (dilation, dilation),
+                                            'NCHW', dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.generic.schedule_conv2d_nchw([C])
+            s = topi.cuda.schedule_conv2d_NCHWc_int8([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index 85d733c5d546..a0258ec93bf2 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -66,18 +66,27 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
+
+        if "cudnn" in device:
+            fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn
+        else:
+            fcompute, fschedule = topi.testing.get_conv2d_nchw_implement(device)
+
         with tvm.target.create(device):
-            C = topi.nn.conv2d(A, W, (stride, stride), padding,
-                               (dilation, dilation), layout='NCHW', out_dtype=dtype)
+            if "cudnn" in device:
+                C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), "NCHW", dtype)
+            else:
+                C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.generic.schedule_conv2d_nchw([C])
+            s = fschedule([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(b_np, ctx)
+
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
         if add_bias:
             func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py
index 342f3190b702..2a5915ef0a53 100644
--- a/topi/tests/python/test_topi_conv2d_nhwc.py
+++ b/topi/tests/python/test_topi_conv2d_nhwc.py
@@ -24,6 +24,16 @@
 from topi.util import get_const_tuple
 
 
+
+_conv2d_nhwc_implement = {
+    "generic": (topi.nn.conv2d_nhwc, topi.generic.schedule_conv2d_nhwc),
+    "cpu": (topi.nn.conv2d_nhwc, topi.x86.schedule_conv2d_nhwc),
+    "arm_cpu": (topi.arm_cpu.conv2d_nhwc_spatial_pack,
+                topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
+    "hls": (topi.nn.conv2d_nhwc, topi.hls.schedule_conv2d_nhwc)
+}
+
+
 def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
     in_height = in_width = in_size
 
@@ -60,7 +70,8 @@ def check_device(device):
         func(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['llvm', 'cuda']:
+    # TODO(@alexgl-github): add cuda back after fix conv2d_nhwc for cuda
+    for device in ['llvm']:
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py b/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py
index 763150ac425f..8267aad382e8 100644
--- a/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py
+++ b/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Example code to do convolution."""
-import os
+import pytest
 import numpy as np
+
 import tvm
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
@@ -56,7 +57,7 @@ def check_device(device):
 
         with tvm.target.create(device):
             B = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NHWC', out_dtype="int32")
-            s = topi.generic.schedule_conv2d_nhwc_pack([B])
+            s = topi.x86.schedule_conv2d_nhwc_pack_int8([B])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
@@ -69,22 +70,12 @@ def check_device(device):
         check_device(device)
 
 
-class DefaultFallback(autotvm.FallbackContext):
-    def _query_inside(self, target, workload):
-        key = (target, workload)
-        if key in self.memory:
-            return self.memory[key]
-        cfg = FallbackConfigEntity()
-        cfg.template_key = 'direct'
-        self.memory[key] = cfg
-        return cfg
-
-
+# TODO(@llyfacebook): Please fix https://github.com/apache/incubator-tvm/issues/4122 to enable this test.
+@pytest.mark.skip
 def test_conv2d_nhwc():
-    autotvm.DispatchContext.current.silent = True
-    with DefaultFallback():
-        verify_conv2d_1x1_nhwc_pack_int8(1, 256, 32, 256, 1, 1, 0)
+    verify_conv2d_1x1_nhwc_pack_int8(1, 256, 32, 256, 1, 1, 0)
 
 
 if __name__ == "__main__":
-    test_conv2d_nhwc()
+    # test_conv2d_nhwc()
+    pass
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index fb836d43ccce..e8aabc61a4fa 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -24,6 +24,14 @@
 
 from common import get_all_backend
 
+_conv2d_transpose_nchw_implement = {
+    "generic": (topi.nn.conv2d_transpose_nchw, topi.generic.schedule_conv2d_transpose_nchw),
+    "cpu": (topi.x86.conv2d_transpose_nchw, topi.x86.schedule_conv2d_transpose_nchw),
+    "arm_cpu": (topi.arm_cpu.conv2d_transpose_nchw, topi.arm_cpu.schedule_conv2d_transpose_nchw),
+    "gpu": (topi.cuda.conv2d_transpose_nchw, topi.cuda.schedule_conv2d_transpose_nchw),
+    "hls": (topi.nn.conv2d_transpose_nchw, topi.hls.schedule_conv2d_transpose_nchw),
+}
+
 def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
     in_height, in_width = in_size
     kernel_height, kernel_width = kernel
@@ -54,13 +62,14 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            B = topi.nn.conv2d_transpose_nchw(A, W,
-                                              [stride_height, stride_width],
-                                              [pad_top, pad_left, pad_bottom, pad_right],
-                                              A.dtype)
+            fcompute, fschedule = topi.testing.dispatch(device, _conv2d_transpose_nchw_implement)
+            B = fcompute(A, W,
+                         [stride_height, stride_width],
+                         [pad_top, pad_left, pad_bottom, pad_right],
+                         A.dtype)
             C = topi.nn.relu(B)
-            s1 = topi.generic.schedule_conv2d_transpose_nchw([B])
-            s2 = topi.generic.schedule_conv2d_transpose_nchw([C])
+            s1 = fschedule([B])
+            s2 = fschedule([C])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py
index 350e62101689..2d12336e771a 100644
--- a/topi/tests/python/test_topi_conv2d_winograd.py
+++ b/topi/tests/python/test_topi_conv2d_winograd.py
@@ -27,6 +27,13 @@
 from topi.util import get_const_tuple
 
 
+_conv2d_nchw_winograd_implement = {
+    "arm_cpu": (topi.arm_cpu.conv2d_nchw_winograd, topi.arm_cpu.schedule_conv2d_nchw_winograd),
+    "cuda": (topi.cuda.conv2d_nchw_winograd, topi.cuda.schedule_conv2d_nchw_winograd),
+    "mali": (topi.mali.conv2d_nchw_winograd, topi.mali.schedule_conv2d_nchw_winograd),
+}
+
+
 def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False,
         devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']):
     pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
@@ -67,12 +74,13 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype)
+            fcompute, fschedule = topi.testing.dispatch(device, _conv2d_nchw_winograd_implement)
+            C = fcompute(A, W, stride, padding, dilation, dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.generic.schedule_conv2d_nchw([C])
+            s = fschedule([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
@@ -93,61 +101,45 @@ def check_device(device):
         check_device(device)
 
 
-class WinogradFallback(autotvm.FallbackContext):
-    def _query_inside(self, target, workload):
-        key = (target, workload)
-        if key in self.memory:
-            return self.memory[key]
-        cfg = FallbackConfigEntity()
-        cfg.template_key = 'winograd'
-        self.memory[key] = cfg
-        cfg.is_fallback = False
-        return cfg
-
-
 def test_conv2d_nchw():
-    autotvm.DispatchContext.current.silent = True
-
-    with WinogradFallback():
-
-        # inception v3 workloads
-        verify_conv2d_nchw(1, 128, 17, 192, 7, 1, 3, devices=['cuda'])
-        verify_conv2d_nchw(1, 128, 17, 128, 7, 1, 3, devices=['cuda'])
-        verify_conv2d_nchw(1, 160, 17, 160, 7, 1, 3, devices=['cuda'])
-
-        # resnet 18 workloads
-        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
-        verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
-        verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
-        verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
-        verify_conv2d_nchw(1, 48,  35, 64, 5, 1, 2, devices=['cuda'])
-
-        # batch size = 2
-        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1)
-
-        # relu, bias
-        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_bias=True)
-        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True)
-        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True)
-
-        # werid workloads
-        verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1)
-        verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1)
-        verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
-
-        # Asymmetric padding
-        verify_conv2d_nchw(1,  48, 56,  48, 3, 1, (1, 1, 1, 1))
-        verify_conv2d_nchw(1,  64, 28,  64, 3, 1, (1, 1, 1, 1))
-        verify_conv2d_nchw(1, 128, 14, 128, 3, 1, (1, 1))
-        verify_conv2d_nchw(1, 512,  7, 512, 3, 1, "SAME")
-        verify_conv2d_nchw(2, 13,  71,  59, 3, 1, (1, 1, 1, 1))
-        verify_conv2d_nchw(2,  48, 56,  48, 3, 1, (1, 1, 1, 1), add_bias=True)
-        verify_conv2d_nchw(2,  48, 56,  48, 3, 1, (1, 1), add_relu=True)
-        verify_conv2d_nchw(2,  48, 56,  48, 3, 1, "SAME", add_relu=True, add_bias=True)
-        verify_conv2d_nchw(1,  64, 17, 192, 7, 1, (3, 1), devices=['cuda'])
-        verify_conv2d_nchw(1,  64, 17,  64, 7, 1, (3, 3, 2, 2), devices=['cuda'])
-        verify_conv2d_nchw(1, 160, 17, 160, 7, 1, "SAME", devices=['cuda'])
-        verify_conv2d_nchw(1,  48, 35,  48, 5, 1, "VALID", devices=['cuda'])
+    # inception v3 workloads
+    verify_conv2d_nchw(1, 128, 17, 192, 7, 1, 3, devices=['cuda'])
+    verify_conv2d_nchw(1, 128, 17, 128, 7, 1, 3, devices=['cuda'])
+    verify_conv2d_nchw(1, 160, 17, 160, 7, 1, 3, devices=['cuda'])
+
+    # resnet 18 workloads
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
+    verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
+    verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
+    verify_conv2d_nchw(1, 48,  35, 64, 5, 1, 2, devices=['cuda'])
+
+    # batch size = 2
+    verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1)
+
+    # relu, bias
+    verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_bias=True)
+    verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True)
+    verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True)
+
+    # weird workloads
+    verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1)
+    verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1)
+    verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
+
+    # Asymmetric padding
+    verify_conv2d_nchw(1,  48, 56,  48, 3, 1, (1, 1, 1, 1))
+    verify_conv2d_nchw(1,  64, 28,  64, 3, 1, (1, 1, 1, 1))
+    verify_conv2d_nchw(1, 128, 14, 128, 3, 1, (1, 1))
+    verify_conv2d_nchw(1, 512,  7, 512, 3, 1, "SAME")
+    verify_conv2d_nchw(2, 13,  71,  59, 3, 1, (1, 1, 1, 1))
+    verify_conv2d_nchw(2,  48, 56,  48, 3, 1, (1, 1, 1, 1), add_bias=True)
+    verify_conv2d_nchw(2,  48, 56,  48, 3, 1, (1, 1), add_relu=True)
+    verify_conv2d_nchw(2,  48, 56,  48, 3, 1, "SAME", add_relu=True, add_bias=True)
+    verify_conv2d_nchw(1,  64, 17, 192, 7, 1, (3, 1), devices=['cuda'])
+    verify_conv2d_nchw(1,  64, 17,  64, 7, 1, (3, 3, 2, 2), devices=['cuda'])
+    verify_conv2d_nchw(1, 160, 17, 160, 7, 1, "SAME", devices=['cuda'])
+    verify_conv2d_nchw(1,  48, 35,  48, 5, 1, "VALID", devices=['cuda'])
 
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_conv3d_ncdhw.py b/topi/tests/python/test_topi_conv3d_ncdhw.py
index 92b1068a11ec..6c60c27ed426 100644
--- a/topi/tests/python/test_topi_conv3d_ncdhw.py
+++ b/topi/tests/python/test_topi_conv3d_ncdhw.py
@@ -27,6 +27,11 @@
 
 from common import get_all_backend
 
+_conv3d_ncdhw_implement = {
+    "generic": (topi.nn.conv3d_ncdhw, topi.generic.schedule_conv3d_ncdhw),
+    "gpu": (topi.cuda.conv3d_ncdhw, topi.cuda.schedule_conv3d_ncdhw),
+}
+
 def verify_conv3d_ncdhw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
     pad_front, pad_top, pad_left, pad_back, pad_bottom, pad_right = get_pad_tuple3d(padding, (kernel, kernel, kernel))
     padding_sum = pad_front + pad_back + pad_top + pad_left + pad_bottom + pad_right
@@ -65,14 +70,15 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
+        fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ncdhw_implement)
         with tvm.target.create(device):
-            C = topi.nn.conv3d(A, W, (stride, stride, stride), padding,
-                               (dilation, dilation, dilation), layout='NCDHW', out_dtype=dtype)
+            C = fcompute(A, W, (stride, stride, stride), padding,
+                         (dilation, dilation, dilation), dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.generic.schedule_conv3d_ncdhw([C])
+            s = fschedule([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
diff --git a/topi/tests/python/test_topi_conv3d_ndhwc.py b/topi/tests/python/test_topi_conv3d_ndhwc.py
index c613f68d062e..7e2f02cea20a 100644
--- a/topi/tests/python/test_topi_conv3d_ndhwc.py
+++ b/topi/tests/python/test_topi_conv3d_ndhwc.py
@@ -23,6 +23,13 @@
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
+_conv3d_ndhwc_implement = {
+    "generic": (topi.nn.conv3d_ndhwc, topi.generic.schedule_conv3d_ndhwc),
+    "cpu": (topi.x86.conv3d_ndhwc, topi.x86.schedule_conv3d_ndhwc),
+    "gpu": (topi.cuda.conv3d_ndhwc, topi.cuda.schedule_conv3d_ndhwc),
+}
 
 def verify_conv3d_ndhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
     if isinstance(in_size, tuple):
@@ -51,13 +58,15 @@ def get_ref_data():
     a_np, w_np, b_np = get_ref_data()
 
     def check_device(device):
-        if not tvm.runtime.enabled(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
+        fcompute, fschedule = topi.testing.dispatch(device, _conv3d_ndhwc_implement)
         with tvm.target.create(device):
-            B = topi.nn.conv3d(A, W, stride, padding, dilation, layout="NDHWC")
-            s = topi.generic.schedule_conv3d_ndhwc([B])
+            B = fcompute(A, W, stride, padding, dilation, dtype)
+            s = fschedule([B])
         ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
@@ -66,7 +75,7 @@ def check_device(device):
         func(a, w, b)
         tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['llvm']:
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_deformable_conv2d.py b/topi/tests/python/test_topi_deformable_conv2d.py
index 45222b6bd489..1b1a0327a3d5 100644
--- a/topi/tests/python/test_topi_deformable_conv2d.py
+++ b/topi/tests/python/test_topi_deformable_conv2d.py
@@ -25,6 +25,11 @@
 from common import get_all_backend
 
 
+_deformable_conv2d_implement = {
+    "generic": (topi.nn.deformable_conv2d_nchw, topi.generic.schedule_deformable_conv2d_nchw),
+    "cuda": (topi.cuda.deformable_conv2d_nchw, topi.cuda.schedule_deformable_conv2d_nchw),
+}
+
 def verify_deformable_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, deformable_groups=1, groups=1):
     print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size,
             num_filter, kernel, stride, padding, dilation, deformable_groups, groups))
@@ -60,10 +65,11 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
+        fcompute, fschedule = topi.testing.dispatch(device, _deformable_conv2d_implement)
         with tvm.target.create(device):
-            C = topi.nn.deformable_conv2d_nchw(A, Offset, W, stride, padding, dilation,
-                    deformable_groups, groups, out_dtype=dtype)
-            s = topi.generic.schedule_deformable_conv2d_nchw([C])
+            C = fcompute(A, Offset, W, stride, padding, dilation,
+                         deformable_groups, groups, dtype)
+            s = fschedule([C])
 
             a = tvm.nd.array(a_np, ctx)
             offset = tvm.nd.array(offset_np, ctx)
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index 3b747712a173..d729e4330e52 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -24,6 +24,19 @@
 
 from common import get_all_backend, Int8Fallback
 
+_dense_implement = {
+    "generic": [(topi.nn.dense, topi.generic.schedule_dense)],
+    "cpu": [(topi.x86.dense_nopack, topi.x86.schedule_dense_nopack),
+            (topi.x86.dense_pack, topi.x86.schedule_dense_pack)],
+    "gpu": [(topi.cuda.dense_small_batch, topi.cuda.schedule_dense_small_batch),
+            (topi.cuda.dense_large_batch, topi.cuda.schedule_dense_large_batch)],
+    "mali": [(topi.mali.dense, topi.mali.schedule_dense)],
+    "bifrost": [(topi.bifrost.dense, topi.bifrost.schedule_dense)],
+    "opengl": [(topi.nn.dense, topi.opengl.schedule_dense)],
+    "rocm": [(topi.rocm.dense, topi.rocm.schedule_dense)],
+    "hls": [(topi.nn.dense, topi.hls.schedule_dense)],
+}
+
 def verify_dense(batch, in_dim, out_dim, use_bias=True):
     A = tvm.placeholder((batch, in_dim), name='A')
     B = tvm.placeholder((out_dim, in_dim), name='B')
@@ -50,17 +63,18 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            D = topi.nn.dense(A, B, C if use_bias else None)
-            D = topi.nn.relu(D)
-            s = topi.generic.schedule_dense([D])
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(b_np, ctx)
-        c = tvm.nd.array(c_np, ctx)
-        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
-        f = tvm.build(s, [A, B, C, D], device, name="dense")
-        f(a, b, c, d)
-        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        for fcompute, fschedule in topi.testing.dispatch(device, _dense_implement):
+            with tvm.target.create(device):
+                D = fcompute(A, B, C if use_bias else None)
+                D = topi.nn.relu(D)
+                s = fschedule([D])
+            a = tvm.nd.array(a_np, ctx)
+            b = tvm.nd.array(b_np, ctx)
+            c = tvm.nd.array(c_np, ctx)
+            d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+            f = tvm.build(s, [A, B, C, D], device, name="dense")
+            f(a, b, c, d)
+            tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
     for device in get_all_backend():
         check_device(device)
@@ -99,9 +113,9 @@ def check_device(device):
 
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            D = topi.nn.dense(A, B, C if use_bias else None, out_dtype=out_dtype)
+            D = topi.cuda.dense_int8(A, B, C if use_bias else None, out_dtype)
             D = topi.nn.relu(D)
-            s = topi.generic.schedule_dense([D])
+            s = topi.cuda.schedule_dense_int8([D])
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(c_np, ctx)
diff --git a/topi/tests/python/test_topi_depth_to_space.py b/topi/tests/python/test_topi_depth_to_space.py
index 4e895cb5db55..693bfb624042 100644
--- a/topi/tests/python/test_topi_depth_to_space.py
+++ b/topi/tests/python/test_topi_depth_to_space.py
@@ -56,7 +56,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index d34d56e7fc86..7efe5a21578c 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -25,6 +25,24 @@
 
 from common import get_all_backend
 
+_depthwise_conv2d_nchw_implement = {
+    "generic": [(topi.nn.depthwise_conv2d_nchw, topi.generic.schedule_depthwise_conv2d_nchw)],
+    "arm_cpu": [(topi.arm_cpu.depthwise_conv2d_nchw, topi.arm_cpu.schedule_depthwise_conv2d_nchw),
+                (topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack,
+                 topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack)],
+    "gpu": [(topi.cuda.depthwise_conv2d_nchw, topi.cuda.schedule_depthwise_conv2d_nchw)],
+    "mali": [(topi.mali.depthwise_conv2d_nchw, topi.mali.schedule_depthwise_conv2d_nchw)],
+    "bifrost": [(topi.nn.depthwise_conv2d_nchw, topi.bifrost.schedule_depthwise_conv2d_nchw)],
+    "intel_graphics": [(topi.intel_graphics.depthwise_conv2d_nchw,
+                        topi.intel_graphics.schedule_depthwise_conv2d_nchw)],
+}
+
+_depthwise_conv2d_nhwc_implement = {
+    "generic": (topi.nn.depthwise_conv2d_nhwc, topi.generic.schedule_depthwise_conv2d_nhwc),
+    "gpu": (topi.nn.depthwise_conv2d_nhwc, topi.cuda.schedule_depthwise_conv2d_nhwc),
+}
+
+
 def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
     in_width = in_height
     filter_channel = in_channel
@@ -53,68 +71,75 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            # declare
-            DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter,
-                (stride_h, stride_w), padding_args, dilation, dtype)
-            ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
-            Relu = topi.nn.relu(ScaleShift)
-            # schedule
-            s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
-            s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
-            s3 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
-        # build the kernels
-        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
-        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
-        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
-
-        # Prepare pod type for test data closure
-        input_shape = get_const_tuple(Input.shape)
-        filter_shape = get_const_tuple(Filter.shape)
-        scale_shape = get_const_tuple(Scale.shape)
-        shift_shape = get_const_tuple(Shift.shape)
-        scale_shift_shape = get_const_tuple(ScaleShift.shape)
-
-        # Use memoize, pickle the test data for next time use.
-        @memoize("topi.tests.test_topi_depthwise_conv2d.nchw")
-        def get_ref_data():
-            input_np = np.random.uniform(size=input_shape).astype(dtype)
-            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
-            dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation))
-            scale_np = np.random.uniform(size=scale_shape).astype(dtype)
-            shift_np = np.random.uniform(size=shift_shape).astype(dtype)
-            # correctness with scipy
-            depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
-                input_np, dilated_filter_np, stride, padding)
-            scale_shift_scipy = np.zeros(shape=scale_shift_shape)
-            for c in range(in_channel * channel_multiplier):
-                scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
-                relu_scipy = np.maximum(scale_shift_scipy, 0)
-            return (input_np, filter_np, scale_np, shift_np,
-                    depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)
-        # Get the test data
-        (input_np, filter_np, scale_np, shift_np,
-         depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data()
 
-        input_tvm = tvm.nd.array(input_np, ctx)
-        filter_tvm = tvm.nd.array(filter_np, ctx)
-        scale_tvm = tvm.nd.array(scale_np, ctx)
-        shift_tvm = tvm.nd.array(shift_np, ctx)
-        depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx)
-        scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
-        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
-        # launch kernel 1 (depthwise_conv2d)
-        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
-        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
-        # launch kernel 2 (depthwise_conv2d + scale_shift)
-        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
-        tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
-        # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
-        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
-        tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
-        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        impl_list = topi.testing.dispatch(device, _depthwise_conv2d_nchw_implement)[:]
+        if device == "llvm" and channel_multiplier == 1 and dilation == 1:
+            impl_list.append((topi.x86.depthwise_conv2d_nchw, topi.x86.schedule_depthwise_conv2d_nchw))
+
+        for fcompute, fschedule in impl_list:
+            with tvm.target.create(device):
+                # declare
+                DepthwiseConv2d = fcompute(Input, Filter, (stride_h, stride_w),
+                                           padding_args, dilation, dtype)
+                ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
+                Relu = topi.nn.relu(ScaleShift)
+                # schedule
+                s1 = fschedule(DepthwiseConv2d)
+                s2 = fschedule(ScaleShift)
+                s3 = fschedule(Relu)
+            # build the kernels
+            f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
+            f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
+            f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
+
+            # Prepare pod type for test data closure
+            input_shape = get_const_tuple(Input.shape)
+            filter_shape = get_const_tuple(Filter.shape)
+            scale_shape = get_const_tuple(Scale.shape)
+            shift_shape = get_const_tuple(Shift.shape)
+            scale_shift_shape = get_const_tuple(ScaleShift.shape)
+
+            # Use memoize, pickle the test data for next time use.
+            @memoize("topi.tests.test_topi_depthwise_conv2d.nchw")
+            def get_ref_data():
+                input_np = np.random.uniform(size=input_shape).astype(dtype)
+                filter_np = np.random.uniform(size=filter_shape).astype(dtype)
+                dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation))
+                scale_np = np.random.uniform(size=scale_shape).astype(dtype)
+                shift_np = np.random.uniform(size=shift_shape).astype(dtype)
+                # correctness with scipy
+                depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
+                    input_np, dilated_filter_np, stride, padding)
+                scale_shift_scipy = np.zeros(shape=scale_shift_shape)
+                for c in range(in_channel * channel_multiplier):
+                    scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
+                    relu_scipy = np.maximum(scale_shift_scipy, 0)
+                return (input_np, filter_np, scale_np, shift_np,
+                        depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)
+
+            # Get the test data
+            (input_np, filter_np, scale_np, shift_np,
+             depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data()
+
+            input_tvm = tvm.nd.array(input_np, ctx)
+            filter_tvm = tvm.nd.array(filter_np, ctx)
+            scale_tvm = tvm.nd.array(scale_np, ctx)
+            shift_tvm = tvm.nd.array(shift_np, ctx)
+            depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx)
+            scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
+            relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
+            # launch kernel 1 (depthwise_conv2d)
+            timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
+            tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
+            # launch kernel 2 (depthwise_conv2d + scale_shift)
+            timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
+            tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
+            # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
+            timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
+            tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
+            tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+            tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+            tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
     for device in get_all_backend():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
@@ -150,16 +175,17 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
 
+        fcompute, fschedule = topi.testing.dispatch(device, _depthwise_conv2d_nhwc_implement)
         with tvm.target.create(device):
             # declare
-            DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter,
+            DepthwiseConv2d = fcompute(Input, Filter,
                 (stride_h, stride_w), padding_args, dilation, dtype)
             ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
             Relu = topi.nn.relu(ScaleShift)
             # schedule
-            s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
-            s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
-            s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
+            s1 = fschedule(DepthwiseConv2d)
+            s2 = fschedule(ScaleShift)
+            s3 = fschedule(Relu)
         # build the kernels
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
@@ -242,6 +268,7 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m
     stride_h = stride_w = stride
 
     assert dilation == 1, "depthwise_conv2d_NCHWc currently does not support dilation."
+    assert channel_multiplier == 1, "depthwise_conv2d_NCHWc currently does not support channel multiplier > 1."
     pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
     padding_args = (pad_h, pad_w)
 
@@ -276,17 +303,17 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             # declare
-            DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter,
-                                                             (stride_h, stride_w),
-                                                             padding_args,
-                                                             (dilation, dilation),
-                                                             in_layout,
-                                                             out_layout, dtype)
+            DepthwiseConv2d = topi.x86.depthwise_conv2d_NCHWc(Input, Filter,
+                                                              (stride_h, stride_w),
+                                                              padding_args,
+                                                              (dilation, dilation),
+                                                              in_layout,
+                                                              out_layout, dtype)
             # TODO: add scale_shift implement for NCHWc and add test here
             Relu = topi.nn.relu(DepthwiseConv2d)
             # schedule
-            s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
-            s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
+            s1 = topi.x86.schedule_depthwise_conv2d_NCHWc(DepthwiseConv2d)
+            s2 = topi.x86.schedule_depthwise_conv2d_NCHWc(Relu)
         # build the kernels
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Relu], device)
@@ -319,7 +346,6 @@ def get_ref_data():
                                                      dtype=DepthwiseConv2d.dtype), ctx)
         relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
         # launch kernel 1 (depthwise_conv2d)
-        print(filter_tvm.shape)
         f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
         # launch kernel 2 (depthwise_conv2d + relu)
         f2(input_tvm, filter_tvm, relu_tvm)
@@ -363,9 +389,7 @@ def test_depthwise_conv2d():
 
     # NCHW[x]c
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
-    depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "SAME")
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
-    depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "VALID")
 
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_group_conv2d.py b/topi/tests/python/test_topi_group_conv2d.py
index 0e176780023d..3904db7d2b23 100644
--- a/topi/tests/python/test_topi_group_conv2d.py
+++ b/topi/tests/python/test_topi_group_conv2d.py
@@ -28,6 +28,12 @@
 from common import get_all_backend, Int8Fallback
 
 
+_group_conv2d_nchw_implement = {
+    "generic": (topi.nn.group_conv2d_nchw, topi.generic.schedule_group_conv2d_nchw),
+    "gpu": (topi.cuda.group_conv2d_nchw, topi.cuda.schedule_group_conv2d_nchw),
+}
+
+
 def verify_group_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False):
     print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" %
         (batch, in_channel, in_size, num_filter,
@@ -70,12 +76,13 @@ def check_device(device):
 
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype)
+            fcompute, fschedule = topi.testing.dispatch(device, _group_conv2d_nchw_implement)
+            C = fcompute(A, W, stride, padding, dilation, groups, dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.generic.schedule_group_conv2d_nchw([C])
+            s = fschedule([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
@@ -149,12 +156,12 @@ def check_device(device):
 
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype)
+            C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.generic.schedule_group_conv2d_nchw([C])
+            s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
diff --git a/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py b/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py
index 3717534b85ff..08f136e5ae23 100644
--- a/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py
+++ b/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py
@@ -81,12 +81,12 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding),
-                                     (dilation, dilation),
-                                     layout='NCHW%dc'%ic_block,
-                                     out_layout="NCHW%dc"%oc_block,
-                                     out_dtype=dtype)
-            s = topi.generic.schedule_conv2d_NCHWc([C])
+            C = topi.x86.conv2d_NCHWc(A, W, (stride, stride), (padding, padding),
+                                      (dilation, dilation),
+                                      'NCHW%dc'%ic_block,
+                                      "NCHW%dc"%oc_block,
+                                      dtype)
+            s = topi.x86.schedule_conv2d_NCHWc([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
diff --git a/topi/tests/python/test_topi_image.py b/topi/tests/python/test_topi_image.py
index 21935cb911da..4297638b3dfe 100644
--- a/topi/tests/python/test_topi_image.py
+++ b/topi/tests/python/test_topi_image.py
@@ -52,7 +52,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
@@ -116,7 +116,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
@@ -176,7 +176,7 @@ def check_device(device):
                 return
             print("Running on target: %s" % device)
             with tvm.target.create(device):
-                s = topi.generic.schedule_injective(out)
+                s = topi.testing.get_injective_schedule(device)(out)
             tvm_images = tvm.nd.array(np_images, ctx)
             tvm_boxes = tvm.nd.array(np_boxes, ctx)
             tvm_indices = tvm.nd.array(np_box_indices, ctx)
diff --git a/topi/tests/python/test_topi_lrn.py b/topi/tests/python/test_topi_lrn.py
index 53139cdf10c6..4cb3c7581800 100644
--- a/topi/tests/python/test_topi_lrn.py
+++ b/topi/tests/python/test_topi_lrn.py
@@ -21,6 +21,16 @@
 from topi.util import get_const_tuple
 import topi.testing
 
+_lrn_schedule = {
+    "generic": topi.generic.schedule_lrn,
+    "gpu": topi.cuda.schedule_lrn,
+    "opencl": topi.cuda.schedule_lrn,
+    "metal": topi.cuda.schedule_lrn,
+    "rocm": topi.cuda.schedule_lrn,
+    "vulkan": topi.cuda.schedule_lrn,
+    "nvptx": topi.cuda.schedule_lrn,
+}
+
 def verify_lrn(shape, size, axis, bias, alpha, beta):
     A = tvm.placeholder(shape, name='A')
     B = topi.nn.lrn(A, size, axis, alpha, beta, bias)
@@ -35,10 +45,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            if device == 'llvm':
-                s = topi.generic.schedule_lrn([B])
-            else:
-                s = topi.cuda.schedule_lrn([B])
+            s_func = topi.testing.dispatch(device, _lrn_schedule)
+            s = s_func([B])
         ctx = tvm.context(device, 0)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index 5bb95ba10e3b..debc3efe0d27 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -62,23 +62,15 @@ def check_device(device):
                 return
             print("Running on target: %s" % device)
             with tvm.target.create(device):
-                s = topi.generic.schedule_injective(B)
+                s = topi.testing.get_injective_schedule(device)(B)
             foo = tvm.build(s, [A, B], device, name=name)
             a = tvm.nd.array(a_np, ctx)
             b = tvm.nd.array(np.zeros_like(b_np), ctx)
             foo(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        check_device('llvm')
-        check_device('cuda')
-        check_device('opencl')
-        check_device('metal')
-        check_device('rocm')
-        check_device('vulkan')
-        check_device('nvptx')
-        check_device('llvm -device=arm-cpu')
-        check_device('opencl -device=mali')
-        check_device('aocl_sw_emu')
+        for target in get_all_backend():
+            check_device(target)
 
     def test_isnan(
         low,
@@ -110,23 +102,15 @@ def check_device(device):
                 return
             print("Running on target: %s" % device)
             with tvm.target.create(device):
-                s = topi.generic.schedule_injective(B)
+                s = topi.testing.get_injective_schedule(device)(B)
             foo = tvm.build(s, [A, B], device, name="isnan")
             a = tvm.nd.array(a_np, ctx)
             b = tvm.nd.array(np.zeros_like(b_np), ctx)
             foo(a, b)
             tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        check_device('llvm')
-        check_device('cuda')
-        check_device('opencl')
-        check_device('metal')
-        check_device('rocm')
-        check_device('vulkan')
-        check_device('nvptx')
-        check_device('llvm -device=arm-cpu')
-        check_device('opencl -device=mali')
-        check_device('aocl_sw_emu')
+        for target in get_all_backend():
+            check_device(target)
 
     test_apply(topi.floor, "floor", np.floor, -100, 100)
     test_apply(topi.ceil, "ceil", np.ceil, -100, 100)
@@ -168,7 +152,7 @@ def verify(from_dtype, to_dtype, low=-100, high=100):
                 continue
             print("Running on target: %s" % device)
             with tvm.target.create(device):
-                s = topi.generic.schedule_injective(B)
+                s = topi.testing.get_injective_schedule(device)(B)
             foo = tvm.build(s, [A, B], device)
             a = tvm.nd.array(a_np, ctx)
             b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx)
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index cb81814e87f9..084a2c7c7671 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -23,6 +23,25 @@
 from topi.util import get_const_tuple
 from common import get_all_backend
 
+_pool_schedule = {
+    "generic": topi.generic.schedule_pool,
+    "cpu": topi.x86.schedule_pool,
+    "gpu": topi.cuda.schedule_pool,
+    "hls": topi.hls.schedule_pool,
+}
+
+_adaptive_pool_schedule = {
+    "generic": topi.generic.schedule_adaptive_pool,
+    "cpu": topi.x86.schedule_adaptive_pool,
+    "gpu": topi.cuda.schedule_adaptive_pool,
+    "hls": topi.hls.schedule_adaptive_pool,
+}
+
+_pool_grad_schedule = {
+    "generic": topi.generic.schedule_pool_grad,
+    "gpu": topi.cuda.schedule_pool_grad,
+}
+
 def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
     iw = ih
     kw = kh
@@ -74,7 +93,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_pool(B, layout)
+            s_func = topi.testing.dispatch(device, _pool_schedule)
+            s = s_func(B, layout)
 
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
@@ -129,7 +149,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_pool_grad(PoolGrad)
+            s_func = topi.testing.dispatch(device, _pool_grad_schedule)
+            s = s_func(PoolGrad)
 
         a = tvm.nd.array(a_np, ctx)
         out_grad = tvm.nd.array(out_grad_np, ctx)
@@ -201,7 +222,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_adaptive_pool(B)
+            s_func = topi.testing.dispatch(device, _adaptive_pool_schedule)
+            s = s_func(B)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
@@ -255,7 +277,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_adaptive_pool(out)
+            s_func = topi.testing.dispatch(device, _adaptive_pool_schedule)
+            s = s_func(out)
         a = tvm.nd.array(np_data, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), ctx)
         f = tvm.build(s, [data, out], device)
@@ -298,7 +321,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_pool(B, layout)
+            s_func = topi.testing.dispatch(device, _pool_schedule)
+            s = s_func(B, layout)
 
         a = tvm.nd.array(input_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
@@ -350,7 +374,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_pool(B, layout)
+            s_func = topi.testing.dispatch(device, _pool_schedule)
+            s = s_func(B, layout)
 
         a = tvm.nd.array(input_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index d266cfc6ceb5..751025bf82b8 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -19,6 +19,7 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 
 from common import get_all_backend
 
@@ -74,7 +75,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_reduce(B)
+            s = topi.testing.get_reduce_schedule(device)(B)
 
         foo = tvm.build(s, [A, B], device, name=type)
         # Test
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index 8868d4ebffe3..8ef354907691 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -19,8 +19,10 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 from topi.util import get_const_tuple
 from tvm.contrib.nvcc import have_fp16
+
 from common import get_all_backend
 
 def verify_relu(m, n, dtype="float32"):
@@ -40,7 +42,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_elemwise(B)
+            s = topi.testing.get_elemwise_schedule(device)(B)
 
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
diff --git a/topi/tests/python/test_topi_reorg.py b/topi/tests/python/test_topi_reorg.py
index 4edb0a195e22..c4cd2b5d0eb8 100644
--- a/topi/tests/python/test_topi_reorg.py
+++ b/topi/tests/python/test_topi_reorg.py
@@ -21,6 +21,11 @@
 import tvm
 import topi.testing
 
+_reorg_schedule = {
+    "generic": topi.generic.schedule_reorg,
+    "gpu": topi.cuda.schedule_reorg,
+}
+
 def verify_reorg(batch, in_size, in_channel, stride):
     '''Verify reorg operator by comparing outputs from tvm and numpy implementation'''
     in_height = in_width = in_size
@@ -46,10 +51,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            if device == 'llvm':
-                s = topi.generic.schedule_reorg([B])
-            else:
-                s = topi.cuda.schedule_reorg([B])
+            s_func = topi.testing.dispatch(device, _reorg_schedule)
+            s = s_func([B])
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device)
diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py
index 4836eef07508..5396b6beef81 100644
--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -25,6 +25,14 @@
 
 from common import get_all_backend
 
+_softmax_schedule = {
+    "generic": topi.generic.schedule_softmax,
+    "cpu": topi.x86.schedule_softmax,
+    "gpu": topi.cuda.schedule_softmax,
+    "hls": topi.hls.schedule_softmax,
+    "opengl": topi.opengl.schedule_softmax,
+}
+
 def check_device(A, B, a_np, b_np, device, name):
     ctx = tvm.context(device, 0)
     if not ctx.exist:
@@ -32,11 +40,12 @@ def check_device(A, B, a_np, b_np, device, name):
         return
     print("Running on target: %s" % device)
     with tvm.target.create(device):
-        s = topi.generic.schedule_softmax(B)
+        s_func = topi.testing.dispatch(device, _softmax_schedule)
+        s = s_func(B)
 
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-    f = tvm.build(s, [A, B], device, name="softmax")
+    f = tvm.build(s, [A, B], device, name=name)
     f(a, b)
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
@@ -50,7 +59,7 @@ def verify_softmax(m, n, dtype="float32"):
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = topi.testing.softmax_python(a_np)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(A, B, a_np, b_np, device, "softmax")
 
 def verify_softmax_4d(shape, dtype="float32"):
@@ -62,7 +71,7 @@ def verify_softmax_4d(shape, dtype="float32"):
     b_np = topi.testing.softmax_python(a_np.transpose(0, 2, 3, 1).reshape(h*w, c))
     b_np = b_np.reshape(1, h, w, c).transpose(0, 3, 1, 2)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(A, B, a_np, b_np, device, "softmax")
 
 def test_softmax():
diff --git a/topi/tests/python/test_topi_sort.py b/topi/tests/python/test_topi_sort.py
index 0ad4e987d17d..74e55ec248d9 100644
--- a/topi/tests/python/test_topi_sort.py
+++ b/topi/tests/python/test_topi_sort.py
@@ -21,6 +21,15 @@
 import topi
 import topi.testing
 
+_argsort_implement = {
+    "generic": (topi.argsort, topi.generic.schedule_argsort),
+    "gpu": (topi.cuda.argsort, topi.cuda.schedule_argsort),
+}
+
+_topk_implement = {
+    "generic": (topi.topk, topi.generic.schedule_topk),
+    "gpu": (topi.cuda.topk, topi.cuda.schedule_topk),
+}
 
 def verify_argsort(axis, is_ascend):
     dshape = (20, 100)
@@ -48,8 +57,9 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            out = topi.argsort(data, axis=axis, is_ascend=is_ascend)
-            s = topi.generic.schedule_argsort(out)
+            fcompute, fschedule = topi.testing.dispatch(device, _argsort_implement)
+            out = fcompute(data, axis=axis, is_ascend=is_ascend)
+            s = fschedule(out)
 
         tvm_data = tvm.nd.array(np_data, ctx)
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), ctx)
@@ -91,9 +101,10 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            outs = topi.topk(data, k, axis, ret_type, is_ascend, dtype)
+            fcompute, fschedule = topi.testing.dispatch(device, _topk_implement)
+            outs = fcompute(data, k, axis, ret_type, is_ascend, dtype)
             outs = outs if isinstance(outs, list) else [outs]
-            s = topi.generic.schedule_topk(outs)
+            s = fschedule(outs)
         tvm_data = tvm.nd.array(np_data, ctx)
         tvm_res = []
         for t in outs:
diff --git a/topi/tests/python/test_topi_space_to_depth.py b/topi/tests/python/test_topi_space_to_depth.py
index b25cad194301..99a798e733ee 100644
--- a/topi/tests/python/test_topi_space_to_depth.py
+++ b/topi/tests/python/test_topi_space_to_depth.py
@@ -56,7 +56,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
diff --git a/topi/tests/python/test_topi_tensor.py b/topi/tests/python/test_topi_tensor.py
index 8e7073f4060b..05098421c561 100644
--- a/topi/tests/python/test_topi_tensor.py
+++ b/topi/tests/python/test_topi_tensor.py
@@ -18,6 +18,7 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from tvm.contrib.nvcc import have_fp16
 
@@ -98,7 +99,7 @@ def check_device(device):
             A = tvm.placeholder((n, m), name='A', dtype=dtype)
             B = tvm.compute((n, m), lambda i, j:
                              A[i, j] + tvm.const(1, A.dtype), name='B')
-            S = topi.generic.schedule_elemwise(B)
+            S = topi.testing.get_elemwise_schedule(device)(B)
 
             fun = tvm.build(S, [A, B], device)
             np_A = tvm.nd.empty((n, m), A.dtype, ctx).copyfrom(
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index fd04fc4b0965..880e86d205e7 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -33,7 +33,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_broadcast(B)
+            s = topi.testing.get_broadcast_schedule(device)(B)
         foo = tvm.build(s, [A, B], device, name="expand_dims")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = data_npy.reshape(out_shape)
@@ -59,7 +59,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_elemwise(B)
+            s = topi.testing.get_elemwise_schedule(device)(B)
         foo = tvm.build(s, [A, B], device, name="reinterpret")
         data_npy = generator(in_shape).astype(in_dtype)
         out_npy = data_npy.view(B.dtype)
@@ -82,7 +82,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         foo = tvm.build(s, [A, B], device, name="transpose")
         data_npy = np.arange(np.prod(in_shape)).reshape(in_shape).astype(A.dtype)
         out_npy = data_npy.transpose(axes)
@@ -105,7 +105,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         foo = tvm.build(s, [A, B], device, name="reshape")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.reshape(data_npy, newshape=dst_shape)
@@ -128,7 +128,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
 
         foo = tvm.build(s, [A, B], device, name="squeeze")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
@@ -143,6 +143,19 @@ def check_device(device):
         check_device(device)
 
 def verify_concatenate(shapes, axis):
+
+    def get_concat_schedule(target):
+        schedule_map = {
+            "cpu": topi.x86.schedule_concatenate,
+            "arm_cpu": topi.arm_cpu.schedule_concatenate,
+        }
+        if isinstance(target, str):
+            target = tvm.target.create(target)
+        for key in target.keys:
+            if key in schedule_map:
+                return schedule_map[key]
+        return topi.testing.get_injective_schedule(target)
+
     tensor_l = []
     for i, shape in enumerate(shapes):
         tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
@@ -154,7 +167,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_concatenate(out_tensor)
+            s = get_concat_schedule(device)(out_tensor)
 
         foo = tvm.build(s, tensor_l + [out_tensor], device, name="concatenate")
         data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
@@ -179,7 +192,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_broadcast(out_tensor)
+            s = topi.testing.get_broadcast_schedule(device)(out_tensor)
 
         foo = tvm.build(s, tensor_l + [out_tensor], device, name="stack")
         data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
@@ -203,7 +216,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(tensor_l)
+            s = topi.testing.get_injective_schedule(device)(tensor_l)
 
         foo = tvm.build(s, [A] + list(tensor_l), device, name="split")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
@@ -262,7 +275,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
 
         foo = tvm.build(s, [A, B], device, name="reverse")
         x_np = np.random.uniform(size=in_shape).astype(A.dtype)
@@ -293,7 +306,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(out_tensor)
+            s = topi.testing.get_injective_schedule(device)(out_tensor)
 
         foo = tvm.build(s, [A] + [indices] + [out_tensor] , device, name="take")
         shape_size = 1
@@ -328,7 +341,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
 
         foo = tvm.build(s, [A, B], device, name="stride_slice")
         x_np = np.random.uniform(size=in_shape).astype(A.dtype)
@@ -360,7 +373,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
 
         if strides is not None:
             foo = tvm.build(s, [A, V, b, e, st, B], device, name="stride_set")
@@ -402,7 +415,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(out_tensor)
+            s = topi.testing.get_injective_schedule(device)(out_tensor)
 
         func = tvm.build(s, [A, indices, out_tensor] , device, name="take")
         shape_size = 1
@@ -441,7 +454,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(A)
+            s = topi.testing.get_injective_schedule(device)(A)
         f = tvm.build(s, [A], device, name="arange")
         a_nd = tvm.nd.empty(a_np.shape, dtype='float32', ctx=ctx)
         f(a_nd)
@@ -460,7 +473,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_broadcast(B)
+            s = topi.testing.get_broadcast_schedule(device)(B)
         foo = tvm.build(s, [A, B], device, name="repeat")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.repeat(data_npy, repeats, axis)
@@ -482,7 +495,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_broadcast(B)
+            s = topi.testing.get_broadcast_schedule(device)(B)
         foo = tvm.build(s, [A, B], device, name="tile")
         data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
         out_npy = np.tile(data_npy, reps)
@@ -507,7 +520,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_broadcast(C)
+            s = topi.testing.get_broadcast_schedule(device)(C)
         f = tvm.build(s, [Cond, A, B, C], device, name="where")
         cond_npy = np.random.uniform(low=-1, high=1, size=in_shape).astype(dtype)
         x_npy = np.random.uniform(size=in_shape).astype(dtype)
@@ -535,7 +548,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(one_hot_result)
+            s = topi.testing.get_injective_schedule(device)(one_hot_result)
         fn = tvm.build(s, [indices, one_hot_result], device, name="one_hot")
         indices_npy = np.random.randint(0, depth, size=indices_shape).astype(indices.dtype)
         out_npy = topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype)
@@ -618,7 +631,7 @@ def test_squeeze():
         ctx = tvm.context(device, 0)
         if ctx.exist:
             with tvm.target.create(device):
-                s = topi.generic.schedule_injective(C)
+                s = topi.testing.get_injective_schedule(device)(C)
                 func = tvm.build(s, [A, C])
             a = tvm.nd.array(np.array((1, 2)).astype('float32'), ctx=ctx)
             c = tvm.nd.empty((1,), dtype='float32', ctx=ctx)
@@ -741,7 +754,7 @@ def check_device(device):
         tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=B.dtype)
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         f = tvm.build(s, [A, B], device, name="layout_transform")
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
@@ -768,7 +781,7 @@ def check_device(device):
         tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=dtype)
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         f = tvm.build(s, [A, B], device, name="shape")
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
@@ -800,7 +813,7 @@ def check_device(device):
                     tvm_C = tvm.nd.empty(in_shape, ctx=ctx, dtype="float32")
                     print("Running on target: %s" % device)
                     with tvm.target.create(device):
-                        s = topi.generic.schedule_injective(C)
+                        s = topi.testing.get_injective_schedule(device)(C)
                     f = tvm.build(s, [A, B, C], device, name="SequenceMask")
                     f(tvm_A, tvm_B, tvm_C)
                     tvm.testing.assert_allclose(tvm_C.asnumpy(), C_gt_data)
@@ -825,7 +838,7 @@ def check_device(device):
         tvm_output = tvm.nd.empty((1,), ctx=ctx, dtype=B.dtype)
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         f = tvm.build(s, [A, B], device, name="ndarray_size")
         f(tvm_input, tvm_output)
         tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
@@ -843,9 +856,10 @@ def check_device(device):
                 print("Skip because %s is not enabled" % device)
                 return
             print("Running on target: %s" % device)
+            conv2d_compute, conv2d_schedule = topi.testing.get_conv2d_nchw_implement(device)
             data = tvm.placeholder((2, 1, 2, 4), 'int8', 'data')
             w = tvm.placeholder((3, 1, 2, 2), 'int8', 'w')
-            conv1 = topi.nn.conv2d(data, w, 1, 0, 1, out_dtype='int32')
+            conv1 = conv2d_compute(data, w, 1, 0, 1, 'int32')
             zeros = topi.full((2, 3, 1, 3), 'int32', tvm.const(0, dtype='int32'))
             gt = topi.greater_equal(conv1, zeros)
             one = topi.full((2, 3, 1, 3), 'int32', tvm.const(1, dtype='int32'))
@@ -853,7 +867,7 @@ def check_device(device):
             where = topi.where(gt, one, two)
             add = topi.add(conv1, where)
             outs = [add]
-            s = topi.generic.schedule_conv2d_nchw(outs)
+            s = conv2d_schedule(outs)
             tvm.build(s, [data, w, add], target=backend)
 
     for backend in get_all_backend():
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
index 875b2f780bef..003748719a0e 100644
--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -64,7 +64,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
@@ -147,7 +147,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_injective(B)
+            s = topi.testing.get_injective_schedule(device)(B)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 85e4180a0892..7d27b8221a60 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -26,6 +26,41 @@
 from topi.util import get_const_tuple
 from topi.vision import ssd, non_max_suppression, get_valid_counts
 
+_get_valid_counts_implement = {
+    "generic": (topi.vision.get_valid_counts, topi.generic.schedule_get_valid_counts),
+    "gpu": (topi.cuda.get_valid_counts, topi.cuda.schedule_get_valid_counts),
+}
+
+_nms_implement = {
+    "generic": (topi.vision.non_max_suppression, topi.generic.schedule_nms),
+    "gpu": (topi.cuda.non_max_suppression, topi.cuda.schedule_nms),
+}
+
+_multibox_prior_implement = {
+    "generic": (topi.vision.ssd.multibox_prior, topi.generic.schedule_multibox_prior),
+    "gpu": (topi.cuda.multibox_prior, topi.cuda.schedule_multibox_prior),
+}
+
+_multibox_detection_implement = {
+    "generic": (topi.vision.ssd.multibox_detection, topi.generic.schedule_multibox_detection),
+    "gpu": (topi.cuda.multibox_detection, topi.cuda.schedule_multibox_detection),
+}
+
+_roi_align_implement = {
+    "generic": (topi.vision.roi_align_nchw, topi.generic.schedule_roi_align),
+    "cpu": (topi.x86.roi_align_nchw, topi.generic.schedule_roi_align),
+    "gpu": (topi.vision.roi_align_nchw, topi.cuda.schedule_roi_align),
+}
+
+_roi_pool_schedule = {
+    "generic": topi.generic.schedule_roi_pool,
+    "gpu": topi.cuda.schedule_roi_pool,
+}
+
+_proposal_implement = {
+    "generic": (topi.vision.rcnn.proposal, topi.generic.schedule_proposal),
+    "gpu": (topi.cuda.proposal, topi.cuda.schedule_proposal),
+}
 
 def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
     dtype = "float32"
@@ -54,9 +89,10 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            fcompute, fschedule = topi.testing.dispatch(device, _get_valid_counts_implement)
             data = tvm.placeholder(dshape, name="data", dtype=dtype)
-            outs = get_valid_counts(data, score_threshold, id_index, score_index)
-            s = topi.generic.schedule_get_valid_counts(outs)
+            outs = fcompute(data, score_threshold, id_index, score_index)
+            s = fschedule(outs)
 
         tvm_input_data = tvm.nd.array(np_data, ctx)
         tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
@@ -95,20 +131,14 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            if device == 'llvm':
-                out = non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k,
-                                          coord_start=coord_start, score_index=score_index, id_index=id_index,
-                                          return_indices=False)
-                indices_out = non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k,
-                                                  coord_start=coord_start, score_index=score_index, id_index=id_index)
-            else:
-                out = topi.cuda.non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k,
-                                                    coord_start=coord_start, score_index=score_index, id_index=id_index,
-                                                    return_indices=False)
-                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, iou_threshold, force_suppress, top_k,
-                                                            coord_start=coord_start, score_index=score_index, id_index=id_index)
-            s = topi.generic.schedule_nms(out)
-            indices_s = topi.generic.schedule_nms(indices_out)
+            fcompute, fschedule = topi.testing.dispatch(device, _nms_implement)
+            out = fcompute(data, valid_count, -1, iou_threshold, force_suppress, top_k,
+                           coord_start=coord_start, score_index=score_index, id_index=id_index,
+                           return_indices=False)
+            indices_out = fcompute(data, valid_count, -1, iou_threshold, force_suppress, top_k,
+                                   coord_start=coord_start, score_index=score_index, id_index=id_index)
+            s = fschedule(out)
+            indices_s = fschedule(indices_out)
 
         tvm_data = tvm.nd.array(np_data, ctx)
         tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
@@ -193,12 +223,11 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
+        
+        fcompute, fschedule = topi.testing.dispatch(device, _multibox_prior_implement)
         with tvm.target.create(device):
-            if device == 'llvm':
-                out = ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip)
-            else:
-                out = topi.cuda.ssd.multibox_prior(data, sizes, ratios, steps, offsets, clip)
-            s = topi.generic.schedule_multibox_prior(out)
+            out = fcompute(data, sizes, ratios, steps, offsets, clip)
+            s = fschedule(out)
 
         tvm_input_data = tvm.nd.array(input_data, ctx)
         tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx)
@@ -239,12 +268,11 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
+
+        fcompute, fschedule = topi.testing.dispatch(device, _multibox_detection_implement)
         with tvm.target.create(device):
-            if device == 'llvm':
-                out = ssd.multibox_detection(cls_prob, loc_preds, anchors)
-            else:
-                out = topi.cuda.ssd.multibox_detection(cls_prob, loc_preds, anchors)
-            s = topi.generic.schedule_multibox_detection(out)
+            out = fcompute(cls_prob, loc_preds, anchors)
+            s = fschedule(out)
 
         tvm_cls_prob = tvm.nd.array(np_cls_prob.astype(cls_prob.dtype), ctx)
         tvm_loc_preds = tvm.nd.array(np_loc_preds.astype(loc_preds.dtype), ctx)
@@ -286,10 +314,11 @@ def check_device(device):
         print("Running on target: %s" % device)
 
         with tvm.target.create(device):
-            b = topi.vision.rcnn.roi_align_nchw(a, rois, pooled_size=pooled_size,
-                                                spatial_scale=spatial_scale,
-                                                sample_ratio=sample_ratio)
-            s = topi.generic.schedule_roi_align(b)
+            fcompute, fschedule = topi.testing.dispatch(device, _roi_align_implement)
+            b = fcompute(a, rois, pooled_size=pooled_size,
+                         spatial_scale=spatial_scale,
+                         sample_ratio=sample_ratio)
+            s = fschedule(b)
 
         tvm_a = tvm.nd.array(a_np, ctx)
         tvm_rois = tvm.nd.array(rois_np, ctx)
@@ -338,7 +367,8 @@ def check_device(device):
         with tvm.target.create(device):
             b = topi.vision.rcnn.roi_pool_nchw(a, rois, pooled_size=pooled_size,
                                                 spatial_scale=spatial_scale)
-            s = topi.generic.schedule_roi_pool(b)
+            s_func = topi.testing.dispatch(device, _roi_pool_schedule)
+            s = s_func(b)
 
         tvm_a = tvm.nd.array(a_np, ctx)
         tvm_rois = tvm.nd.array(rois_np, ctx)
@@ -368,8 +398,9 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            out = topi.vision.proposal(cls_prob, bbox_pred, im_info, **attrs)
-            s = topi.generic.schedule_proposal(out)
+            fcompute, fschedule = topi.testing.dispatch(device, _proposal_implement)
+            out = fcompute(cls_prob, bbox_pred, im_info, **attrs)
+            s = fschedule(out)
             f = tvm.build(s, [cls_prob, bbox_pred, im_info, out], device)
             tvm_cls_prob = tvm.nd.array(np_cls_prob, ctx=ctx)
             tvm_bbox_pred = tvm.nd.array(np_bbox_pred, ctx=ctx)
@@ -428,4 +459,5 @@ def test_proposal():
     test_multibox_prior()
     test_multibox_detection()
     test_roi_align()
+    test_roi_pool()
     test_proposal()
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 09b56045edaf..0e26dcb97412 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -78,7 +78,7 @@
 # can be very large (at the level of 10^9 for some input shapes)
 #
 
-@autotvm.template
+@autotvm.register_customized_task("tutorial/conv2d_no_batching")
 def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     assert N == 1, "Only consider batch_size = 1 in this template"
 
@@ -180,7 +180,7 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
 
 # the last layer in resnet
 N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
-task = autotvm.task.create(conv2d_no_batching,
+task = autotvm.task.create("tutorial/conv2d_no_batching",
                            args=(N, H, W, CO, CI, KH, KW, strides, padding),
                            target='cuda')
 print(task.config_space)
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 4cbdf52163d6..9aba93798617 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -248,30 +248,7 @@ def tune_tasks(tasks,
                n_trial=1000,
                early_stopping=None,
                log_filename='tuning.log',
-               use_transfer_learning=True,
-               try_winograd=True,
-               try_spatial_pack_depthwise=False):
-    if try_winograd:
-        for i in range(len(tasks)):
-            try:  # try winograd template
-                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
-                                          tasks[i].target, tasks[i].target_host, 'winograd')
-                input_channel = tsk.workload[1][1]
-                if input_channel >= 64:
-                    tasks[i] = tsk
-            except Exception:
-                pass
-
-    # if we want to use spatial pack for depthwise convolution
-    if try_spatial_pack_depthwise:
-        tuner = 'xgb_knob'
-        for i in range(len(tasks)):
-            if tasks[i].name == 'topi_nn_depthwise_conv2d_nchw':
-                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
-                                          tasks[i].target, tasks[i].target_host,
-                                          'contrib_spatial_pack')
-                tasks[i] = tsk
-
+               use_transfer_learning=True):
     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
     if os.path.exists(tmp_log_file):
@@ -322,7 +299,7 @@ def tune_and_evaluate(tuning_opt):
     mod, params, input_shape, _ = get_network(network, batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
-                                              ops=(relay.op.nn.conv2d,))
+                                              ops=(relay.op.get("nn.conv2d"),))
 
     # run tuning tasks
     print("Tuning...")
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 72fc2bed3d0e..58c8751b73b9 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -163,19 +163,7 @@ def tune_tasks(tasks,
                n_trial=1000,
                early_stopping=None,
                log_filename='tuning.log',
-               use_transfer_learning=True,
-               try_winograd=True):
-    if try_winograd:
-        for i in range(len(tasks)):
-            try:  # try winograd template
-                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
-                                          tasks[i].target, tasks[i].target_host, 'winograd')
-                input_channel = tsk.workload[1][1]
-                if input_channel >= 64:
-                    tasks[i] = tsk
-            except Exception:
-                pass
-
+               use_transfer_learning=True):
     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
     if os.path.exists(tmp_log_file):
@@ -223,7 +211,8 @@ def tune_and_evaluate(tuning_opt):
     print("Extract tasks...")
     mod, params, input_shape, out_shape = get_network(network, batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
-                                              params=params, ops=(relay.op.nn.conv2d,))
+                                              params=params,
+                                              ops=(relay.op.get("nn.conv2d"),))
 
     # run tuning tasks
     print("Tuning...")
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 3c56524078c2..5425f1b15715 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -247,17 +247,7 @@ def tune_tasks(tasks,
                n_trial=1000,
                early_stopping=None,
                log_filename='tuning.log',
-               use_transfer_learning=True,
-               try_winograd=True):
-    if try_winograd:
-        for i in range(len(tasks)):
-            try:  # try winograd template
-                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
-                                          tasks[i].target, tasks[i].target_host, 'winograd')
-                tasks.append(tsk)
-            except Exception:
-                pass
-
+               use_transfer_learning=True):
     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
     if os.path.exists(tmp_log_file):
@@ -307,7 +297,8 @@ def tune_and_evaluate(tuning_opt):
     tasks = autotvm.task.extract_from_program(mod["main"],
                                               target=target,
                                               target_host=target_host,
-                                              params=params, ops=(relay.op.nn.conv2d,))
+                                              params=params,
+                                              ops=(relay.op.get("nn.conv2d"),))
 
     # run tuning tasks
     print("Tuning...")
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 5e26f5858bbc..87d07f9870b2 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -132,22 +132,9 @@ def tune_kernels(tasks,
                  early_stopping=None,
                  log_filename='tuning.log'):
 
-    for i, tsk in enumerate(tasks):
+    for i, task in enumerate(tasks):
         prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
 
-        # converting conv2d tasks to conv2d_NCHWc tasks
-        op_name = tsk.workload[0]
-        if op_name == 'conv2d':
-            func_create = 'topi_x86_conv2d_NCHWc'
-        elif op_name == 'depthwise_conv2d_nchw':
-            func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
-        else:
-            raise ValueError("Tuning {} is not supported on x86".format(op_name))
-
-        task = autotvm.task.create(func_create, args=tsk.args,
-                                   target=target, template_key='direct')
-        task.workload = tsk.workload
-
         # create tuner
         if tuner == 'xgb' or tuner == 'xgb-rank':
             tuner_obj = XGBTuner(task, loss_type='rank')
@@ -173,7 +160,7 @@ def tune_kernels(tasks,
 # Use graph tuner to achieve graph level optimal schedules
 # Set use_DP=False if it takes too long to finish.
 def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
-    target_op = [relay.nn.conv2d]
+    target_op = [relay.op.get("nn.conv2d"),]
     Tuner = DPTuner if use_DP else PBQPTuner
     executor = Tuner(graph, {input_name: dshape}, records, target_op, target)
     executor.benchmark_layout_transform(min_exec_num=2000)
@@ -189,10 +176,10 @@ def tune_and_evaluate(tuning_opt):
     print("Extract tasks...")
     mod, params, data_shape, out_shape = get_network(model_name, batch_size)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
-                                              params=params, ops=(relay.op.nn.conv2d,))
+                                              params=params,
+                                              ops=(relay.op.get("nn.conv2d"),))
 
     # run tuning tasks
-    print("Tuning...")
     tune_kernels(tasks, **tuning_opt)
     tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)
 
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index b6ad7e94f883..8efeed487b43 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -102,7 +102,7 @@ def matmul_v0(N, L, M, dtype):
 # In autotvm, we can define a tunable parameter, or a "knob" for such kind of value.
 
 # Matmul V1: List candidate values
-@autotvm.template  # 1. use a decorator
+@autotvm.register_customized_task("tutorial/matmul_v1")  # 1. use a decorator
 def matmul_v1(N, L, M, dtype):
     A = tvm.placeholder((N, L), name='A', dtype=dtype)
     B = tvm.placeholder((L, M), name='B', dtype=dtype)
@@ -182,7 +182,7 @@ def matmul_v1(N, L, M, dtype):
 # When the high level API cannot meet your requirement, you can always fall
 # back to use low level API.
 
-@autotvm.template
+@autotvm.register_customized_task("tutorial/matmul")
 def matmul(N, L, M, dtype):
     A = tvm.placeholder((N, L), name='A', dtype=dtype)
     B = tvm.placeholder((L, M), name='B', dtype=dtype)
@@ -272,7 +272,7 @@ def matmul(N, L, M, dtype):
 # In this case, for a 512x512 square matrix multiplication, the space size
 # is 10x10=100
 N, L, M = 512, 512, 512
-task = autotvm.task.create(matmul, args=(N, L, M, 'float32'), target='llvm')
+task = autotvm.task.create("tutorial/matmul", args=(N, L, M, 'float32'), target='llvm')
 print(task.config_space)
 
 ################################################################
diff --git a/tutorials/dev/relay_pass_infra.py b/tutorials/dev/relay_pass_infra.py
index d27e236a2572..494593eeb5a1 100644
--- a/tutorials/dev/relay_pass_infra.py
+++ b/tutorials/dev/relay_pass_infra.py
@@ -78,7 +78,7 @@ def example():
 # the scope of this tutorial.
 
 @relay.op.register_alter_op_layout("nn.conv2d", level=101)
-def alter_conv2d(attrs, inputs, tinfos):
+def alter_conv2d(attrs, inputs, tinfos, out_type):
     data, weight = inputs
     new_attrs = dict(attrs)
     new_attrs['data_layout'] = 'NCHW16c'
@@ -245,10 +245,10 @@ def visit_const(self, c):
 f = example()
 mod = tvm.IRModule.from_expr(f)
 seq = relay.transform.Sequential([relay.transform.FoldConstant(),
-                                  relay.transform.PrintIR(),
+                                  relay.transform.PrintIR(False),
                                   relay.transform.EliminateCommonSubexpr(),
                                   relay.transform.FuseOps(),
-                                  relay.transform.PrintIR()])
+                                  relay.transform.PrintIR(False)])
 with relay.build_config(opt_level=3):
     mod = seq(mod)
 
diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py
index f7cdae227b75..a4658eba2bee 100644
--- a/tutorials/optimize/opt_matmul_auto_tensorcore.py
+++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py
@@ -94,7 +94,7 @@ def matmul_nn(A, B, L, dtype='float16', layout='NN'):
 #
 # We use AutoTVM to search for best configurations in this schedule.
 
-@autotvm.template
+@autotvm.register_customized_task("tutorial/test_gemm")
 def test_gemm(N, L, M, dtype, layout):
     if (layout == "NN"):
       shape_a = (N, L)
@@ -264,7 +264,8 @@ def test_gemm(N, L, M, dtype, layout):
   assert(major == 7 and minor == 5 and layout == 'TN')
 
 def tune_and_evaluate(M, N, L, dtype, layout):
-  task = autotvm.task.create(test_gemm, args=(N, L, M, dtype, layout), target='cuda')
+  task = autotvm.task.create("tutorial/test_gemm", args=(N, L, M, dtype, layout),
+                             target='cuda')
   print(task.config_space)
 
   logging.getLogger('autotvm').setLevel(logging.DEBUG)
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index 390085ea70b5..2e049828e5cc 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -85,7 +85,7 @@
 f = e / 2.0
 g = topi.sum(f)
 with tvm.target.cuda():
-    sg = topi.generic.schedule_reduce(g)
+    sg = topi.cuda.schedule_reduce(g)
     print(tvm.lower(sg, [a, b], simple_mode=True))
 
 ######################################################################
@@ -113,7 +113,7 @@
 tarray = tvm.placeholder((512, 512), name="tarray")
 softmax_topi = topi.nn.softmax(tarray)
 with tvm.target.create("cuda"):
-    sst = topi.generic.schedule_softmax(softmax_topi)
+    sst = topi.cuda.schedule_softmax(softmax_topi)
     print(tvm.lower(sst, [tarray], simple_mode=True))
 
 ######################################################################
@@ -133,9 +133,9 @@
 kernel = tvm.placeholder((10, 3, 5, 5))
 
 with tvm.target.create("cuda"):
-    conv = topi.nn.conv2d(data, kernel, strides=1, padding=2, dilation=1)
+    conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1)
     out = topi.nn.relu(conv)
-    sconv = topi.generic.nn.schedule_conv2d_nchw([out])
+    sconv = topi.cuda.schedule_conv2d_nchw([out])
     print(tvm.lower(sconv, [data, kernel], simple_mode=True))
 
 ######################################################################
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
index 36d8e4198a40..0c9b2eac2df7 100644
--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -662,8 +662,12 @@ def _do_fold(op):
                                          0, 0,
                                          0, 0, 0))
                 inner = irb.get()
-                args = op.body.body.args
-                res_tensor = op.body.body.func.output(0)
+                # TODO(@tmoreau89): This is only a temporary fix, please take a look.
+                body = op.body.body
+                while isinstance(body, tvm.stmt.IfThenElse):
+                    body = body.then_case
+                args = body.args
+                res_tensor = body.func.output(0)
                 tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT)
                 inner = tvm.tir.AttrStmt(
                     [dout, res_tensor], 'buffer_bind_scope',
diff --git a/vta/python/vta/top/__init__.py b/vta/python/vta/top/__init__.py
index 7fdf27f8e01a..6f62aff469d4 100644
--- a/vta/python/vta/top/__init__.py
+++ b/vta/python/vta/top/__init__.py
@@ -20,8 +20,8 @@
 from . import bitpack
 from .graphpack import graph_pack
 from . import op
-from . import vta_conv2d
-from . import vta_conv2d_transpose
-from . import vta_group_conv2d
-from . import vta_dense
+from .vta_conv2d import conv2d_packed, schedule_conv2d_packed
+from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
+from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
+from .vta_dense import dense_packed, schedule_dense_packed
 from . import util
diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
index d4748faad6a7..6e9d57bc0001 100644
--- a/vta/python/vta/top/bitpack.py
+++ b/vta/python/vta/top/bitpack.py
@@ -22,9 +22,8 @@
 import tvm
 from topi import util
 
-from tvm.relay.op.op import register_compute, register_schedule
+from tvm.relay.op.op import register_compute, register_injective_schedule
 from tvm.relay.op.op import register_pattern, OpPattern
-from tvm.relay.op.op import schedule_injective
 
 def bitpack(data, bits, pack_type="int8", name="bitpack"):
     """Packs lowest dimension into format needed by VTA
@@ -86,5 +85,5 @@ def compute_bitpack(attrs, inputs):
     bits = 8 // lanes
     return bitpack(inputs[0], bits, dtype)
 
-register_schedule("bitpack", schedule_injective)
+register_injective_schedule("bitpack")
 register_pattern("bitpack", OpPattern.INJECTIVE)
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index bf6409cc9405..04e14b1e2bdd 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -22,19 +22,22 @@
 import topi
 
 from tvm.relay.op import op as reg
-from tvm.relay.op.op import OpPattern
-from tvm.relay.op.nn import _nn
+from tvm.relay.op import strategy as _strategy
+from tvm.relay.op.op import OpPattern, OpStrategy
 
 from .util import is_packed_layout
+from .vta_conv2d import conv2d_packed, schedule_conv2d_packed
+from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
+from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
+from .vta_dense import dense_packed, schedule_dense_packed
 from ..environment import get_env
 
 
 # override to force partition at copy
 reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
 
-
-@reg.register_compute("clip", level=15)
-def compute_clip(attrs, inputs, output_type, target):
+# add clip vta strategy
+def compute_clip_vta(attrs, inputs, output_type):
     """ Clip operator. """
     x = inputs[0]
     a_min = attrs.a_min
@@ -48,139 +51,79 @@ def compute_clip(attrs, inputs, output_type, target):
             x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
     return [x]
 
-
-@reg.register_compute("nn.conv2d", level=15)
-def compute_conv2d(attrs, inputs, output_type, target):
-    """ Compute definition of conv2d """
-    padding = topi.util.get_const_tuple(attrs.padding)
-    strides = topi.util.get_const_tuple(attrs.strides)
-    dilation = tuple([int(d) for d in attrs.dilation])
+def clip_strategy_vta(attrs, inputs, out_type, target):
+    strategy = OpStrategy()
+    strategy.add_implementation(
+        compute_clip_vta,
+        _strategy.wrap_topi_schedule(topi.generic.schedule_injective),
+        name="clip.vta")
+    return strategy
+
+reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta")
+
+@_strategy.conv2d_strategy.register("vta")
+def conv2d_strategy_vta(attrs, inputs, out_type, target):
+    """conv2d vta strategy"""
+    strategy = OpStrategy()
+    kernel = inputs[1]
+    dilation = topi.util.get_const_tuple(attrs.dilation)
     groups = attrs.groups
     layout = attrs.data_layout
-    out_dtype = attrs.out_dtype
-
-    if target.device_name == "vta":
-        assert dilation == (1, 1), "support for dilation limited to (1, 1)"
-        if is_packed_layout(layout):
-            if groups == 1:
-                assert groups == 1
-                env = get_env()
-                assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
-                assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
-                inputs = list(inputs)
-                assert inputs[1].dtype == "int8"
-                return [topi.nn.conv2d(inputs[0],
-                                       inputs[1],
-                                       strides,
-                                       padding,
-                                       dilation,
-                                       layout,
-                                       out_dtype)]
-            return [topi.nn.group_conv2d_nchw(inputs[0],
-                                              inputs[1],
-                                              strides,
-                                              padding,
-                                              dilation,
-                                              groups,
-                                              out_dtype)]
-        # If it's not packed, run on ARM CPU
-        with tvm.target.arm_cpu(tvm.target.Target.current().model):
-            return _nn.compute_conv2d(attrs, inputs, output_type, target)
-
-    # If VTA is not the target, default to _nn def
-    return _nn.compute_conv2d(attrs, inputs, output_type, target)
-
-
-@reg.register_schedule("nn.conv2d", level=15)
-def schedule_conv2d(attrs, outs, target):
-    """ Schedule definition of conv2d """
-    groups = attrs.groups
-    layout = attrs.data_layout
-
-    if target.device_name == "vta":
-        if is_packed_layout(layout):
-            target = tvm.target.create(target)
-            assert target.device_name == "vta"
-            if groups == 1:
-                return topi.generic.schedule_conv2d_nchw(outs)
-            return topi.generic.schedule_group_conv2d_nchw(outs)
-        # If it's not packed, run on ARM CPU
-        with tvm.target.arm_cpu(tvm.target.Target.current().model):
-            return _nn.schedule_conv2d(attrs, outs, tvm.target.Target.current())
-
-    # If VTA is not the target, default to _nn def
-    return _nn.schedule_conv2d(attrs, outs, target)
-
-
-@reg.register_compute("nn.conv2d_transpose", level=15)
-def compute_conv2d_transpose(attrs, inputs, output_type, target):
-    """ 2D convolution algorithm.
-    """
-    padding = topi.util.get_const_tuple(attrs.padding)
-    strides = topi.util.get_const_tuple(attrs.strides)
-    dilation = tuple([int(d) for d in attrs.dilation])
-    layout = attrs.data_layout
-    out_dtype = attrs.out_dtype
-
-    if target.device_name == "vta":
-        assert dilation == (1, 1), "support for dilation limited to (1, 1)"
-        if is_packed_layout(layout):
-            return [topi.nn.conv2d_transpose_nchw(
-                inputs[0], inputs[1], strides, padding, out_dtype)]
-        # If it's not packed, run on ARM CPU
-        with tvm.target.arm_cpu(tvm.target.Target.current().model):
-            return _nn.compute_conv2d_transpose(attrs, inputs, output_type, target)
-
-    # If VTA is not the target, default to _nn def
-    return _nn.compute_conv2d_transpose(attrs, inputs, output_type, target)
 
-
-@reg.register_schedule("nn.conv2d_transpose", level=15)
-def schedule_conv2d_transpose(attrs, outputs, target):
-    """ 2D convolution schedule.
-    """
+    assert dilation == (1, 1), "support for dilation limited to (1, 1)"
+    if is_packed_layout(layout):
+        if groups == 1:
+            env = get_env()
+            assert env.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
+            assert env.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
+            assert kernel.dtype == "int8"
+
+            strategy.add_implementation(
+                _strategy.wrap_compute_conv2d(conv2d_packed, True),
+                _strategy.wrap_topi_schedule(schedule_conv2d_packed),
+                name="conv2d_packed.vta")
+        else: # group_conv2d
+            strategy.add_implementation(
+                _strategy.wrap_compute_conv2d(group_conv2d_packed, has_groups=True),
+                _strategy.wrap_topi_schedule(schedule_group_conv2d_packed),
+                name="group_conv2d_packed.vta")
+        return strategy
+
+    # If it's not packed, run on ARM CPU
+    arm_tgt = tvm.target.arm_cpu(target.model)
+    return _strategy.arm_cpu.conv2d_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt)
+
+
+@_strategy.conv2d_transpose_strategy.register("vta")
+def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target):
+    """conv2d_transpose vta strategy"""
+    dilation = topi.util.get_const_tuple(attrs.dilation)
     layout = attrs.data_layout
-
-    if target.device_name == "vta":
-        if is_packed_layout(layout):
-            return topi.nn.schedule_conv2d_transpose_nchw(outputs)
-        # If it's not packed, run on ARM CPU
-        with tvm.target.arm_cpu(tvm.target.Target.current().model):
-            return _nn.schedule_conv2d_transpose(attrs, outputs, tvm.target.Target.current())
-
-    # If VTA is not the target, default to _nn def
-    return _nn.schedule_conv2d_transpose(attrs, outputs, tvm.target.Target.current())
-
-
-@reg.register_compute("nn.dense", level=15)
-def compute_dense(attrs, inputs, out_type, target):
-    """Compute definition of dense"""
-    out_dtype = attrs.out_dtype
-    out_dtype = inputs[0].dtype if out_dtype == "" else out_dtype
-
-    if target.device_name == "vta":
-        if inputs[0].shape == 4: # this implies the layout is packed
-            target = tvm.target.create(target)
-            return [topi.nn.dense(inputs[0], inputs[1], None, out_dtype)]
-        # If it's not packed, run on ARM CPU
-        with tvm.target.arm_cpu(tvm.target.Target.current().model):
-            return _nn.compute_dense(attrs, inputs, out_type, target)
-
-    # If VTA is not the target, default to _nn def
-    return _nn.compute_dense(attrs, inputs, out_type, target)
-
-
-@reg.register_schedule("nn.dense", level=15)
-def schedule_dense(attrs, outs, target):
-    """Schedule definition of dense"""
-    if target.device_name == "vta":
-        if outs[0].shape == 4: # this implies the layout is packed
-            target = tvm.target.create(target)
-            assert target.device_name == "vta"
-            return topi.generic.schedule_dense(outs)
-        # If it's not packed, run on ARM CPU
-        with tvm.target.arm_cpu(tvm.target.Target.current().model):
-            return _nn.schedule_dense(attrs, outs, tvm.target.Target.current())
-
-    # If VTA is not the target, default to _nn def
-    return _nn.schedule_dense(attrs, outs, target)
+    assert dilation == (1, 1), "support for dilation limited to (1, 1)"
+
+    if is_packed_layout(layout):
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            _strategy.wrap_compute_conv2d_transpose(conv2d_transpose_packed),
+            _strategy.wrap_topi_schedule(schedule_conv2d_transpose_packed),
+            name="conv2d_transpose_packed.vta")
+        return strategy
+
+    # If it's not packed, run on ARM CPU
+    arm_tgt = tvm.target.arm_cpu(target.model)
+    return _strategy.arm_cpu.conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt)
+
+
+@_strategy.dense_strategy.register("vta")
+def dense_strategy_vta(attrs, inputs, out_type, target):
+    """dense vta strategy"""
+    if inputs[0].shape == 4: # this implies the layout is packed
+        strategy = OpStrategy()
+        strategy.add_implementation(
+            _strategy.wrap_compute_dense(dense_packed),
+            _strategy.wrap_topi_schedule(schedule_dense_packed),
+            name="dense_packed.vta")
+        return strategy
+    # If it's not packed, run on ARM CPU
+    arm_tgt = tvm.target.arm_cpu(target.model)
+    return _strategy.x86.dense_strategy_cpu(attrs, inputs, out_type, arm_tgt)
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index e15f6c1d9ecc..ba93b05ca232 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -25,15 +25,8 @@
 from .util import is_packed_layout
 from ..environment import get_env
 
-@autotvm.register_topi_compute(topi.nn.conv2d, 'vta', 'direct')
-def _declaration_conv2d(cfg,
-                        data,
-                        kernel,
-                        strides,
-                        padding,
-                        dilation,
-                        layout,
-                        out_dtype):
+@autotvm.register_topi_compute("conv2d_packed.vta")
+def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     """ Packed conv2d function."""
     if not is_packed_layout(layout):
         raise topi.InvalidShapeError()
@@ -69,8 +62,9 @@ def _declaration_conv2d(cfg,
 
     return res
 
-@autotvm.register_topi_schedule(topi.generic.schedule_conv2d_nchw, 'vta', 'direct')
-def _schedule_conv2d(cfg, outs):
+@autotvm.register_topi_schedule("conv2d_packed.vta")
+def schedule_conv2d_packed(cfg, outs):
+    """Schedule packed conv2d"""
     assert len(outs) == 1
     output = outs[0]
     const_ops = []
diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py
index a2750dc9081d..a3fd7ac92cd3 100644
--- a/vta/python/vta/top/vta_conv2d_transpose.py
+++ b/vta/python/vta/top/vta_conv2d_transpose.py
@@ -26,13 +26,9 @@
 
 from ..environment import get_env
 
-@autotvm.register_topi_compute(topi.nn.conv2d_transpose_nchw, 'vta', 'direct')
-def _declatation_conv2d_transpose(cfg,
-                                  data,
-                                  kernel,
-                                  strides,
-                                  padding,
-                                  out_dtype):
+@autotvm.register_topi_compute("conv2d_transpose_packed.vta")
+def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype):
+    """Packed conv2d_transpose compute"""
     ishape = get_const_tuple(data.shape)
     kshape = get_const_tuple(kernel.shape)
     b, c_i, i_h, i_w, t_b, t_ci = ishape
@@ -75,8 +71,9 @@ def _declatation_conv2d_transpose(cfg,
 
     return out
 
-@autotvm.register_topi_schedule(topi.generic.schedule_conv2d_transpose_nchw, 'vta', 'direct')
-def _schedule_conv2d_transpose(cfg, outs):
+@autotvm.register_topi_schedule("conv2d_transpose_packed.vta")
+def schedule_conv2d_transpose_packed(cfg, outs):
+    """Schedule packed conv2d_transpose"""
     assert len(outs) == 1
     output = outs[0]
     ewise_inputs = []
diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index 9d6c19c5af20..e23910447ba8 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -32,12 +32,8 @@ def is_packed_layout(layout):
         return True
     return False
 
-@autotvm.register_topi_compute(topi.nn.dense, 'vta', 'direct')
-def _declaration_dense(cfg,
-                       data,
-                       weight,
-                       bias=None,
-                       out_dtype=None):
+@autotvm.register_topi_compute("dense_packed.vta")
+def dense_packed(cfg, data, weight, bias=None, out_dtype=None):
     """Dense function declaration."""
 
     # Make sure that the dense operator is packed
@@ -67,8 +63,8 @@ def _declaration_dense(cfg,
 
     return res
 
-@autotvm.register_topi_schedule(topi.generic.schedule_dense, 'vta', 'direct')
-def _schedule_dense(cfg, outs):
+@autotvm.register_topi_schedule("dense_packed.vta")
+def schedule_dense_packed(cfg, outs):
     """Packed dense schedule."""
 
     assert len(outs) == 1
diff --git a/vta/python/vta/top/vta_group_conv2d.py b/vta/python/vta/top/vta_group_conv2d.py
index e54637f2c204..aa06c61c3ec0 100644
--- a/vta/python/vta/top/vta_group_conv2d.py
+++ b/vta/python/vta/top/vta_group_conv2d.py
@@ -24,8 +24,8 @@
 
 from ..environment import get_env
 
-@autotvm.register_topi_compute(topi.nn.group_conv2d_nchw, 'vta', 'direct')
-def packed_group_conv2d(cfg,
+@autotvm.register_topi_compute("group_conv2d_packed.vta")
+def group_conv2d_packed(cfg,
                         data,
                         kernel,
                         strides,
@@ -74,8 +74,8 @@ def packed_group_conv2d(cfg,
     return out
 
 
-@autotvm.register_topi_schedule(topi.generic.schedule_group_conv2d_nchw, 'vta', 'direct')
-def schedule_packed_group_conv2d(cfg, outs):
+@autotvm.register_topi_schedule("group_conv2d_packed.vta")
+def schedule_group_conv2d_packed(cfg, outs):
     """ Schedule the packed conv2d.
     """
     assert len(outs) == 1
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index b9edc30e5ba3..cf6f42654e6e 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -246,7 +246,7 @@ def tune_tasks(tasks,
     print("Extracting tasks...")
     tasks = extract_from_program(func=relay_prog,
                                  params=params,
-                                 ops=(tvm.relay.op.nn.conv2d,),
+                                 ops=(relay.op.get("nn.conv2d"),),
                                  target=target,
                                  target_host=env.target_host)
 
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index af71561972a1..6935e4794c4e 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -20,10 +20,12 @@
 import json
 import os
 
+import pytest
 import numpy as np
 from collections import namedtuple
 
 import tvm
+from tvm import relay
 from tvm import autotvm
 from tvm.contrib import util
 from tvm.contrib.pickle_memoize import memoize
@@ -79,9 +81,13 @@ def run_conv2d(env, remote, wl, target,
     if "arm_cpu" in target.keys:
         data_pack = False
         layout = "NCHW"
+        conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack
+        conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack
     elif "vta" in target.keys:
         data_pack = True
         layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
+        conv2d_fcompute = vta.top.conv2d_packed
+        conv2d_fschedule = vta.top.schedule_conv2d_packed
 
     # Derive shapes depending upon packing
     a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
@@ -101,18 +107,24 @@ def run_conv2d(env, remote, wl, target,
     data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
     kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
     bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
+    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
 
     # Define base computation schedule
     with target:
-        res = topi.nn.conv2d(
-            data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1),
-            layout, env.acc_dtype)
+        if data_pack:
+            res = conv2d_fcompute(
+                data, kernel, (wl.hstride, wl.wstride), padding, (1, 1),
+                layout, env.acc_dtype)
+        else:
+            res = conv2d_fcompute(
+                data, kernel, (wl.hstride, wl.wstride), padding, (1, 1),
+                env.acc_dtype)
         res = topi.right_shift(res, 8)
         res = topi.add(res, bias)
         res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
         res = topi.cast(res, env.out_dtype)
         # Derive base schedule
-        s = topi.generic.schedule_conv2d_nchw([res])
+        s = conv2d_fschedule([res])
         if print_ir:
             print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
 
@@ -222,7 +234,8 @@ def get_ref_data():
 
     return correct, cost, stats
 
-def test_conv2d(device="vta"):
+@pytest.mark.parametrize("device", ["vta", "arm_cpu"])
+def test_conv2d(device):
     def _run(env, remote):
         if device == "vta":
             target = env.target
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
index d729fa517692..2d96a7313480 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
@@ -20,10 +20,12 @@
 import json
 import os
 
+import pytest
 import numpy as np
 from collections import namedtuple
 
 import tvm
+from tvm import relay
 from tvm import autotvm
 from tvm.contrib import util
 from tvm.contrib.pickle_memoize import memoize
@@ -80,14 +82,18 @@ def run_conv2d_transpose(env, remote, wl, target,
     if "arm_cpu" in target.keys:
         data_pack = False
         layout = "NCHW"
+        fcompute = topi.arm_cpu.conv2d_transpose_nchw
+        fschedule = topi.arm_cpu.schedule_conv2d_transpose_nchw
     elif "vta" in target.keys:
         data_pack = True
         layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
+        fcompute = vta.top.conv2d_transpose_packed
+        fschedule = vta.top.schedule_conv2d_transpose_packed
 
     # Derive shapes depending upon packing
 
     a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
-    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
+    w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
     if data_pack:
         data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN,
                       wl.height, wl.width, env.BATCH, env.BLOCK_IN)
@@ -98,16 +104,17 @@ def run_conv2d_transpose(env, remote, wl, target,
         kernel_shape = w_shape
     data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
     kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
 
     # Define base computation schedule
     with target:
-        res = topi.nn.conv2d_transpose_nchw(
-            data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), env.acc_dtype)
+        res = fcompute(
+            data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype)
         res = topi.right_shift(res, env.WGT_WIDTH)
         res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
         res = topi.cast(res, env.out_dtype)
         # Derive base schedule
-        s = topi.generic.schedule_conv2d_transpose_nchw([res])
+        s = fschedule([res])
         if print_ir:
             print(vta.lower(s, [data, kernel, res], simple_mode=True))
 
@@ -210,7 +217,8 @@ def get_ref_data():
 
     return correct, cost, stats
 
-def test_conv2d_transpose(device="vta"):
+@pytest.mark.parametrize("device", ["vta", "arm_cpu"])
+def test_conv2d_transpose(device):
     def _run(env, remote):
         if device == "vta":
             target = env.target
@@ -227,5 +235,5 @@ def _run(env, remote):
     vta.testing.run(_run)
 
 if __name__ == "__main__":
-    # test_conv2d_transpose(device="arm_cpu")
+    test_conv2d_transpose(device="arm_cpu")
     test_conv2d_transpose(device="vta")
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
index b0ee2f5f7792..a0acdc34acef 100644
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -63,21 +63,25 @@ def run_gemm(env, remote, target,
                       env.BATCH, env.BLOCK_IN)
         kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN,
                         env.BLOCK_OUT, env.BLOCK_IN)
+        fcompute = vta.top.dense_packed
+        fschedule = vta.top.schedule_dense_packed
     else:
         data_shape = a_shape
         kernel_shape = w_shape
+        fcompute = topi.x86.dense_nopack
+        fschedule = topi.x86.schedule_dense_nopack
     data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
     kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
 
     # Define base computation schedule
     with target:
-        res = topi.nn.dense(
-            data, kernel, out_dtype=env.acc_dtype)
+        res = fcompute(
+            data, kernel, None, env.acc_dtype)
         res = topi.right_shift(res, 8)
         res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
         res = topi.cast(res, env.out_dtype)
         # Derive base schedule
-        s = topi.generic.schedule_dense([res])
+        s = fschedule([res])
         if print_ir:
             print(vta.lower(s, [data, kernel, res], simple_mode=True))
 
diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
index 7bba2449cea5..31fef4923328 100644
--- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
@@ -20,10 +20,12 @@
 import json
 import os
 
+import pytest
 import numpy as np
 from collections import namedtuple
 
 import tvm
+from tvm import relay
 from tvm import autotvm
 from tvm.contrib import util
 import topi
@@ -75,9 +77,13 @@ def run_group_conv2d(env, remote, wl, target,
     if "arm_cpu" in target.keys:
         data_pack = False
         layout = "NCHW"
+        fcompute = topi.nn.group_conv2d_nchw
+        fschedule = topi.generic.schedule_group_conv2d_nchw
     elif "vta" in target.keys:
         data_pack = True
         layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
+        fcompute = vta.top.group_conv2d_packed
+        fschedule = vta.top.schedule_group_conv2d_packed
 
     # Derive shapes depending upon packing
     CI_G = wl.in_filter // wl.groups
@@ -98,17 +104,19 @@ def run_group_conv2d(env, remote, wl, target,
     data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
     kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
     bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
+    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
+
     # Define base computation schedule
     with target:
-        res = topi.nn.group_conv2d_nchw(
-            data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1),
+        res = fcompute(
+            data, kernel, (wl.hstride, wl.wstride), padding, (1, 1),
             wl.groups, env.acc_dtype)
         res = topi.right_shift(res, 8)
         res = topi.add(res, bias)
         res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
         res = topi.cast(res, env.out_dtype)
         # Derive base schedule
-        s = topi.generic.schedule_group_conv2d_nchw([res])
+        s = fschedule([res])
         if print_ir:
             print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
 
@@ -219,7 +227,8 @@ def get_ref_data():
 
     return correct, cost, stats
 
-def test_conv2d(device="vta"):
+@pytest.mark.parametrize("device", ["vta", "arm_cpu"])
+def test_conv2d(device):
     def _run(env, remote):
         if device == "vta":
             target = env.target
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index 94fba3db2989..a20b8ec8d3d3 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -295,7 +295,7 @@ def tune_tasks(tasks,
 
 
 def register_vta_tuning_tasks():
-    from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
+    from tvm.autotvm.task import TaskExtractEnv
 
     @tvm.tag_scope(tag=topi.tag.ELEMWISE)
     def my_clip(x, a_min, a_max):
@@ -309,20 +309,19 @@ def my_clip(x, a_min, a_max):
     # init autotvm env to register VTA operator
     TaskExtractEnv()
 
-    @autotvm.task.register("topi_nn_conv2d", override=True)
+    @autotvm.register_customized_task("conv2d_packed.vta")
     def _topi_nn_conv2d(*args, **kwargs):
         assert not kwargs, "Do not support kwargs in template function call"
-        args = deserialize_args(args)
         A, W = args[:2]
 
         with tvm.target.vta():
-            res = topi.nn.conv2d(*args, **kwargs)
+            res = vta.top.conv2d_packed(*args, **kwargs)
             res = topi.right_shift(res, 8)
             res = my_clip(res, 0, 127)
             res = topi.cast(res, "int8")
 
         if tvm.target.Target.current().device_name == 'vta':
-            s = topi.generic.schedule_conv2d_nchw([res])
+            s = vta.top.schedule_conv2d_packed([res])
         else:
             s = tvm.create_schedule([res.op])
         return s, [A, W, res]
@@ -356,10 +355,13 @@ def tune_and_evaluate(tuning_opt):
     mod = tvm.IRModule.from_expr(relay_prog)
     tasks = autotvm.task.extract_from_program(mod,
                                               params=params,
-                                              ops=(tvm.relay.op.nn.conv2d, ),
+                                              ops=(relay.op.get("nn.conv2d"),),
                                               target=target,
                                               target_host=env.target_host)
 
+    # filter out non-packed conv2d task
+    tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
+
     # We should have extracted 10 convolution tasks
     assert len(tasks) == 10
     print("Extracted {} conv2d tasks:".format(len(tasks)))

From 13cf1da3946c7be828b189276e3dd5a14b3e720d Mon Sep 17 00:00:00 2001
From: Samuel <siju.samuel@huawei.com>
Date: Tue, 25 Feb 2020 09:09:10 +0530
Subject: [PATCH 32/73] [FRONTEND][KERAS]GaussianDropout/Noise parsing support
 (#4928)

GaussianDropout & GaussianNoise are active only during training time. This can be skipped during inference.
---
 python/tvm/relay/frontend/keras.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index caf41768ada4..adb28c466454 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -762,6 +762,8 @@ def _default_skip(inexpr, keras_layer, _): # pylint: disable=unused-argument
     'Dropout'                  : _default_skip,
     'SpatialDropout2D'         : _default_skip,
     'SpatialDropout1D'         : _default_skip,
+    'GaussianDropout'          : _default_skip,
+    'GaussianNoise'            : _default_skip,
 }
 
 

From 81d1124055ddbf46ee9c2daa89c03a294548c37e Mon Sep 17 00:00:00 2001
From: vizero1 <50483246+vizero1@users.noreply.github.com>
Date: Tue, 25 Feb 2020 04:55:27 +0100
Subject: [PATCH 33/73] Use opencv reisze method for preprocessing of image in
 darknet (#4883)

* Use opencv reisze method for preprocessing of image in darknet

* Use opencv reisze method for preprocessing of image in darknet

* Fix pylint issues
---
 python/tvm/relay/testing/darknet.py | 62 ++++++++++-------------------
 1 file changed, 21 insertions(+), 41 deletions(-)

diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py
index ab94ecd6d2a9..5ddbcb12b7bd 100644
--- a/python/tvm/relay/testing/darknet.py
+++ b/python/tvm/relay/testing/darknet.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-variable, unused-argument, no-init, unpacking-non-sequence
+# pylint: disable=invalid-name, unused-variable, unused-argument, no-init
 """
 Compile DarkNet Models
 ====================
@@ -23,60 +23,41 @@
 These are utility functions used for testing and tutorial file.
 """
 from __future__ import division
-import math
 import numpy as np
 from cffi import FFI
 import cv2
 
-def _resize_image(img, w_in, h_in):
-    """Resize the image to the given height and width."""
-    imc, imh, imw = img.shape
-    h_in = int(h_in)
-    w_in = int(w_in)
-    part = np.zeros((imc, imh, w_in))
-    resized = np.zeros((imc, h_in, w_in))
-    w_scale = (imw - 1) / (w_in - 1)
-    h_scale = (imh - 1) / (h_in - 1)
-    for k in range(imc):
-        for j in range(imh):
-            for c in range(w_in):
-                if c == w_in - 1 or imw == 1:
-                    part[k][j][c] = img[k][j][imw - 1]
-                else:
-                    fdx, idx = math.modf(c * w_scale)
-                    part[k][j][c] = (1 - fdx) * img[k][j][int(idx)] + \
-                                            fdx * img[k][j][int(idx) + 1]
-    for k in range(imc):
-        for j in range(h_in):
-            fdy, idy = math.modf(j * h_scale)
-            for c in range(w_in):
-                resized[k][j][c] = (1 - fdy)*part[k][int(idy)][c]
-            if (j == h_in - 1) or (imh == 1):
-                continue
-            for c in range(w_in):
-                resized[k][j][c] += fdy * part[k][int(idy) + 1][c]
-    return resized
 
-def load_image_color(test_image):
-    """To load the image using opencv api and do preprocessing."""
-    imagex = cv2.imread(test_image)
-    imagex = cv2.cvtColor(imagex, cv2.COLOR_BGR2RGB)
-    imagex = np.array(imagex)
+def convert_image(image):
+    """Convert the image with numpy."""
+    imagex = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    imagex = np.array(image)
     imagex = imagex.transpose((2, 0, 1))
     imagex = np.divide(imagex, 255.0)
     imagex = np.flip(imagex, 0)
     return imagex
 
+def load_image_color(test_image):
+    """To load the image using opencv api and do preprocessing."""
+    imagex = cv2.imread(test_image)
+    return convert_image(imagex)
+
 def _letterbox_image(img, w_in, h_in):
     """To get the image in boxed format."""
-    imc, imh, imw = img.shape
+    imh, imw, imc = img.shape
     if (w_in / imw) < (h_in / imh):
         new_w = w_in
         new_h = imh * w_in // imw
     else:
         new_h = h_in
         new_w = imw * h_in // imh
-    resized = _resize_image(img, new_w, new_h)
+    dim = (new_w, new_h)
+    # Default interpolation method is INTER_LINEAR
+    # Other methods are INTER_AREA, INTER_NEAREST, INTER_CUBIC and INTER_LANCZOS4
+    # For more information see:
+    # https://docs.opencv.org/2.4/modules/imgproc/doc/geometric_transformations.html#resize
+    resized = cv2.resize(src=img, dsize=dim, interpolation=cv2.INTER_CUBIC)
+    resized = convert_image(resized)
     boxed = np.full((imc, h_in, w_in), 0.5, dtype=float)
     _, resizedh, resizedw = resized.shape
     boxed[:, int((h_in - new_h) / 2)
@@ -84,7 +65,7 @@ def _letterbox_image(img, w_in, h_in):
           :int((w_in - new_w) / 2) + resizedw] = resized
     return boxed
 
-def load_image(image, resize_width, resize_height):
+def load_image(img, resize_width, resize_height):
     """Load the image and convert to the darknet model format.
     The image processing of darknet is different from normal.
     Parameters
@@ -103,9 +84,8 @@ def load_image(image, resize_width, resize_height):
     img : Float array
         Array of processed image
     """
-
-    img = load_image_color(image)
-    return _letterbox_image(img, resize_width, resize_height)
+    imagex = cv2.imread(img)
+    return _letterbox_image(imagex, resize_width, resize_height)
 
 class LAYERTYPE(object):
     """Darknet LAYERTYPE Class constant."""

From 87c20bb2b266a3df1c0228d4bf996c8cd5cf9c66 Mon Sep 17 00:00:00 2001
From: Alex Wong <11878166+alexwong@users.noreply.github.com>
Date: Mon, 24 Feb 2020 20:14:45 -0800
Subject: [PATCH 34/73] [Relay] Add a PyTorch to Relay Parser (#4497)

* Add a PyTorch to Relay parser

* Add alexnet, googlenet, mnasnet, shufflenet wip

* Fix lint

* Remove fix for shufflenet

* Lower check

* Pull changes from neo-ai/tvm changes

* Remove commented out section

* Use infer_shape everywhere

* Change back to using trace instead of path in from_pytorch

* Parse state_dict to add param names

* Umbrella single_op under test_forwards

* Remove print and cleanup call

* Check if update to test broke CI

* Retrigger CI

* Add back in updated tests

* Try splitting up tests

* First pass at flexible typing, implemented for ones

* Add int32 for all ops

* Remove print statements

* Fix lint

* Broad except

* Add other tensor types

* Temporarily use old tests

* Retrigger CI

* Lower type names

* Use numpy to convert in dense op

* Fix lint

* Remove print

* Need to cleanup but verify int32 works for add

* Rough tests for different types, a lot of types are not supported on CPU

* Probably doesn't build, need to save work as I have to switch branches (constantly)

* Parse param type

* Remove print stmt in parser

* Clean up some code

* Working on flaot32 for bn

* Add resnet18 double type

* Fix lint

* Temporarily move PT tests first

* Temporarily add back refactored tests to fix mem issue

* Add more type test and temp remove some tests

* Comment out tests, hopefully CI prints a trace

* Get stack trace

* Remove operator dict key, rename op_name to node_id, remove dead code

* Make relay map a list

* Remove some hacky string stuff

* Move to PyTorch 1.4

* Remove input_type as param

* Remove _get_fill_value, fix full ops

* Remove unused code and combine ops for identity and none

* Remove fn_param

* Clean up main loop

* Remove useless if/else for outputs

* Remove ir_names, only used once

* Remove some string hacking

* Remove string parsing to get output name

* Fix bug with output sizes of nodes

* Use attributeNames in parse ops

* Remove continue and add_op in parse_op

* Do this everywhere, use assert instead of explciitly type casting

* Remove unnecessary swap

* Slight refactor for elemwise input parse

* Use a copy of graph everywhere

* Rename nid_to_node_name

* Refactor parse import prereqs

* Clean up input node kind check

* Clean up conditionals

* Clean up add_op

* Cleanup type for ones and zeros op

* Fix lint

* Add torch install to CI

* Actually use torch

* Try moving import torch to only where it's needed

* Import torch for CI

* Use take op for select

* Temporarily add ignore for jit inline pass for CI

* Use CompleteTensorType, might be a PT 1.2 only thing

* Use different types in elemwise op

* Use float16 ones

* Fix float16 test

* Remove the temp docker changes

* Remove temp test

* Temporarily comment out original tests

* Remove file

* Empty cache after each test

* Add some prints and lower input sizes

* Try using no grad

* Trying to globally set grad off

* Use no grad for torchvision

* Remove xfail tests

* Remove VGG and AlexNet due to some issues

* Combine pooling tests

* Remove extra test file

* Remove single op, remove larger pooling tests

* Remove maxpool3

* Remove debug prints

* Remove inference call and add no_grad in measure latency

* Use standard string start char

* Remove redundant infer_shape in slice

* Convert most to checks to just expr

* Remove extra paren

* More refactor of isinstance

* Add helper for creating typed constants

* Assert instead of return when no matching type

* Remove network variants

* Add no_grad when forward, remove deatch, fix lint

* Change isinstance to expr in transpose

* Use opnotimplemented, refactor

* Fix full ops, remove duplicate tests

* Never use shape field unless we know the type

* Remove comma, retrigger CI

* Add paren, retrigger CI

* Use inline if-else for flags

* Throw exception instead of assert

* Remove version check for CI

* Check version when doing inline pass

* Fix lint

* Lower more input sizes

* Add new line, conv2d only accepts weight as expr

* Use tvm.runtime.ndarray

* Remove change to torch version install

* Try no grad for mobilenet

* Fix lint

* Fix lint again

* Revert to last passing

* Delete test files

* Ignore lint

* Revert back

* Comment out mobilenet

* Clean up compare compiled and baseline outputs

* Use IRModule

* Add todos

* Refactor use_bias

* Add todo for fix conv op channels

* Change input to data type

* Remove todo

* Handle channel multiplier > 1
---
 docs/api/python/relay/frontend.rst            |    2 +
 python/tvm/relay/frontend/__init__.py         |    1 +
 python/tvm/relay/frontend/pytorch.py          | 1032 +++++++++++++++++
 tests/python/frontend/pytorch/test_forward.py |  768 ++++++++++++
 tests/scripts/task_python_frontend.sh         |    3 +
 5 files changed, 1806 insertions(+)
 create mode 100644 python/tvm/relay/frontend/pytorch.py
 create mode 100644 tests/python/frontend/pytorch/test_forward.py

diff --git a/docs/api/python/relay/frontend.rst b/docs/api/python/relay/frontend.rst
index 90da0a4d2808..4b4bcf0397a5 100644
--- a/docs/api/python/relay/frontend.rst
+++ b/docs/api/python/relay/frontend.rst
@@ -34,3 +34,5 @@ tvm.relay.frontend
 .. autofunction:: tvm.relay.frontend.from_caffe2
 
 .. autofunction:: tvm.relay.frontend.from_tensorflow
+
+.. autofunction:: tvm.relay.frontend.from_pytorch
diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
index 21115d07241c..fa258f48ac76 100644
--- a/python/tvm/relay/frontend/__init__.py
+++ b/python/tvm/relay/frontend/__init__.py
@@ -36,3 +36,4 @@
 from .caffe2 import from_caffe2
 from .tensorflow import from_tensorflow
 from .darknet import from_darknet
+from .pytorch import from_pytorch
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
new file mode 100644
index 000000000000..af8715abaed3
--- /dev/null
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -0,0 +1,1032 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, too-many-lines, len-as-condition, no-else-return, unused-variable, too-many-nested-blocks
+# pylint: disable=consider-iterating-dictionary, invalid-name, unused-argument, unused-variable, broad-except
+# pylint: disable=import-outside-toplevel, simplifiable-if-expression, unnecessary-comprehension
+"""PT: PyTorch frontend."""
+import numpy as np
+
+import tvm
+from tvm.ir import module as _module
+
+from .. import analysis as _analysis
+from .. import expr as _expr
+from .. import op as _op
+from .common import get_relay_op
+from .common import infer_shape as _infer_shape
+
+__all__ = ["from_pytorch"]
+
+# operator implementation
+def _elemwise(name):
+    def _impl(inputs, input_types):
+        # TODO: Figure out a better way to get typing to work for tensor + scalar
+        type0 = input_types[0]
+        if isinstance(inputs[1], _expr.Expr):
+            type0 = input_types[1]
+
+        type1 = input_types[1]
+        if isinstance(inputs[0], _expr.Expr):
+            type1 = input_types[0]
+
+        data0 = _convert_elemwise_input(inputs[0], type0)
+        data1 = _convert_elemwise_input(inputs[1], type1)
+
+        return get_relay_op(name)(data0, data1)
+    return _impl
+
+def _unsqueeze():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        axis = inputs[1]
+
+        return _op.transform.expand_dims(data, int(axis), 1)
+    return _impl
+
+def _concatenate():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        axis = inputs[1]
+
+        if isinstance(data, _expr.Expr):
+            data = [data]
+
+        return _op.tensor.concatenate(data, int(axis))
+    return _impl
+
+def _slice():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        strides = []
+
+        if isinstance(data, _expr.Expr):
+            inferred_shape = _infer_shape(data)
+            end = []
+            for infer in inferred_shape:
+                end.append(int(infer))
+            if isinstance(data, _expr.Var):
+                end = inferred_shape
+                end = list(end)
+        else:
+            end = data.shape
+
+        begin = [0]*len(end)
+        dim = int(inputs[1])
+        begin[dim] = int(inputs[2])
+
+        if isinstance(inputs[3], str) and inputs[3].isdigit():
+            end[dim] = min(end[dim], int(inputs[3]))
+        else:
+            end[dim] = inputs[3]
+
+        strides.append(int(inputs[4]))
+        return _op.transform.strided_slice(data, begin, end, strides)
+    return _impl
+
+def _select():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        dim = int(inputs[1])
+        index = int(inputs[2])
+
+        return _op.transform.take(data, _expr.const(index, dtype="int32"), axis=dim)
+    return _impl
+
+def _ones():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+
+        import torch
+        if isinstance(data, _expr.Expr):
+            shape = _infer_shape(data)
+        elif isinstance(data, list):
+            shape = data
+        elif isinstance(data, (torch.Tensor, np.ndarray)):
+            shape = data.shape
+        else:
+            assert "data type {} could not be parsed in ones op" % (type(data))
+
+        return _op.full(_expr.const(1), shape, dtype=_convert_data_type(input_types[0]))
+    return _impl
+
+def _zeros():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+
+        import torch
+        if isinstance(data, _expr.Expr):
+            shape = _infer_shape(data)
+        elif isinstance(data, list):
+            shape = data
+        elif isinstance(data, (torch.Tensor, np.ndarray)):
+            shape = data.shape
+        else:
+            assert "data type {} could not be parsed in zeros op" % (type(data))
+
+        return _op.full(_expr.const(0), shape, dtype=_convert_data_type(input_types[0]))
+    return _impl
+
+def _relu():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        return _op.nn.relu(data)
+    return _impl
+
+def _adaptive_avg_2d():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        output_size = _infer_shape(inputs[1])
+
+        return _op.contrib.contrib.adaptive_avg_pool2d(
+            data,
+            output_size=output_size)
+    return _impl
+
+def _adaptive_max_2d():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        output_size = _infer_shape(inputs[1])
+
+        return _op.contrib.contrib.adaptive_max_pool2d(
+            data,
+            output_size=output_size)
+    return _impl
+
+def _maxpool_2d():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+
+        pool_size = _infer_shape(inputs[1])
+        strides = _infer_shape(inputs[2])
+        padding = _infer_shape(inputs[3])
+
+        ceil_mode = int(inputs[5])
+
+        return _op.nn.max_pool2d(data, pool_size, strides, padding, "NCHW", ceil_mode)
+    return _impl
+
+def _hardtanh():
+    def _impl(inputs, input_types):
+        a = inputs[0]
+        tanh_min = float(inputs[1])
+        tanh_max = float(inputs[2])
+        return _op.tensor.clip(a, tanh_min, tanh_max)
+    return _impl
+
+def _convolution():
+    def _impl(inputs, input_types):
+        # Use transpose or normal
+        use_transpose = True if inputs[6] == "1" else False
+
+        data = inputs[0]
+        weight = inputs[1]
+        bias = inputs[2]
+        strides = inputs[3]
+        padding = inputs[4]
+        dilation = inputs[5]
+
+        if isinstance(weight, _expr.Expr):
+            inferred_shape = _infer_shape(weight)
+            weight_shape = []
+            for infer in inferred_shape:
+                weight_shape.append(infer)
+        else:
+            assert "data type {} could not be parsed in conv op" % (type(weight))
+
+        # TODO: Add reshape when channel multiplier > 1. Pending PR #4644
+        channels = weight_shape[0]
+        groups = int(inputs[8])
+
+        if groups > 1:
+            # in torch, groups == in_channels for depth wise conv
+            channel_multiplier = channels // groups
+            new_weight_shape = (groups, channel_multiplier, weight_shape[2], weight_shape[3])
+            weight = _op.transform.reshape(weight, new_weight_shape)
+
+        kernel_size = weight_shape[2:]
+        use_bias = isinstance(bias, _expr.Expr)
+
+        if isinstance(strides, _expr.Expr):
+            strides = _infer_shape(strides)
+
+        if isinstance(padding, _expr.Expr):
+            padding = _infer_shape(padding)
+
+        if isinstance(dilation, _expr.Expr):
+            dilation = _infer_shape(dilation)
+
+        if use_transpose:
+            conv_out = _op.nn.conv2d_transpose(data,
+                                               weight,
+                                               strides=strides,
+                                               padding=padding,
+                                               dilation=dilation,
+                                               groups=groups,
+                                               channels=channels,
+                                               kernel_size=kernel_size,
+                                               data_layout="NCHW",
+                                               kernel_layout="OIHW",
+                                               out_layout="",
+                                               out_dtype="")
+        else:
+            conv_out = _op.nn.conv2d(data,
+                                     weight,
+                                     strides=strides,
+                                     padding=padding,
+                                     dilation=dilation,
+                                     groups=groups,
+                                     channels=channels,
+                                     kernel_size=kernel_size,
+                                     data_layout="NCHW",
+                                     kernel_layout="OIHW",
+                                     out_layout="",
+                                     out_dtype="")
+
+        if use_bias:
+            return _op.nn.bias_add(conv_out, bias)
+        else:
+            return conv_out
+    return _impl
+
+def _softmax():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        axis = inputs[1]
+        if isinstance(axis, str):
+            axis = int(axis)
+
+        return _op.nn.softmax(data, axis=axis)
+    return _impl
+
+def _threshold():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        return _op.nn.relu(data)
+    return _impl
+
+def _contiguous():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        return _op.tensor.copy(data)
+    return _impl
+
+def _batch_norm():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        data_type = input_types[0]
+
+        channels = _infer_shape(data)
+
+        if isinstance(inputs[1], _expr.Expr) and isinstance(inputs[2], _expr.Expr):
+            scale = center = True
+            weight = inputs[1]
+            beta = inputs[2]
+            gamma = weight
+        else:
+            scale = center = False
+
+        if not scale:
+            gamma = _create_typed_const(np.ones([int(channels[1])]), data_type)
+
+        if not center:
+            beta = _create_typed_const(np.zeros([int(channels[1])]), data_type)
+
+        moving_mean = inputs[3]
+        moving_var = inputs[4]
+        epsilon = float(inputs[7])
+
+        return _op.nn.batch_norm(data,
+                                 gamma,
+                                 beta,
+                                 moving_mean,
+                                 moving_var,
+                                 axis=1,
+                                 epsilon=epsilon,
+                                 center=center,
+                                 scale=scale)[0]
+    return _impl
+
+def _transpose():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+
+        import torch
+        if isinstance(data, _expr.Expr):
+            ndims = len(_infer_shape(data))
+        elif isinstance(data, list):
+            ndims = data
+        elif isinstance(data, (torch.Tensor, np.ndarray)):
+            ndims = data.shape
+        else:
+            assert "data type {} could not be parsed in transpose op" % (type(data))
+
+        if isinstance(data, tvm.runtime.NDArray):
+            ndims = len(data.shape)
+        axes = list(range(ndims))
+
+        num_inputs = len(inputs)
+
+        if num_inputs == 1:
+            if ndims >= 2:
+                axes[-1] = ndims - 2
+                axes[-2] = ndims - 1
+            if not isinstance(data, _expr.Expr):
+                data = _expr.const(data)
+
+        elif num_inputs == 3:
+            parse = lambda i: ndims * (i < 0) + i
+            src, dst = [parse(int(inputs[i])) for i in [1, 2]]
+            axes[src] = dst
+            axes[dst] = src
+        else:
+            axes = inputs[1]
+        return _op.transform.transpose(data, axes)
+    return _impl
+
+def _flatten():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        return _op.nn.batch_flatten(data)
+    return _impl
+
+def _dense():
+    def _impl(inputs, input_types):
+        use_bias = isinstance(inputs[0], _expr.Expr)
+
+        data = inputs[1]
+        data_type = input_types[1]
+        weight = inputs[2]
+
+        beta = inputs[3]
+        alpha = inputs[4]
+
+        if not isinstance(alpha, _expr.Expr):
+            alpha = _create_typed_const(alpha, data_type)
+            data *= alpha
+
+        if not isinstance(beta, _expr.Expr):
+            beta = _create_typed_const(beta, data_type)
+            weight *= beta
+
+        weight_out = _op.transform.transpose(weight, axes=[1, 0])
+
+        units = _infer_shape(weight_out)[0]
+        dense_out = _op.nn.dense(data, weight_out, units=units)
+
+        if use_bias:
+            bias = inputs[0]
+            return _op.nn.bias_add(dense_out, bias)
+        else:
+            return dense_out
+    return _impl
+
+def _size():
+    def _impl(inputs, input_types):
+        axis = int(inputs[1])
+        shape = _infer_shape(inputs[0])
+        return shape[axis]
+    return _impl
+
+def _numtotensor():
+    def _impl(inputs, input_types):
+        val = inputs[0]
+        dtype = type(val)
+
+        if isinstance(val, tvm.expr.IntImm):
+            val = val.__int__()
+            dtype = int
+
+        arr = val * np.ones([]).astype(dtype)
+        return arr
+    return _impl
+
+def _view():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+
+        if len(inputs) == 3:
+            new_shape = [inputs[1], _infer_shape(inputs[2])[0]]
+        else:
+            if isinstance(inputs[1], list):
+                new_shape = inputs[1]
+            else:
+                new_shape = _infer_shape(inputs[1])
+
+        return _op.transform.reshape(data, new_shape)
+    return _impl
+
+def _clone():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        return _op.tensor.copy(data)
+    return _impl
+
+def _log_softmax():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        axis = int(inputs[1])
+        return _op.nn.log_softmax(data, axis)
+    return _impl
+
+def _sigmoid():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        return _op.tensor.sigmoid(data)
+    return _impl
+
+def _avg_pool2d():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+
+        pool_size = _infer_shape(inputs[1])
+        strides = _infer_shape(inputs[2])
+        padding = _infer_shape(inputs[3])
+
+        ceil_mode = int(inputs[4])
+        count_include_pad = int(inputs[5])
+
+        return _op.nn.avg_pool2d(data,
+                                 pool_size=pool_size,
+                                 strides=strides,
+                                 padding=padding,
+                                 ceil_mode=ceil_mode,
+                                 count_include_pad=count_include_pad)
+    return _impl
+
+def _dropout():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        rate = float(inputs[1])
+
+        return _op.nn.dropout(data, rate)
+    return _impl
+
+def _reduce(name):
+    def _impl(inputs, attrs, params):
+        data = inputs[0]
+        return get_relay_op(name)(data)
+    return _impl
+
+def _mean():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        axis = _infer_shape(inputs[1])
+
+        keepdims = int(inputs[2])
+        exclude = int(inputs[3])
+
+        return _op.mean(data, axis, keepdims, exclude)
+    return _impl
+
+def _chunk():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+
+        num_chunks = int(inputs[1])
+        axis = int(inputs[2])
+
+        if isinstance(data, _expr.Expr):
+            inferred_shape = _infer_shape(data)
+
+        shape = []
+        for infer in inferred_shape:
+            shape.append(infer)
+
+        dim = int(shape[axis])
+
+        if dim % num_chunks:
+            unif_size = int(dim / (num_chunks - 1))
+        else:
+            unif_size = int(dim / num_chunks)
+
+        chunks = []
+        for i in range(0, dim, unif_size):
+            begin = [0] * len(shape)
+            end = shape[:]
+            begin[axis] = i
+            end[axis] = i + unif_size
+            stride = [1] * len(shape)
+
+            chunk_out = _op.transform.strided_slice(data, begin, end, stride)
+            chunks.append(chunk_out)
+
+
+        if dim % num_chunks:
+            begin = [0] * len(shape)
+            end = shape[:]
+            begin[axis] = unif_size * (num_chunks - 1)
+            end[axis] = dim
+            stride = [1] * len(shape)
+
+            chunk_out = _op.transform.strided_slice(data, begin, end, stride)
+            chunks.append(chunk_out)
+
+        return chunks
+    return _impl
+
+def _matmul():
+    def _impl(inputs, input_types):
+        data0 = inputs[0]
+        data1 = inputs[1]
+        data1_t = _op.transpose(data1, axes=(1, 0))
+
+        return _op.nn.dense(data0, data1_t)
+    return _impl
+
+def _expand():
+    def _impl(inputs, input_types):
+        data_in = inputs[0]
+        if isinstance(data_in, _expr.Expr):
+            shape = _infer_shape(data_in)
+
+        ndims = len(shape)
+        sizes = _infer_shape(inputs[1])
+        out = inputs[0]
+
+        for i in range(ndims):
+            if sizes[i] in {-1, shape[i]}:
+                continue
+            data = list()
+            for temp in range(sizes[i]):
+                data.append(out)
+            call = _op.tensor.concatenate(data, i)
+
+        return call
+    return _impl
+
+def _int():
+    def _impl(inputs, input_types):
+        if isinstance(inputs[0], _expr.Expr):
+            return inputs[0]
+        return int(inputs[0])
+    return _impl
+
+def _identity():
+    def _impl(inputs, input_types):
+        return inputs[0]
+    return _impl
+
+def _none():
+    def _impl(inputs, input_types):
+        return None
+    return _impl
+
+def _pad():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        padding = inputs[1]
+        pad_width = list(zip(padding, padding))
+        pad_value = inputs[2]
+        return _op.nn.pad(data, pad_width, pad_value)
+    return _impl
+
+def _sqrt():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        return _op.tensor.sqrt(data)
+    return _impl
+
+# Helper functions for operator implementation
+
+def _convert_data_type(input_type):
+    if input_type in ["double", "torch.float64"]:
+        return "float64"
+    elif input_type in ["float", "torch.float32"]:
+        return "float32"
+    elif input_type in ["half", "torch.float16"]:
+        return "float16"
+    elif input_type in ["long", "torch.int64"]:
+        return "int64"
+    elif input_type in ["int", "torch.int32"]:
+        return "int32"
+    elif input_type in ["short", "torch.int16"]:
+        return "int16"
+    elif input_type in ["char", "torch.int8"]:
+        return "int8"
+    elif input_type in ["byte", "torch.uint8"]:
+        return "uint8"
+    else:
+        raise NotImplementedError("input_type {} is not handled yet" % (input_type))
+    return "float32"
+
+def _create_typed_const(data, data_type):
+    dtype = _convert_data_type(data_type)
+
+    if dtype == "float64":
+        typed_data = _expr.const(np.float64(data), dtype=dtype)
+    elif dtype == "float32":
+        typed_data = _expr.const(np.float32(data), dtype=dtype)
+    elif dtype == "float16":
+        typed_data = _expr.const(np.float16(data), dtype=dtype)
+    elif dtype == "int64":
+        typed_data = _expr.const(np.int64(data), dtype=dtype)
+    elif dtype == "int32":
+        typed_data = _expr.const(np.int32(data), dtype=dtype)
+    elif dtype == "int16":
+        typed_data = _expr.const(np.int16(data), dtype=dtype)
+    elif dtype == "int8":
+        typed_data = _expr.const(np.int8(data), dtype=dtype)
+    elif dtype == "uint8":
+        typed_data = _expr.const(np.uint8(data), dtype=dtype)
+    else:
+        raise NotImplementedError("input_type {} is not handled yet" % (data_type))
+    return typed_data
+
+def _convert_elemwise_input(data, input_type):
+    import torch
+    if isinstance(data, torch.Tensor):
+        return _expr.const(data.item(), dtype=_convert_data_type(input_type))
+    elif not isinstance(data, _expr.Expr):
+        return _expr.const(int(data), dtype=_convert_data_type(input_type))
+    else:
+        return data
+
+# Operator mappings
+
+_convert_map = {
+    "aten::device"                          : _none(),
+    "aten::add"                             : _elemwise("add"),
+    "aten::add_"                            : _elemwise("add"),
+    "aten::sub"                             : _elemwise("subtract"),
+    "aten::sub_"                            : _elemwise("subtract"),
+    "aten::max"                             : _elemwise("maximum"),
+    "aten::min"                             : _elemwise("minimum"),
+    "aten::mul"                             : _elemwise("multiply"),
+    "aten::mul_"                            : _elemwise("multiply"),
+    "aten::pow"                             : _elemwise("power"),
+    "aten::div"                             : _elemwise("divide"),
+    "aten::div_"                            : _elemwise("divide"),
+    "aten::ones"                            : _ones(),
+    "aten::zeros"                           : _zeros(),
+    "aten::to"                              : _identity(),
+    "aten::unsqueeze"                       : _unsqueeze(),
+    "aten::cat"                             : _concatenate(),
+    "aten::slice"                           : _slice(),
+    "aten::select"                          : _select(),
+    "aten::relu"                            : _relu(),
+    "aten::relu_"                           : _relu(),
+    "aten::adaptive_avg_pool2d"             : _adaptive_avg_2d(),
+    "aten::adaptive_max_pool2d"             : _adaptive_max_2d(),
+    "aten::max_pool2d"                      : _maxpool_2d(),
+    "aten::max_pool2d_with_indices"         : _maxpool_2d(),
+    "aten::hardtanh"                        : _hardtanh(),
+    "aten::hardtanh_"                       : _hardtanh(),
+    "aten::_convolution"                    : _convolution(),
+    "aten::softmax"                         : _softmax(),
+    "aten::threshold"                       : _threshold(),
+    "aten::threshold_"                      : _threshold(),
+    "aten::contiguous"                      : _contiguous(),
+    "aten::batch_norm"                      : _batch_norm(),
+    "aten::transpose"                       : _transpose(),
+    "aten::transpose_"                      : _transpose(),
+    "aten::t"                               : _transpose(),
+    "aten::flatten"                         : _flatten(),
+    "aten::addmm"                           : _dense(),
+    "aten::size"                            : _size(),
+    "aten::view"                            : _view(),
+    "aten::clone"                           : _clone(),
+    "aten::log_softmax"                     : _log_softmax(),
+    "aten::sigmoid"                         : _sigmoid(),
+    "aten::avg_pool2d"                      : _avg_pool2d(),
+    "aten::dropout"                         : _dropout(),
+    "aten::dropout_"                        : _dropout(),
+    "aten::mean"                            : _mean(),
+    "aten::chunk"                           : _chunk(),
+    "aten::matmul"                          : _matmul(),
+    "aten::expand"                          : _expand(),
+    "aten::Int"                             : _int(),
+    "prim::NumToTensor"                     : _numtotensor(),
+    "prim::ListUnpack"                      : _identity(),
+    "aten::constant_pad_nd"                 : _pad(),
+    "aten::permute"                         : _transpose(),
+    "aten::sum"                             : _reduce("sum"),
+    "aten::prod"                            : _reduce("prod"),
+    "aten::sqrt"                            : _sqrt()
+}
+
+# Internal graph for parsing
+
+class Graph(object):
+    """ A helper class for parsing PyTorch model to Relay graph."""
+
+    def __init__(self, script_module, input_shapes):
+
+        self._script_module = script_module
+        self._graph = script_module.graph.copy()
+
+        # TODO: Temporary fix to remove prim::CallMethod node introduced in PT 1.4
+        import torch
+        from packaging import version
+        if version.parse(torch.__version__) >= version.parse("1.4.0"):
+            torch._C._jit_pass_inline(self._graph)
+
+        self._inputs_r = {}
+        self._params = {}
+        self._param_tensors = {}
+        self._consts = {}
+        self._ops = {}
+        self._op_inputs_r = {}
+        self._op_inputs_types = {}
+        self._input_shapes = input_shapes if input_shapes else {}
+        self._parsed_node_names = {}
+
+    def from_pytorch(self):
+        """ Construct relay nodes from PyTorch graph
+
+        Currently only supports traced PyTorch format which means no control flow.
+        User must perform torch.jit.trace on a model and pass this in.
+        Future support should include support scripted models (torch.jit.script) which
+        preserves control flow.
+
+        Returns
+        -------
+        mod : tvm.relay.Module
+            The module that optimizations will be performed on.
+
+        params : dict of str to tvm.runtime
+            Dict of converted parameters stored in tvm.runtime format
+        """
+        # Check for missing ops
+        missing_operators = self._parse_import_prerequisites()
+
+        if missing_operators:
+            raise tvm.error.OpNotImplemented( \
+                "The following operators are not implemented: {}".format(missing_operators))
+
+        # Translate PyTorch graph to by decorating Graph with state dict and inputs into each op
+        self._parse_inputs()
+        self._parse_params()
+        self._parse_ops()
+
+        outputs = []
+        nid = 0
+
+        for op_name, op_node in self._ops.items():
+            if op_node.kind() == "prim::ListConstruct":
+                if any(inp.debugName() in self._parsed_node_names.keys() \
+                       for inp in op_node.inputs()):
+                    list_constr = []
+                    for i in op_node.inputs():
+                        if i.debugName() in self._parsed_node_names.keys():
+                            list_constr.append( \
+                                outputs[self._parsed_node_names[i.debugName()]])
+                        elif i.node().kind() == "prim::Constant":
+                            list_constr.append(int(self._consts[i.debugName()]))
+                        elif i.debugName() in self._inputs_r.keys():
+                            list_constr.append(int(self._inputs_r[i.debugName()]))
+
+                    # Unwrap for tensors
+                    if len(list_constr) == 1:
+                        list_constr = list_constr[0]
+
+                    outputs.append(list_constr)
+                    self._parsed_node_names[op_name] = nid
+                    nid = nid+1
+            elif op_node.kind() != "prim::Constant":
+                for i in op_node.inputs():
+                    if i.debugName() in self._parsed_node_names.keys():
+                        for cnt in range(0, len(self._op_inputs_r[op_name])):
+                            if isinstance(self._op_inputs_r[op_name][cnt], str):
+                                if "call/var" in self._op_inputs_r[op_name][cnt]:
+                                    self._op_inputs_r[op_name][cnt] = \
+                                        outputs[self._parsed_node_names[i.debugName()]]
+                                    break
+
+                call = _convert_map[op_node.kind()](self._op_inputs_r[op_name],
+                                                    self._op_inputs_types[op_name])
+
+                outputs.append(call)
+                self._parsed_node_names[op_name] = nid
+                nid = nid+1
+
+        func = tvm.relay.Function(_analysis.free_vars(outputs[-1]), outputs[-1])
+
+        param = {k: tvm.nd.array(v) for k, v in self._param_tensors.items()}
+
+        return  _module.IRModule.from_expr(func), param
+
+    def _parse_inputs(self):
+        """ Map inputs to parser and inputs to graph. """
+        # Get names and objects of inputs for IR
+        ir_inputs = [i for i in self._graph.inputs()]
+
+        # Create corresponding shape and add to input
+        for input_name, ir_input in zip(self._input_shapes, ir_inputs[1:]):
+            input_shape = self._input_shapes[input_name]
+            ir_input.setDebugName(input_name)
+
+            ir_dtype = _convert_data_type(ir_input.type().scalarType().lower())
+            self._inputs_r[input_name] = _expr.var(input_name,
+                                                   shape=self._input_shapes[input_name],
+                                                   dtype=ir_dtype)
+
+        # Add self (first input of a PyTorch graph) to inputs, the value doesn't matter here
+        input_name = ir_inputs[0].debugName()
+        self._inputs_r[input_name] = "self"
+
+    def _parse_params(self):
+        """ Map state dictionary values to corresponding prim::GetAttr op node. """
+        # Grab weights, biases, etc. from graph
+        state_dict = self._script_module.state_dict()
+        param_names = []
+        for key, value in state_dict.items():
+            param_str = str(key)
+            param_name = param_str.split(".")[-1]
+            param_names.append(param_name)
+
+        # Get names of all inputs
+        input_names = [i for i in self._inputs_r.keys()]
+
+        # Iterate through graph for getAttr nodes and match full state_dict name to nodes
+        node_weight_map = {}
+        for node in self._graph.nodes():
+            if node.kind() == "prim::GetAttr":
+
+                attribute_names = node.attributeNames()
+                assert len(attribute_names) == 1
+                node_getattr_name = node.s(attribute_names[0])
+                node_arg = node.input().debugName()
+
+                if node.outputsSize() == 1:
+                    node_name = node.output().debugName()
+                else:
+                    node_name = [output.debugName() for output in node.outputs()][0]
+
+                if node_arg in input_names:
+                    node_weight_map[node_name] = node_getattr_name
+                else:
+                    previous_map = node_weight_map[node_arg[:]]
+                    node_weight_map[node_name] = previous_map+"."+node_getattr_name
+
+                if node_getattr_name in param_names:
+
+                    value = state_dict[node_weight_map[node_name]]
+                    tensor = tvm.nd.array(value.cpu().numpy())
+                    shape = tensor.shape
+                    self._param_tensors[node_name] = tensor
+
+                    self._params[node_name] = _expr.var(node_name,
+                                                        shape=shape,
+                                                        dtype=_convert_data_type(str(value.dtype)))
+
+    def _parse_ops(self):
+        """ Iterate through nodes and decorate graph with constants, operators,
+        and the inputs to each operator. """
+        # Traverse nodes and add to graph
+        for node in self._graph.nodes():
+
+            if node.outputsSize() == 1:
+                node_name = node.output().debugName()
+            else:
+                node_name = [output.debugName() for output in node.outputs()][0]
+
+            if node.kind() == "prim::Constant":
+                if node.hasAttributes():
+                    attribute_names = node.attributeNames()
+                    attr_name = attribute_names[0]
+                    ty = node.output().type().kind()
+
+                    if ty in ["IntType", "BoolType"]:
+                        self._consts[node_name] = node.i(attr_name)
+                    elif ty in ["FloatType", "LongType"]:
+                        self._consts[node_name] = node.f(attr_name)
+                    elif ty in ["TensorType", "CompleteTensorType"]:
+                        self._consts[node_name] = node.output().toIValue()
+                    else:
+                        self._consts[node_name] = "0"
+                else:
+                    self._consts[node_name] = "0"
+            elif node.kind() == "prim::ListConstruct":
+                list_shape = []
+                for input_node in node.inputs():
+                    if input_node.debugName() in self._inputs_r.keys():
+                        c = self._inputs_r[input_node.debugName()]
+                        assert isinstance(c, int)
+                        list_shape.append(c)
+                    elif input_node.debugName() in self._consts.keys():
+                        c = self._consts[input_node.debugName()]
+                        assert isinstance(c, int)
+                        list_shape.append(c)
+                self._inputs_r[node_name] = _expr.var(node_name, shape=list_shape)
+
+            if node.kind() != "prim::GetAttr":
+                self._add_op(node_name, node)
+
+    # Graph Helper Functions
+
+    def _add_op(self, node_id, op_node):
+        """ Add an operator and its operators inputs to the graph and insert placeholders
+            where an input is a call node.
+
+        Parameters
+        ----------
+        node_id : string
+            The ID of the op node
+
+        op_node : PyTorch Node object
+            The full Node object for the op node
+
+        """
+        self._ops[(node_id)] = op_node
+        input_list_r = []
+        input_list_types = []
+        for input_value in op_node.inputs():
+
+            inode_id = input_value.debugName()
+            inode = input_value.node()
+
+            if inode_id in self._inputs_r.keys():
+                input_list_r.append(self._inputs_r[inode_id])
+            elif inode_id in self._params.keys():
+                input_list_r.append(self._params[inode_id])
+            elif inode.kind() == "prim::Constant":
+                input_list_r.append(self._consts[inode_id])
+            else:
+                input_list_r.append("call/var."+inode_id)
+
+                # If the inputs of a ListConstruct op is a call or var, remove it from inputs
+                if op_node.kind() == "prim::ListConstruct":
+                    if node_id in self._inputs_r.keys():
+                        self._inputs_r.pop(node_id)
+
+            try:
+                input_value_kind = input_value.type().kind()
+                if input_value_kind in ["TensorType", "CompleteTensorType"]:
+                    if input_value.type().scalarType() is None:
+                        input_list_types.append("float")
+                    else:
+                        input_list_types.append(input_value.type().scalarType().lower())
+                elif input_value_kind == "ListType":
+                    input_list_types.append(str(input_value.type().getElementType()).lower())
+                elif input_value_kind in ["IntType", "FloatType", "BoolType", "StringType",
+                                          "OptionalType"]:
+                    input_list_types.append(str(input_value.type()).lower())
+                else:
+                    input_list_types.append("UnsupportedType")
+                    print("UnsupportedType "+str(input_value.type())+" and "+str(input_value_kind))
+            except Exception as e:
+                print("Internal PyTorch error. Failed to grab type.")
+
+        if op_node.kind() in ["aten::ones", "aten::zeros"]:
+            node_type = op_node.output().type().scalarType()
+            input_list_types[0] = node_type.lower()
+
+        self._op_inputs_r[node_id] = input_list_r
+        self._op_inputs_types[node_id] = input_list_types
+
+    def _parse_import_prerequisites(self):
+        """ Calculate the named preconditions from PyTorch graph.
+
+        Returns
+        -------
+        missing_operators : set object
+            Set of operator names which don't have their mapping in TVM
+            i.e. which are not supported
+
+        """
+        missing_operators = set()
+        for node in self._graph.nodes():
+            if not node.kind() in ["prim::Constant", "prim::ListConstruct", "prim::GetAttr"] \
+                    and not node.kind() in _convert_map:
+                missing_operators.add(node.kind())
+
+        return missing_operators
+
+def from_pytorch(script_module, input_shapes):
+    """ Load PyTorch model in the form of a scripted PyTorch model and convert into relay.
+    The companion parameters will be handled automatically.
+
+    Parameters
+    ----------
+    script_module : TopLevelTracedModule object
+        TorchScripted PyTorch graph
+        Note: We currently only support traces (ie: torch.jit.trace(model, input))
+
+    shape : Dictionary of input dimensions
+        Graph level input shape dictionary
+
+    Returns
+    -------
+    mod : tvm.relay.Module
+        The module that optimizations will be performed on.
+
+    params : dict of str to tvm.runtime
+        Dict of converted parameters stored in tvm.runtime format
+    """
+    g = Graph(script_module, input_shapes)
+    mod, params = g.from_pytorch()
+    return mod, params
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
new file mode 100644
index 000000000000..715ae7805cc3
--- /dev/null
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -0,0 +1,768 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, invalid-name, unused-argument
+"""Unit tests for various models and operators"""
+from time import time
+import os
+import sys
+from tempfile import TemporaryDirectory
+from scipy.stats import t as tdistr
+import numpy as np
+import torch
+from torch.nn import Module
+import tvm
+import torchvision
+
+from tvm import relay
+from tvm.contrib import graph_runtime
+from tvm.relay.testing.config import ctx_list
+
+sys.setrecursionlimit(10000)
+
+def _vectorize(ten):
+    return ten.reshape(-1)
+
+def atol(tru, est):
+    def _atol_elt(tru, est):
+        return abs(tru - est)
+    tru = _vectorize(tru)
+    est = _vectorize(est)
+    return max([_atol_elt(x, y) for x, y in zip(tru, est)])
+
+def rtol(tru, est):
+    def _rtol_elt(tru, est):
+        return abs(tru - est) / min(abs(tru), abs(est))
+    tru = _vectorize(tru)
+    est = _vectorize(est)
+    return max([_rtol_elt(x, y) for x, y in zip(tru, est)])
+
+def assert_shapes_match(tru, est):
+    if tru.shape != est.shape:
+        msg = "Output shapes {} and {} don't match"
+        raise AssertionError(msg.format(tru.shape, est.shape))
+
+def load_torchvision(model_name):
+    """Given a model name, returns a Torchvision model in eval mode as well
+    as an example input."""
+    with torch.no_grad():
+        if model_name.startswith("inception"):
+            height = width = 299
+            mean = [0.5, 0.5, 0.5]
+            std = [0.5, 0.5, 0.5]
+        else:
+            height = width = 224
+            mean = [0.485, 0.456, 0.406]
+            std = [0.229, 0.224, 0.225]
+        input_shape = [1, 3, height, width]
+        input_data = torch.randn(input_shape).float()
+        for channel in range(3):
+            input_data[:, channel] -= mean[channel]
+            input_data[:, channel] /= std[channel]
+        model = getattr(torchvision.models, model_name)(pretrained=True)
+        model = model.float().eval()
+        return model, input_data
+
+def load_pretrainedmodels(model_name):
+    """Given a model name, returns a pretrainedmodels.pytorch model in eval
+    mode as well as an example input."""
+    import pretrainedmodels # https://github.com/Cadene/pretrained-models.pytorch
+    model = getattr(pretrainedmodels, model_name)().float().eval()
+    input_shape = [1, *model.input_size]
+    input_data = torch.rand(input_shape).float() * 256
+    for channel in range(3):
+        input_data[:, channel] -= model.mean[channel]
+        input_data[:, channel] /= model.std[channel]
+    return model, input_data
+
+def load_model(model_name):
+    """Given a model name, returns a model as well as an example input."""
+    if hasattr(torchvision.models, model_name):
+        return load_torchvision(model_name)
+    try:
+        if hasattr(pretrainedmodels, model_name):
+            return load_pretrainedmodels(model_name)
+    except ModuleNotFoundError:
+        raise ModuleNotFoundError("Please install pretrainedmodels.pytorch")
+    raise RuntimeError("Model not supported")
+
+
+def confidence_interval(mean, stdev, count, alpha=.01):
+    """Returns the lower and upper bounds of the confidence interval of a random
+    variable. Confidence is 1 - alpha (default confidence is 99%)."""
+    stdval = tdistr.ppf(1 - alpha / 2, count - 1)
+    lower, upper = mean + np.array([-1, 1]) * stdval * stdev / np.sqrt(count)
+    return lower, upper
+
+def measure_latency(model, input_shapes, output_shapes, thresh, dryruns=40):
+    """Compute the latency of the given model"""
+    latencies = []
+    count = 0
+    while True:
+        if isinstance(model, torch.nn.Module):
+            input_data = [torch.rand(shape).float() for shape in input_shapes]
+            if torch.cuda.is_available():
+                input_data = list(map(lambda x: x.cuda(), input_data))
+                model = model.cuda()
+            t_start = time()
+            with torch.no_grad():
+                model(*input_data)
+            t_end = time()
+            latencies.append(t_end - t_start)
+        else:
+            input_data = {}
+            for i, shape in enumerate(input_shapes):
+                name = "input" + str(i)
+                arr = np.random.random(shape).astype("float32")
+                input_data[name] = tvm.nd.array(arr)
+            t_start = time()
+            model.set_input(**input_data)
+            model.run()
+            for i, shape in enumerate(output_shapes):
+                arr = np.zeros(shape).astype("float32")
+                model.get_output(i, tvm.nd.array(arr))
+            t_end = time()
+        count += 1
+        if count < dryruns:
+            continue
+        latencies.append(t_end - t_start)
+        mean = np.mean(latencies)
+        stdev = np.std(latencies)
+        sample_size = len(latencies)
+        if sample_size > dryruns:
+            lower, upper = confidence_interval(mean, stdev, sample_size)
+            est = (upper + lower) / 2
+            err = (upper - lower) / 2
+            if err < thresh:
+                return est
+
+def verify_model(model_name, input_data=[]):
+    """Assert that the output of a compiled model matches with that of its
+    baseline."""
+    if len(input_data) == 0:
+        baseline_model, baseline_input = load_model(model_name)
+    else:
+        baseline_model = model_name
+        baseline_input = input_data
+    if torch.cuda.is_available():
+        baseline_model = baseline_model.cuda()
+        baseline_input = baseline_input.cuda()
+    with torch.no_grad():
+        baseline_outputs = baseline_model(baseline_input)
+    if isinstance(baseline_outputs, tuple):
+        baseline_outputs = tuple(out.cpu().numpy() for out in baseline_outputs)
+    else:
+        baseline_outputs = (baseline_outputs.float().cpu().numpy(),)
+    output_shapes = [out.shape for out in baseline_outputs]
+    dtype = "float32"
+    input_name = "input0"
+    input_shapes = {input_name: list(baseline_input.shape)}
+    trace = torch.jit.trace(baseline_model, baseline_input).float().eval()
+    if torch.cuda.is_available():
+        trace = trace.cuda()
+    else:
+        trace = trace.cpu()
+
+    mod, params = relay.frontend.from_pytorch(trace, input_shapes)
+    compiled_input = {input_name: tvm.nd.array(baseline_input.cpu().numpy())}
+
+    with relay.build_config(opt_level=3):
+        for target, ctx in ctx_list():
+            relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params)
+            relay_model = graph_runtime.create(relay_graph, relay_lib, ctx)
+            relay_model.set_input(**relay_params)
+            relay_model.set_input(**compiled_input)
+            relay_model.run()
+
+            for i, baseline_output in enumerate(baseline_outputs):
+                compiled_output = relay_model.get_output(i).asnumpy()
+
+                assert_shapes_match(baseline_output, compiled_output)
+                tvm.testing.assert_allclose(baseline_output, compiled_output,
+                                            rtol=1e-3, atol=1e-3)
+
+    del model_name
+    del baseline_model
+    torch.cuda.empty_cache()
+
+# Single operator tests
+def test_forward_add():
+    torch.set_grad_enabled(False)
+    input_shape = [10]
+
+    class Add1(Module):
+        def forward(self, *args):
+            return args[0] + args[0]
+
+    class Add2(Module):
+        def forward(self, *args):
+            return args[0] + 1
+
+    class Add3(Module):
+        def forward(self, *args):
+            ones = torch.ones(input_shape, dtype=torch.float)
+            if torch.cuda.is_available():
+                ones = ones.cuda()
+            return args[0] + ones
+
+    class Add4(Module):
+        def forward(self, *args):
+            ones = torch.ones([], dtype=torch.float)
+            if torch.cuda.is_available():
+                ones = ones.cuda()
+            return args[0] + ones
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Add1().float().eval(), input_data=input_data)
+        verify_model(Add2().float().eval(), input_data=input_data)
+        verify_model(Add3().float().eval(), input_data=input_data)
+        verify_model(Add4().float().eval(), input_data=input_data)
+
+def test_forward_subtract():
+    torch.set_grad_enabled(False)
+    input_shape = [10]
+
+    class Subtract1(Module):
+        def forward(self, *args):
+            return args[0] - args[0]
+
+    class Subtract2(Module):
+        def forward(self, *args):
+            return args[0] - 1
+
+    class Subtract3(Module):
+        def forward(self, *args):
+            ones = torch.ones(input_shape)
+            if torch.cuda.is_available():
+                ones = ones.cuda()
+            return args[0] - ones
+
+    class Subtract4(Module):
+        def forward(self, *args):
+            ones = torch.ones([])
+            if torch.cuda.is_available():
+                ones = ones.cuda()
+            return args[0] - ones
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Subtract1().float().eval(), input_data=input_data)
+        verify_model(Subtract2().float().eval(), input_data=input_data)
+        verify_model(Subtract3().float().eval(), input_data=input_data)
+        verify_model(Subtract4().float().eval(), input_data=input_data)
+
+def test_forward_multiply():
+    torch.set_grad_enabled(False)
+    input_shape = [10]
+
+    class Multiply1(Module):
+        def forward(self, *args):
+            return args[0] * args[0]
+
+    class Multiply2(Module):
+        def forward(self, *args):
+            return args[0] * 1
+
+    class Multiply3(Module):
+        def forward(self, *args):
+            ones = torch.ones(input_shape)
+            if torch.cuda.is_available():
+                ones = ones.cuda()
+            return args[0] * ones
+
+    class Multiply4(Module):
+        def forward(self, *args):
+            ones = torch.ones([])
+            if torch.cuda.is_available():
+                ones = ones.cuda()
+            return args[0] * ones
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Multiply1().float().eval(), input_data=input_data)
+        verify_model(Multiply2().float().eval(), input_data=input_data)
+        verify_model(Multiply3().float().eval(), input_data=input_data)
+        verify_model(Multiply4().float().eval(), input_data=input_data)
+
+def test_forward_unsqueeze():
+    torch.set_grad_enabled(False)
+    input_shape = [10, 10]
+
+    class Unsqueeze1(Module):
+        def forward(self, *args):
+            return args[0].unsqueeze(2)
+
+    input_data = torch.rand(input_shape).float()
+    verify_model(Unsqueeze1().float().eval(), input_data=input_data)
+
+def test_forward_concatenate():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Concatenate1(Module):
+        def forward(self, *args):
+            return torch.cat([args[0][:, 0].unsqueeze(1), args[0][:, 1].unsqueeze(1)], 1)
+
+    class Concatenate2(Module):
+        def forward(self, *args):
+            a = (args[0][:, :, 0] + 2) * 7
+            b = (args[0][:, :, 1] + 3) * 11
+            c = (args[0][:, :, 2] + 5) * 13
+            return torch.cat([t.unsqueeze(2) for t in [a, b, c]], 2)
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Concatenate1().float().eval(), input_data=input_data)
+        verify_model(Concatenate2().float().eval(), input_data=input_data)
+
+def test_forward_relu():
+    torch.set_grad_enabled(False)
+    input_shape = [10, 10]
+
+    class ReLU1(Module):
+        def forward(self, *args):
+            return torch.nn.ReLU()(args[0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(ReLU1().float().eval(), input_data=input_data)
+
+def test_forward_adaptiveavgpool():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class AdaptiveAvgPool2D1(Module):
+        def forward(self, *args):
+            return torch.nn.AdaptiveAvgPool2d([1, 1])(args[0])
+
+    class AdaptiveAvgPool2D2(Module):
+        def forward(self, *args):
+            return torch.nn.AdaptiveAvgPool2d([10, 10])(args[0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(AdaptiveAvgPool2D1().float().eval(), input_data=input_data)
+        verify_model(AdaptiveAvgPool2D2().float().eval(), input_data=input_data)
+
+def test_forward_maxpool():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class MaxPool2D1(Module):
+        def forward(self, *args):
+            return torch.nn.MaxPool2d(kernel_size=[1, 1])(args[0])
+
+    class MaxPool2D2(Module):
+        def forward(self, *args):
+            return torch.nn.MaxPool2d(kernel_size=[10, 10])(args[0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(MaxPool2D1().float().eval(), input_data=input_data)
+        verify_model(MaxPool2D2().float().eval(), input_data=input_data)
+
+def test_forward_avgpool():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class AvgPool2D1(Module):
+        def forward(self, *args):
+            return torch.nn.AvgPool2d(kernel_size=[10, 10])(args[0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(AvgPool2D1().float().eval(), input_data=input_data)
+
+def test_forward_hardtanh():
+    torch.set_grad_enabled(False)
+    input_shape = [10]
+
+    class HardTanh1(Module):
+        def forward(self, *args):
+            return torch.nn.Hardtanh()(args[0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(HardTanh1().float().eval(), input_data=input_data)
+
+def test_forward_conv():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Conv2D1(Module):
+        def __init__(self):
+            super(Conv2D1, self).__init__()
+            self.conv = torch.nn.Conv2d(3, 6, 7, bias=True)
+            self.softmax = torch.nn.Softmax()
+
+        def forward(self, *args):
+            return self.softmax(self.conv(args[0]))
+
+    class Conv2D2(Module):
+        def __init__(self):
+            super(Conv2D2, self).__init__()
+            self.conv = torch.nn.Conv2d(3, 6, 7, bias=False)
+            self.softmax = torch.nn.Softmax()
+
+        def forward(self, *args):
+            return self.softmax(self.conv(args[0]))
+
+    class Conv2D3(Module):
+        def __init__(self):
+            super(Conv2D3, self).__init__()
+            self.conv = torch.nn.Conv2d(3, 6, 7, groups=3, bias=False)
+            self.softmax = torch.nn.Softmax()
+
+        def forward(self, *args):
+            return self.softmax(self.conv(args[0]))
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Conv2D1().float().eval(), input_data=input_data)
+        verify_model(Conv2D2().float().eval(), input_data=input_data)
+        verify_model(Conv2D3().float().eval(), input_data=input_data)
+
+def test_forward_threshold():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3]
+
+    class Threshold1(Module):
+        def forward(self, *args):
+            return torch.nn.Threshold(0, 0)(args[0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Threshold1().float().eval(), input_data=input_data)
+
+def test_forward_contiguous():
+    torch.set_grad_enabled(False)
+    input_shape = [10]
+
+    class Contiguous1(Module):
+        def forward(self, *args):
+            return args[0].contiguous()
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Contiguous1().float().eval(), input_data=input_data)
+
+def test_forward_batchnorm():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class BatchNorm1(Module):
+        def __init__(self):
+            super(BatchNorm1, self).__init__()
+            self.batch_norm = torch.nn.BatchNorm2d(3, affine=True)
+        def forward(self, *args):
+            return self.batch_norm(args[0])
+
+    class BatchNorm2(Module):
+        def __init__(self):
+            super(BatchNorm2, self).__init__()
+            self.batch_norm = torch.nn.BatchNorm2d(3, affine=False)
+        def forward(self, *args):
+            return self.batch_norm(args[0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(BatchNorm1().float().eval(), input_data=input_data)
+        verify_model(BatchNorm2().float().eval(), input_data=input_data)
+
+def test_forward_transpose():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Transpose1(Module):
+        def forward(self, *args):
+            return args[0].transpose(2, 3)
+
+    class Transpose2(Module):
+        def forward(self, *args):
+            return args[0].transpose(-2, -1)
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Transpose1().float().eval(), input_data=input_data)
+        verify_model(Transpose2().float().eval(), input_data=input_data)
+
+def test_forward_size():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3]
+
+    class Size1(Module):
+        def forward(self, *args):
+            return args[0].size(0) * args[0]
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Size1().float().eval(), input_data=input_data)
+
+def test_forward_view():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class View1(Module):
+        def forward(self, *args):
+            return args[0].view((1, 3 * 10 * 10))
+
+    class View2(Module):
+        def forward(self, *args):
+            return args[0].view(args[0].shape[0], -1)
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(View1().float().eval(), input_data=input_data)
+        verify_model(View2().float().eval(), input_data=input_data)
+
+def test_forward_select():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Select1(Module):
+        def forward(self, *args):
+            return args[0].select(1, 1)
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Select1().float().eval(), input_data=input_data)
+
+def test_forward_clone():
+    torch.set_grad_enabled(False)
+    input_shape = [10]
+
+    class Clone1(Module):
+        def forward(self, *args):
+            return args[0].clone()
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Clone1().float().eval(), input_data=input_data)
+
+def test_forward_logsoftmax():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class LogSoftmax1(Module):
+        def forward(self, *args):
+            return torch.nn.LogSoftmax(dim=1)(args[0][0, 0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(LogSoftmax1().float().eval(), input_data=input_data)
+
+def test_forward_sigmoid():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Sigmoid1(Module):
+        def forward(self, *args):
+            return torch.nn.Sigmoid()(args[0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Sigmoid1().float().eval(), input_data=input_data)
+
+def test_forward_dense():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Dense1(Module):
+        def __init__(self):
+            super(Dense1, self).__init__()
+            self.linear = torch.nn.Linear(10, 7, bias=True)
+        def forward(self, *args):
+            return self.linear(args[0][0, 0])
+
+    class Dense2(Module):
+        def __init__(self):
+            super(Dense2, self).__init__()
+            self.linear = torch.nn.Linear(10, 7, bias=False)
+        def forward(self, *args):
+            return self.linear(args[0][0, 0])
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Dense1().float().eval(), input_data=input_data)
+        verify_model(Dense2().float().eval(), input_data=input_data)
+
+def test_forward_dropout():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Dropout1(Module):
+        def forward(self, *args):
+            return torch.nn.functional.dropout(args[0][0, 0], 0.5, False)
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Dropout1().float().eval(), input_data=input_data)
+
+def test_forward_slice():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Slice1(Module):
+        def forward(self, *args):
+            return args[0][:, :, :, :3]
+
+    class Slice2(Module):
+        def forward(self, *args):
+            return args[0][0, :, :, :]
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Slice1().float().eval(), input_data=input_data)
+        verify_model(Slice2().float().eval(), input_data=input_data)
+
+def test_forward_mean():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Mean1(Module):
+        def forward(self, *args):
+            return args[0].mean(2)
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Mean1().float().eval(), input_data=input_data)
+
+def test_forward_expand():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Expand1(Module):
+        def forward(self, *args):
+            return args[0].expand((3, -1, -1, -1))
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Expand1().float().eval(), input_data=input_data)
+
+def test_forward_pow():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    class Pow1(Module):
+        def forward(self, *args):
+            return args[0] ** 2
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Pow1().float().eval(), input_data=input_data)
+
+def test_forward_chunk():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 14, 14]
+
+    class Chunk1(Module):
+        def forward(self, *args):
+            chunks = args[0].chunk(7, 2)
+            return torch.cat(chunks, 2)
+
+    with torch.no_grad():
+        input_data = torch.rand(input_shape).float()
+        verify_model(Chunk1().float().eval(), input_data=input_data)
+
+# Model tests
+def test_resnet18():
+    torch.set_grad_enabled(False)
+    verify_model("resnet18")
+
+def test_squeezenet1_0():
+    torch.set_grad_enabled(False)
+    verify_model("squeezenet1_0")
+
+def test_squeezenet1_1():
+    torch.set_grad_enabled(False)
+    verify_model("squeezenet1_1")
+
+def test_densenet121():
+    torch.set_grad_enabled(False)
+    verify_model("densenet121")
+
+def test_inception_v3():
+    torch.set_grad_enabled(False)
+    verify_model("inception_v3")
+
+def test_googlenet():
+    torch.set_grad_enabled(False)
+    verify_model("googlenet")
+
+def test_mnasnet0_5():
+    torch.set_grad_enabled(False)
+    verify_model("mnasnet0_5")
+
+"""
+#TODO: Fix VGG and AlexNet issues (probably due to pooling)
+def test_alexnet():
+    torch.set_grad_enabled(False)
+    verify_model("alexnet")
+
+def test_vgg11():
+    torch.set_grad_enabled(False)
+    verify_model("vgg11")
+
+def test_vgg11_bn():
+    torch.set_grad_enabled(False)
+    verify_model("vgg11_bn")
+    
+#TODO: Need to update schedule in tophub file after PR #4787 updated workloads
+def test_mobilenet_v2():
+    torch.set_grad_enabled(False)
+    verify_model("mobilenet_v2")
+"""
+
+if __name__ == "__main__":
+    # Single operator tests
+    test_forward_add()
+    test_forward_subtract()
+    test_forward_multiply()
+    test_forward_unsqueeze()
+    test_forward_concatenate()
+    test_forward_relu()
+    test_forward_adaptiveavgpool()
+    test_forward_maxpool()
+    test_forward_hardtanh()
+    test_forward_conv()
+    test_forward_threshold()
+    test_forward_contiguous()
+    test_forward_batchnorm()
+    test_forward_transpose()
+    test_forward_size()
+    test_forward_view()
+    test_forward_select()
+    test_forward_clone()
+    test_forward_logsoftmax()
+    test_forward_sigmoid()
+    test_forward_dense()
+    test_forward_avgpool()
+    test_forward_dropout()
+    test_forward_slice()
+    test_forward_mean()
+    test_forward_expand()
+    test_forward_pow()
+    test_forward_chunk()
+
+    # Model tests
+    test_resnet18()
+    test_squeezenet1_0()
+    test_squeezenet1_1()
+    test_densenet121()
+    test_inception_v3()
+    test_googlenet()
+    test_mnasnet0_5()
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index d93036b3d65d..862de5a81c73 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -52,3 +52,6 @@ python3 -m pytest -v tests/python/frontend/caffe2
 
 echo "Running relay DarkNet frontend test..."
 python3 -m pytest -v tests/python/frontend/darknet
+
+echo "Running relay PyTorch frontend test..."
+python3 -m pytest -v tests/python/frontend/pytorch

From a2429c1fa61cf54d1890e887572c8fa93c467d7a Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Mon, 24 Feb 2020 20:53:24 -0800
Subject: [PATCH 35/73] [Relay][External Codegen] Support data types for
 CSourceModuleCodegen args and output (#4934)

* Support int args and no extra buffers

* Fixes

* remove testing code

* fix style

* more style

* use const args

* style

Co-authored-by: Jon Soifer <jonso@microsoft.com>
---
 .../backend/contrib/codegen_c/codegen.cc      | 57 ++++++-----
 .../backend/contrib/codegen_c/codegen_c.h     | 94 ++++++++++++++-----
 src/relay/backend/contrib/dnnl/codegen.cc     | 23 +++--
 tests/python/relay/test_external_codegen.py   | 18 ++++
 4 files changed, 139 insertions(+), 53 deletions(-)

diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index 55844479d605..126d1d5839de 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -41,9 +41,11 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
   explicit CodegenC(const std::string& id) { this->ext_func_id_ = id; }
 
   void VisitExpr_(const VarNode* node) {
-    ext_func_args_.push_back(node->name_hint());
+    ext_func_args_.push_back(GetRef<Var>(node));
     out_.clear();
-    out_.push_back({node->name_hint(), 0});
+    Output output;
+    output.name = node->name_hint();
+    out_.push_back(output);
   }
 
   void VisitExpr_(const CallNode* call) final {
@@ -70,6 +72,12 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
     for (size_t i = 0; i < in_shape.size(); ++i) {
       macro_stream << ", " << in_shape[i];
     }
+
+    const auto* type_node = call->checked_type().as<TensorTypeNode>();
+    CHECK(type_node);
+    const auto& dtype = GetDtypeString(type_node);
+    macro_stream << ", " << dtype;
+
     macro_stream << ");";
     func_decl_.push_back(macro_stream.str());
 
@@ -83,20 +91,18 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
           decl_stream << ", ";
         }
         first = false;
-        decl_stream << out.first;
+        decl_stream << out.name;
       }
     }
 
-    auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-        << "Only support single output tensor with float type";
     std::string out = "buf_" + std::to_string(buf_idx_++);
     auto out_shape = GetShape(call->checked_type());
     int out_size = 1;
     for (size_t i = 0; i < out_shape.size(); ++i) {
       out_size *= out_shape[i];
     }
-    buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");";
+    buf_stream << dtype << "* " << out <<
+      " = (" << dtype << "*)std::malloc(4 * " << out_size << ");";
     buf_decl_.push_back(buf_stream.str());
 
     decl_stream << ", " << out << ");";
@@ -104,7 +110,12 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
 
     // Update output buffer
     out_.clear();
-    out_.push_back({out, out_size});
+    Output output;
+    output.name = out;
+    output.dtype = dtype;
+    output.need_copy = true;
+    output.size = out_size;
+    out_.push_back(output);
   }
 
   /*!
@@ -128,7 +139,7 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
   /*! \brief The index of allocated buffers. */
   int buf_idx_ = 0;
   /*! \brief The arguments of a C compiler compatible function. */
-  std::vector<std::string> ext_func_args_;
+  Array<Var> ext_func_args_;
   /*! \brief The statements of a C compiler compatible function. */
   std::vector<std::string> ext_func_body;
   /*! \brief The declaration statements of a C compiler compatible function. */
@@ -136,7 +147,7 @@ class CodegenC : public ExprVisitor, public CodegenCBase {
   /*! \brief The declaration statements of buffers. */
   std::vector<std::string> buf_decl_;
   /*! \brief The name and index pairs for output. */
-  std::vector<std::pair<std::string, int>> out_;
+  std::vector<Output> out_;
 };
 
 class CSourceCodegen : public CSourceModuleCodegenBase {
@@ -161,21 +172,21 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
 
     // Append some common macro for operator definition.
     const char* operator_macro = R"op_macro(
-    #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_)       \
-      extern "C" void p_ID_(float* a, float* b, float* out) { \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {               \
-          out[i] = a[i] p_OP_ b[i];                           \
-        }                                                     \
+    #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_, p_DTYPE)       \
+      extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {    \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {                        \
+          out[i] = a[i] p_OP_ b[i];                                    \
+        }                                                              \
       }
 
-    #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_)  \
-      extern "C" void p_ID_(float* a, float* b, float* out) {     \
-        for (int64_t i = 0; i < p_DIM1_; ++i) {                   \
-          for (int64_t j = 0; j < p_DIM2_; ++j) {                 \
-            int64_t k = i * p_DIM2_ + j;                          \
-            out[k] = a[k] p_OP_ b[k];                             \
-          }                                                       \
-        }                                                         \
+    #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_, p_DTYPE)  \
+      extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {        \
+        for (int64_t i = 0; i < p_DIM1_; ++i) {                            \
+          for (int64_t j = 0; j < p_DIM2_; ++j) {                          \
+            int64_t k = i * p_DIM2_ + j;                                   \
+            out[k] = a[k] p_OP_ b[k];                                      \
+          }                                                                \
+        }                                                                  \
       }
     )op_macro";
 
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
index f473c93a2896..2a88d4b7996a 100644
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -35,6 +35,13 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
+struct Output {
+  std::string name;
+  std::string dtype;
+  int size;
+  bool need_copy;
+};
+
 class CSourceModuleCodegenBase {
  public:
   CSourceModuleCodegenBase() = default;
@@ -98,7 +105,7 @@ class CodegenCBase {
    * \brief Gerenate C code for the external function.
    *
    * \param func_name The name of the external function.
-   * \param arg_cnt The expected number of arguments.
+   * \param args arguments to the external function.
    *
    * \code
    *
@@ -116,16 +123,18 @@ class CodegenCBase {
    *
    * \endcode
    */
-  void GenerateBackendCFunc(const std::string& func_name, int arg_cnt) {
+  void GenerateBackendCFunc(const std::string& func_name,
+                            const Array<Var>& args,
+                            const Output& out) {
     // Print signature
     code_stream_ << "\n";
     code_stream_ << "extern \"C\" int " << func_name << "_wrapper_(";
-    for (int i = 0; i < arg_cnt - 1; i++) {
+    for (size_t i = 0; i < args.size(); i++) {
       code_stream_ << "DLTensor* arg" << i << ",\n";
       code_stream_ << "\t";
     }
-    if (arg_cnt > 0) {
-      code_stream_ << "DLTensor* arg" << arg_cnt - 1 << ") {\n";
+    if (args.size() > 0) {
+      code_stream_ << "DLTensor* arg" << args.size() << ") {\n";
     }
 
     EnterScope();
@@ -133,12 +142,13 @@ class CodegenCBase {
     // Generate the internal call.
     PrintIndents();
     code_stream_ << func_name << "_(";
-    for (int i = 0; i < arg_cnt - 1; i++) {
-      code_stream_ << "static_cast<float*>(arg" << i << "->data),\n";
+    for (size_t i = 0; i < args.size(); i++) {
+      const auto& dtype_str = GetDtypeString(args[i]);
+      code_stream_ << "static_cast<" << dtype_str << "*>(arg" << i << "->data),\n";
       PrintIndents();
     }
-    if (arg_cnt > 0) {
-      code_stream_ << "static_cast<float*>(arg" << arg_cnt - 1 << "->data)";
+    if (args.size() > 0) {
+      code_stream_ << "static_cast<" << out.dtype << "*>(arg" << args.size() << "->data)";
     }
     code_stream_ << ");\n";
     PrintIndents();
@@ -207,17 +217,21 @@ class CodegenCBase {
    *
    * \return The emitted code string.
    */
-  std::string JitImpl(std::string ext_func_id, std::vector<std::string> args,
-                      std::vector<std::string> buf_decl, std::vector<std::string> body,
-                      std::vector<std::pair<std::string, int>> out) {
+  std::string JitImpl(std::string ext_func_id, const Array<Var>& args,
+                      const std::vector<std::string>& buf_decl,
+                      const std::vector<std::string>& body,
+                      const std::vector<Output>& out) {
     // Create the signature. For example, it could be:
     // extern "C" void dnnl_0_(float* input0, float* input1, float* out, int M, int N) {}
     code_stream_ << "extern \"C\" void " << ext_func_id << "_(";
 
+    CHECK_EQ(out.size(), 1U) << "Internal error: only single output is support.";
+
     for (const auto& arg : args) {
-      code_stream_ << "float* " << arg << ", ";
+      const auto& dtype_str = GetDtypeString(arg);
+      code_stream_ << dtype_str << "* " << arg->name_hint() << ", ";
     }
-    code_stream_ << "float* out) {\n";
+    code_stream_ << out[0].dtype << "* out) {\n";
     this->EnterScope();
 
     // Function body
@@ -232,24 +246,60 @@ class CodegenCBase {
     }
 
     // Copy output
-    CHECK_EQ(out.size(), 1U) << "Internal error: only single output is support.";
-    this->PrintIndents();
-    code_stream_ << "std::memcpy(out, " << out[0].first << ", 4 * " << out[0].second << ");\n";
-
-    // Free buffers
-    for (size_t i = 0; i < buf_decl.size(); i++) {
+    if (out[0].need_copy) {
       this->PrintIndents();
-      code_stream_ << "std::free(buf_" << i << ");\n";
+      code_stream_ << "std::memcpy(out, " << out[0].name << ", 4 * " << out[0].size << ");\n";
+
+      // Free buffers
+      for (size_t i = 0; i < buf_decl.size(); i++) {
+        this->PrintIndents();
+        code_stream_ << "std::free(buf_" << i << ");\n";
+      }
     }
 
     this->ExitScope();
     code_stream_ << "}\n";
 
     // Create the wrapper to call the ext_func
-    this->GenerateBackendCFunc(ext_func_id, args.size() + 1 /* output */);
+    this->GenerateBackendCFunc(ext_func_id, args, out[0]);
     return code_stream_.str();
   }
 
+  /*!
+   * \brief Returns dtype string
+   *
+   * \param var Var to get the dtype of
+   *
+   * \return The dtype string.
+   */
+  std::string GetDtypeString(const Var& var) {
+    auto ttype = var->checked_type().as<TensorTypeNode>();
+    CHECK(ttype) << "Expect TensorTypeNode";
+    return GetDtypeString(ttype);
+  }
+
+  /*!
+   * \brief Returns dtype string
+   *
+   * \param ttype TensorTypeNode* to get the dtype of
+   *
+   * \return The dtype string.
+   */
+  std::string GetDtypeString(const TensorTypeNode* ttype) {
+    std::string dtype;
+    if (runtime::TypeMatch(ttype->dtype, kDLFloat, 32)) {
+      dtype = "float";
+    } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 32)) {
+      dtype = "int";
+    } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 64)) {
+      dtype = "int64_t";
+    } else {
+      LOG(FATAL) << "Unsupported dtype " << ttype->dtype;
+    }
+
+    return dtype;
+  }
+
   /*! \brief The external function source code stream. */
   std::ostringstream code_stream_;
 
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 6206173dd998..73711749d9c4 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -45,9 +45,11 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
   explicit CodegenDNNL(const std::string& id) { this->ext_func_id_ = id; }
 
   void VisitExpr_(const VarNode* node) final {
-    ext_func_args_.push_back(node->name_hint());
+    ext_func_args_.push_back(GetRef<Var>(node));
     out_.clear();
-    out_.push_back({node->name_hint(), 0});
+    Output output;
+    output.name = node->name_hint();
+    out_.push_back(output);
   }
 
   void VisitExpr_(const TupleGetItemNode* op) final {
@@ -90,14 +92,14 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
           decl_stream << ", ";
         }
         first = false;
-        decl_stream << out.first;
+        decl_stream << out.name;
       }
     }
 
     // Analyze the output buffer
     auto type_node = call->checked_type().as<TensorTypeNode>();
-    CHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32))
-        << "Only support single output tensor with float type";
+    CHECK(type_node);
+    const auto& dtype = GetDtypeString(type_node);
     std::string out = "buf_" + std::to_string(buf_idx_++);
     auto out_shape = GetShape(call->checked_type());
     int out_size = 1;
@@ -118,7 +120,12 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
 
     // Update output buffer
     out_.clear();
-    out_.push_back({out, out_size});
+    Output output;
+    output.name = out;
+    output.dtype = dtype;
+    output.need_copy = true;
+    output.size = out_size;
+    out_.push_back(output);
   }
 
   std::string JIT(void) {
@@ -213,13 +220,13 @@ class CodegenDNNL : public ExprVisitor, public CodegenCBase {
    */
   int buf_idx_{0};
   /*! \brief The arguments used by a wrapped function that calls DNNL kernels. */
-  std::vector<std::string> ext_func_args_;
+  Array<Var> ext_func_args_;
   /*! \brief statement of the function that will be compiled using DNNL kernels. */
   std::vector<std::string> ext_func_body;
   /*! \brief The declaration of intermeidate buffers. */
   std::vector<std::string> buf_decl_;
   /*! \brief The name of the the outputs. */
-  std::vector<std::pair<std::string, int>> out_;
+  std::vector<Output> out_;
 };
 
 /*!
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index 608bc2a77bb0..b086df07a835 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -161,6 +161,23 @@ def test_extern_gcc_single_op():
     check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
 
 
+def test_extern_gcc_single_op_int():
+    x = relay.var('x', shape=(8, 8), dtype="int32")
+    y = relay.var('y', shape=(8, 8), dtype="int32")
+
+    x0 = relay.var('x0', shape=(8, 8), dtype="int32")
+    y0 = relay.var('y0', shape=(8, 8), dtype="int32")
+    z = x0 + y0
+    f = relay.Function([x0, y0], z)
+    f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
+    call = relay.Call(f, [x, y])
+    mod = tvm.IRModule.from_expr(call)
+    x_data = np.random.rand(8, 8).astype('int32')
+    y_data = np.random.rand(8, 8).astype('int32')
+
+    check_result(mod, {"x": x_data, "y": y_data}, (8, 8), x_data + y_data)
+
+
 def test_extern_gcc():
     x = relay.var('x', shape=(2, 2))
     y = relay.var('y', shape=(2, 2))
@@ -242,5 +259,6 @@ def test_extern_dnnl():
 if __name__ == "__main__":
     test_multi_node_subgraph()
     test_extern_gcc_single_op()
+    test_extern_gcc_single_op_int()
     test_extern_gcc()
     test_extern_dnnl()

From 588523ddb6e938637d96745bcce145375307247f Mon Sep 17 00:00:00 2001
From: wpan11nv <60017475+wpan11nv@users.noreply.github.com>
Date: Tue, 25 Feb 2020 03:32:21 -0800
Subject: [PATCH 36/73] [LLVM] Fix build breaks from StringRef changes (#4923)

- llvm::StringRef to std::string conversion is explicit now.

Signed-off-by: Wei Pan <wpan11nv@nvidia.com>
---
 src/target/llvm/codegen_amdgpu.cc | 2 +-
 src/target/llvm/codegen_nvptx.cc  | 2 +-
 src/target/llvm/llvm_module.cc    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 9cf4d539c1ad..961ff418e2a5 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -236,7 +236,7 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
     llvm::SMDiagnostic err;
     std::unique_ptr<llvm::Module> mlib = llvm::parseIRFile(path, err, *ctx);
     if (mlib.get() == nullptr) {
-      std::string msg = err.getMessage();
+      std::string msg(err.getMessage());
       LOG(FATAL) << "Fail to load bitcode file " << path << "\n"
                  << "line " << err.getLineNo() << ":" << msg;
     }
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 555adc9d26ed..821232ded170 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -215,7 +215,7 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
       llvm::SMDiagnostic err;
       std::unique_ptr<llvm::Module> mlib = llvm::parseIRFile(path, err, *ctx);
       if (mlib.get() == nullptr) {
-        std::string msg = err.getMessage();
+        std::string msg(err.getMessage());
         LOG(FATAL) << "Fail to load bitcode file " << path << "\n"
                    << "line " << err.getLineNo() << ":" << msg;
       }
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 30755fcfc125..2e04920d866b 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -239,7 +239,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     if (mtarget != nullptr) {
       llvm::MDString* pstr = llvm::dyn_cast<llvm::MDString>(mtarget);
       CHECK(pstr != nullptr);
-      target_ = pstr->getString();
+      target_ = pstr->getString().str();
     } else {
       std::ostringstream os;
       os << "llvm -target " << module_->getTargetTriple();

From 545f6ea3fede7a99f0a1b2c6933875550214a46d Mon Sep 17 00:00:00 2001
From: Yida Wang <yidawa@gmail.com>
Date: Tue, 25 Feb 2020 13:14:58 -0800
Subject: [PATCH 37/73] [Fix] remove unnecessary spliting in the cached chunk
 (#4935)

* remove unnecessary spliting in the cached chunk

* remove unnecessary spliting in the cached chunk
---
 topi/python/topi/x86/depthwise_conv2d.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
index 2aa5e748e5c7..70b30fea8c51 100644
--- a/topi/python/topi/x86/depthwise_conv2d.py
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -223,12 +223,12 @@ def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out
     s[C].parallel(parallel_axis)
     s[CC].compute_at(s[C], ow_chunk)
 
+    # the ow axis in the cached block CC is the ow_block in C
     _, ic_chunk, oh, ow, ic_block = s[CC].op.axis
     kh, kw = s[CC].op.reduce_axis
-    ow_chunk, ow_block = s[CC].split(ow, factor=tile_ow)
-    s[CC].reorder(ic_chunk, oh, kh, kw, ow_block, ic_block)
+    s[CC].reorder(ic_chunk, oh, kh, kw, ow, ic_block)
     s[CC].vectorize(ic_block)
-    s[CC].unroll(ow_block)
+    s[CC].unroll(ow)
 
     if C != O:
         out_ndim = len(s[O].op.axis)

From b422f6a96c74095e43c42cee119e4b267922f266 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Tue, 25 Feb 2020 17:35:49 -0800
Subject: [PATCH 38/73] [WIP] Fixing an Infinite Loop case in UnmatchedChecker.
 (#4881)

* save

* save

* remove

* remove cerr
---
 src/relay/pass/match_exhaustion.cc            | 34 +++++--------------
 .../python/relay/test_pass_unmatched_cases.py | 25 ++++++++++++++
 2 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/src/relay/pass/match_exhaustion.cc b/src/relay/pass/match_exhaustion.cc
index 885c47ef5845..14be6b751354 100644
--- a/src/relay/pass/match_exhaustion.cc
+++ b/src/relay/pass/match_exhaustion.cc
@@ -168,8 +168,10 @@ Array<Pattern> ExpandWildcards(const Pattern& clause_pat,
                                const IRModule& mod) {
   if (auto clause_ctor = clause_pat.as<PatternConstructorNode>()) {
     return ExpandWildcardsConstructor(GetRef<PatternConstructor>(clause_ctor), cand, mod);
+  } else if (auto clause_tup = clause_pat.as<PatternTupleNode>()) {
+    return ExpandWildcardsTuple(GetRef<PatternTuple>(clause_tup), cand, mod);
   } else {
-    return ExpandWildcardsTuple(Downcast<PatternTuple>(clause_pat), cand, mod);
+    return {cand};
   }
 }
 
@@ -201,18 +203,9 @@ Array<Pattern> ExpandWildcardsConstructor(const PatternConstructor& clause_ctor,
   // for constructors, we will expand the wildcards in any field that is an ADT.
   Array<Array<Pattern>> values_by_field;
   for (size_t i = 0; i < ctor_cand->constructor->inputs.size(); i++) {
-    bool subpattern =
-      clause_ctor->patterns[i].as<PatternConstructorNode>() ||
-      clause_ctor->patterns[i].as<PatternTupleNode>();
-    // for non-ADT fields, we can only have a wildcard for the value.
-    if (!subpattern) {
-      values_by_field.push_back({PatternWildcardNode::make()});
-    } else {
-      // otherwise, recursively expand.
-      values_by_field.push_back(ExpandWildcards(clause_ctor->patterns[i],
-                                                ctor_cand->patterns[i],
-                                                mod));
-    }
+    values_by_field.push_back(ExpandWildcards(clause_ctor->patterns[i],
+                                              ctor_cand->patterns[i],
+                                              mod));
   }
 
   // generate new candidates using a cartesian product.
@@ -243,18 +236,9 @@ Array<Pattern> ExpandWildcardsTuple(const PatternTuple& clause_tuple,
   // for constructors, we will expand the wildcards in any field that is an ADT.
   Array<Array<Pattern>> values_by_field;
   for (size_t i = 0; i < tuple_cand->patterns.size(); i++) {
-    bool subpattern =
-      clause_tuple->patterns[i].as<PatternConstructorNode>() ||
-      clause_tuple->patterns[i].as<PatternTupleNode>();
-    // for non-ADT fields, we can only have a wildcard for the value
-    if (!subpattern) {
-      values_by_field.push_back({PatternWildcardNode::make()});
-    } else {
-      // otherwise, recursively expand
-      values_by_field.push_back(ExpandWildcards(clause_tuple->patterns[i],
-                                                tuple_cand->patterns[i],
-                                                mod));
-    }
+    values_by_field.push_back(ExpandWildcards(clause_tuple->patterns[i],
+                                              tuple_cand->patterns[i],
+                                              mod));
   }
 
   // generate new candidates using a cartesian product
diff --git a/tests/python/relay/test_pass_unmatched_cases.py b/tests/python/relay/test_pass_unmatched_cases.py
index 615d4e092291..1ac99a69a249 100644
--- a/tests/python/relay/test_pass_unmatched_cases.py
+++ b/tests/python/relay/test_pass_unmatched_cases.py
@@ -19,6 +19,7 @@
 from tvm import relay
 from tvm.relay.prelude import Prelude
 from tvm.relay.analysis import unmatched_cases
+import pytest
 
 def test_empty_match_block():
     # empty match block will not match anything, so it should return a wildcard pattern
@@ -273,3 +274,27 @@ def test_tuple_match():
     clause = relay.Clause(relay.PatternTuple([relay.PatternVar(a), relay.PatternVar(b)]), a + b)
     x = relay.Match(relay.Tuple([relay.const(1), relay.const(1)]), [clause])
     assert len(unmatched_cases(x)) == 0
+
+
+def test_inf_loop_case():
+    code = """
+v0.0.4
+type Arith[A] {
+    Zero,
+    Const(A),
+    Plus(Arith[A], Arith[A])
+}
+
+def @shallow_opt[A](%a: Arith[A]) -> Arith[A] {
+    match (%a) {
+        Plus(Zero, %r) => %r,
+        Plus(%l, Zero) => %l,
+        _ => %a
+    }
+}
+"""
+    relay.fromtext(code)
+    # fromtext parse the module, then checked it (which include strictness checking).
+
+if __name__ == "__main__":
+    pytest.main([__file__])

From b0b1e7daf4f9a44505915e57682ce59d43873ddf Mon Sep 17 00:00:00 2001
From: yongfeng-nv <49211903+yongfeng-nv@users.noreply.github.com>
Date: Tue, 25 Feb 2020 23:21:08 -0500
Subject: [PATCH 39/73] Tensor Expression Debug Display (TEDD) (#4651)

* Initial TEDD for publishing.

* 1. Fix lint issues. 2. Print intrin.body instead of intrin.name in Schedule Tree.  3. Add examples to top level APIs' comments.  4. Top level APIs don't print Dot string by default, unless outputdotstring is True.

* Fix more lint issues.

* Update top level API argument names and use raw strings to avoid Python lint warnings in the tests.

* Disable TEDD verification, but keep TE construction.

* Stop importing tedd to avoid failure.

* Separate data extraction and visualization. 1. Add API tedd.dump_json(schedule) to dump a json string for the schedule data for visualization.  2. Update tests.  3. Add a tutorial.  4. Add range information to IterVars.

* Update TEDD about InferBound failure.  1. TEDD doesn't call inferbound for DFG. 2. Update tutorial about the InferBound failure.

* 1. Import IPython only if SVG is requested.  This is required to fix a tutorial publishing faliure.  2. Fix test about IPython availability check.
---
 python/tvm/contrib/tedd.py        | 738 ++++++++++++++++++++++++++++++
 tests/python/contrib/test_tedd.py | 137 ++++++
 tutorials/language/tedd.py        | 164 +++++++
 3 files changed, 1039 insertions(+)
 create mode 100644 python/tvm/contrib/tedd.py
 create mode 100644 tests/python/contrib/test_tedd.py
 create mode 100644 tutorials/language/tedd.py

diff --git a/python/tvm/contrib/tedd.py b/python/tvm/contrib/tedd.py
new file mode 100644
index 000000000000..f15b7d489eee
--- /dev/null
+++ b/python/tvm/contrib/tedd.py
@@ -0,0 +1,738 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-outside-toplevel
+"""Tensor Expression Debug Display (TEDD), visualizing Tensor Expression"""
+import html
+import json
+import warnings
+from graphviz import Digraph
+from graphviz import Source
+import tvm
+
+TVMDD_TABLE_BODY_WIDTH = 30
+# Must match enum IterVarType defined in include/tvm/expr.h
+ITERVAR_TYPE_STRING_MAP = {
+    0: ('kDataPar', '#FFFFFF'),
+    1: ('kThreadIndex', '#2980B9'),
+    2: ('kCommReduce', '#FAD7A0'),
+    3: ('kOrdered', '#D35400'),
+    4: ('kOpaque', '#ABB2B9'),
+    5: ('kUnrolled', '#D2B4DE'),
+    6: ('kVectorized', '#AED6F1'),
+    7: ('kParallelized', '#F5B7B1'),
+    8: ('kTensorized', '#A9DFBF'),
+}
+
+
+def dom_path_to_string(dom_path, prefix=""):
+    path_string = prefix
+    for index in dom_path:
+        path_string = path_string + '_' + str(index)
+    return path_string
+
+
+def insert_dot_id(sch):
+    """Insert unique ID for each node in the DOM tree.
+       They are used as Dot node ID.
+       """
+    for stage_idx, stage in enumerate(sch["stages"]):
+        dom_path = [stage_idx]
+        stage["id"] = dom_path_to_string(dom_path, stage["type"])
+        for itervar_idx, itervar in enumerate(stage["all_itervars"]):
+            dom_path = [stage_idx, itervar_idx]
+            itervar["id"] = dom_path_to_string(dom_path, itervar["type"])
+        for rel_idx, rel in enumerate(stage["relations"]):
+            dom_path = [stage_idx, rel_idx]
+            rel["id"] = dom_path_to_string(dom_path, rel["type"])
+        for tensor_idx, tensor in enumerate(stage["output_tensors"]):
+            dom_path = [stage_idx, tensor_idx]
+            tensor["id"] = dom_path_to_string(dom_path, tensor["type"])
+    return sch
+
+
+class ObjectManager:
+    """A helper class tracking schedule objects, e.g. stage, IterVar,
+       relationship, and tensor, to their DOM path."""
+    def __init__(self, sch):
+        self.dict = {}
+        for stage_idx, stage in enumerate(sch.stages):
+            self.dict[stage] = [stage_idx]
+            for itervar_idx, itervar in enumerate(stage.all_iter_vars):
+                self.dict[itervar] = [stage_idx, itervar_idx]
+            for rel_idx, rel in enumerate(stage.relations):
+                self.dict[rel] = [stage_idx, rel_idx]
+            for tensor_idx in range(stage.op.num_outputs):
+                self.dict[frozenset({stage.op.name,
+                                     tensor_idx})] = [stage_idx, tensor_idx]
+
+    def get_dom_path(self, obj):
+        if obj is None:
+            return None
+        assert obj in self.dict, 'Node is no found.'
+        return self.dict[obj]
+
+
+def get_or_create_dot_id(obj, prefix="", assert_on_missing=False):
+    """If obj's ID has been registered, return it.
+       If not, either assert or create a unique and legal ID, register and
+       return it, according to assert_on_missing.
+       ID must be a unique and legal Dotty ID.
+
+        Parameters
+        ----------
+        obj : objet
+                    Serve as the key to the ID.
+
+        prefix : string
+                    Prefix to attach to the ID.  Usually use obj's non-unique
+                    name as prefix.
+
+        assert_on_missing : bool
+                    Assert or not if object doesn't have a registered ID.
+    """
+    prefix = prefix.replace('.', '_')
+    if not hasattr(get_or_create_dot_id, "obj_id_dict"):
+        get_or_create_dot_id.obj_id_dict = {}
+    if obj not in get_or_create_dot_id.obj_id_dict:
+        if assert_on_missing:
+            assert False, 'dot_id ' + str(obj) + ' has not been registered.'
+        else:
+            get_or_create_dot_id.obj_id_dict[obj] = prefix + hex(id(obj))
+    return get_or_create_dot_id.obj_id_dict[obj]
+
+
+def get_port_id(is_input, index):
+    return 'I_' + str(index) if is_input else 'O_' + str(index)
+
+
+def get_itervar_type_info(iter_type):
+    assert iter_type < len(
+        ITERVAR_TYPE_STRING_MAP), 'Unknown IterVar type: ' + str(iter_type)
+    return ITERVAR_TYPE_STRING_MAP[iter_type]
+
+
+def get_itervar_label_color(itervar, iv_type):
+    type_info = get_itervar_type_info(iv_type)
+    return linebrk(
+        str(itervar["name"]) + '(' + type_info[0] + ')',
+        TVMDD_TABLE_BODY_WIDTH), type_info[1]
+
+
+def linebrk(s, n):
+    """ Break input string s with <br/> for every n charactors."""
+    result = ''
+    j = 0
+    for i, c in enumerate(s):
+        if j == n and i != len(s) - 1:
+            result = result + '\n'
+            j = 0
+        j = j + 1
+        result = result + c
+    result = html.escape(str(result), quote=True)
+    result = result.replace('\n', '<br/>')
+    return result
+
+
+def create_graph(name="", rankdir='BT'):
+    graph = Digraph(name=name)
+    graph.graph_attr['rankdir'] = rankdir
+    return graph
+
+
+def itervar_label(itervar, index, index_color, label):
+    return '<TR><TD PORT="' + itervar[
+        "id"] + '" BGCOLOR="' + index_color + '">' + str(
+            index
+        ) + '</TD><TD BGCOLOR="white" PORT="itervar">' + label + '<br/>' + str(
+            itervar["properties"]["range"]) + '</TD></TR>'
+
+
+def stage_label(stage):
+    return stage['name'] + '<br/>Scope: ' + stage['properties']['scope']
+
+
+def legend_label():
+    label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" CELLPADDING="4">'
+    for iter_type in ITERVAR_TYPE_STRING_MAP:
+        name, color = ITERVAR_TYPE_STRING_MAP[iter_type]
+        label += '<TR><TD BGCOLOR="' + color + '"></TD>' \
+            + '<TD BGCOLOR="white">' + name + '</TD></TR>'
+    label += '</TABLE>>'
+    return label
+
+
+def leaf_itervars(stage):
+    filtered = filter(lambda x: (x["index"] >= 0), stage["all_itervars"])
+    return sorted(filtered, key=lambda x: x["index"])
+
+
+def legend_dot(g):
+    with g.subgraph(name='cluster_legend') as subgraph:
+        subgraph.attr(label='Legend')
+        label = legend_label()
+        subgraph.node('legend', label, shape='none', margin='0')
+
+
+def extract_dom_for_viz(sch, need_range=True):
+    json_str = dump_json(sch, need_range)
+    s = json.loads(json_str)
+    s = insert_dot_id(s)
+    return s
+
+
+def dump_graph(dot_string,
+               show_svg=True,
+               dot_file_path='',
+               output_dot_string=False):
+    """Output dot_string in various formats."""
+    if dot_file_path:
+        try:
+            dot_file = open(dot_file_path, "w+")
+            dot_file.write(dot_string)
+            dot_file.close()
+        except IOError:
+            print('Cannot open file: ' + dot_file_path)
+    if show_svg:
+        from IPython.display import display
+        from IPython.display import SVG
+        src = Source(dot_string)
+        display(SVG(src.pipe(format='svg')))
+    if output_dot_string:
+        return dot_string
+    return None
+
+
+def dump_json(sch, need_range):
+    """Serialize data for visualization from a schedule in JSON format.
+
+        Parameters
+        ----------
+        sch : schedule
+                    The schedule object to serialize
+
+        Returns
+        -------
+        json : string
+            Serialized JSON string
+    """
+    def encode_itervar(itervar, stage, index, range_map):
+        """Extract and encode IterVar visualization data to a dictionary"""
+        ivrange = range_map[
+            itervar] if range_map is not None and itervar in range_map else None
+        bind_thread = None
+        tensor_intrin = None
+        if itervar in stage.iter_var_attrs:
+            attr = stage.iter_var_attrs[itervar]
+            iv_type = attr.iter_type
+            # binding
+            bind_thread = str(
+                attr.bind_thread.var) if attr.bind_thread is not None else None
+            # tensorization
+            if attr.tensor_intrin is not None:
+                tensor_intrin = str(attr.tensor_intrin.body)
+                # remove the final \n
+                tensor_intrin = tensor_intrin[0:-1] if tensor_intrin[
+                    -1] == "\n" else tensor_intrin
+            else:
+                tensor_intrin = None
+        else:
+            iv_type = itervar.iter_type
+        itervar_dict = {
+            "type": "IterVar",
+            "index": index,
+            "name": str(itervar.var),
+            "itervar_type": iv_type,
+            "properties": {
+                "thread": bind_thread,
+                "intrin": tensor_intrin,
+                "range": str(ivrange) if ivrange is not None else 'range(N/A)',
+            }
+        }
+        return itervar_dict
+
+    def encode_itervars(stage, range_map):
+        """Extract and encode IterVars visualization data from a stage to a dictionary"""
+        def get_leaf_itervar_index(itervar, leaf_iv):
+            for leaf_index, ivar in enumerate(leaf_iv):
+                if ivar == itervar:
+                    return leaf_index
+            return -1
+
+        itervars = []
+        for itervar in stage.all_iter_vars:
+            leaf_index = get_leaf_itervar_index(itervar, stage.leaf_iter_vars)
+            itervars.append(
+                encode_itervar(itervar, stage, leaf_index, range_map))
+        return itervars
+
+    def encode_itervar_relation(obj_manager, rel):
+        """Extract and encode IterVar Relationship visualization data to a dictionary"""
+        rel_type = type(rel)
+        if rel_type is tvm.schedule.Split:
+            node_type = 'Split_Relation'
+            rel_dict = {
+                "type": node_type,
+                "parent": obj_manager.get_dom_path(rel.parent),
+                "outer": obj_manager.get_dom_path(rel.outer),
+                "inner": obj_manager.get_dom_path(rel.inner),
+            }
+        elif rel_type is tvm.schedule.Fuse:
+            node_type = 'Fuse_Relation'
+            rel_dict = {
+                "type": node_type,
+                "fused": obj_manager.get_dom_path(rel.fused),
+                "outer": obj_manager.get_dom_path(rel.outer),
+                "inner": obj_manager.get_dom_path(rel.inner),
+            }
+        elif rel_type is tvm.schedule.Singleton:
+            node_type = 'Singleton_Relation'
+            rel_dict = {
+                "type": node_type,
+                "iter": obj_manager.get_dom_path(rel.iter),
+            }
+        else:
+            return None
+        return rel_dict
+
+    def encode_itervar_relations(obj_manager, stage):
+        relations = []
+        for i in range(len(stage.relations)):
+            rel = encode_itervar_relation(obj_manager, stage.relations[i])
+            if rel is not None:
+                relations.append(rel)
+        return relations
+
+    def encode_tensor(obj_manager, tensor, stage):
+        """Extract and encode tensor visualization data to a dictionary"""
+        tensor_dict = {
+            "type": "Tensor",
+            "source": obj_manager.get_dom_path(stage),
+            "value_index": tensor.value_index,
+            "shape": str(tensor.op.output(tensor.value_index).shape),
+            "data_type": tensor.op.output(tensor.value_index).dtype,
+        }
+        return tensor_dict
+
+    def encode_tensors(obj_manager, stage):
+        tensors = []
+        for i in range(stage.op.num_outputs):
+            tensor = stage.op.output(i)
+            tensors.append(encode_tensor(obj_manager, tensor, stage))
+        tensors.sort(key=lambda tensor: tensor["value_index"])
+        return tensors
+
+    def encode_stage(obj_manager, stage, range_map):
+        """Extract and encode stage visualization data to a dictionary"""
+        stage_dict = {
+            "type":
+            "Stage",
+            "name":
+            stage.op.name,
+            "attaching_to":
+            obj_manager.get_dom_path(stage.attach_ivar),
+            "compute":
+            str(stage.op.body) if hasattr(stage.op, 'body') else None,
+            "properties": {
+                "scope": stage.scope,
+            },
+            "all_itervars":
+            encode_itervars(stage, range_map),
+            "relations":
+            encode_itervar_relations(obj_manager, stage),
+            "input_tensors": [
+                obj_manager.get_dom_path(
+                    frozenset({tensor.op.name, tensor.value_index}))
+                for tensor in stage.op.input_tensors
+            ],
+            "output_tensors":
+            encode_tensors(obj_manager, stage),
+        }
+        return stage_dict
+
+    def encode_schedule(sch, need_range):
+        """Extract and encode data from a schedule for visualization to a nested dictionary.
+        It is useful for JSON to serialize schedule.
+
+            Parameters
+            ----------
+            sch : schedule
+                        The schedule object to extract
+
+            Returns
+            -------
+            dict : dictionary
+                A nested dictionary
+        """
+        assert isinstance(sch, tvm.schedule.Schedule
+                          ), 'Input is not a tvm.schedule.Schedule object.'
+        range_map = None
+        if need_range:
+            try:
+                range_map = tvm.schedule.InferBound(sch)
+            except tvm._ffi.base.TVMError as expt:
+                warnings.warn(
+                    'Ranges are not available, because InferBound fails with the following error:\n'
+                    + str(expt))
+
+        obj_manager = ObjectManager(sch)
+        stages = []
+        for stage in sch.stages:
+            stages.append(encode_stage(obj_manager, stage, range_map))
+        return {
+            "type": "Schedule",
+            "stages": stages,
+        }
+
+    return json.dumps(sch, default=lambda s: encode_schedule(s, need_range))
+
+
+def viz_schedule_tree(sch,
+                      show_svg=False,
+                      dot_file_path='',
+                      output_dot_string=False):
+    """Top level API to render schedule tree
+
+        Parameters
+        ----------
+        sch : schedule
+                    The schedule object to visualize
+
+        show_svg : bool
+                    Display graph as SVG, useful for Jupyter notebooks.
+
+        dot_file_path : string
+                    Dot file to save the graph.
+
+        output_dot_string : bool
+                    Return dot file content or an empty string.
+
+        Returns
+        -------
+        dot_string : string
+            Dot file content or an empty string according to output_dot_string
+
+        Examples
+        --------
+        The following code writes a schedule tree to a dot file.
+
+        .. code-block:: python
+            tedd.viz_schedule_tree(s, dot_file_path = '/tmp/example.dot')
+
+        Use the following code to render a SVG graph in a Jupyter notebook.
+
+        .. code-block:: python
+            tedd.viz_schedule_tree(s, show_svg = True)
+    """
+    def create_schedule_tree_graph(name=""):
+        return create_graph(name=name, rankdir='BT')
+
+    def root_dot(g):
+        g.node('ROOT', 'ROOT', shape='oval', margin='0')
+
+    def stage_node_dot(g, stage):
+        node_label = stage_node_label(stage)
+        g.node(stage['id'], node_label, shape='none', margin='0')
+
+    def stage_node_label(stage):
+        """Return a html format label for the given stage."""
+        label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" ' \
+            'CELLPADDING="4"> <TR><TD BGCOLOR="lightgrey" ' \
+            'COLSPAN="2" PORT="stage">' + stage_label(stage) + '</TD></TR>'
+
+        for leafiv in leaf_itervars(stage):
+            iv_type = leafiv["itervar_type"]
+            var_attr_label = ''
+            if "thread" in leafiv["properties"] and \
+                    leafiv["properties"]["thread"] is not None:
+                var_attr_label = var_attr_label + "<br/>(" + str(
+                    leafiv["properties"]["thread"]) + ")"
+            if "intrin" in leafiv["properties"] and \
+                    leafiv["properties"]["intrin"] is not None:
+                var_attr_label = var_attr_label + "<br/>" + \
+                    linebrk("(tensor_intrin:" + str(
+                        leafiv["properties"]["intrin"]) + ")", TVMDD_TABLE_BODY_WIDTH)
+            var_label, color = get_itervar_label_color(leafiv, iv_type)
+            label += itervar_label(leafiv, leafiv["index"], color,
+                                   var_label + var_attr_label)
+        if stage["compute"] is not None:
+            label += '<TR><TD COLSPAN="2">' + linebrk(str(
+                stage["compute"]), TVMDD_TABLE_BODY_WIDTH) + '</TD></TR>'
+        label += '</TABLE>>'
+        return label
+
+    def compute_at_dot(g, stage):
+        """If the given stage attaches to another stage, create an edge from it
+        stage to its attach point; otherwise, create an edge to the ROOT.
+        """
+        src = stage["id"]
+        dst = dom_path_to_string(
+            [stage["attaching_to"][0]], "Stage") + ":" + dom_path_to_string(
+                stage["attaching_to"],
+                "IterVar") if stage["attaching_to"] is not None else "ROOT"
+        g.edge(src, dst)
+
+    graph = create_schedule_tree_graph("Schedule Tree")
+    s = extract_dom_for_viz(sch)
+    legend_dot(graph)
+    for stage in s['stages']:
+        stage_node_dot(graph, stage)
+    for stage in s['stages']:
+        compute_at_dot(graph, stage)
+    root_dot(graph)
+    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
+
+
+def viz_itervar_relationship_graph(sch,
+                                   show_svg=False,
+                                   dot_file_path='',
+                                   output_dot_string=False):
+    """Top level API to render IterVar relationship graph
+
+        Parameters
+        ----------
+        sch : schedule
+                    The schedule object to visualize
+
+        show_svg : bool
+                    Display graph as SVG, useful for Jupyter notebooks.
+
+        dot_file_path : string
+                    Dot file to save the graph.
+
+        output_dot_string : bool
+                    Return dot file content or an empty string.
+
+        Examples
+        --------
+        The following code writes Ian tervar relationship graph to a dot file.
+
+        .. code-block:: python
+            tedd.viz_def viz_itervar_relationship_graph(sch,
+                (s, dot_file_path = '/tmp/example.dot')
+
+        Use the following code to render a SVG graph in a Jupyter notebook.
+
+        .. code-block:: python
+            tedd.viz_def viz_itervar_relationship_graph(sch,
+                (s, show_svg = True)
+    """
+    def create_itervar_relation_graph(name=""):
+        return create_graph(name=name, rankdir='TB')
+
+    def itervar_node_dot(g, itervar, iv_type, index):
+        label = itervar_node_label(itervar, iv_type, index)
+        g.node(itervar["id"], label, shape='none', margin='0')
+
+    def itervar_node_label(itervar, iv_type, index):
+        label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" ' \
+            'CELLPADDING="4">' + itervar_label(
+                itervar, index,
+                get_itervar_label_color(itervar, iv_type)[1],
+                get_itervar_label_color(itervar, iv_type)[0]) + '</TABLE>>'
+        return label
+
+    def itervar_relation_node_dot(g, node_id, node_label, input_ports,
+                                  output_ports):
+        label = itervar_relation_node_label(node_label, input_ports,
+                                            output_ports)
+        g.node(node_id, label, shape='none', margin='0')
+
+    def itervar_relation_node_label(node_label, input_ports, output_ports):
+        """Return a html format label for an itervar relationship node
+        including node_label and input/output ports.
+        """
+        label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" ' \
+            'CELLPADDING="4">' + '<TR>'
+        max_port_num = max(len(input_ports), len(output_ports))
+        for i in range(max_port_num):
+            if i < len(input_ports):
+                input_port = input_ports[i]
+                label += '<TD BGCOLOR="lightgrey" PORT="' + input_port + '">' \
+                    + input_port + '</TD>'
+            else:
+                label += '<TD BGCOLOR="white"></TD>'
+        label += '</TR>'
+        label += '<TR><TD BGCOLOR="white" COLSPAN="' + str(
+            max_port_num) + '" PORT="relation">' + node_label + '</TD></TR>'
+        label += '<TR>'
+        for i in range(max_port_num):
+            if i < len(output_ports):
+                output_port = output_ports[i]
+                label += '<TD BGCOLOR="lightgrey" PORT="' + output_port + '">' \
+                    + output_port + '</TD>'
+            else:
+                label += '<TD BGCOLOR="white"></TD>'
+        label += '</TR>'
+        label += '</TABLE>>'
+        return label
+
+    def itervar_relation_dot(g, node, node_id):
+        """Create an itervar relationship node."""
+        node_type = node["type"]
+        if node_type == "Split_Relation":
+            node_type = 'Split'
+            itervar_relation_node_dot(g, node_id, node_type, ['Input'],
+                                      ['Outer', 'Inner'])
+            parent = dom_path_to_string(node["parent"], "IterVar")
+            outer = dom_path_to_string(node["outer"], "IterVar")
+            inner = dom_path_to_string(node["inner"], "IterVar")
+            g.edge(parent + ':itervar', node_id + ':Input')
+            g.edge(node_id + ':Outer', outer + ':itervar')
+            g.edge(node_id + ':Inner', inner + ':itervar')
+        elif node_type == "Fuse_Relation":
+            node_type = 'Fuse'
+            itervar_relation_node_dot(g, node_id, node_type,
+                                      ['Outer', 'Inner'], ['Fused'])
+            fused = dom_path_to_string(node["fused"], "IterVar")
+            outer = dom_path_to_string(node["outer"], "IterVar")
+            inner = dom_path_to_string(node["inner"], "IterVar")
+            g.edge(outer + ':itervar', node_id + ':Outer')
+            g.edge(inner + ':itervar', node_id + ':Inner')
+            g.edge(node_id + ':Fused', fused + ':itervar')
+        elif node_type == "Singleton_Relation":
+            node_type = 'Singleton'
+            itervar_relation_node_dot(g, node_id, node_type, [], ['Iter'])
+            itervar = dom_path_to_string(node["inner"], "IterVar")
+            g.edge(node_id + ':Iter', itervar + ':itervar')
+        else:
+            assert False, 'Unknown IterVarRelationNode: ' + node_type
+
+    def stage_node_dot(g, stage):
+        """Create a stage node."""
+        with g.subgraph(name='cluster_' + stage["id"]) as subgraph:
+            subgraph.attr(label=stage["name"])
+            if stage["all_itervars"]:
+                for itervar in stage["all_itervars"]:
+                    iv_type = itervar["itervar_type"]
+                    itervar_node_dot(subgraph, itervar, iv_type,
+                                     itervar["index"])
+                for rel in stage["relations"]:
+                    node_id = rel["id"]
+                    itervar_relation_dot(subgraph, rel, node_id)
+            else:
+                subgraph.node(stage["name"] + '_placeholder', style='invis')
+
+    graph = create_itervar_relation_graph("IterVar Relationship Graph")
+    s = extract_dom_for_viz(sch)
+    legend_dot(graph)
+    for stage in s['stages']:
+        stage_node_dot(graph, stage)
+
+    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
+
+
+def viz_dataflow_graph(sch,
+                       show_svg=False,
+                       dot_file_path='',
+                       output_dot_string=False):
+    """Top level API to render dataflow graph
+
+        Parameters
+        ----------
+        sch : schedule
+                    The schedule object to visualize
+
+        show_svg : bool
+                    Display graph as SVG, useful for Jupyter notebooks.
+
+        dot_file_path : string
+                    Dot file to save the graph.
+
+        output_dot_string : bool
+                    Return dot file content or an empty string.
+
+        Examples
+        --------
+        The following code writes a dataflow graph to a dot file.
+
+        .. code-block:: python
+            tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/example.dot')
+
+        Use the following code to render a SVG graph in a Jupyter notebook.
+
+        .. code-block:: python
+            tedd.viz_dataflow_graph(s, show_svg = True)    """
+    def create_dataflow_graph(name=""):
+        return create_graph(name=name, rankdir='LR')
+
+    def tensor_node_dot(g, tensor):
+        """Create a tensor node."""
+        label = tensor_node_label(tensor)
+        g.node(tensor["id"], label, shape='oval', margin='0')
+
+    def tensor_node_label(tensor):
+        """Return a html format label for the given tensor."""
+        label = str(tensor["shape"]) + '\n' + str(tensor["data_type"])
+        return label
+
+    def stage_node_dot(g, stage):
+        """Create a stage node."""
+        label = stage_node_label(stage)
+        g.node(stage["id"], label, shape='none', margin='0')
+
+    def stage_node_label(stage):
+        """Return a html format label for the given stage."""
+        rows = max(
+            1, max(len(stage["output_tensors"]), len(stage["input_tensors"])))
+        label = '<<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0" ' \
+            'CELLPADDING="4">'
+        for i in range(rows):
+            label += '<TR>'
+            if i < len(stage["input_tensors"]):
+                port_id = get_port_id(True, i)
+                label += '<TD BGCOLOR="lightgrey" COLSPAN="2" PORT="' \
+                    + port_id + '">' + str(
+                        i) + '</TD>'
+            else:
+                label += '<TD BGCOLOR="white" COLSPAN="2"></TD>'
+            if i == 0:
+                label += '<TD BGCOLOR="white" COLSPAN="2" ROWSPAN="' + str(
+                    rows) + '">' + stage_label(stage) + '</TD>'
+            if i < len(stage["output_tensors"]):
+                port_id = get_port_id(False, i)
+                label += '<TD BGCOLOR="lightgrey" COLSPAN="2" PORT="' \
+                    + port_id + '">' + str(
+                        i) + '</TD>'
+            else:
+                label += '<TD BGCOLOR="white" COLSPAN="2"></TD>'
+            label += '</TR>'
+        label += '</TABLE>>'
+        return label
+
+    def dfg_dot(g, sch):
+        """Create edges among stages."""
+        stages = sch['stages']
+        for stage in stages:
+            for i in range(len(stage["input_tensors"])):
+                src = dom_path_to_string(stage["input_tensors"][i], "Tensor")
+                dst = stage["id"] + ':' + get_port_id(True, i)
+                g.edge(src, dst)
+            for i in range(len(stage["output_tensors"])):
+                src = stage["id"] + ':' + get_port_id(False, i)
+                dst = stage["output_tensors"][i]["id"]
+                g.edge(src, dst)
+
+    graph = create_dataflow_graph("Dataflow Graph")
+    s = extract_dom_for_viz(sch, need_range=False)
+    for stage in s['stages']:
+        stage_node_dot(graph, stage)
+        for tensor in stage["output_tensors"]:
+            tensor_node_dot(graph, tensor)
+
+    dfg_dot(graph, s)
+
+    return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string)
diff --git a/tests/python/contrib/test_tedd.py b/tests/python/contrib/test_tedd.py
new file mode 100644
index 000000000000..d4d3ce464d44
--- /dev/null
+++ b/tests/python/contrib/test_tedd.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import numpy as np
+import re
+import topi
+
+
+def findany(pattern, str):
+    matches = re.findall(pattern, str)
+    assert (len(matches) >
+            0), 'Pattern not found.\nPattern: ' + pattern + '\nString:  ' + str
+
+
+def checkdepdency():
+    import pkg_resources
+    return not {'graphviz', 'ipython'} - {pkg.key for pkg in pkg_resources.working_set}
+
+def test_dfg():
+    A = tvm.placeholder((1024, 4096), dtype='float32', name='A')
+    B = topi.nn.softmax(A)
+    # confirm lower works
+    s = tvm.create_schedule([B.op])
+
+    def verify():
+        from tvm.contrib import tedd
+        str = tedd.viz_dataflow_graph(s, False, '', True)
+        # Check all edges are available
+        findany(r"digraph \"Dataflow Graph\"", str)
+        findany(r"Stage_0:O_0 -> Tensor_0_0", str)
+        findany(r"Tensor_0_0 -> Stage_1:I_0", str)
+        findany(r"Stage_1:O_0 -> Tensor_1_0", str)
+        findany(r"Tensor_0_0 -> Stage_2:I_0", str)
+        findany(r"Tensor_1_0 -> Stage_2:I_1", str)
+        findany(r"Stage_2:O_0 -> Tensor_2_0", str)
+        findany(r"Tensor_2_0 -> Stage_3:I_0", str)
+        findany(r"Stage_3:O_0 -> Tensor_3_0", str)
+        findany(r"Tensor_2_0 -> Stage_4:I_0", str)                
+        findany(r"Tensor_3_0 -> Stage_4:I_1", str)
+        findany(r"Stage_4:O_0 -> Tensor_4_0", str)
+    if checkdepdency():
+        verify()
+
+
+def test_itervar_relationship_graph():
+    n = tvm.var("n")
+    m = tvm.var("m")
+    A = tvm.placeholder((n, m), name='A')
+    k = tvm.reduce_axis((0, m), "k")
+    B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k], axis=k), name="B")
+
+    s = tvm.create_schedule(B.op)
+    s[B].split(B.op.reduce_axis[0], factor=16)
+
+    def verify():
+        from tvm.contrib import tedd
+        str = tedd.viz_itervar_relationship_graph(s, False, '', True)
+        findany(r"digraph \"IterVar Relationship Graph\"", str)
+        findany(r"subgraph cluster_legend", str)
+        # Check subgraphs for stages
+        findany(r"subgraph cluster_Stage_0", str)
+        findany(r"subgraph cluster_Stage_1", str)
+        # Check itervars and their types
+        findany(r"i\(kDataPar\)\<br/\>range\(min=0, ext=n\)", str)
+        findany(r"k\(kCommReduce\)\<br/\>range\(min=0, ext=m\)", str)
+        # Check the split node
+        findany(r"Split_Relation_1_0 +.+\>Split", str)
+        # Check all edges to/from the split node
+        findany(r"IterVar_1_1:itervar -> Split_Relation_1_0:Input", str)
+        findany(r"Split_Relation_1_0:Outer -> IterVar_1_2:itervar", str)
+        findany(r"Split_Relation_1_0:Inner -> IterVar_1_3:itervar", str)
+
+    if checkdepdency():
+        verify()
+
+
+def test_schedule_tree():
+    block_x = tvm.thread_axis('blockIdx.x')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    n = tvm.var("n")
+    m = tvm.var("m")
+    l = tvm.var("l")
+    A = tvm.placeholder((n, m, l), name='A')
+    B = tvm.compute((n, m, l), lambda bi, bj, bk: A[bi, bj, bk] + 1, name='B')
+    r = tvm.reduce_axis((0, m), "r")
+    C = tvm.compute((n, m,),
+                    lambda ci, cj: tvm.sum(B[ci, cj, r], axis=r),
+                    name="C")
+    s = tvm.create_schedule(C.op)
+    s.cache_read(A, 'shared', [B])
+    s[B].vectorize(B.op.axis[-1])
+    s[C].reorder(C.op.reduce_axis[0], C.op.axis[0])
+    _, ki = s[C].split(C.op.reduce_axis[0], factor=16)
+    Cr = s.rfactor(C, ki)
+    s[Cr].compute_at(s[C], s[C].op.axis[-1])
+    s[C].bind(s[C].op.axis[0], block_x)
+    s[C].bind(s[C].op.axis[1], thread_x)
+
+    def verify():
+        from tvm.contrib import tedd
+        str = tedd.viz_schedule_tree(s, False, '', True)
+        findany(r"digraph \"Schedule Tree\"", str)
+        findany(r"subgraph cluster_legend", str)
+        # Check the A_shared stage, including memory scope, itervars, 
+        # and compute
+        findany(r"Stage_1.*A\.shared<br/>Scope: shared.+>0.+>" \
+            r"ax0\(kDataPar\).+>1.+ax1\(kDataPar\).+>2.+>ax2\(kDataPar\).+>" \
+            r"\[A\(ax0, ax1, ax2\)\]", str)
+        # Check itervars of types different from KDataPar
+        findany(r"bk\(kVectorized\)", str)
+        findany(r"r.outer\(kCommReduce\)", str)
+        findany(r"label=ROOT", str)
+        # Check the compute_at edge
+        findany(r"Stage_1", str)
+
+    if checkdepdency():
+        verify()
+
+
+if __name__ == "__main__":
+    test_dfg()
+    test_itervar_relationship_graph()
+    test_schedule_tree()
\ No newline at end of file
diff --git a/tutorials/language/tedd.py b/tutorials/language/tedd.py
new file mode 100644
index 000000000000..aa3fa152a519
--- /dev/null
+++ b/tutorials/language/tedd.py
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one 
+# or more contributor license agreements.  See the NOTICE file 
+# distributed with this work for additional information 
+# regarding copyright ownership.  The ASF licenses this file 
+# to you under the Apache License, Version 2.0 (the 
+# "License"); you may not use this file except in compliance 
+# with the License.  You may obtain a copy of the License at 
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0 
+# 
+# Unless required by applicable law or agreed to in writing, 
+# software distributed under the License is distributed on an 
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
+# KIND, either express or implied.  See the License for the 
+# specific language governing permissions and limitations 
+# under the License. 
+""" 
+Use Tensor Expression Debug Display (TEDD) for Visualization 
+============================================================ 
+**Author**: `Yongfeng Gu <https://github.com/yongfeng-nv>`_
+
+This is an introduction about using TEDD to visualize tensor expressions. 
+
+Tensor Expressions are scheduled with primitives.  Although individual  
+primitives are usually easy to understand, they become complicated quickly  
+when you put them together. We have introduced an operational model of  
+schedule primitives in Tensor Expression in this document  
+(https://docs.google.com/document/d/1nmz00_n4Ju-SpYN0QFl3abTHTlR_P0dRyo5zsWC0Q1k/edit?usp=sharing)  
+to make it easier to understand 
+
+* the interactions between different schedule primitives, 
+* the impact of the schedule primitives on the final code generation. 
+
+The operational model is based on a Dataflow Graph, a Schedule Tree and an  
+IterVar Relationship Graph. Schedule primitives perform operations on these  
+graphs.
+
+TEDD renders these three graphs from a given schedule.  This tutorial demonstrates 
+how to use TEDD and how to interpret the rendered graphs. 
+
+"""
+from __future__ import absolute_import, print_function
+
+import tvm
+import topi
+from tvm.contrib import tedd
+
+######################################################################
+# Define and Schedule Convolution with Bias and ReLU
+# --------------------------------------------------
+# Let's build an example Tensor Expression for a convolution followed by Bias and ReLU.
+# We first connect conv2d, add, and relu TOPIs.  Then, we create a TOPI generic schedule.
+#
+
+batch = 1
+in_channel = 256
+in_size = 32
+num_filter = 256
+kernel = 3
+stride = 1
+padding = "SAME"
+dilation=1
+A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
+W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
+B = tvm.placeholder((1, num_filter, 1), name='bias')
+with tvm.target.create("cuda"):
+    t_conv = topi.nn.conv2d(A, W, stride, padding, dilation, layout='HWCN')
+    t_bias = topi.add(t_conv, B)
+    t_relu = topi.nn.relu(t_bias)
+    s = topi.generic.schedule_conv2d_hwcn([t_relu]) 
+ 
+###################################################################### 
+# Render Graphs with TEDD
+# -----------------------
+# We render graphs to see the computation  
+# and how it is scheduled.   
+# If you run the tutorial in a Jupyter notebook, you can use the following commented lines  
+# to render SVG figures showing in notebook directly.
+#
+
+tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/dfg.dot') 
+#tedd.viz_dataflow_graph(s, show_svg = True) 
+
+######################################################################
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_dfg.png
+#      :align: center
+#      :scale: 100%
+#
+# The first one is a dataflow graph.  Every node represents a stage with name and memory  
+# scope shown in the middle and inputs/outputs information on the sides.   
+# Edges show nodes' dependency.   
+#
+
+tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree.dot') 
+#tedd.viz_schedule_tree(s, show_svg = True) 
+
+######################################################################
+# We just rendered the schedule tree graph.  You may notice an warning about ranges not 
+# available.
+# The message also suggests to call normalize() to infer range information.  We will
+# skip inspecting the first schedule tree and encourage you to compare the graphs before
+# and after normalize() for its impact.
+#
+
+s = s.normalize()
+tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree2.dot') 
+#tedd.viz_schedule_tree(s, show_svg = True) 
+
+######################################################################
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_st.png
+#      :align: center
+#      :scale: 100%
+#
+# Now, let us take a close look at the second schedule tree.  Every block under ROOT 
+# represents a  
+# stage.  Stage name shows in the top row and compute shows in the bottom row.   
+# The middle rows are for IterVars, the higher the outer, the lower the inner. 
+# An IterVar row contains its index, name, type, and other optional information. 
+# Let's use the W.shared stage as an example.  The top row tells  
+# its name, "W.shared", and memory scope, "Shared".  Its compute is  
+# :code:`W(ax0, ax1, ax2, ax3)`. 
+# Its outer most loop IterVar is ax0.ax1.fused.ax2.fused.ax3.fused.outer,  
+# indexed with 0, of kDataPar, bound to threadIdx.y, and with range(min=0, ext=8).  
+# You can also tell 
+# IterVar type with the index box color, shown in the legend. 
+# 
+# If a stage doesn't compute_at any other stage, it has an edge directly to the  
+# ROOT node.  Otherwise, it has an edge pointing to the IterVar it attaches to,  
+# such as W.shared attaches to rx.outer in the middle compute stage. 
+#
+
+###################################################################### 
+# .. note:: 
+# 
+#   By definition, IterVars are internal nodes and computes are leaf nodes in 
+#   a schedule tree.   The edges among IterVars and compute within one stage are 
+#   omitted, making every stage a block, for better readability.
+#
+
+tedd.viz_itervar_relationship_graph(s, dot_file_path = '/tmp/itervar.dot') 
+#tedd.viz_itervar_relationship_graph(s, show_svg = True) 
+
+######################################################################
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_itervar_rel.png
+#      :align: center
+#      :scale: 100%
+#
+# The last one is an IterVar Relationship Graph.  Every subgraph represents a  
+# stage and contains IterVar nodes and transformation nodes.  For example,   
+# W.shared has three split nodes and three fuse nodes.  The rest are IterVar  
+# nodes of the same format as the IterVar rows in Schedule Trees.  Root  
+# IterVars are those not driven by any transformation node, such as ax0; leaf  
+# IterVars don't drive any transformation node and have non-negative indices,  
+# such as ax0.ax1.fused.ax2.fused.ax3.fused.outer with index of 0.
+#
+
+
+###################################################################### 
+# Summary 
+# ------- 
+# This tutorial demonstrates the usage of TEDD.  We use an example built 
+# with TOPI to show the schedules under the hood.  You can also use 
+# it before and after any schedule primitive to inspect its effect.
+#
\ No newline at end of file

From b598c545b416d0417bfd94d3f8077ab9265a9164 Mon Sep 17 00:00:00 2001
From: Neo Chien <cchung100m@cs.ccu.edu.tw>
Date: Wed, 26 Feb 2020 12:23:27 +0800
Subject: [PATCH 40/73] [DOCS] Fix Sphinx Warning: the target found for
 cross-reference (#4925)

* [DOCS] Fix Sphinx Warnings: the target found for cross-reference warnings

* Fix the warning: undefined label
---
 docs/api/python/autotvm.rst                   |  1 +
 docs/api/python/relay/build_module.rst        |  2 ++
 docs/api/python/relay/index.rst               |  1 +
 docs/api/python/relay/testing.rst             | 21 ++++++++++++
 docs/api/python/tvm.rst                       |  7 +++-
 docs/conf.py                                  |  9 +++---
 docs/dev/relay_add_pass.rst                   |  2 +-
 docs/dev/relay_bring_your_own_codegen.rst     |  4 +--
 docs/dev/relay_pass_infra.rst                 |  2 ++
 docs/dev/runtime.rst                          | 32 +++++++++----------
 tutorials/autotvm/tune_relay_arm.py           |  2 ++
 tutorials/autotvm/tune_relay_x86.py           |  2 ++
 tutorials/language/intrin_math.py             | 11 ++++++-
 tutorials/relay_quick_start.py                |  2 +-
 vta/tutorials/optimize/matrix_multiply_opt.py |  2 +-
 15 files changed, 73 insertions(+), 27 deletions(-)
 create mode 100644 docs/api/python/relay/testing.rst

diff --git a/docs/api/python/autotvm.rst b/docs/api/python/autotvm.rst
index 5e8778502457..9357d1b6be08 100644
--- a/docs/api/python/autotvm.rst
+++ b/docs/api/python/autotvm.rst
@@ -18,6 +18,7 @@
 tvm.autotvm
 -----------
 .. automodule:: tvm.autotvm
+.. automodule:: tvm.autotvm.apply_history_best
 
 tvm.autotvm.measure
 ~~~~~~~~~~~~~~~~~~~
diff --git a/docs/api/python/relay/build_module.rst b/docs/api/python/relay/build_module.rst
index 26164bf1ade9..f470b9aff80f 100644
--- a/docs/api/python/relay/build_module.rst
+++ b/docs/api/python/relay/build_module.rst
@@ -18,6 +18,8 @@
 tvm.relay.build_module
 ----------------------
 
+.. automodule:: tvm.relay.build
+
 .. automodule:: tvm.relay.build_module
 
 .. autofunction:: tvm.relay.build_module.build
diff --git a/docs/api/python/relay/index.rst b/docs/api/python/relay/index.rst
index b286386b1230..03c8a37c9c5a 100644
--- a/docs/api/python/relay/index.rst
+++ b/docs/api/python/relay/index.rst
@@ -39,3 +39,4 @@ compiler stack.
    op
    scope_builder
    vision
+   testing
diff --git a/docs/api/python/relay/testing.rst b/docs/api/python/relay/testing.rst
new file mode 100644
index 000000000000..5af5ba72cb7b
--- /dev/null
+++ b/docs/api/python/relay/testing.rst
@@ -0,0 +1,21 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+tvm.relay.testing
+-----------------
+
+.. autoclass:: tvm.relay.testing.resnet
diff --git a/docs/api/python/tvm.rst b/docs/api/python/tvm.rst
index 07c2dbc44765..56f36130b4b4 100644
--- a/docs/api/python/tvm.rst
+++ b/docs/api/python/tvm.rst
@@ -44,7 +44,9 @@ The user facing API for computation declaration.
    tvm.min
    tvm.max
    tvm.tag_scope
-
+   tvm.exp
+   tvm.intrin
+   tvm.call_pure_extern
 
 .. autofunction:: tvm.var
 .. autofunction:: tvm.size_var
@@ -69,3 +71,6 @@ The user facing API for computation declaration.
 .. autofunction:: tvm.min
 .. autofunction:: tvm.max
 .. autofunction:: tvm.tag_scope
+.. autofunction:: tvm.exp
+.. autofunction:: tvm.intrin
+.. autofunction:: tvm.call_pure_extern
diff --git a/docs/conf.py b/docs/conf.py
index 3ca622d6ff18..05f4cfc970d1 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -238,10 +238,11 @@ def setup(app):
 sphinx_gallery_conf = {
     'backreferences_dir': 'gen_modules/backreferences',
     'doc_module': ('tvm', 'numpy'),
-'reference_url': {
-    'tvm': None,
-    'matplotlib': 'https://matplotlib.org/',
-    'numpy': 'https://docs.scipy.org/doc/numpy/'},
+    'reference_url': {
+        'tvm': None,
+        'matplotlib': 'https://matplotlib.org/',
+        'numpy': 'https://docs.scipy.org/doc/numpy/'
+    },
     'examples_dirs': examples_dirs,
     'gallery_dirs': gallery_dirs,
     'subsection_order': subsection_order,
diff --git a/docs/dev/relay_add_pass.rst b/docs/dev/relay_add_pass.rst
index e842664eaad0..8a6f8be0aea8 100644
--- a/docs/dev/relay_add_pass.rst
+++ b/docs/dev/relay_add_pass.rst
@@ -401,6 +401,6 @@ in `src/relay/pass/`_.
 
 .. _include/tvm/relay/transform.h: https://github.com/apache/incubator-tvm/blob/master/include/tvm/relay/transform.h
 
-.. _src/relay/pass: https://github.com/apache/incubator-tvm/tree/master/src/relay/pass
+.. _src/relay/pass/: https://github.com/apache/incubator-tvm/tree/master/src/relay/pass
 
 .. _src/relay/pass/fold_constant.cc: https://github.com/apache/incubator-tvm/blob/master/src/relay/pass/fold_constant.cc
diff --git a/docs/dev/relay_bring_your_own_codegen.rst b/docs/dev/relay_bring_your_own_codegen.rst
index b7d5fa9f5fd6..0cced36c95c1 100644
--- a/docs/dev/relay_bring_your_own_codegen.rst
+++ b/docs/dev/relay_bring_your_own_codegen.rst
@@ -137,7 +137,7 @@ Here we highlight the notes marked in the above code:
 
 * **Note 3** is a TVM runtime compatible wrapper function. It accepts a list of input tensors and one output tensor (the last argument), casts them to the right data type, and invokes the subgraph function described in Note 2. In addition, ``TVM_DLL_EXPORT_TYPED_FUNC`` is a TVM macro that generates another function ``gcc_0`` with unified the function arguments by packing all tensors to ``TVMArgs``. As a result, the TVM runtime can directly invoke ``gcc_0`` to execute the subgraph without additional efforts. With the above code generated, TVM is able to compile it along with the rest parts of the graph and export a single library for deployment.
 
-In the rest of this section, we will implement a codegen step-by-step to generate the above code. Your own codegen has to be located at ``src/relay/backend/contrib/<your-codegen-name>/``. In our example, we name our codegen "codegen_c" and put it under `here<https://github.com/apache/incubator-tvm/blob/master/src/relay/backend/contrib/codegen_c/codegen.cc>`_. Feel free to check this file for a complete implementation.
+In the rest of this section, we will implement a codegen step-by-step to generate the above code. Your own codegen has to be located at ``src/relay/backend/contrib/<your-codegen-name>/``. In our example, we name our codegen "codegen_c" and put it under `/src/relay/backend/contrib/codegen_c/ <https://github.com/apache/incubator-tvm/blob/master/src/relay/backend/contrib/codegen_c/codegen.cc>`_. Feel free to check this file for a complete implementation.
 
 Specifically, we are going to implement two classes in this file and here is their relationship:
 
@@ -625,7 +625,7 @@ The next step is to implement a customized runtime to make use of the output of
 Implement a Customized Runtime
 ==============================
 
-In this section, we will implement a customized TVM runtime step-by-step and register it to TVM runtime modules. The customized runtime should be located at ``src/runtime/contrib/<your-runtime-name>/``. In our example, we name our runtime "example_ext_runtime" and put it under `here<src/runtime/contrib/example_ext_runtime/example_ext_runtime.cc>`_. Feel free to check this file for a complete implementation.
+In this section, we will implement a customized TVM runtime step-by-step and register it to TVM runtime modules. The customized runtime should be located at ``src/runtime/contrib/<your-runtime-name>/``. In our example, we name our runtime "example_ext_runtime" and put it under `/src/runtime/contrib/example_ext_runtime/ <https://github.com/apache/incubator-tvm/blob/master/src/runtime/contrib/example_ext_runtime/example_ext_runtime.cc>`_. Feel free to check this file for a complete implementation.
 
 Again, we first define a customized runtime class as follows. The class has to be derived from TVM ``ModuleNode`` in order to be compatible with other TVM runtime modules.
 
diff --git a/docs/dev/relay_pass_infra.rst b/docs/dev/relay_pass_infra.rst
index 8bd5a05534a1..4630c9839f41 100644
--- a/docs/dev/relay_pass_infra.rst
+++ b/docs/dev/relay_pass_infra.rst
@@ -664,3 +664,5 @@ For more pass infra related examples in Python and C++, please refer to
 .. _tests/python/relay/test_pass_manager.py: https://github.com/apache/incubator-tvm/blob/master/tests/python/relay/test_pass_manager.py
 
 .. _tests/cpp/relay_transform_sequential.cc: https://github.com/apache/incubator-tvm/blob/master/tests/cpp/relay_transform_sequential.cc
+
+.. _include/tvm/relay/transform.h: https://github.com/apache/incubator-tvm/blob/master/include/tvm/relay/transform.h
\ No newline at end of file
diff --git a/docs/dev/runtime.rst b/docs/dev/runtime.rst
index 5ed5f86ed44b..9e542bf01b44 100644
--- a/docs/dev/runtime.rst
+++ b/docs/dev/runtime.rst
@@ -1,19 +1,19 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements.  See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership.  The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License.  You may obtain a copy of the License at
-..
-..   http://www.apache.org/licenses/LICENSE-2.0
-..
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied.  See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
 
 .. _tvm-runtime-system:
 
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index 9aba93798617..ea24b1685788 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
+.. _tune_relay_arm:
+
 Auto-tuning a convolutional network for ARM CPU
 ===============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, `Zhao Wu <https://github.com/FrozenGene>`_, `Eddie Yan <https://github.com/eqy>`_
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index 87d07f9870b2..f44773e544a7 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """
+.. _tune_relay_x86:
+
 Auto-tuning a convolutional network for x86 CPU
 ===============================================
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Eddie Yan <https://github.com/eqy>`_
diff --git a/tutorials/language/intrin_math.py b/tutorials/language/intrin_math.py
index c1af984a09a1..59bf79d13092 100644
--- a/tutorials/language/intrin_math.py
+++ b/tutorials/language/intrin_math.py
@@ -21,7 +21,7 @@
 
 While TVM supports basic arithmetic operations. In many cases
 usually we will need more complicated builtin functions.
-For example :code:`exp` to take the exponetial of the function.
+For example :code:`exp` to take the exponential of the function.
 
 These functions are target system dependent and may have different
 names of different target platforms. In this tutorial, we will learn
@@ -94,6 +94,8 @@
 # TVM also allows user to customize the rules during runtime.
 # The following example customizes CUDA lowering rule for :code:`exp`.
 #
+
+
 def my_cuda_math_rule(op):
     """Customized CUDA intrinsic lowering rule"""
     assert isinstance(op, tvm.tir.Call)
@@ -106,6 +108,8 @@ def my_cuda_math_rule(op):
     else:
         # cannot do translation, return self.
         return op
+
+
 tvm.target.register_intrin_rule("cuda", "exp", my_cuda_math_rule, override=True)
 ######################################################################
 # Register the rule to TVM with override option to override existing rule.
@@ -123,10 +127,13 @@ def my_cuda_math_rule(op):
 # User can easily add new intrinsic by using the intrinsic rule system.
 # The following example add an intrinsic :code:`mylog` to the system.
 #
+
+
 def mylog(x):
     """customized log intrinsic function"""
     return tvm.call_pure_intrin(x.dtype, "mylog", x)
 
+
 def my_cuda_mylog_rule(op):
     """CUDA lowering rule for log"""
     if op.dtype == "float32":
@@ -135,6 +142,8 @@ def my_cuda_mylog_rule(op):
         return tvm.call_pure_extern("float64", "log", op.args[0])
     else:
         return op
+
+
 tvm.target.register_intrin_rule("cuda", "mylog", my_cuda_mylog_rule, override=True)
 
 n = tvm.var("n")
diff --git a/tutorials/relay_quick_start.py b/tutorials/relay_quick_start.py
index 6cded3325ad6..d272a0e315b8 100644
--- a/tutorials/relay_quick_start.py
+++ b/tutorials/relay_quick_start.py
@@ -55,7 +55,7 @@
 #
 # In this tutorial, we assume we will do inference on our device
 # and the batch size is set to be 1. Input images are RGB color
-# images of size 224 * 224. We can call the :any:`tvm.relay.expr.astext()`
+# images of size 224 * 224. We can call the :any:`tvm.relay.expr.TupleWrapper.astext()`
 # to show the network structure.
 
 batch_size = 1
diff --git a/vta/tutorials/optimize/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
index 2d54b97957b2..2722af594c03 100644
--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ b/vta/tutorials/optimize/matrix_multiply_opt.py
@@ -23,7 +23,7 @@
 
 This tutorial provides an overview on how to use TVM to map matrix
 multiplication efficiently on the VTA design.
-We recommend covering the :ref:`vta-basic-mat-mult` tutorial first.
+We recommend covering the :ref:`basic-mat-mult` tutorial first.
 
 In this tutorial, we will demonstrate TVM schedule optimizations to break large
 neural network operators down onto smaller blocks to achieve computation within

From b1428bc49c525d88b6b4469cbbf452e17acbdbeb Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 25 Feb 2020 21:39:16 -0800
Subject: [PATCH 41/73] Bump up dev version (#4941)

* bump up dev version

* update
---
 conda/tvm-libs/meta.yaml            | 2 +-
 conda/tvm/meta.yaml                 | 2 +-
 include/tvm/runtime/c_runtime_api.h | 2 +-
 python/tvm/_ffi/libinfo.py          | 2 +-
 version.py                          | 2 +-
 web/tvm_runtime.js                  | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
index cf79f39d0721..e9e805666d39 100644
--- a/conda/tvm-libs/meta.yaml
+++ b/conda/tvm-libs/meta.yaml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-{% set version = "0.7.dev0" %}
+{% set version = "0.7.dev1" %}
 
 package:
   name: tvm-libs
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index c64ebb1ef819..829dc4b58b64 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-{% set version = "0.7.dev0" %}
+{% set version = "0.7.dev1" %}
 
 package:
   name: tvm
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index e0673bf8b830..bcb75eab731d 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -61,7 +61,7 @@
 #endif
 
 // TVM version
-#define TVM_VERSION "0.7.dev0"
+#define TVM_VERSION "0.7.dev1"
 
 
 // TVM Runtime is DLPack compatible.
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index c026a7afffe9..0d1a4e214791 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -209,4 +209,4 @@ def find_include_path(name=None, search_path=None, optional=False):
 # We use the version of the incoming release for code
 # that is under development.
 # The following line is set by tvm/python/update_version.py
-__version__ = "0.7.dev0"
+__version__ = "0.7.dev1"
diff --git a/version.py b/version.py
index f27527006564..4078d8a23ca7 100644
--- a/version.py
+++ b/version.py
@@ -31,7 +31,7 @@
 # current version
 # We use the version of the incoming release for code
 # that is under development
-__version__ = "0.7.dev0"
+__version__ = "0.7.dev1"
 
 # Implementations
 def update(file_name, pattern, repl):
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index cf5d55ef2261..0740efc5ff6a 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -21,7 +21,7 @@
  * TVM Javascript web runtime library.
  *
  * @projectname tvm
- * @version 0.7.dev0
+ * @version 0.7.dev1
  */
 /* eslint no-unused-vars: "off" */
 /* eslint no-unexpected-multiline: "off" */

From 61bea507bf09df935a475976c8e9b79d3f269195 Mon Sep 17 00:00:00 2001
From: Alex Wong <11878166+alexwong@users.noreply.github.com>
Date: Tue, 25 Feb 2020 21:41:53 -0800
Subject: [PATCH 42/73] [Tutorial] Add a tutorial for PyTorch (#4936)

* Add a tutorial for PyTorch

* Fix sphinx formatting, add version support

* Remove space

* Remove version check

* Some refactoring

* Use no grad

* Rename input

* Update cat img source
---
 tutorials/frontend/from_pytorch.py | 166 +++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 tutorials/frontend/from_pytorch.py

diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
new file mode 100644
index 000000000000..c280c259c1fe
--- /dev/null
+++ b/tutorials/frontend/from_pytorch.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile PyTorch Models
+======================
+**Author**: `Alex Wong <https://github.com/alexwong/>`_
+
+This article is an introductory tutorial to deploy PyTorch models with Relay.
+
+For us to begin with, PyTorch should be installed.
+TorchVision is also required since we will be using it as our model zoo.
+
+A quick solution is to install via pip
+
+.. code-block:: bash
+
+    pip install torch==1.4.0
+    pip install torchvision==0.5.0
+
+or please refer to official site
+https://pytorch.org/get-started/locally/
+
+PyTorch versions should be backwards compatible but should be used
+with the proper TorchVision version.
+
+Currently, TVM supports PyTorch 1.4, 1.3, and 1.2. Other versions may
+be unstable.
+"""
+
+# tvm, relay
+import tvm
+from tvm import relay
+
+# numpy, packaging
+import numpy as np
+from packaging import version
+from tvm.contrib.download import download_testdata
+
+# PyTorch imports
+import torch
+import torchvision
+
+######################################################################
+# Load a pretrained PyTorch model
+# -------------------------------
+model_name = 'resnet18'
+model = getattr(torchvision.models, model_name)(pretrained=True)
+model = model.eval()
+
+# We grab the TorchScripted model via tracing
+input_shape = [1, 3, 224, 224]
+input_data = torch.randn(input_shape)
+scripted_model = torch.jit.trace(model, input_data).eval()
+
+######################################################################
+# Load a test image
+# -----------------
+# Classic cat example!
+from PIL import Image
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+img_path = download_testdata(img_url, 'cat.png', module='data')
+img = Image.open(img_path).resize((224, 224))
+
+# Preprocess the image and convert to tensor
+from torchvision import transforms
+my_preprocess = transforms.Compose([
+    transforms.Resize(256),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                         std=[0.229, 0.224, 0.225])
+])
+img = my_preprocess(img)
+img = np.expand_dims(img, 0)
+
+######################################################################
+# Import the graph to Relay
+# -------------------------
+# Convert PyTorch graph to Relay graph.
+shape_dict = {'img': img.shape}
+mod, params = relay.frontend.from_pytorch(scripted_model,
+                                          shape_dict)
+
+######################################################################
+# Relay Build
+# -----------
+# Compile the graph to llvm target with given input specification.
+target = 'llvm'
+target_host = 'llvm'
+ctx = tvm.cpu(0)
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(mod,
+                                     target=target,
+                                     target_host=target_host,
+                                     params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now we can try deploying the compiled model on target.
+from tvm.contrib import graph_runtime
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# Set inputs
+m.set_input('img', tvm.nd.array(img.astype(dtype)))
+m.set_input(**params)
+# Execute
+m.run()
+# Get outputs
+tvm_output = m.get_output(0, tvm.nd.empty(((1, 1000)), 'float32'))
+
+#####################################################################
+# Look up synset name
+# -------------------
+# Look up prediction top 1 index in 1000 class synset.
+synset_url = ''.join(['https://raw.githubusercontent.com/Cadene/',
+                      'pretrained-models.pytorch/master/data/',
+                      'imagenet_synsets.txt'])
+synset_name = 'imagenet_synsets.txt'
+synset_path = download_testdata(synset_url, synset_name, module='data')
+with open(synset_path) as f:
+    synsets = f.readlines()
+
+synsets = [x.strip() for x in synsets]
+splits = [line.split(' ') for line in synsets]
+key_to_classname = {spl[0]:' '.join(spl[1:]) for spl in splits}
+
+class_url = ''.join(['https://raw.githubusercontent.com/Cadene/',
+                      'pretrained-models.pytorch/master/data/',
+                      'imagenet_classes.txt'])
+class_name = 'imagenet_classes.txt'
+class_path = download_testdata(class_url, class_name, module='data')
+with open(class_path) as f:
+    class_id_to_key = f.readlines()
+
+class_id_to_key = [x.strip() for x in class_id_to_key]
+
+# Get top-1 result for TVM
+top1_tvm = np.argmax(tvm_output.asnumpy()[0])
+tvm_class_key = class_id_to_key[top1_tvm]
+
+# Convert input to PyTorch variable and get PyTorch result for comparison
+with torch.no_grad():
+    torch_img = torch.from_numpy(img)
+    output = model(torch_img)
+
+    # Get top-1 result for PyTorch
+    top1_torch = np.argmax(output.numpy())
+    torch_class_key = class_id_to_key[top1_torch]
+
+print('Relay top-1 id: {}, class name: {}'.format(top1_tvm, key_to_classname[tvm_class_key]))
+print('Torch top-1 id: {}, class name: {}'.format(top1_torch, key_to_classname[torch_class_key]))
\ No newline at end of file

From eba50ad8b27cffc05a0a59aa796265f5cbf9446e Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Wed, 26 Feb 2020 10:01:27 -0800
Subject: [PATCH 43/73] [Relay][pass] call graph for relay (#4922)

* call graph for relay

* CallGraphEntryNode->CallGraphEntry, __getitem__->print_var

* fix typos
---
 python/tvm/relay/__init__.py          |   4 +
 python/tvm/relay/call_graph.py        | 144 ++++++++
 src/relay/pass/call_graph.cc          | 339 +++++++++++++++++
 src/relay/pass/call_graph.h           | 509 ++++++++++++++++++++++++++
 tests/python/relay/test_call_graph.py | 150 ++++++++
 5 files changed, 1146 insertions(+)
 create mode 100644 python/tvm/relay/call_graph.py
 create mode 100644 src/relay/pass/call_graph.cc
 create mode 100644 src/relay/pass/call_graph.h
 create mode 100644 tests/python/relay/test_call_graph.py

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 0df3747a93b1..2ad210e7d109 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -19,6 +19,7 @@
 import os
 from sys import setrecursionlimit
 from ..api import register_func
+from . import call_graph
 from . import base
 from . import ty
 from . import expr
@@ -141,3 +142,6 @@
 
 # Feature
 Feature = feature.Feature
+
+# CallGraph
+CallGraph = call_graph.CallGraph
diff --git a/python/tvm/relay/call_graph.py b/python/tvm/relay/call_graph.py
new file mode 100644
index 000000000000..8206f5dccd4c
--- /dev/null
+++ b/python/tvm/relay/call_graph.py
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-import
+"""Call graph used in Relay."""
+
+from tvm.ir import IRModule
+from .base import Object
+from .expr import GlobalVar
+from . import _analysis
+
+
+class CallGraph(Object):
+    """Class to represent a call graph."""
+
+    def __init__(self, module):
+        """Construct a call graph.
+
+        Parameters
+        ----------
+        module : tvm.ir.IRModule
+            The IR module used to create a call graph
+
+        Returns
+        -------
+        call_graph: CallGraph
+            A constructed call graph.
+        """
+        self.__init_handle_by_constructor__(_analysis.CallGraph, module)
+
+    @property
+    def module(self):
+        """Return the contained Relay IR module.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        ret : tvm.ir.IRModule
+            The contained IRModule
+        """
+        return _analysis.GetModule(self)
+
+    def ref_count(self, var):
+        """Return the number of references to the global var
+
+        Parameters
+        ----------
+        var : Union[String, tvm.relay.GlobalVar]
+
+        Returns
+        -------
+        ret : int
+            The number reference to the global var
+        """
+        var = self._get_global_var(var)
+        return _analysis.GetRefCountGlobalVar(self, var)
+
+    def global_call_count(self, var):
+        """Return the number of global function calls from a given global var.
+
+        Parameters
+        ----------
+        var : Union[String, tvm.relay.GlobalVar]
+
+        Returns
+        -------
+        ret : int
+            The number of global function calls from the given var.
+        """
+        var = self._get_global_var(var)
+        return _analysis.GetGlobalVarCallCount(self, var)
+
+    def is_recursive(self, var):
+        """Return if the function corresponding to a var is a recursive
+        function.
+
+        Parameters
+        ----------
+        var : Union[String, tvm.relay.GlobalVar]
+
+        Returns
+        -------
+        ret : Boolean
+            If the function corresponding to var is recurisve.
+        """
+        var = self._get_global_var(var)
+        return _analysis.IsRecursive(self, var)
+
+    def _get_global_var(self, var):
+        """Return the global var using a given name or GlobalVar.
+
+        Parameters
+        ----------
+        var : Union[String, tvm.relay.GlobalVar]
+
+        Returns
+        -------
+        ret : tvm.relay.GlobalVar
+            The global var.
+        """
+        if isinstance(var, str):
+            mod = self.module
+            var = mod.get_global_var(var)
+
+        if isinstance(var, GlobalVar):
+            return var
+        else:
+            raise TypeError("var should be either a string or GlobalVar")
+
+    def print_var(self, var):
+        """Print a call graph of a global function by name or by variable.
+
+        Parameters
+        ----------
+        var: Union[String, tvm.relay.GlobalVar]
+            The name or global variable.
+
+        Returns
+        -------
+        ret : String
+            The call graph represented in string.
+        """
+        var = self._get_global_var(var)
+        return _analysis.PrintCallGraphGlobalVar(self, var)
+
+    def __str__(self):
+        """Print the call graph in the topological order."""
+        return _analysis.PrintCallGraph(self)
diff --git a/src/relay/pass/call_graph.cc b/src/relay/pass/call_graph.cc
new file mode 100644
index 000000000000..6b82801776dd
--- /dev/null
+++ b/src/relay/pass/call_graph.cc
@@ -0,0 +1,339 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/pass/call_graph.cc
+ * \brief Implementation of APIs to handle the call graph of a Relay module.
+ */
+
+#include "call_graph.h"
+
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/object.h>
+#include <algorithm>
+#include <memory>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+CallGraph::CallGraph(IRModule module) {
+  auto n = make_object<CallGraphNode>();
+  n->module = std::move(module);
+  auto gvar_funcs = n->module->functions;
+  for (const auto& it : gvar_funcs) {
+    if (const auto* fn = it.second.as<FunctionNode>()) {
+      auto func = GetRef<Function>(fn);
+      // Add the global function to gradually build up the call graph.
+      n->AddToCallGraph(it.first, func);
+    }
+  }
+  data_ = std::move(n);
+}
+
+void CallGraphNode::AddToCallGraph(const GlobalVar& gv, const Function& func) {
+  CHECK(func.defined() && gv.defined());
+  // Add the current global function as an entry to the call grpah.
+  CallGraphEntry* cg_node = LookupGlobalVar(gv);
+
+  // Only GlobalVar nodes need to be handled in a function. It indicates that
+  // the global function of a callee is called by the function that is being
+  // processed. An edge will be added from the current global function, cg_node,
+  // to the node that contains the found callee GlobalVarNode.
+  //
+  // This is the major overhead for constructing a call graph because the
+  // post-order visitor will visit each AST node of the current function to
+  // figure out the dependencies between functions.
+  PostOrderVisit(func, [&](const Expr& expr) {
+    if (const GlobalVarNode* gvn = expr.as<GlobalVarNode>()) {
+      auto callee = GetRef<GlobalVar>(gvn);
+      cg_node->AddCalledGlobal(LookupGlobalVar(callee));
+    }
+  });
+}
+
+const CallGraphEntry* CallGraphNode::operator[](const GlobalVar& gv) const {
+  const_iterator cit = call_graph_.find(gv);
+  CHECK(cit != call_graph_.end())
+      << "GlobalVar " << gv->name_hint << " not found in the call graph!";
+  return cit->second.get();
+}
+
+CallGraphEntry* CallGraphNode::operator[](const GlobalVar& gv) {
+  const_iterator cit = call_graph_.find(gv);
+  CHECK(cit != call_graph_.end())
+      << "GlobalVar " << gv->name_hint << " not found in the call graph!";
+  return cit->second.get();
+}
+
+// Query the existence of a GlobalVar in the call graph. It creates an entry if
+// there is no such node available.
+CallGraphEntry* CallGraphNode::LookupGlobalVar(const GlobalVar& gv) {
+  CHECK(gv.defined());
+
+  // This inserts an element to the call graph if it is not there yet.
+  auto& call_graph_node = call_graph_[gv];
+  if (call_graph_node) return call_graph_node.get();
+
+  CHECK(module->ContainGlobalVar(gv->name_hint))
+      << "GlobalVar " << gv->name_hint << " not found in the current ir module";
+
+  // Create the node for the inserted entry.
+  call_graph_node = std::unique_ptr<CallGraphEntry>(new CallGraphEntry(gv));
+  return call_graph_node.get();
+}
+
+void CallGraphNode::Print(std::ostream& os) const {
+  // Print the call graph in the topological order.
+  std::vector<CallGraphEntry*> nodes = TopologicalOrder();
+  for (const auto* cgn : nodes) {
+    cgn->Print(os);
+  }
+}
+
+GlobalVar CallGraphNode::RemoveGlobalVarFromModule(CallGraphEntry* cg_node,
+                                                   bool update_call_graph) {
+  CHECK(cg_node->empty() || (cg_node->IsRecursive() && cg_node->size() == 1))
+      << "Cannot remove global var " << cg_node->GetNameHint()
+      << " from call graph, because it still calls "
+      << cg_node->size() << " other global functions";
+
+  if (update_call_graph) {
+    // Update the call graph by removing all edges that point to the node
+    // `cg_node`.
+    for (auto& it : *this) {
+      it.second->RemoveAllCallTo(cg_node);
+    }
+  }
+  GlobalVar gv = cg_node->GetGlobalVar();
+  call_graph_.erase(gv);
+  // Update the IR module.
+  module->Remove(gv);
+  return gv;
+}
+
+std::vector<CallGraphEntry*> CallGraphNode::GetEntryGlobals() const {
+  std::vector<CallGraphEntry*> ret;
+  // An entry function in Relay is a function that never called by other
+  // functions or only called by itself.
+  for (const auto& it : *this) {
+    if (it.second->GetRefCount() == 0 || it.second->IsRecursiveEntry()) {
+      ret.push_back(it.second.get());
+    }
+  }
+  return ret;
+}
+
+std::vector<CallGraphEntry*> CallGraphNode::TopologicalOrder() const {
+  std::vector<CallGraphEntry*> ret;
+  // Collect all entry nodes.
+  std::vector<CallGraphEntry*> entries = GetEntryGlobals();
+  CallGraphEntry::CallGraphEntrySet visited;
+
+  for (const auto& it : entries) {
+    // Keep tracking the nodes that have been visited.
+    auto topo = it->TopologicalOrder(&visited);
+    // Prepend the collected items. The intermediate nodes that are shared by
+    // multiple entries are guaranteed to be collected when visiting the
+    // previous entries. Therefore, topological order remains.
+    ret.insert(ret.begin(), topo.begin(), topo.end());
+  }
+
+  // Find out the missing global functions if there are any to help debugging.
+  if (ret.size() != module->functions.size()) {
+    for (auto it : module->functions) {
+      if (visited.find((*this)[it.first]) == visited.end()) {
+        LOG(WARNING) << "Missing global:" << it.first->name_hint
+                     << " with # refs = " << (*this)[it.first]->GetRefCount();
+      }
+    }
+    LOG(FATAL) << "Expected " << module->functions.size()
+               << " globals, but received "
+               << ret.size();
+  }
+
+  return ret;
+}
+
+// BSF traversal is used to collect the nodes in a CallGraphEntry. The nodes
+// that are visited by previous CallGraphEntry entries can be memoized. This
+// helps us to make sure no entry will be visited multiple times when collecting
+// the nodes for an entire call graph.
+std::vector<CallGraphEntry*> CallGraphEntry::TopologicalOrder(
+    CallGraphEntrySet* visited) const {
+  std::vector<CallGraphEntry*> ret;
+  std::vector<CallGraphEntry*> current_nodes;
+  if (visited->find(this) == visited->end()) {
+    visited->emplace(this);
+    current_nodes.emplace_back(const_cast<CallGraphEntry*>(this));
+  }
+
+  std::vector<CallGraphEntry*> next_nodes;
+  while (!current_nodes.empty()) {
+    for (const auto& node : current_nodes) {
+      ret.push_back(node);
+      // Iterate through the called entries.
+      for (auto git = node->begin(); git != node->end(); ++git) {
+        if (visited->find(git->second) == visited->end()) {
+          next_nodes.push_back(git->second);
+          visited->emplace(git->second);
+        }
+      }
+    }
+    // Update the current level and clean the next level.
+    current_nodes = next_nodes;
+    next_nodes.clear();
+  }
+  return ret;
+}
+
+void CallGraphEntry::CleanCallGraphEntries() {
+  while (!called_globals_.empty()) {
+    // Decrement the reference counter
+    called_globals_.back().second->DecRef();
+    called_globals_.pop_back();
+  }
+}
+
+inline void CallGraphEntry::AddCalledGlobal(CallGraphEntry* cg_node) {
+  called_globals_.emplace_back(global_, cg_node);
+  // Increment the reference to indicate that another call site is found for
+  // the callee in `cg_node`.
+  cg_node->IncRef();
+  // Mark the global function as recursive if it calls itself.
+  if (global_ == cg_node->GetGlobalVar()) {
+    cg_node->is_recursive_ = true;
+  }
+}
+
+// Remove an edge from the current global function to the callee.
+void CallGraphEntry::RemoveCallTo(const GlobalVar& callee) {
+  for (auto it = begin();; ++it) {
+    CHECK(it != end()) << "Cannot find global function "
+                       << callee->name_hint << " to remove!";
+    if (it->second->GetGlobalVar() == callee) {
+      // Only remove one occurrence of the call site.
+      it->second->DecRef();
+      *it = called_globals_.back();
+      called_globals_.pop_back();
+      return;
+    }
+  }
+}
+
+// Remove all edges from the current global function to the callee.
+void CallGraphEntry::RemoveAllCallTo(CallGraphEntry* callee) {
+  for (uint32_t i = 0, e = size(); i != e;) {
+    if (called_globals_[i].second == callee) {
+      callee->DecRef();
+      called_globals_[i] = called_globals_.back();
+      called_globals_.pop_back();
+      --e;
+    } else {
+      ++i;
+    }
+  }
+  // Make sure all references to the callee are removed.
+  CHECK_EQ(callee->GetRefCount(), 0U)
+      << "All references to " << callee->GetNameHint()
+      << " should have been removed";
+}
+
+void CallGraphEntry::Print(std::ostream& os) const {
+  if (!global_.defined()) {
+    os << "GlobalVar is not defined\n";
+    return;
+  }
+
+  os << "Call graph node: " << global_->name_hint;
+  os << " at: " << this << ",  #refs = " << GetRefCount() << "\n";
+
+  for (const auto& it : *this) {
+    os << "  call site: <" << it.first->name_hint << "> calls ";
+    os << it.second->GetNameHint() << "\n";
+  }
+  os << "\n";
+}
+
+std::ostream& operator<<(std::ostream& os, const CallGraph& cg) {
+  cg->Print(os);
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const CallGraphEntry& cgn) {
+  cgn.Print(os);
+  return os;
+}
+
+TVM_REGISTER_NODE_TYPE(CallGraphNode);
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+.set_dispatch<CallGraphNode>([](const ObjectRef& ref, ReprPrinter* p) {
+  auto* node = static_cast<const CallGraphNode*>(ref.get());
+  CHECK(node);
+  p->stream << "CallGraph: \n" << GetRef<CallGraph>(node);
+});
+
+TVM_REGISTER_GLOBAL("relay._analysis.CallGraph")
+.set_body_typed([](IRModule module) {
+  return CallGraph(module);
+});
+
+TVM_REGISTER_GLOBAL("relay._analysis.PrintCallGraph")
+.set_body_typed([](CallGraph call_graph) {
+  std::stringstream ss;
+  ss << call_graph;
+  return ss.str();
+});
+
+TVM_REGISTER_GLOBAL("relay._analysis.GetModule")
+.set_body_typed([](CallGraph call_graph) {
+  return call_graph->GetModule();
+});
+
+TVM_REGISTER_GLOBAL("relay._analysis.PrintCallGraphGlobalVar")
+.set_body_typed([](CallGraph call_graph, GlobalVar var) {
+  const auto* entry_node = call_graph[var];
+  std::stringstream ss;
+  ss << *entry_node;
+  return ss.str();
+});
+
+TVM_REGISTER_GLOBAL("relay._analysis.GetRefCountGlobalVar")
+.set_body_typed([](CallGraph call_graph, GlobalVar var) {
+  const auto* entry_node = call_graph[var];
+  return static_cast<int>(entry_node->GetRefCount());
+});
+
+TVM_REGISTER_GLOBAL("relay._analysis.GetGlobalVarCallCount")
+.set_body_typed([](CallGraph call_graph, GlobalVar var) {
+  const auto* entry_node = call_graph[var];
+  return static_cast<int>(entry_node->size());
+});
+
+TVM_REGISTER_GLOBAL("relay._analysis.IsRecursive")
+.set_body_typed([](CallGraph call_graph, GlobalVar var) {
+  const auto* entry_node = call_graph[var];
+  return entry_node->IsRecursive();
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/call_graph.h b/src/relay/pass/call_graph.h
new file mode 100644
index 000000000000..340ee30bc5d2
--- /dev/null
+++ b/src/relay/pass/call_graph.h
@@ -0,0 +1,509 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/pass/call_graph.h
+ * \brief Define data structures for the call graph of a IRModule. It borrows
+ * the idea how LLVM constructs CallGraph.
+ *
+ * https://llvm.org/doxygen/CallGraph_8h_source.html
+ */
+
+#ifndef TVM_RELAY_PASS_CALL_GRAPH_H_
+#define TVM_RELAY_PASS_CALL_GRAPH_H_
+
+#include <tvm/ir/module.h>
+#include <tvm/relay/expr.h>
+#include <tvm/runtime/object.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+class CallGraphEntry;
+class CallGraph;
+
+class CallGraphNode : public Object {
+  using CallGraphMap =
+      std::unordered_map<GlobalVar, std::unique_ptr<CallGraphEntry>, ObjectHash,
+                         ObjectEqual>;
+  // Create iterator alias for a CallGraphNode object.
+  using iterator = CallGraphMap::iterator;
+  using const_iterator = CallGraphMap::const_iterator;
+
+ public:
+  /*! \brief The IR module for creating a CallGraphNode. */
+  IRModule module;
+
+  /*! \brief Default constructor. */
+  CallGraphNode() {}
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("module", &module);
+  }
+
+  /*!
+   * \brief Print the call graph.
+   *
+   * \param os The stream for printing.
+   */
+  void Print(std::ostream& os) const;
+
+  /*! \return The begin iterator. */
+  iterator begin() {
+    return call_graph_.begin();
+  }
+  /*! \return The end iterator. */
+  iterator end() {
+    return call_graph_.end();
+  }
+  /*! \return The begin iterator. */
+  const_iterator begin() const {
+    return call_graph_.begin();
+  }
+  /*! \return The end iterator. */
+  const_iterator end() const {
+    return call_graph_.end();
+  }
+
+  /*!
+   * \brief Get an element from the CallGraphNode using a GlobalVar.
+   *
+   * \param gv The GlobalVar used for indexing.
+   *
+   * \return The fetched element.
+   */
+  const CallGraphEntry* operator[](const GlobalVar& gv) const;
+  /*!
+   * \brief Get an element from the CallGraphNode using a GlobalVar.
+   *
+   * \param gv The GlobalVar used for indexing.
+   *
+   * \return The fetched element.
+   */
+  CallGraphEntry* operator[](const GlobalVar& gv);
+  /*!
+   * \brief Get an element from the CallGraphNode using the global function name.
+   *
+   * \param gvar_name The global function name used for indexing.
+   *
+   * \return The fetched element.
+   */
+  const CallGraphEntry* operator[](const std::string& gvar_name) const {
+    return (*this)[module->GetGlobalVar(gvar_name)];
+  }
+  /*!
+   * \brief Get an element from the CallGraphNode using the global function name.
+   *
+   * \param gvar_name The global function name used for indexing.
+   *
+   * \return The fetched element.
+   */
+  CallGraphEntry* operator[](const std::string& gvar_name) {
+    return (*this)[module->GetGlobalVar(gvar_name)];
+  }
+
+  /*! \brief Return the IR module. */
+  IRModule GetModule() const {
+    return module;
+  }
+
+  /*!
+   * \brief Get the entries/root nodes of CallGraphNode.
+   *
+   *  Entry functions are never referenced by other functions.
+   *  Note these functions can be recursive as well.
+   *
+   * \return The list of CallGraphEntry that represent entry nodes.
+   */
+  std::vector<CallGraphEntry*> GetEntryGlobals() const;
+
+  /*!
+   * \brief Remove a GlobalVar in a given CallGraphEntry from the current
+   *        IR module.
+   *
+   * \param cg_node The CallGraphEntry that contains a global function to be
+   *        removed.
+   * \param update_call_graph Indicate if we will update the CallGraph as well
+   *        since updating is costly. We are only able to remove a leaf function
+   *        when update_call_graph is disabled because the edges pointing to
+   *        functions being removed are not updated.
+   *
+   * \return The GlobalVar removed from the current module.
+   */
+  GlobalVar RemoveGlobalVarFromModule(CallGraphEntry* cg_node,
+                                      bool update_call_graph = false);
+
+  /*!
+   * \brief Lookup a GlobalVar for the CallGraphNode. It creates an entry for
+   *        the GlobalVar if it doesn't exist.
+   *
+   * \param gv The GlobalVar for query.
+   *
+   * \return The queried entry.
+   */
+  CallGraphEntry* LookupGlobalVar(const GlobalVar& gv);
+
+  /*!
+   * \brief Get the entries from the CallGraphNode in the topological order.
+   *
+   *  This is useful for various module-level optimizations/analysis. For example,
+   *  inlining requires the correct order of the functions being processed, i.e.
+   *  callee should be always handled before callers.
+   *
+   * \return The list of collected entries that are sorted in the topological order.
+   */
+  std::vector<CallGraphEntry*> TopologicalOrder() const;
+
+  static constexpr const char* _type_key = "relay.CallGraph";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CallGraphNode, Object);
+
+ private:
+  /*!
+   * \brief Create a CallGraphEntry for a global function and add it to the
+   *        CallGraphNode.
+   *
+   * \param gv The global var.
+   * \param func The global function corresponding to `gv`.
+   */
+  void AddToCallGraph(const GlobalVar& gv, const Function& func);
+
+  /*! \brief A record contains GlobalVar to CallGraphEntry mapping. */
+  CallGraphMap call_graph_;
+
+  friend CallGraph;
+};
+
+/*!
+ * \brief The class that represents the call graph of a Relay IR module. It also
+ * provides a variety of utility functions for users to query, view, and update
+ * a call graph.
+ */
+class CallGraph : public ObjectRef {
+  using CallGraphMap =
+      std::unordered_map<GlobalVar, std::unique_ptr<CallGraphEntry>, ObjectHash,
+                         ObjectEqual>;
+  // Create iterator alias for a CallGraph object.
+  using iterator = CallGraphMap::iterator;
+  using const_iterator = CallGraphMap::const_iterator;
+
+ public:
+  /*!
+   * \brief Construct a CallGraph from a IR module.
+   *
+   * \param module The IR module
+   */
+  explicit CallGraph(IRModule module);
+
+  /*!
+   * \brief Construct from an object pointer.
+   * \param n The object pointer.
+   */
+  explicit CallGraph(ObjectPtr<Object> n) : ObjectRef(n) {}
+
+  /*! \return The begin iterator. */
+  iterator begin() {
+    auto* n = operator->();
+    CHECK(n);
+    return n->begin();
+  }
+  /*! \return The end iterator. */
+  iterator end() {
+    auto* n = operator->();
+    CHECK(n);
+    return n->end();
+  }
+  /*! \return The begin iterator. */
+  const_iterator begin() const {
+    const auto* n = operator->();
+    CHECK(n);
+    return n->begin();
+  }
+  /*! \return The end iterator. */
+  const_iterator end() const {
+    const auto* n = operator->();
+    CHECK(n);
+    return n->end();
+  }
+
+  /*!
+   * \brief Get an element from the CallGraph using a GlobalVar.
+   *
+   * \param gv The GlobalVar used for indexing.
+   *
+   * \return The fetched element.
+   */
+  const CallGraphEntry* operator[](const GlobalVar& gv) const {
+    const auto* n = operator->();
+    CHECK(n);
+    return (*n)[gv];
+  }
+  /*!
+   * \brief Get an element from the CallGraph using a GlobalVar.
+   *
+   * \param gv The GlobalVar used for indexing.
+   *
+   * \return The fetched element.
+   */
+  CallGraphEntry* operator[](const GlobalVar& gv) {
+    auto* n = operator->();
+    CHECK(n);
+    return (*n)[gv];
+  }
+  /*!
+   * \brief Get an element from the CallGraph using the global function name.
+   *
+   * \param gvar_name The global function name used for indexing.
+   *
+   * \return The fetched element.
+   */
+  const CallGraphEntry* operator[](const std::string& gvar_name) const {
+    const auto* n = operator->();
+    CHECK(n);
+    return (*n)[gvar_name];
+  }
+  /*!
+   * \brief Get an element from the CallGraph using the global function name.
+   *
+   * \param gvar_name The global function name used for indexing.
+   *
+   * \return The fetched element.
+   */
+  CallGraphEntry* operator[](const std::string& gvar_name) {
+    auto* n = operator->();
+    CHECK(n);
+    return (*n)[gvar_name];
+  }
+
+  /*! \return mutable pointers to the node. */
+  CallGraphNode* operator->() const {
+    auto* ptr = get_mutable();
+    CHECK(ptr != nullptr);
+    return static_cast<CallGraphNode*>(ptr);
+  }
+
+ private:
+  /*! \brief Overload the << operator to print a call graph. */
+  friend std::ostream& operator<<(std::ostream& os, const CallGraph&);
+};
+
+/*!
+ * \brief A node in the call graph. It maintains the edges from a caller to
+ * all callees.
+ */
+class CallGraphEntry {
+ public:
+  using CallGraphEntryPair = std::pair<GlobalVar, CallGraphEntry*>;
+  using CallGraphEntryVector = std::vector<CallGraphEntryPair>;
+  using CallGraphEntrySet = std::unordered_set<const CallGraphEntry*>;
+  // Create iterator alias for a CallGraphEntry object.
+  using iterator = std::vector<CallGraphEntryPair>::iterator;
+  using const_iterator = std::vector<CallGraphEntryPair>::const_iterator;
+
+  /*!
+   * \brief Construct from a GlobalVar.
+   *
+   * \param gv The GlobalVar to create a CallGraphEntry.
+   */
+  explicit CallGraphEntry(const GlobalVar& gv) : global_(gv) {}
+  /*!
+   * \brief Delete copy constructor.
+   */
+  CallGraphEntry(const CallGraphEntry&) = delete;
+  /*! \brief Delete assignment. */
+  CallGraphEntry& operator=(const CallGraphEntry&) = delete;
+
+  /*! \return The begin iterator */
+  iterator begin() {
+    return called_globals_.begin();
+  }
+  /*! \return The end iterator */
+  iterator end() {
+    return called_globals_.end();
+  }
+  /*! \return The const begin iterator */
+  const_iterator begin() const {
+    return called_globals_.begin();
+  }
+  /*! \return The const end iterator */
+  const_iterator end() const {
+    return called_globals_.end();
+  }
+
+  /*!
+   * \brief Return if the list of called nodes is empty.
+   *
+   * \return true if the list is empty. Otherwise, false.
+   */
+  bool empty() const {
+    return called_globals_.empty();
+  }
+
+  /*!
+   * \brief Return the size of the list that represents the nodes are called by
+   * the current node.
+   *
+   * \return The number of called nodes.
+   */
+  uint32_t size() const {
+    return static_cast<uint32_t>(called_globals_.size());
+  }
+
+  /*!
+   * \brief Fetch the i-th CallGraphEntry from the list of nodes that are called
+   * by the current function.
+   *
+   * \param i The index.
+   *
+   * \return The fetched CallGraphEntry.
+   */
+  CallGraphEntry* operator[](size_t i) const {
+    CHECK_LT(i, called_globals_.size()) << "Invalid Index";
+    return called_globals_[i].second;
+  }
+
+  /*!
+   * \brief Print the call graph that is stemmed from the current CallGraphEntry.
+   *
+   * \param os The stream for printing.
+   */
+  void Print(std::ostream& os) const;
+
+  /*!
+   * \brief Return the number of times the global function is referenced.
+   *
+   * \return The count.
+   */
+  uint32_t GetRefCount() const {
+    return ref_cnt_;
+  }
+
+  /*!
+   * \brief Return the GlobalVar stored in the current CallGraphEntry.
+   *
+   * \return The GlobalVar.
+   */
+  GlobalVar GetGlobalVar() const {
+    return global_;
+  }
+
+  /*!
+   * \brief Return the name hint of the GlobalVar stored in the CallGraphEntry.
+   *
+   * \return The name hint of the global function.
+   */
+  std::string GetNameHint() const {
+    return global_->name_hint;
+  }
+
+  /*!
+   * \brief Return if the global function corresponding to the current
+   * CallGraphEntry is a recursive function.
+   *
+   * \return true if it is recursive. Otherwise, false.
+   */
+  bool IsRecursive() const {
+    return is_recursive_;
+  }
+
+  /*!
+   * \brief Return if the global function corresponding to the current
+   * CallGraphEntry is both a recursive function and an entry function. This type
+   * of function only has one reference which is called by itself.
+   *
+   * \return true if it is both a recursive function and an entry. Otherwise, false.
+   */
+  bool IsRecursiveEntry() const {
+    return GetRefCount() == 1 && IsRecursive();
+  }
+
+  /*!
+   * \brief Return the topological order of the CallGraphEntry.
+   *
+   * \param visited A set of CallGraphEntry objects that have been visited.
+   *
+   * \return The list of CallGraphEntry that is represented in topological order.
+   */
+  std::vector<CallGraphEntry*> TopologicalOrder(
+      CallGraphEntrySet* visited = new CallGraphEntrySet()) const;
+
+  /*!
+   * \brief Remove all edges from the current CallGraphEntry to any global
+   * function it calls.
+   */
+  void CleanCallGraphEntries();
+
+  /*!
+   * \brief Add a node to the list of nodes that are being called by the current
+   * global function.
+   *
+   * \param cg_node The CallGraphEntry that will be added to the call list.
+   */
+  void AddCalledGlobal(CallGraphEntry* cg_node);
+
+  /*!
+   * \brief Remove a call edge to the global function from the current
+   * function.
+   *
+   * \param callee The function that is being called.
+   */
+  void RemoveCallTo(const GlobalVar& callee);
+
+  /*!
+   * \brief Remove all the edges that represent that calls to the global function
+   * stored in a given CallGraphEntry.
+   *
+   * \param callee The function that is being called.
+   */
+  void RemoveAllCallTo(CallGraphEntry* callee);
+
+ private:
+  /*! \brief Decrement the reference counter by 1. */
+  void DecRef() {
+    CHECK_GT(ref_cnt_, 0);
+    --ref_cnt_;
+  }
+  /*! \brief Increment the reference counter by 1. */
+  void IncRef() { ++ref_cnt_; }
+
+  /*!
+   * \brief Mark if the global function stored in the CallGraphEntry is
+   * recursive function.
+   */
+  bool is_recursive_{false};
+  /*! \brief Count the number of times the global function is referenced. */
+  uint32_t ref_cnt_{0};
+  /*! \brief The GlobalVar stored in the current CallGraphEntry. */
+  GlobalVar global_;
+  /*! \brief The list of entries called by the current CallGraphEntry. */
+  CallGraphEntryVector called_globals_;
+
+  friend class CallGraph;
+  /*! \brief Overload the << operator to print a call graph node. */
+  friend std::ostream& operator<<(std::ostream& os, const CallGraphEntry&);
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_CALL_GRAPH_H_
diff --git a/tests/python/relay/test_call_graph.py b/tests/python/relay/test_call_graph.py
new file mode 100644
index 000000000000..fbbda678b102
--- /dev/null
+++ b/tests/python/relay/test_call_graph.py
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+import pytest
+import tvm
+from tvm import relay
+
+
+def test_callgraph_construct():
+    mod = tvm.IRModule({})
+    x = relay.var("x", shape=(2, 3))
+    y = relay.var("y", shape=(2, 3))
+    mod["g1"] = relay.Function([x, y], x + y)
+    call_graph = relay.CallGraph(mod)
+    assert "g1" in str(call_graph)
+    assert relay.alpha_equal(mod, call_graph.module)
+
+
+def test_print_element():
+    mod = tvm.IRModule({})
+    x0 = relay.var("x0", shape=(2, 3))
+    y0 = relay.var("y0", shape=(2, 3))
+    mod["g0"] = relay.Function([x0, y0], x0 + y0)
+    x1 = relay.var("x1", shape=(2, 3))
+    y1 = relay.var("y1", shape=(2, 3))
+    mod["g1"] = relay.Function([x1, y1], x1 - y1)
+    call_graph = relay.CallGraph(mod)
+
+    assert "#refs = 0" in str(call_graph.print_var("g0"))
+    assert "#refs = 0" in str(call_graph.print_var("g1"))
+
+
+def test_global_call_count():
+    mod = tvm.IRModule({})
+    x0 = relay.var("x0", shape=(2, 3))
+    y0 = relay.var("y0", shape=(2, 3))
+    g0 = relay.GlobalVar("g0")
+    mod[g0] = relay.Function([x0, y0], x0 + y0)
+    x1 = relay.var("x1", shape=(2, 3))
+    y1 = relay.var("y1", shape=(2, 3))
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = relay.Function([x1, y1], g0(x1, y1))
+    call_graph = relay.CallGraph(mod)
+
+    p0 = relay.var("p0", shape=(2, 3))
+    p1 = relay.var("p1", shape=(2, 3))
+    func = relay.Function([p0, p1], g0(p0, p1) * g1(p0, p1))
+    mod["main"] = func
+    call_graph = relay.CallGraph(mod)
+
+    assert call_graph.global_call_count(g0) == 0
+    assert call_graph.global_call_count(g1) == 1
+    assert call_graph.global_call_count("main") == 2
+
+
+def test_ref_count():
+    mod = tvm.IRModule({})
+    x0 = relay.var("x0", shape=(2, 3))
+    y0 = relay.var("y0", shape=(2, 3))
+    g0 = relay.GlobalVar("g0")
+    mod[g0] = relay.Function([x0, y0], x0 + y0)
+    x1 = relay.var("x1", shape=(2, 3))
+    y1 = relay.var("y1", shape=(2, 3))
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = relay.Function([x1, y1], x1 - y1)
+    call_graph = relay.CallGraph(mod)
+
+    p0 = relay.var("p0", shape=(2, 3))
+    p1 = relay.var("p1", shape=(2, 3))
+    func = relay.Function([p0, p1], g0(p0, p1) * g1(p0, p1))
+    mod["main"] = func
+    call_graph = relay.CallGraph(mod)
+
+    assert call_graph.ref_count(g0) == 1
+    assert call_graph.ref_count(g1) == 1
+    assert call_graph.ref_count("main") == 0
+
+
+def test_nested_ref():
+    mod = tvm.IRModule({})
+    x0 = relay.var("x0", shape=(2, 3))
+    y0 = relay.var("y0", shape=(2, 3))
+    g0 = relay.GlobalVar("g0")
+    mod[g0] = relay.Function([x0, y0], x0 + y0)
+    x1 = relay.var("x1", shape=(2, 3))
+    y1 = relay.var("y1", shape=(2, 3))
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = relay.Function([x1, y1], g0(x1, y1))
+    call_graph = relay.CallGraph(mod)
+
+    p0 = relay.var("p0", shape=(2, 3))
+    p1 = relay.var("p1", shape=(2, 3))
+    func = relay.Function([p0, p1], g0(p0, p1) * g1(p0, p1))
+    mod["main"] = func
+    call_graph = relay.CallGraph(mod)
+
+    assert call_graph.ref_count(g0) == 2
+    assert call_graph.ref_count(g1) == 1
+    assert call_graph.ref_count("main") == 0
+
+
+def test_recursive_func():
+    mod = tvm.IRModule({})
+
+    x = relay.var('x', shape=[], dtype='int32')
+    fn0 = relay.Function([x], x)
+    gx = relay.GlobalVar("gx")
+    mod[gx] = fn0
+
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    sb = relay.ScopeBuilder()
+    with sb.if_scope(relay.equal(i, relay.const(0, dtype='int32'))):
+        sb.ret(i)
+    with sb.else_scope():
+        one_less = relay.subtract(i, relay.const(1, dtype='int32'))
+        global_call = gx(i)
+        rec_call = relay.Call(sum_up, [one_less]) + global_call
+        sb.ret(relay.add(rec_call, i))
+    func = relay.Function([i],
+                          sb.get(),
+                          ret_type=relay.TensorType([], 'int32'))
+    func = func.set_attribute("Compiler", tvm.tir.StringImm("a"))
+    mod[sum_up] = func
+    iarg = relay.var('i', shape=[], dtype='int32')
+    mod["main"] = relay.Function([iarg], sum_up(iarg))
+    call_graph = relay.CallGraph(mod)
+
+    assert call_graph.is_recursive(sum_up)
+    assert call_graph.ref_count(sum_up) == 2
+    assert call_graph.ref_count(gx) == 1
+    assert call_graph.ref_count("main") == 0
+
+
+if __name__ == "__main__":
+    pytest.main()

From 502ab605e399f0cdfeec5611b48af94efe436b60 Mon Sep 17 00:00:00 2001
From: Nick Hynes <nhynes@oasislabs.com>
Date: Wed, 26 Feb 2020 13:44:32 -0800
Subject: [PATCH 44/73] Remove SGX toolchain installation from CI Dockerfile
 (#4948)

---
 docker/Dockerfile.ci_cpu | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 6484d1ea7053..10c8c62d970b 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -33,11 +33,6 @@ RUN bash /install/ubuntu_install_python_package.sh
 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
 
-# SGX deps (build early; changes infrequently)
-COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
-RUN bash /install/ubuntu_install_sgx.sh
-ENV LD_LIBRARY_PATH /opt/sgxsdk/lib64:${LD_LIBRARY_PATH}
-
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh

From 8e7e7792aafb9333a3f2b3fe9527249b9c987e54 Mon Sep 17 00:00:00 2001
From: Ina Dobreva <55383260+inadob@users.noreply.github.com>
Date: Thu, 27 Feb 2020 01:43:11 +0200
Subject: [PATCH 45/73] [Frontend][TFLite] Add parser support for 'square'
 operator (#4915)

* [Frontend][TFLite] Add parser support for square operator

* Add parser implementation
* Add relevant tests
* Note: 'square' is an unary elemwise operator but it's added separately
  in the parser since there is no Relay 'square' op
  and instead we have to use 'multiply'

* Change relay operation from 'multiply' to 'power'

* Remove a redundant line as requested
---
 python/tvm/relay/frontend/tflite.py          | 29 +++++++++++++++++++-
 tests/python/frontend/tflite/test_forward.py |  7 +++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 352bc6302ee0..e132d4ca3585 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -121,7 +121,8 @@ def __init__(self, model, subgraph, exp_tab):
             'SQUARED_DIFFERENCE': self.convert_squared_difference,
             'LOGICAL_AND': self.convert_logical_and,
             'LOGICAL_OR': self.convert_logical_or,
-            'DETECTION_POSTPROCESS': self.convert_detection_postprocess
+            'DETECTION_POSTPROCESS': self.convert_detection_postprocess,
+            'SQUARE': self.convert_square,
         }
 
     def check_unsupported_ops(self):
@@ -636,6 +637,32 @@ def convert_neg(self, op):
                 'TFlite quantized NEG operator is not supported yet.')
         return self._convert_unary_elemwise(_op.negative, op)
 
+    def convert_square(self, op):
+        """Convert TFLite SQUARE"""
+        try:
+            from tflite.Operator import Operator
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 1, "input tensors length should be 1"
+        input_tensor = input_tensors[0]
+        in_expr = self.get_expr(input_tensor.tensor_idx)
+
+        output_tensors = self.get_output_tensors(op)
+        assert len(output_tensors) == 1, "output tensors length should be 1"
+        output_tensor = output_tensors[0]
+
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized SQUARE operator is not supported yet.')
+
+        exp_type = self.get_tensor_type_str(output_tensor.tensor.Type())
+        out = _op.power(in_expr, relay.const(2, exp_type))
+
+        return out
+
     def _convert_elemwise(self, relay_op, op):
         """Generic method to Convert TFLite elemwise"""
         try:
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 427d4bfe2810..f4b7ee0cd8b1 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -742,6 +742,12 @@ def _test_neg(data):
     """ One iteration of neg """
     return _test_unary_elemwise(math_ops.neg, data)
 #######################################################################
+# Square
+# ------
+
+def _test_square(data):
+    """ One iteration of square """
+    return _test_unary_elemwise(math_ops.square, data)
 
 def _test_forward_unary_elemwise(test_op):
     # functions that need positive input
@@ -759,6 +765,7 @@ def test_all_unary_elemwise():
     _test_forward_unary_elemwise(_test_sqrt)
     _test_forward_unary_elemwise(_test_rsqrt)
     _test_forward_unary_elemwise(_test_neg)
+    _test_forward_unary_elemwise(_test_square)
     # ceil and cos come with TFLite 1.14.0.post1 fbs schema
     if package_version.parse(tf.VERSION) >= package_version.parse('1.14.0'):
         _test_forward_unary_elemwise(_test_ceil)

From 09c55fd1f3354d2280bb792a252590ac6bd68e58 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Wed, 26 Feb 2020 15:52:28 -0800
Subject: [PATCH 46/73] [VTA] YoloV3 Support (#4887)

* [VTA] YoloV3 Support

Issue:
YoloV3 use some operator and logic that not get good support by
existing vta logic, like nn.pad, upsample, and 255 output channel.

Solution:
add related logic to let darknet YoloV3 can running on VTA

* Fix small(0, or 1 heigh/width) detect frame issue.

* add yolov3-tiny turtorial

* add os import

* address review comments.

* rename tutorial file with a short name.

* rename deploy_vision_on_vta.py into deploy_classification.py.

* address review comment, fix plint eror in deploy_detection.py
---
 vta/python/vta/top/graphpack.py               | 117 ++++++-
 ...ion_on_vta.py => deploy_classification.py} |   0
 vta/tutorials/frontend/deploy_detection.py    | 330 ++++++++++++++++++
 3 files changed, 439 insertions(+), 8 deletions(-)
 rename vta/tutorials/frontend/{deploy_vision_on_vta.py => deploy_classification.py} (100%)
 create mode 100644 vta/tutorials/frontend/deploy_detection.py

diff --git a/vta/python/vta/top/graphpack.py b/vta/python/vta/top/graphpack.py
index 76b3dc54b113..2689fbcb6ec7 100644
--- a/vta/python/vta/top/graphpack.py
+++ b/vta/python/vta/top/graphpack.py
@@ -31,6 +31,8 @@ def run_opt_pass(expr, opt_pass):
     return entry if isinstance(expr, relay.Function) else entry.body
 
 def _to_shape(shape):
+    """ convert shape into tuple.
+    """
     return tuple(int(sh) for sh in shape)
 
 def _pack_batch_channel(data, dshape, bfactor, cfactor):
@@ -55,6 +57,49 @@ def _unpack_batch_channel(data, old_shape):
     return data
 
 
+def _const_shape_match(data, dshape, cfactor_out):
+    """ Pad the constant if the shape[0] not divisible by cfactor_out.
+    """
+    assert len(dshape) == 3
+    pad_width = int(dshape[0]) % cfactor_out
+    if pad_width != 0:
+        pad_width = cfactor_out -pad_width
+        data = op.nn.pad(data, [[0, pad_width], [0, 0], [0, 0]])
+        dshape = tuple([dshape[0] + pad_width, dshape[1], dshape[2]])
+    return data, dshape
+
+def _weight_shape_match(data, dshape, channels, cfactor_out, transpose=False):
+    """ Pad the weight if the shape[0] not divisible by cfactor_out.
+    """
+    assert len(dshape) == 4
+    pad_width = int(dshape[0]) % cfactor_out
+    channels_pad = int(channels) % cfactor_out
+    if pad_width != 0:
+        pad_width = cfactor_out - pad_width
+        data = op.nn.pad(data, [[0, pad_width], [0, 0], [0, 0], [0, 0]])
+        dshape = tuple([dshape[0] + pad_width, dshape[1], dshape[2], dshape[3]])
+
+    if channels_pad != 0:
+        channels = channels + (cfactor_out - channels_pad)
+
+    return data, dshape, channels
+
+def _weight_shape_match_transpose(data, dshape, channels, cfactor_out):
+    """ Pad the weight if the shape[1] not divisible by cfactor_out.
+    """
+    assert len(dshape) == 4
+    pad_width = int(dshape[1]) % cfactor_out
+    channels_pad = int(channels) % cfactor_out
+    if pad_width != 0:
+        pad_width = cfactor_out - pad_width
+        data = op.nn.pad(data, [[0, 0], [0, pad_width], [0, 0], [0, 0]])
+        dshape = tuple(dshape[0], [dshape[1] + pad_width, dshape[2], dshape[3]])
+
+    if channels_pad != 0:
+        channels = channels + (cfactor_out - channels_pad)
+
+    return data, dshape, channels
+
 def _pack_weight(data, dshape, cfactor):
     """Pack the weight into packed format.
     """
@@ -106,10 +151,19 @@ def _pack_const(data, dshape, dtype, bfactor, cfactor):
     return data
 
 
-def _get_shape(node):
-    """Get the shape of a node.
+def _get_tensor_shape(node):
+    """Get node shape.
     """
-    return _to_shape(node.checked_type.shape)
+    if isinstance(node.checked_type, relay.ty.TensorType):
+        return _to_shape(node.checked_type.shape)
+    return []
+
+def _get_tensor_type(node):
+    """Get node type.
+    """
+    if isinstance(node.checked_type, relay.ty.TensorType):
+        return node.checked_type.dtype
+    return "float32"
 
 def _operator_idx_inc(expr, count_meta, operator_current_idx):
     """Increase operator index
@@ -136,14 +190,17 @@ def __init__(self, bfactor, cfactor, weight_bits):
         self.add = op.op.get("add")
         self.multiply = op.op.get("multiply")
         self.bias_add = op.op.get("nn.bias_add")
+        self.pad = op.op.get("nn.pad")
+        self.upsampling = op.op.get("nn.upsampling")
+        self.reshape = op.op.get("reshape")
         self.number_of_conv2d = 0
         super().__init__()
 
     def visit_call(self, call):
         """ Visit the children. """
         # First visit the children.
-        oshape = _get_shape(call)
-        odtype = call.checked_type.dtype
+        oshape = _get_tensor_shape(call)
+        odtype = _get_tensor_type(call)
         input_types = [arg.checked_type for arg in call.args]
         args = [self.visit(arg) for arg in call.args]
 
@@ -156,7 +213,7 @@ def visit_call(self, call):
             if self.start_pack:
                 self.start_pack = False
                 data = args[0]
-                data_shape = _get_shape(call.args[0])
+                data_shape = _get_tensor_shape(call.args[0])
                 return _unpack_batch_channel(data, data_shape)
         if self.start_pack:
             # Operator cases
@@ -169,11 +226,17 @@ def visit_call(self, call):
                 data, weight = args
                 data_shape = _to_shape(input_types[0].shape)
                 kernel_shape = _to_shape(input_types[1].shape)
+                channels = call.attrs.channels
+                weight, kernel_shape, channels = _weight_shape_match(weight,
+                                                                     kernel_shape,
+                                                                     channels,
+                                                                     self.cfactor)
                 kernel = _pack_weight(weight, kernel_shape, self.cfactor)
                 # insert bit packing when necessary
                 if w_lanes != 1:
                     assert 8 % w_lanes == 0
                     kernel = op.bitpack(kernel, lanes=w_lanes)
+
                 conv2d = op.nn.conv2d(
                     data,
                     kernel,
@@ -181,7 +244,7 @@ def visit_call(self, call):
                     padding=call.attrs.padding,
                     dilation=call.attrs.dilation,
                     groups=call.attrs.groups,
-                    channels=call.attrs.channels,
+                    channels=channels,
                     kernel_size=call.attrs.kernel_size,
                     data_layout=data_layout,
                     kernel_layout=kernel_layout,
@@ -198,6 +261,11 @@ def visit_call(self, call):
                     data, weight = args
                     data_shape = _to_shape(input_types[0].shape)
                     kernel_shape = _to_shape(input_types[1].shape)
+                    channels = call.attrs.channels
+                    weight, kernel_shape, channels = _weight_shape_match_transpose(weight,
+                                                                                   kernel_shape,
+                                                                                   channels,
+                                                                                   self.cfactor)
                     kernel = _pack_weight_conv2d_transpose(weight, kernel_shape, self.cfactor)
                     conv2d = op.nn.conv2d_transpose(
                         data,
@@ -218,8 +286,11 @@ def visit_call(self, call):
                 pass
             elif call.op == self.add and len(input_types[1].shape) == 3:
                 data, const = args
+                const, input_shape = _const_shape_match(const,
+                                                        input_types[1].shape,
+                                                        self.cfactor)
                 const = _pack_const(const,
-                                    _to_shape(input_types[1].shape),
+                                    _to_shape(input_shape),
                                     input_types[1].dtype,
                                     self.bfactor,
                                     self.cfactor)
@@ -247,6 +318,36 @@ def visit_call(self, call):
                     input_types[0].dtype == 'int32':
                 cast = relay.Call(op.op.get('cast'), [args[0]], call.attrs)
                 return relay.Call(op.op.get('copy'), [cast])
+            elif call.op == self.pad:
+                pad_width = call.attrs.pad_width
+                if len(pad_width) == 6:
+                    pass
+                elif len(pad_width) == 4:
+                    data, = args
+                    new_pad_width = []
+                    new_pad_width.extend(pad_width)
+                    for _ in range(2):
+                        new_pad_width.append([0, 0])
+                    return op.nn.pad(data,
+                                     pad_value=call.attrs.pad_value,
+                                     pad_width=new_pad_width)
+            elif call.op == self.upsampling:
+                data, = args
+                scale_h = call.attrs.scale_h
+                scale_w = call.attrs.scale_w
+                data_layout = "NCHW%dn%dc" % (self.bfactor, self.cfactor)
+                method = call.attrs.method
+                align_corners = call.attrs.align_corners
+                return op.nn.upsampling(data,
+                                        scale_h,
+                                        scale_w,
+                                        data_layout,
+                                        method,
+                                        align_corners)
+            elif call.op == self.reshape and len(input_types[0].shape) == 4:
+                data, = args
+                data = op.transpose(data, axes=(0, 4, 1, 5, 2, 3))
+                return op.reshape(data, input_types[0].shape)
 
         return relay.Call(
             self.visit(call.op),
diff --git a/vta/tutorials/frontend/deploy_vision_on_vta.py b/vta/tutorials/frontend/deploy_classification.py
similarity index 100%
rename from vta/tutorials/frontend/deploy_vision_on_vta.py
rename to vta/tutorials/frontend/deploy_classification.py
diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
new file mode 100644
index 000000000000..09d8465f3da5
--- /dev/null
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -0,0 +1,330 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Deploy Pretrained Vision Detection Model from Darknet on VTA
+================================================
+**Author**: `Hua Jiang <https://github.com/huajsj>`_
+
+This tutorial provides an end-to-end demo, on how to run Darknet YoloV3-tiny
+inference onto the VTA accelerator design to perform Image detection tasks.
+It showcases Relay as a front end compiler that can perform quantization (VTA
+only supports int8/32 inference) as well as graph packing (in order to enable
+tensorization in the core) to massage the compute graph for the hardware target.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+# pip3 install "Pillow<7"
+#
+# YOLO-V3-tiny Model with Darknet parsing have dependancy with CFFI and CV2 library,
+# we need to install CFFI and CV2 before executing this script.
+#
+# pip3 install "Pillow<7"
+#
+# pip3 install cffi
+# pip3 install opencv-python
+#
+# Now return to the python code. Import packages.
+
+from __future__ import absolute_import, print_function
+
+import sys
+import os
+import time
+import matplotlib.pyplot as plt
+import numpy as np
+import tvm
+import vta
+from tvm import rpc, autotvm, relay
+from tvm.relay.testing import yolo_detection, darknet
+from tvm.relay.testing.darknet import __darknetffi__
+from tvm.contrib import graph_runtime, graph_runtime, util
+from tvm.contrib.download import download_testdata
+from vta.testing import simulator
+from vta.top import graph_pack
+# Make sure that TVM was compiled with RPC=1
+assert tvm.runtime.enabled("rpc")
+
+##############################################################################
+# Download yolo net configure file, weight file, darknet library file based on
+# Model Name
+# ----------------------------------------------------------------------------
+MODEL_NAME = 'yolov3-tiny'
+REPO_URL = 'https://github.com/dmlc/web-data/blob/master/darknet/'
+
+cfg_path = download_testdata('https://github.com/pjreddie/darknet/blob/master/cfg/'
+                             + MODEL_NAME + '.cfg' + '?raw=true',
+                             MODEL_NAME + '.cfg',
+                             module="darknet")
+weights_path = download_testdata('https://pjreddie.com/media/files/'
+                                 + MODEL_NAME + '.weights' + '?raw=true',
+                                 MODEL_NAME + '.weights',
+                                 module="darknet")
+
+if sys.platform in ['linux', 'linux2']:
+    darknet_lib_path = download_testdata(REPO_URL + 'lib/' + 'libdarknet2.0.so' + '?raw=true',
+                                         'libdarknet2.0.so',
+                                         module="darknet")
+elif sys.platform == 'darwin':
+    darknet_lib_path = download_testdata(REPO_URL+'lib_osx/'+'libdarknet_mac2.0.so'+'?raw=true',
+                                         'libdarknet_mac2.0.so',
+                                         module="darknet")
+else:
+    raise NotImplementedError("Darknet lib is not supported on {} platform"
+                              .format(sys.platform))
+
+##################################################
+# Download yolo categories and illustration front.
+# ------------------------------------------------
+coco_path = download_testdata(REPO_URL + 'data/' + 'coco.names' + '?raw=true',
+                              'coco.names',
+                              module='data')
+font_path = download_testdata(REPO_URL + 'data/' + 'arial.ttf' + '?raw=true',
+                              'arial.ttf',
+                              module='data')
+with open(coco_path) as f:
+    content = f.readlines()
+names = [x.strip() for x in content]
+
+########################################
+# Define the platform and model targets.
+# --------------------------------------
+# Execute on CPU vs. VTA, and define the model.
+
+# Load VTA parameters from the vta/config/vta_config.json file
+env = vta.get_env()
+# Set ``device=arm_cpu`` to run inference on the CPU
+# or ``device=vta`` to run inference on the FPGA.
+device = "vta"
+target = env.target if device == "vta" else env.target_vta_cpu
+
+pack_dict = {
+    "yolov3-tiny": ["nn.max_pool2d", "cast", 4, 185],
+}
+
+# Name of Darknet model to compile
+# The ``start_pack`` and ``stop_pack`` labels indicate where
+# to start and end the graph packing relay pass: in other words
+# where to start and finish offloading to VTA.
+# the number 4 indicate the the ``start_pack`` index is 4, the
+# number 185 indicate the ``stop_pack index`` is 185, by using
+# name and index number, here we can located to correct place
+# where to start/end when there are multiple ``nn.max_pool2d``
+# or ``cast``, print(mod.astext(show_meta_data=False)) can help
+# to find operator name and index information.
+assert MODEL_NAME in pack_dict
+
+#############################
+# Obtain an execution remote.
+# ---------------------------
+# When target is 'pynq' or other FPGA backend, reconfigure FPGA and runtime.
+# Otherwise, if target is 'sim', execute locally.
+
+if env.TARGET not in ["sim", "tsim"]:
+    # Get remote from tracker node if environment variable is set.
+    # To set up the tracker, you'll need to follow the "Auto-tuning
+    # a convolutional network for VTA" tutorial.
+    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
+    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
+    # Otherwise if you have a device you want to program directly from
+    # the host, make sure you've set the variables below to the IP of
+    # your board.
+    device_host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+    device_port = os.environ.get("VTA_PYNQ_RPC_PORT", "9091")
+    if not tracker_host or not tracker_port:
+        remote = rpc.connect(device_host, int(device_port))
+    else:
+        remote = autotvm.measure.request_remote(env.TARGET,
+                                                tracker_host,
+                                                int(tracker_port),
+                                                timeout=10000)
+    # Reconfigure the JIT runtime and FPGA.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    reconfig_start = time.time()
+    vta.reconfig_runtime(remote)
+    vta.program_fpga(remote, bitstream=None)
+    reconfig_time = time.time() - reconfig_start
+    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))
+
+# In simulation mode, host the RPC server locally.
+else:
+    remote = rpc.LocalSession()
+
+# Get execution context from remote
+ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
+
+####################################
+# Build the inference graph runtime.
+# ----------------------------------
+# Using Darknet library load downloaded vision model and compile with Relay.
+# The compilation steps are:
+#
+# 1. Front end translation from Darknet into Relay module.
+# 2. Apply 8-bit quantization: here we skip the first conv layer,
+#    and dense layer which will both be executed in fp32 on the CPU.
+# 3. Perform graph packing to alter the data layout for tensorization.
+# 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply).
+# 5. Perform relay build to object file.
+# 6. Load the object file onto remote (FPGA device).
+# 7. Generate graph runtime, `m`.
+#
+
+# Load pre-configured AutoTVM schedules
+with autotvm.tophub.context(target):
+    net = __darknetffi__.dlopen(darknet_lib_path).load_network(cfg_path.encode('utf-8'),
+                                                               weights_path.encode('utf-8'),
+                                                               0)
+    dshape = (env.BATCH, net.c, net.h, net.w)
+    dtype = 'float32'
+
+    # Measure build start time
+    build_start = time.time()
+
+    # Start front end compilation
+    mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=dshape)
+
+    if target.device_name == "vta":
+    # Perform quantization in Relay
+    # Note: We set opt_level to 3 in order to fold batch norm
+        with relay.build_config(opt_level=3):
+            with relay.quantize.qconfig(global_scale=33.0,
+                                        skip_conv_layers=[0],
+                                        store_lowbit_output=True,
+                                        round_for_shift=True):
+                mod = relay.quantize.quantize(mod, params=params)
+            # Perform graph packing and constant folding for VTA target
+            mod = graph_pack(
+                mod["main"],
+                env.BATCH,
+                env.BLOCK_OUT,
+                env.WGT_WIDTH,
+                start_name=pack_dict[MODEL_NAME][0],
+                stop_name=pack_dict[MODEL_NAME][1],
+                start_name_idx=pack_dict[MODEL_NAME][2],
+                stop_name_idx=pack_dict[MODEL_NAME][3])
+    else:
+        mod = mod["main"]
+
+    # Compile Relay program with AlterOpLayout disabled
+    with vta.build_config(disabled_pass={"AlterOpLayout"}):
+        graph, lib, params = relay.build(
+            mod,
+            target=target,
+            params=params,
+            target_host=env.target_host)
+
+    # Measure Relay build time
+    build_time = time.time() - build_start
+    print(MODEL_NAME + " inference graph built in {0:.2f}s!".format(build_time))
+
+    # Send the inference library over to the remote RPC server
+    temp = util.tempdir()
+    lib.save(temp.relpath("graphlib.o"))
+    remote.upload(temp.relpath("graphlib.o"))
+    lib = remote.load_module("graphlib.o")
+
+    # Graph runtime
+    m = graph_runtime.create(graph, lib, ctx)
+
+####################################
+# Perform image detection inference.
+# ----------------------------------
+# We run detect on an downloaded image
+# Download test image
+[neth, netw] = dshape[2:]
+test_image = 'person.jpg'
+img_url = REPO_URL + 'data/' + test_image + '?raw=true'
+img_path = download_testdata(img_url, test_image, "data")
+data = darknet.load_image(img_path, neth, netw).transpose(1, 2, 0)
+
+# Prepare test image for inference
+plt.imshow(data)
+plt.show()
+data = data.transpose((2, 0, 1))
+data = data[np.newaxis, :]
+data = np.repeat(data, env.BATCH, axis=0)
+
+# Set the network parameters and inputs
+m.set_input('data', data)
+m.set_input(**params)
+
+# Perform inference and gather execution statistics
+# More on: https://docs.tvm.ai/api/python/module.html#tvm.runtime.Module.time_evaluator
+num = 4 # number of times we run module for a single measurement
+rep = 3 # number of measurements (we derive std dev from this)
+timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
+
+if env.TARGET in ["sim", "tsim"]:
+    simulator.clear_stats()
+    timer()
+    sim_stats = simulator.stats()
+    print("\nExecution statistics:")
+    for k, v in sim_stats.items():
+        # Since we execute the workload many times, we need to normalize stats
+        # Note that there is always one warm up run
+        # Therefore we divide the overall stats by (num * rep + 1)
+        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
+else:
+    tcost = timer()
+    std = np.std(tcost.results) * 1000
+    mean = tcost.mean * 1000
+    print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
+    print("Average per sample inference time: %.2fms" % (mean/env.BATCH))
+
+# Get detection results from out
+thresh = 0.5
+nms_thresh = 0.45
+tvm_out = []
+for i in range(2):
+    layer_out = {}
+    layer_out['type'] = 'Yolo'
+    # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
+    layer_attr = m.get_output(i*4+3).asnumpy()
+    layer_out['biases'] = m.get_output(i*4+2).asnumpy()
+    layer_out['mask'] = m.get_output(i*4+1).asnumpy()
+    out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                 layer_attr[2], layer_attr[3])
+    layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
+    layer_out['classes'] = layer_attr[4]
+    tvm_out.append(layer_out)
+    thresh = 0.560
+
+# Show detection results
+img = darknet.load_image_color(img_path)
+_, im_h, im_w = img.shape
+dets = yolo_detection.fill_network_boxes((netw, neth),
+                                         (im_w, im_h),
+                                         thresh,
+                                         1,
+                                         tvm_out)
+last_layer = net.layers[net.n - 1]
+yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
+yolo_detection.draw_detections(font_path,
+                               img,
+                               dets,
+                               thresh,
+                               names,
+                               last_layer.classes)
+plt.imshow(img.transpose(1, 2, 0))
+plt.show()

From c9be16bd51bbc1fcba0246bb560473bc65692bb0 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 26 Feb 2020 16:44:09 -0800
Subject: [PATCH 47/73] [TUTORIAL] Fix tedd tutorial after strategy change
 (#4947)

* [TUTORIAL] Fix tedd tutorial after strategy change

* Remove scale, remove link to external gdoc
---
 tutorials/language/tedd.py | 169 ++++++++++++++++++-------------------
 1 file changed, 82 insertions(+), 87 deletions(-)

diff --git a/tutorials/language/tedd.py b/tutorials/language/tedd.py
index aa3fa152a519..36146880d150 100644
--- a/tutorials/language/tedd.py
+++ b/tutorials/language/tedd.py
@@ -1,42 +1,40 @@
-# Licensed to the Apache Software Foundation (ASF) under one 
-# or more contributor license agreements.  See the NOTICE file 
-# distributed with this work for additional information 
-# regarding copyright ownership.  The ASF licenses this file 
-# to you under the Apache License, Version 2.0 (the 
-# "License"); you may not use this file except in compliance 
-# with the License.  You may obtain a copy of the License at 
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0 
-# 
-# Unless required by applicable law or agreed to in writing, 
-# software distributed under the License is distributed on an 
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 
-# KIND, either express or implied.  See the License for the 
-# specific language governing permissions and limitations 
-# under the License. 
-""" 
-Use Tensor Expression Debug Display (TEDD) for Visualization 
-============================================================ 
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Use Tensor Expression Debug Display (TEDD) for Visualization
+============================================================
 **Author**: `Yongfeng Gu <https://github.com/yongfeng-nv>`_
 
-This is an introduction about using TEDD to visualize tensor expressions. 
+This is an introduction about using TEDD to visualize tensor expressions.
 
-Tensor Expressions are scheduled with primitives.  Although individual  
-primitives are usually easy to understand, they become complicated quickly  
-when you put them together. We have introduced an operational model of  
-schedule primitives in Tensor Expression in this document  
-(https://docs.google.com/document/d/1nmz00_n4Ju-SpYN0QFl3abTHTlR_P0dRyo5zsWC0Q1k/edit?usp=sharing)  
-to make it easier to understand 
+Tensor Expressions are scheduled with primitives.  Although individual
+primitives are usually easy to understand, they become complicated quickly
+when you put them together. We have introduced an operational model of
+schedule primitives in Tensor Expression.
 
-* the interactions between different schedule primitives, 
-* the impact of the schedule primitives on the final code generation. 
+* the interactions between different schedule primitives,
+* the impact of the schedule primitives on the final code generation.
 
-The operational model is based on a Dataflow Graph, a Schedule Tree and an  
-IterVar Relationship Graph. Schedule primitives perform operations on these  
+The operational model is based on a Dataflow Graph, a Schedule Tree and an
+IterVar Relationship Graph. Schedule primitives perform operations on these
 graphs.
 
-TEDD renders these three graphs from a given schedule.  This tutorial demonstrates 
-how to use TEDD and how to interpret the rendered graphs. 
+TEDD renders these three graphs from a given schedule.  This tutorial demonstrates
+how to use TEDD and how to interpret the rendered graphs.
 
 """
 from __future__ import absolute_import, print_function
@@ -63,39 +61,38 @@
 A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
 W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
 B = tvm.placeholder((1, num_filter, 1), name='bias')
-with tvm.target.create("cuda"):
+with tvm.target.create("llvm"):
     t_conv = topi.nn.conv2d(A, W, stride, padding, dilation, layout='HWCN')
     t_bias = topi.add(t_conv, B)
     t_relu = topi.nn.relu(t_bias)
-    s = topi.generic.schedule_conv2d_hwcn([t_relu]) 
- 
-###################################################################### 
+    s = topi.generic.schedule_conv2d_hwcn([t_relu])
+
+######################################################################
 # Render Graphs with TEDD
 # -----------------------
-# We render graphs to see the computation  
-# and how it is scheduled.   
-# If you run the tutorial in a Jupyter notebook, you can use the following commented lines  
+# We render graphs to see the computation
+# and how it is scheduled.
+# If you run the tutorial in a Jupyter notebook, you can use the following commented lines
 # to render SVG figures showing in notebook directly.
 #
 
-tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/dfg.dot') 
-#tedd.viz_dataflow_graph(s, show_svg = True) 
+tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/dfg.dot')
+#tedd.viz_dataflow_graph(s, show_svg = True)
 
 ######################################################################
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_dfg.png
 #      :align: center
-#      :scale: 100%
 #
-# The first one is a dataflow graph.  Every node represents a stage with name and memory  
-# scope shown in the middle and inputs/outputs information on the sides.   
-# Edges show nodes' dependency.   
+# The first one is a dataflow graph.  Every node represents a stage with name and memory
+# scope shown in the middle and inputs/outputs information on the sides.
+# Edges show nodes' dependency.
 #
 
-tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree.dot') 
-#tedd.viz_schedule_tree(s, show_svg = True) 
+tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree.dot')
+#tedd.viz_schedule_tree(s, show_svg = True)
 
 ######################################################################
-# We just rendered the schedule tree graph.  You may notice an warning about ranges not 
+# We just rendered the schedule tree graph.  You may notice an warning about ranges not
 # available.
 # The message also suggests to call normalize() to infer range information.  We will
 # skip inspecting the first schedule tree and encourage you to compare the graphs before
@@ -103,62 +100,60 @@
 #
 
 s = s.normalize()
-tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree2.dot') 
-#tedd.viz_schedule_tree(s, show_svg = True) 
+tedd.viz_schedule_tree(s, dot_file_path = '/tmp/scheduletree2.dot')
+#tedd.viz_schedule_tree(s, show_svg = True)
 
 ######################################################################
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_st.png
 #      :align: center
-#      :scale: 100%
 #
-# Now, let us take a close look at the second schedule tree.  Every block under ROOT 
-# represents a  
-# stage.  Stage name shows in the top row and compute shows in the bottom row.   
-# The middle rows are for IterVars, the higher the outer, the lower the inner. 
-# An IterVar row contains its index, name, type, and other optional information. 
-# Let's use the W.shared stage as an example.  The top row tells  
-# its name, "W.shared", and memory scope, "Shared".  Its compute is  
-# :code:`W(ax0, ax1, ax2, ax3)`. 
-# Its outer most loop IterVar is ax0.ax1.fused.ax2.fused.ax3.fused.outer,  
-# indexed with 0, of kDataPar, bound to threadIdx.y, and with range(min=0, ext=8).  
-# You can also tell 
-# IterVar type with the index box color, shown in the legend. 
-# 
-# If a stage doesn't compute_at any other stage, it has an edge directly to the  
-# ROOT node.  Otherwise, it has an edge pointing to the IterVar it attaches to,  
-# such as W.shared attaches to rx.outer in the middle compute stage. 
+# Now, let us take a close look at the second schedule tree.  Every block under ROOT
+# represents a
+# stage.  Stage name shows in the top row and compute shows in the bottom row.
+# The middle rows are for IterVars, the higher the outer, the lower the inner.
+# An IterVar row contains its index, name, type, and other optional information.
+# Let's use the W.shared stage as an example.  The top row tells
+# its name, "W.shared", and memory scope, "Shared".  Its compute is
+# :code:`W(ax0, ax1, ax2, ax3)`.
+# Its outer most loop IterVar is ax0.ax1.fused.ax2.fused.ax3.fused.outer,
+# indexed with 0, of kDataPar, bound to threadIdx.y, and with range(min=0, ext=8).
+# You can also tell
+# IterVar type with the index box color, shown in the legend.
+#
+# If a stage doesn't compute_at any other stage, it has an edge directly to the
+# ROOT node.  Otherwise, it has an edge pointing to the IterVar it attaches to,
+# such as W.shared attaches to rx.outer in the middle compute stage.
 #
 
-###################################################################### 
-# .. note:: 
-# 
-#   By definition, IterVars are internal nodes and computes are leaf nodes in 
-#   a schedule tree.   The edges among IterVars and compute within one stage are 
+######################################################################
+# .. note::
+#
+#   By definition, IterVars are internal nodes and computes are leaf nodes in
+#   a schedule tree.   The edges among IterVars and compute within one stage are
 #   omitted, making every stage a block, for better readability.
 #
 
-tedd.viz_itervar_relationship_graph(s, dot_file_path = '/tmp/itervar.dot') 
-#tedd.viz_itervar_relationship_graph(s, show_svg = True) 
+tedd.viz_itervar_relationship_graph(s, dot_file_path = '/tmp/itervar.dot')
+#tedd.viz_itervar_relationship_graph(s, show_svg = True)
 
 ######################################################################
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tedd_itervar_rel.png
 #      :align: center
-#      :scale: 100%
 #
-# The last one is an IterVar Relationship Graph.  Every subgraph represents a  
-# stage and contains IterVar nodes and transformation nodes.  For example,   
-# W.shared has three split nodes and three fuse nodes.  The rest are IterVar  
-# nodes of the same format as the IterVar rows in Schedule Trees.  Root  
-# IterVars are those not driven by any transformation node, such as ax0; leaf  
-# IterVars don't drive any transformation node and have non-negative indices,  
+# The last one is an IterVar Relationship Graph.  Every subgraph represents a
+# stage and contains IterVar nodes and transformation nodes.  For example,
+# W.shared has three split nodes and three fuse nodes.  The rest are IterVar
+# nodes of the same format as the IterVar rows in Schedule Trees.  Root
+# IterVars are those not driven by any transformation node, such as ax0; leaf
+# IterVars don't drive any transformation node and have non-negative indices,
 # such as ax0.ax1.fused.ax2.fused.ax3.fused.outer with index of 0.
 #
 
 
-###################################################################### 
-# Summary 
-# ------- 
-# This tutorial demonstrates the usage of TEDD.  We use an example built 
-# with TOPI to show the schedules under the hood.  You can also use 
+######################################################################
+# Summary
+# -------
+# This tutorial demonstrates the usage of TEDD.  We use an example built
+# with TOPI to show the schedules under the hood.  You can also use
 # it before and after any schedule primitive to inspect its effect.
-#
\ No newline at end of file
+#

From 9816efc2df63cf6a14a6de46dc2adfafde58acc1 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 26 Feb 2020 21:10:47 -0800
Subject: [PATCH 48/73] [REFACTOR][PY][API-CHANGE] Remove legacy python files.
 (#4943)

* [REFACTOR][PY][API-CHANGE] Remove legacy python files.

Remove legacy python files.
Use the te namespace for most of the tensor expression primitives.

- tvm.create_schedule -> tvm.te.create_schedule
- tvm.placeholder -> tvm.te.placeholder
- tvm.compute -> tvm.te.compute

* Remove top-level exposures.
---
 apps/android_rpc/tests/android_rpc_test.py    |  21 +-
 apps/benchmark/arm_cpu_imagenet_bench.py      |   1 +
 apps/benchmark/gpu_imagenet_bench.py          |   1 +
 apps/benchmark/mobile_gpu_imagenet_bench.py   |   1 +
 apps/bundle_deploy/build_model.py             |   1 +
 apps/dso_plugin_module/test_plugin_module.py  |   1 +
 apps/extension/python/tvm_ext/__init__.py     |   1 +
 apps/extension/tests/test_ext.py              |  23 +-
 apps/howto_deploy/prepare_test_libs.py        |   9 +-
 apps/howto_deploy/python_deploy.py            |   1 +
 apps/ios_rpc/tests/ios_rpc_test.py            |  15 +-
 apps/sgx/enclave/src/build_model.py           |   1 +
 apps/sgx/run_model.py                         |   1 +
 docs/api/python/te.rst                        |   1 +
 docs/api/python/tir.rst                       |   2 +-
 docs/conf.py                                  |   1 +
 golang/sample/deploy.py                       |  11 +-
 jvm/core/src/test/scripts/test_add_cpu.py     |  11 +-
 jvm/core/src/test/scripts/test_add_gpu.py     |  15 +-
 .../src/test/scripts/test_graph_runtime.py    |   7 +-
 python/tvm/__init__.py                        |  13 +-
 python/tvm/api.py                             |  38 --
 python/tvm/arith/analyzer.py                  |   2 +-
 python/tvm/autotvm/feature.py                 |  25 +-
 .../autotvm/graph_tuner/base_graph_tuner.py   |   5 +-
 python/tvm/autotvm/measure/measure_methods.py |  12 +-
 python/tvm/autotvm/task/code_hash.py          |   4 +-
 python/tvm/autotvm/task/space.py              |  34 +-
 python/tvm/autotvm/task/task.py               |  29 +-
 python/tvm/autotvm/task/topi_integration.py   |   6 +-
 python/tvm/autotvm/util.py                    |   2 +-
 python/tvm/contrib/binutil.py                 |  11 +-
 python/tvm/contrib/cblas.py                   |   6 +-
 python/tvm/contrib/cublas.py                  |   7 +-
 python/tvm/contrib/cublaslt.py                |   4 +-
 python/tvm/contrib/cudnn.py                   |  13 +-
 python/tvm/contrib/debugger/debug_result.py   |   1 +
 python/tvm/contrib/miopen.py                  |   9 +-
 python/tvm/contrib/mps.py                     |   7 +-
 python/tvm/contrib/nnpack.py                  |  14 +-
 python/tvm/contrib/nvcc.py                    |   5 +-
 python/tvm/contrib/peak.py                    |  27 +-
 python/tvm/contrib/random.py                  |   8 +-
 python/tvm/contrib/rocblas.py                 |   5 +-
 python/tvm/contrib/rocm.py                    |  10 +-
 python/tvm/contrib/sdaccel.py                 |   5 +-
 python/tvm/contrib/sparse.py                  |  13 +-
 python/tvm/contrib/tedd.py                    |  12 +-
 python/tvm/driver/build_module.py             |  22 +-
 python/tvm/hybrid/calls.py                    |  25 +-
 python/tvm/hybrid/parser.py                   |  36 +-
 python/tvm/hybrid/util.py                     |   7 +-
 python/tvm/intrin.py                          |  19 -
 python/tvm/make.py                            |  52 --
 python/tvm/relay/__init__.py                  |   2 +-
 python/tvm/relay/backend/_backend.py          |   4 +-
 python/tvm/relay/backend/compile_engine.py    |  17 +-
 .../relay/backend/graph_runtime_codegen.py    |   2 +-
 python/tvm/relay/build_module.py              |   2 +-
 python/tvm/relay/debug.py                     |   8 +-
 python/tvm/relay/frontend/coreml.py           |   1 -
 python/tvm/relay/frontend/darknet.py          |   1 -
 python/tvm/relay/frontend/mxnet.py            |   2 -
 python/tvm/relay/frontend/pytorch.py          |   2 +-
 python/tvm/relay/frontend/tensorflow.py       |   5 +-
 python/tvm/relay/frontend/tflite.py           |   1 -
 python/tvm/relay/op/_reduce.py                |   2 +-
 python/tvm/relay/op/_tensor.py                |   4 +-
 python/tvm/relay/op/_transform.py             |   7 +-
 python/tvm/relay/op/algorithm.py              |   2 +-
 python/tvm/relay/op/nn/_nn.py                 |   3 +-
 python/tvm/relay/op/op.py                     |  15 +-
 python/tvm/relay/param_dict.py                |   6 +-
 python/tvm/relay/quantize/quantize.py         |   6 +-
 python/tvm/relay/testing/__init__.py          |   1 +
 python/tvm/relay/testing/config.py            |   1 +
 python/tvm/relay/transform.py                 |   1 +
 python/tvm/runtime/vm.py                      |   1 +
 python/tvm/target/build_config.py             |   2 +-
 python/tvm/target/generic_func.py             |   2 +
 python/tvm/te/__init__.py                     |   3 +
 python/tvm/te/operation.py                    |  22 +-
 python/tvm/te/tag.py                          |  20 +-
 python/tvm/tir/__init__.py                    |   6 +-
 python/tvm/tir/buffer.py                      |  14 +-
 python/tvm/tir/expr.py                        |  10 +-
 python/tvm/tir/generic.py                     |   2 +-
 python/tvm/tir/ir_builder.py                  |  18 +-
 python/tvm/tir/op.py                          |  34 +-
 python/tvm/tir/stmt.py                        |   4 +-
 .../examples/resnet/src/build_resnet.py       |   1 +
 rust/frontend/tests/basics/src/tvm_add.py     |  15 +-
 rust/runtime/tests/build_model.py             |   1 +
 .../tests/test_nn/src/build_test_graph.py     |   1 +
 .../test_tvm_basic/src/build_test_lib.py      |  11 +-
 .../tests/test_tvm_dso/src/build_test_lib.py  |  11 +-
 tests/python/contrib/test_binutil.py          |   1 +
 tests/python/contrib/test_cblas.py            |  23 +-
 tests/python/contrib/test_cublas.py           |  19 +-
 tests/python/contrib/test_cudnn.py            |  13 +-
 tests/python/contrib/test_dlpack.py           |  13 +-
 tests/python/contrib/test_edgetpu_runtime.py  |   1 +
 tests/python/contrib/test_gemm_acc16.py       |  17 +-
 tests/python/contrib/test_gemm_acc32_vnni.py  |  13 +-
 tests/python/contrib/test_miopen.py           |   5 +-
 tests/python/contrib/test_mps.py              |  23 +-
 tests/python/contrib/test_mxnet_bridge.py     |   9 +-
 tests/python/contrib/test_nnpack.py           |  27 +-
 tests/python/contrib/test_random.py           |   7 +-
 tests/python/contrib/test_rocblas.py          |   7 +-
 tests/python/contrib/test_rpc_proxy.py        |   1 +
 tests/python/contrib/test_rpc_tracker.py      |   1 +
 tests/python/contrib/test_sort.py             |  21 +-
 tests/python/contrib/test_sparse.py           |  33 +-
 tests/python/contrib/test_tedd.py             |  48 +-
 tests/python/contrib/test_tflite_runtime.py   |   9 +-
 tests/python/frontend/caffe2/test_forward.py  |   1 +
 tests/python/frontend/coreml/test_forward.py  |   1 +
 tests/python/frontend/darknet/test_forward.py |   1 +
 tests/python/frontend/keras/test_forward.py   |   1 +
 tests/python/frontend/mxnet/test_forward.py   |   1 +
 tests/python/frontend/mxnet/test_graph.py     |   1 +
 .../frontend/mxnet/test_qnn_ops_utils.py      |   1 +
 tests/python/frontend/onnx/test_forward.py    |   1 +
 tests/python/frontend/pytorch/test_forward.py |   3 +-
 .../frontend/tensorflow/test_forward.py       |   3 +-
 tests/python/frontend/tflite/test_forward.py  |   1 +
 tests/python/integration/test_dot.py          |  31 +-
 tests/python/integration/test_ewise.py        |  91 +--
 tests/python/integration/test_ewise_fpga.py   |  27 +-
 tests/python/integration/test_gemm.py         |  23 +-
 tests/python/integration/test_reduce.py       | 123 ++--
 tests/python/integration/test_scan.py         |  23 +-
 tests/python/integration/test_tuning.py       |  41 +-
 .../integration/test_winograd_nnpack.py       |   7 +-
 .../test_quantization_accuracy.py             |   1 +
 .../python/relay/benchmarking/benchmark_vm.py |   3 +-
 tests/python/relay/test_adt.py                |   1 +
 tests/python/relay/test_any.py                |   1 +
 .../relay/test_backend_compile_engine.py      |   7 +-
 .../relay/test_backend_graph_runtime.py       |   1 +
 .../python/relay/test_backend_interpreter.py  |   1 +
 tests/python/relay/test_change_batch.py       |   1 +
 tests/python/relay/test_cpp_build_module.py   |   1 +
 tests/python/relay/test_error_reporting.py    |   1 +
 tests/python/relay/test_expr_functor.py       |   1 +
 tests/python/relay/test_external_codegen.py   |   1 +
 tests/python/relay/test_external_runtime.py   |   1 +
 tests/python/relay/test_feature.py            |   1 +
 tests/python/relay/test_ir_bind.py            |   1 +
 tests/python/relay/test_ir_module.py          |   1 +
 tests/python/relay/test_ir_nodes.py           |  41 +-
 tests/python/relay/test_ir_parser.py          |   1 +
 tests/python/relay/test_ir_text_printer.py    |   3 +-
 tests/python/relay/test_ir_well_formed.py     |   1 +
 tests/python/relay/test_json_compact.py       |   1 +
 tests/python/relay/test_memory_alloc.py       |   1 +
 tests/python/relay/test_op_grad_level1.py     |   1 +
 tests/python/relay/test_op_grad_level2.py     |   5 +-
 tests/python/relay/test_op_grad_level3.py     |   1 +
 tests/python/relay/test_op_level1.py          |  37 +-
 tests/python/relay/test_op_level10.py         |   5 +-
 tests/python/relay/test_op_level2.py          |  73 +--
 tests/python/relay/test_op_level3.py          |  31 +-
 tests/python/relay/test_op_level4.py          |   7 +-
 tests/python/relay/test_op_level5.py          |  23 +-
 tests/python/relay/test_op_level6.py          |   1 +
 tests/python/relay/test_op_qnn_add.py         |   1 +
 tests/python/relay/test_op_qnn_concatenate.py |   1 +
 tests/python/relay/test_op_qnn_conv2d.py      |   1 +
 tests/python/relay/test_op_qnn_dense.py       |   1 +
 tests/python/relay/test_op_qnn_dequantize.py  |   1 +
 tests/python/relay/test_op_qnn_mul.py         |   1 +
 tests/python/relay/test_op_qnn_quantize.py    |   1 +
 tests/python/relay/test_op_qnn_requantize.py  |   1 +
 tests/python/relay/test_param_dict.py         |   1 +
 tests/python/relay/test_pass_alpha_equal.py   |  93 +--
 .../python/relay/test_pass_alter_op_layout.py |   1 +
 tests/python/relay/test_pass_annotation.py    |   1 +
 tests/python/relay/test_pass_auto_quantize.py |   1 +
 .../relay/test_pass_canonicalize_cast.py      |   1 +
 tests/python/relay/test_pass_check_kind.py    |  57 +-
 .../test_pass_combine_parallel_conv2d.py      |   1 +
 .../relay/test_pass_combine_parallel_dense.py |   1 +
 .../relay/test_pass_convert_op_layout.py      |   1 +
 .../relay/test_pass_dead_code_elimination.py  |   3 +-
 .../test_pass_eliminate_common_subexpr.py     |   1 +
 tests/python/relay/test_pass_eta_expand.py    |   1 +
 tests/python/relay/test_pass_fold_constant.py |   3 +-
 .../python/relay/test_pass_fold_scale_axis.py |   1 +
 tests/python/relay/test_pass_fuse_ops.py      |   1 +
 tests/python/relay/test_pass_gradient.py      |   1 +
 tests/python/relay/test_pass_lambda_lift.py   |   1 +
 tests/python/relay/test_pass_legalize.py      |   1 +
 tests/python/relay/test_pass_mac_count.py     |  11 +-
 tests/python/relay/test_pass_manager.py       |   1 +
 tests/python/relay/test_pass_partial_eval.py  |   1 +
 .../python/relay/test_pass_partition_graph.py |   1 +
 tests/python/relay/test_pass_qnn_legalize.py  |   1 +
 .../test_pass_remove_unused_functions.py      |   1 +
 .../relay/test_pass_to_a_normal_form.py       |   1 +
 tests/python/relay/test_pass_to_cps.py        |   1 +
 .../relay/test_pass_to_graph_normal_form.py   |   1 +
 .../python/relay/test_pass_unmatched_cases.py |   1 +
 tests/python/relay/test_pass_vars.py          |   1 +
 tests/python/relay/test_py_converter.py       |   1 +
 tests/python/relay/test_type_functor.py       |   3 +-
 tests/python/relay/test_type_infer.py         |   1 +
 tests/python/relay/test_type_solver.py        |   1 +
 tests/python/relay/test_typecall.py           |   1 +
 tests/python/relay/test_vm.py                 |   1 +
 tests/python/relay/test_vm_serialization.py   |   1 +
 .../unittest/test_arith_canonical_simplify.py | 133 ++---
 .../unittest/test_arith_const_int_bound.py    |  53 +-
 .../unittest/test_arith_deduce_bound.py       |  63 +-
 .../unittest/test_arith_detect_clip_bound.py  |  19 +-
 .../test_arith_detect_linear_equation.py      |  21 +-
 .../unittest/test_arith_domain_touched.py     |  13 +-
 tests/python/unittest/test_arith_intset.py    |  29 +-
 .../python/unittest/test_arith_modular_set.py |  37 +-
 .../unittest/test_arith_rewrite_simplify.py   | 557 +++++++++---------
 .../unittest/test_arith_stmt_simplify.py      |  57 +-
 tests/python/unittest/test_autotvm_common.py  |  21 +-
 tests/python/unittest/test_autotvm_feature.py |  41 +-
 .../unittest/test_autotvm_flop_calculator.py  |  79 +--
 tests/python/unittest/test_autotvm_measure.py |   1 +
 tests/python/unittest/test_autotvm_record.py  |   1 +
 tests/python/unittest/test_autotvm_space.py   |  11 +-
 .../unittest/test_autotvm_xgboost_model.py    |   1 +
 tests/python/unittest/test_build_lower.py     |  27 +-
 tests/python/unittest/test_codegen_arm.py     |  33 +-
 tests/python/unittest/test_codegen_blob.py    |  13 +-
 tests/python/unittest/test_codegen_bool.py    |  17 +-
 tests/python/unittest/test_codegen_c_host.py  |  45 +-
 .../unittest/test_codegen_cross_llvm.py       |  11 +-
 tests/python/unittest/test_codegen_cuda.py    | 105 ++--
 tests/python/unittest/test_codegen_device.py  |  55 +-
 tests/python/unittest/test_codegen_extern.py  |  47 +-
 tests/python/unittest/test_codegen_llvm.py    | 213 +++----
 tests/python/unittest/test_codegen_opencl.py  |  41 +-
 tests/python/unittest/test_codegen_rocm.py    |  47 +-
 .../unittest/test_codegen_static_init.py      |  31 +-
 .../python/unittest/test_codegen_vm_basic.py  |  49 +-
 tests/python/unittest/test_codegen_vulkan.py  |  39 +-
 tests/python/unittest/test_codegen_x86.py     |   9 +-
 tests/python/unittest/test_container.py       |   1 +
 .../test_custom_datatypes_mybfloat16.py       |  17 +-
 .../python/unittest/test_graph_tuner_core.py  |  15 +-
 .../python/unittest/test_graph_tuner_utils.py |   1 +
 tests/python/unittest/test_hybrid_script.py   | 136 ++---
 tests/python/unittest/test_ir_builder.py      |  49 +-
 tests/python/unittest/test_lang_basic.py      | 117 ++--
 tests/python/unittest/test_lang_buffer.py     | 129 ++--
 .../python/unittest/test_lang_constructor.py  |  33 +-
 tests/python/unittest/test_lang_container.py  |  23 +-
 .../python/unittest/test_lang_data_layout.py  |  21 +-
 tests/python/unittest/test_lang_group.py      |  45 +-
 tests/python/unittest/test_lang_operator.py   |  95 +--
 tests/python/unittest/test_lang_reflection.py |  21 +-
 tests/python/unittest/test_lang_schedule.py   | 161 ++---
 tests/python/unittest/test_lang_tag.py        |  64 +-
 tests/python/unittest/test_lang_target.py     |   1 +
 tests/python/unittest/test_lang_tensor.py     | 255 ++++----
 .../unittest/test_lang_tensor_overload_op.py  |  65 +-
 .../unittest/test_lang_verify_compute.py      |  33 +-
 .../unittest/test_pass_attrs_hash_equal.py    |  21 +-
 tests/python/unittest/test_pass_basic.py      |  41 +-
 .../unittest/test_pass_bound_checkers.py      | 267 ++++-----
 .../test_pass_combine_context_call.py         |  19 +-
 .../test_pass_decorate_device_scope.py        |  21 +-
 tests/python/unittest/test_pass_equal.py      |  39 +-
 tests/python/unittest/test_pass_hoist_if.py   |  55 +-
 .../unittest/test_pass_inject_copy_intrin.py  | 109 ++--
 .../test_pass_inject_double_buffer.py         |  15 +-
 .../unittest/test_pass_inject_vthread.py      |  39 +-
 tests/python/unittest/test_pass_inline.py     |  25 +-
 .../python/unittest/test_pass_ir_transform.py |  17 +-
 .../unittest/test_pass_lift_attr_scope.py     |  13 +-
 .../unittest/test_pass_loop_partition.py      | 311 +++++-----
 .../python/unittest/test_pass_lower_intrin.py |  43 +-
 .../unittest/test_pass_lower_warp_memory.py   |  15 +-
 tests/python/unittest/test_pass_makeapi.py    |  25 +-
 .../python/unittest/test_pass_remove_no_op.py |  19 +-
 .../test_pass_rewrite_for_tensor_core.py      |  71 +--
 .../test_pass_rewrite_unsafe_select.py        |  13 +-
 .../unittest/test_pass_split_host_device.py   |   9 +-
 .../unittest/test_pass_storage_flatten.py     |  73 +--
 .../unittest/test_pass_storage_rewrite.py     | 321 +++++-----
 .../python/unittest/test_pass_storage_sync.py |  59 +-
 tests/python/unittest/test_pass_unroll.py     |  41 +-
 tests/python/unittest/test_pass_vectorize.py  |  53 +-
 .../unittest/test_pass_verify_gpu_code.py     |  75 +--
 .../unittest/test_pass_verify_memory.py       |  61 +-
 .../unittest/test_pass_virtual_thread.py      |  27 +-
 tests/python/unittest/test_runtime_error.py   |   1 +
 .../python/unittest/test_runtime_extension.py |  13 +-
 tests/python/unittest/test_runtime_graph.py   |   7 +-
 .../unittest/test_runtime_graph_debug.py      |   7 +-
 .../unittest/test_runtime_heterogeneous.py    |  35 +-
 tests/python/unittest/test_runtime_measure.py |   5 +-
 tests/python/unittest/test_runtime_micro.py   |  25 +-
 .../unittest/test_runtime_module_export.py    |  19 +-
 .../unittest/test_runtime_module_load.py      |  32 +-
 tests/python/unittest/test_runtime_ndarray.py |   7 +-
 .../unittest/test_runtime_packed_func.py      | 103 ++--
 tests/python/unittest/test_runtime_rpc.py     |  21 +-
 .../unittest/test_runtime_vm_profiler.py      |   1 +
 .../unittest/test_schedule_bound_inference.py | 315 +++++-----
 tests/python/unittest/test_schedule_graph.py  | 119 ++--
 tests/python/unittest/test_schedule_lstm.py   |  51 +-
 .../unittest/test_schedule_schedule_ops.py    | 433 +++++++-------
 .../unittest/test_schedule_tensor_core.py     | 119 ++--
 .../unittest/test_schedule_tensorize.py       | 225 +++----
 tests/python/unittest/test_testing.py         |   1 +
 tests/python/unittest/test_tvm_intrin.py      |   9 +-
 tests/web/prepare_test_libs.py                |   9 +-
 tests/web/websock_rpc_test.py                 |   9 +-
 tests/webgl/test_local_gemm.py                |  15 +-
 tests/webgl/test_local_multi_stage.py         |  11 +-
 tests/webgl/test_local_save_load.py           |  11 +-
 tests/webgl/test_local_topi_conv2d_nchw.py    |   7 +-
 tests/webgl/test_local_topi_dense.py          |   7 +-
 tests/webgl/test_local_topi_pooling.py        |   5 +-
 tests/webgl/test_local_topi_softmax.py        |   9 +-
 tests/webgl/test_remote_save_load.py          |  11 +-
 tests/webgl/test_static_webgl_library.py      |   9 +-
 topi/python/topi/argwhere.py                  |  24 +-
 topi/python/topi/arm_cpu/bitserial_conv2d.py  | 170 +++---
 topi/python/topi/arm_cpu/bitserial_dense.py   |  39 +-
 topi/python/topi/arm_cpu/conv2d.py            |  97 +--
 topi/python/topi/arm_cpu/conv2d_alter_op.py   |  19 +-
 topi/python/topi/arm_cpu/conv2d_int8.py       |  15 +-
 .../topi/arm_cpu/conv2d_spatial_pack.py       | 120 ++--
 topi/python/topi/arm_cpu/conv2d_transpose.py  |  51 +-
 topi/python/topi/arm_cpu/depthwise_conv2d.py  |  87 +--
 topi/python/topi/arm_cpu/injective.py         |  13 +-
 topi/python/topi/arm_cpu/tensor_intrin.py     |  47 +-
 topi/python/topi/bifrost/conv2d.py            | 105 ++--
 topi/python/topi/bifrost/dense.py             |  21 +-
 topi/python/topi/bifrost/depthwise_conv2d.py  |  19 +-
 topi/python/topi/bifrost/gemm.py              |  89 ++-
 topi/python/topi/bifrost/transforms.py        |  25 +-
 topi/python/topi/broadcast.py                 | 150 ++---
 topi/python/topi/cuda/batch_matmul.py         |  23 +-
 topi/python/topi/cuda/conv1d.py               |  53 +-
 topi/python/topi/cuda/conv1d_transpose_ncw.py |  55 +-
 topi/python/topi/cuda/conv2d.py               |  10 +-
 topi/python/topi/cuda/conv2d_alter_op.py      |  23 +-
 topi/python/topi/cuda/conv2d_direct.py        |  27 +-
 topi/python/topi/cuda/conv2d_hwcn.py          |  31 +-
 topi/python/topi/cuda/conv2d_int8.py          |  89 +--
 .../python/topi/cuda/conv2d_transpose_nchw.py |  81 +--
 topi/python/topi/cuda/conv2d_winograd.py      | 107 ++--
 topi/python/topi/cuda/conv3d.py               |  28 +-
 topi/python/topi/cuda/conv3d_direct.py        |  29 +-
 topi/python/topi/cuda/deformable_conv2d.py    |  31 +-
 topi/python/topi/cuda/dense.py                |  79 ++-
 topi/python/topi/cuda/depthwise_conv2d.py     |  63 +-
 topi/python/topi/cuda/group_conv2d_nchw.py    | 127 ++--
 topi/python/topi/cuda/injective.py            |  15 +-
 topi/python/topi/cuda/nms.py                  | 245 ++++----
 topi/python/topi/cuda/pooling.py              |  37 +-
 topi/python/topi/cuda/rcnn/proposal.py        | 139 ++---
 topi/python/topi/cuda/reduction.py            |  17 +-
 topi/python/topi/cuda/softmax.py              |  10 +-
 topi/python/topi/cuda/sort.py                 | 156 ++---
 topi/python/topi/cuda/ssd/multibox.py         | 136 +++--
 topi/python/topi/cuda/tensor_intrin.py        |  25 +-
 topi/python/topi/cuda/vision.py               |   5 +-
 topi/python/topi/generic/__init__.py          |   2 +-
 topi/python/topi/generic/conv2d.py            |  11 +-
 topi/python/topi/generic/extern.py            |   2 -
 topi/python/topi/generic/injective.py         |   7 +-
 topi/python/topi/generic/nn.py                |  12 +-
 topi/python/topi/generic/vision.py            |   7 +-
 topi/python/topi/generic_op_impl.py           |  20 +-
 topi/python/topi/hls/injective.py             |   9 +-
 topi/python/topi/hls/nn.py                    |  59 +-
 topi/python/topi/image/resize.py              | 165 +++---
 topi/python/topi/intel_graphics/conv2d.py     | 113 ++--
 .../topi/intel_graphics/conv2d_alter_op.py    |   9 +-
 .../topi/intel_graphics/depthwise_conv2d.py   |  61 +-
 topi/python/topi/mali/conv2d.py               | 138 ++---
 topi/python/topi/mali/dense.py                |  21 +-
 topi/python/topi/mali/depthwise_conv2d.py     |  31 +-
 topi/python/topi/math.py                      | 196 +++---
 topi/python/topi/nn/batch_matmul.py           |  17 +-
 topi/python/topi/nn/bitserial_conv2d.py       |  70 +--
 topi/python/topi/nn/bitserial_dense.py        |  25 +-
 topi/python/topi/nn/bitserial_util.py         |   8 +-
 topi/python/topi/nn/bnn.py                    |  31 +-
 topi/python/topi/nn/conv1d.py                 |  31 +-
 topi/python/topi/nn/conv1d_transpose.py       |  23 +-
 topi/python/topi/nn/conv2d.py                 | 241 ++++----
 topi/python/topi/nn/conv2d_transpose.py       |  24 +-
 topi/python/topi/nn/conv3d.py                 |  39 +-
 topi/python/topi/nn/deformable_conv2d.py      |  43 +-
 topi/python/topi/nn/dense.py                  |  27 +-
 topi/python/topi/nn/depth_to_space.py         |  19 +-
 topi/python/topi/nn/depthwise_conv2d.py       |  95 +--
 topi/python/topi/nn/dilate.py                 |  21 +-
 topi/python/topi/nn/elemwise.py               |  33 +-
 topi/python/topi/nn/fifo_buffer.py            | 129 ++--
 topi/python/topi/nn/flatten.py                |  13 +-
 topi/python/topi/nn/local_response_norm.py    |   4 +-
 topi/python/topi/nn/mapping.py                |  25 +-
 topi/python/topi/nn/pad.py                    |  33 +-
 topi/python/topi/nn/pooling.py                |  26 +-
 topi/python/topi/nn/softmax.py                |  51 +-
 topi/python/topi/nn/space_to_depth.py         |  19 +-
 topi/python/topi/nn/sparse.py                 |  47 +-
 topi/python/topi/nn/upsampling.py             |  31 +-
 topi/python/topi/nn/util.py                   |   4 +-
 topi/python/topi/nn/winograd_util.py          |   2 +-
 topi/python/topi/opengl/conv2d_nchw.py        |   9 +-
 topi/python/topi/opengl/dense.py              |   9 +-
 topi/python/topi/opengl/injective.py          |   8 +-
 topi/python/topi/opengl/pooling.py            |  16 +-
 topi/python/topi/opengl/softmax.py            |   6 +-
 topi/python/topi/reduction.py                 |  32 +-
 topi/python/topi/rocm/conv2d.py               |   8 +-
 topi/python/topi/rocm/dense.py                |  35 +-
 topi/python/topi/sort.py                      |  89 +--
 topi/python/topi/sparse/csrmm.py              |  35 +-
 topi/python/topi/sparse/csrmv.py              |  35 +-
 topi/python/topi/sparse/dense.py              |  79 +--
 topi/python/topi/tensor.py                    |  10 +-
 .../topi/testing/conv2d_transpose_python.py   |   4 +-
 .../topi/testing/conv3d_ncdhw_python.py       |   2 +-
 .../topi/testing/conv3d_ndhwc_python.py       |   2 +-
 .../topi/testing/depthwise_conv2d_python.py   |  16 +-
 topi/python/topi/testing/pool3d_python.py     |   4 +-
 topi/python/topi/testing/pool_grad_python.py  |   2 +-
 topi/python/topi/testing/roi_align_python.py  |   4 +-
 topi/python/topi/transform.py                 | 159 ++---
 topi/python/topi/util.py                      |  85 +--
 topi/python/topi/vision/nms.py                |  95 +--
 topi/python/topi/vision/rcnn/proposal.py      | 115 ++--
 topi/python/topi/vision/rcnn/roi_align.py     |  41 +-
 topi/python/topi/vision/rcnn/roi_pool.py      |  51 +-
 topi/python/topi/vision/reorg.py              |   4 +-
 topi/python/topi/vision/ssd/multibox.py       |  53 +-
 topi/python/topi/x86/batch_matmul.py          |  23 +-
 topi/python/topi/x86/binarize_pack.py         |   7 +-
 topi/python/topi/x86/binary_dense.py          |   9 +-
 topi/python/topi/x86/bitserial_conv2d.py      |  89 +--
 topi/python/topi/x86/bitserial_dense.py       |  41 +-
 topi/python/topi/x86/conv1d.py                |  19 +-
 topi/python/topi/x86/conv2d.py                |  33 +-
 topi/python/topi/x86/conv2d_alter_op.py       |  33 +-
 topi/python/topi/x86/conv2d_avx_1x1.py        |  35 +-
 topi/python/topi/x86/conv2d_avx_common.py     |   5 +-
 topi/python/topi/x86/conv2d_int8.py           |  25 +-
 topi/python/topi/x86/conv2d_transpose.py      |   4 +-
 topi/python/topi/x86/conv3d.py                |  79 +--
 topi/python/topi/x86/dense.py                 |  79 +--
 topi/python/topi/x86/depthwise_conv2d.py      |  33 +-
 topi/python/topi/x86/injective.py             |  15 +-
 topi/python/topi/x86/nn.py                    |   7 +-
 topi/python/topi/x86/pooling.py               |  16 +-
 topi/python/topi/x86/reduction.py             |  10 +-
 topi/python/topi/x86/roi_align.py             |  56 +-
 topi/python/topi/x86/sparse.py                |   4 +-
 topi/python/topi/x86/tensor_intrin.py         | 187 +++---
 topi/python/topi/x86/util.py                  |   2 +-
 topi/recipe/broadcast/test_broadcast_map.py   |   7 +-
 topi/recipe/conv/depthwise_conv2d_test.py     |  21 +-
 topi/recipe/conv/test_conv2d_hwcn_map.py      |   7 +-
 topi/recipe/conv/test_conv_int8_arm.py        |   7 +-
 topi/recipe/conv/test_conv_int8_intel.py      |   7 +-
 topi/recipe/gemm/android_gemm_square.py       |  27 +-
 topi/recipe/gemm/cuda_gemm_square.py          |  31 +-
 topi/recipe/gemm/gemm_int8.py                 |  23 +-
 topi/recipe/reduce/test_reduce_map.py         |   5 +-
 topi/recipe/rnn/lstm.py                       |  53 +-
 topi/recipe/rnn/matexp.py                     |  31 +-
 topi/tests/python/common.py                   |   1 +
 topi/tests/python/test_fifo_buffer.py         |  17 +-
 topi/tests/python/test_topi_basic.py          |   9 +-
 topi/tests/python/test_topi_batch_matmul.py   |   5 +-
 .../python/test_topi_bitserial_conv2d.py      |   9 +-
 .../python/test_topi_bitserial_conv2d_rasp.py |   5 +-
 .../tests/python/test_topi_bitserial_dense.py |   5 +-
 topi/tests/python/test_topi_bnn.py            |   9 +-
 topi/tests/python/test_topi_broadcast.py      |  35 +-
 topi/tests/python/test_topi_clip.py           |   5 +-
 topi/tests/python/test_topi_conv1d.py         |   5 +-
 .../python/test_topi_conv1d_transpose_ncw.py  |   5 +-
 topi/tests/python/test_topi_conv2d_NCHWc.py   |   7 +-
 topi/tests/python/test_topi_conv2d_hwcn.py    |   7 +-
 topi/tests/python/test_topi_conv2d_int8.py    |   7 +-
 topi/tests/python/test_topi_conv2d_nchw.py    |   7 +-
 topi/tests/python/test_topi_conv2d_nhwc.py    |   5 +-
 .../python/test_topi_conv2d_nhwc_pack_int8.py |   5 +-
 .../python/test_topi_conv2d_transpose_nchw.py |   5 +-
 .../tests/python/test_topi_conv2d_winograd.py |   7 +-
 topi/tests/python/test_topi_conv3d_ncdhw.py   |   7 +-
 topi/tests/python/test_topi_conv3d_ndhwc.py   |   5 +-
 .../python/test_topi_deformable_conv2d.py     |   9 +-
 topi/tests/python/test_topi_dense.py          |  13 +-
 topi/tests/python/test_topi_depth_to_space.py |   3 +-
 .../python/test_topi_depthwise_conv2d.py      |  21 +-
 .../test_topi_depthwise_conv2d_back_input.py  |   5 +-
 .../test_topi_depthwise_conv2d_back_weight.py |   5 +-
 topi/tests/python/test_topi_dilate.py         |   5 +-
 topi/tests/python/test_topi_group_conv2d.py   |  13 +-
 .../test_topi_group_conv2d_NCHWc_int8.py      |   5 +-
 topi/tests/python/test_topi_image.py          |  15 +-
 topi/tests/python/test_topi_lrn.py            |   3 +-
 topi/tests/python/test_topi_math.py           |  25 +-
 topi/tests/python/test_topi_matmul.py         |   5 +-
 topi/tests/python/test_topi_pooling.py        |  15 +-
 topi/tests/python/test_topi_reduce.py         |   3 +-
 topi/tests/python/test_topi_relu.py           |  13 +-
 topi/tests/python/test_topi_reorg.py          |   3 +-
 topi/tests/python/test_topi_softmax.py        |  11 +-
 topi/tests/python/test_topi_sort.py           |   5 +-
 topi/tests/python/test_topi_space_to_depth.py |   3 +-
 topi/tests/python/test_topi_sparse.py         |  71 +--
 topi/tests/python/test_topi_tensor.py         |  17 +-
 topi/tests/python/test_topi_transform.py      |  85 +--
 topi/tests/python/test_topi_upsampling.py     |  11 +-
 topi/tests/python/test_topi_vision.py         |  31 +-
 tutorials/autotvm/tune_conv2d_cuda.py         |  31 +-
 tutorials/autotvm/tune_relay_arm.py           |   1 +
 tutorials/autotvm/tune_relay_cuda.py          |   1 +
 tutorials/autotvm/tune_relay_mobile_gpu.py    |   1 +
 tutorials/autotvm/tune_relay_x86.py           |   1 +
 tutorials/autotvm/tune_simple_template.py     |  31 +-
 tutorials/cross_compilation_and_rpc.py        |  15 +-
 tutorials/dev/low_level_custom_pass.py        |  29 +-
 tutorials/dev/relay_pass_infra.py             |   1 +
 tutorials/frontend/build_gcn.py               |   1 +
 tutorials/frontend/deploy_model_on_android.py |   1 +
 tutorials/frontend/deploy_model_on_rasp.py    |   1 +
 tutorials/frontend/deploy_quantized.py        |   1 +
 tutorials/frontend/deploy_ssd_gluoncv.py      |   3 +-
 tutorials/frontend/from_caffe2.py             |   1 +
 tutorials/frontend/from_coreml.py             |   1 +
 tutorials/frontend/from_darknet.py            |   1 +
 tutorials/frontend/from_keras.py              |   1 +
 tutorials/frontend/from_mxnet.py              |   1 +
 tutorials/frontend/from_onnx.py               |   1 +
 tutorials/frontend/from_tensorflow.py         |   1 +
 tutorials/frontend/from_tflite.py             |   1 +
 tutorials/frontend/using_external_lib.py      |   1 +
 tutorials/language/extern_op.py               |  29 +-
 tutorials/language/intrin_math.py             |  61 +-
 tutorials/language/reduction.py               |  67 +--
 tutorials/language/scan.py                    |  63 +-
 tutorials/language/schedule_primitives.py     |  85 +--
 tutorials/language/tedd.py                    |  13 +-
 tutorials/language/tensorize.py               |  71 +--
 tutorials/language/tuple_inputs.py            |  49 +-
 tutorials/optimize/opt_conv_cuda.py           |  39 +-
 tutorials/optimize/opt_conv_tensorcore.py     |  95 +--
 tutorials/optimize/opt_gemm.py                |  31 +-
 .../optimize/opt_matmul_auto_tensorcore.py    |  43 +-
 tutorials/relay_quick_start.py                |   1 +
 tutorials/tensor_expr_get_started.py          |  17 +-
 tutorials/topi/intro_topi.py                  |  27 +-
 vta/apps/gemm/python/tsim.py                  |   1 +
 vta/apps/gemm/tests/python/chisel_accel.py    |  23 +-
 vta/apps/tsim_example/python/tsim.py          |   1 +
 .../tsim_example/tests/python/chisel_accel.py |   1 +
 .../tests/python/verilog_accel.py             |   1 +
 vta/python/vta/build_module.py                |  14 +-
 vta/python/vta/environment.py                 |  11 +-
 vta/python/vta/intrin.py                      |  47 +-
 vta/python/vta/ir_pass.py                     | 171 +++---
 vta/python/vta/pkg_config.py                  |  24 +-
 vta/python/vta/top/bitpack.py                 |   7 +-
 vta/python/vta/top/op.py                      |  15 +-
 vta/python/vta/top/vta_conv2d.py              |  23 +-
 vta/python/vta/top/vta_conv2d_transpose.py    |  23 +-
 vta/python/vta/top/vta_dense.py               |  15 +-
 vta/python/vta/top/vta_group_conv2d.py        |  23 +-
 vta/scripts/tune_conv2d.py                    |  19 +-
 vta/scripts/tune_conv2d_transpose.py          |  17 +-
 vta/scripts/tune_dense.py                     |  17 +-
 vta/scripts/tune_group_conv2d.py              |  19 +-
 vta/scripts/tune_resnet.py                    |  15 +-
 .../python/integration/test_benchmark_gemm.py |  31 +-
 .../integration/test_benchmark_topi_conv2d.py |  17 +-
 .../test_benchmark_topi_conv2d_transpose.py   |  15 +-
 .../integration/test_benchmark_topi_dense.py  |  15 +-
 .../test_benchmark_topi_group_conv2d.py       |  17 +-
 vta/tests/python/pynq/test_program_rpc.py     |   1 +
 vta/tests/python/unittest/test_vta_insn.py    | 101 ++--
 vta/tutorials/autotvm/tune_relay_vta.py       |  13 +-
 .../frontend/deploy_classification.py         |   1 +
 vta/tutorials/matrix_multiply.py              |  25 +-
 vta/tutorials/optimize/convolution_opt.py     |  35 +-
 vta/tutorials/optimize/matrix_multiply_opt.py |  31 +-
 vta/tutorials/vta_get_started.py              |  15 +-
 595 files changed, 9038 insertions(+), 8687 deletions(-)
 delete mode 100644 python/tvm/api.py
 delete mode 100644 python/tvm/intrin.py
 delete mode 100644 python/tvm/make.py

diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index 122d07faf9e5..32af005d7d4d 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -22,6 +22,7 @@
 """
 
 import tvm
+from tvm import te
 import os
 from tvm import rpc
 from tvm.contrib import util, ndk
@@ -44,9 +45,9 @@
 
 def test_rpc_module():
     # graph
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
     a_np = np.random.uniform(size=1024).astype(A.dtype)
     temp = util.tempdir()
 
@@ -56,7 +57,7 @@ def test_rpc_module():
                              session_timeout=60)
 
     # Compile the Graph for CPU target
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
     s[B].parallel(xi)
     s[B].pragma(xo, "parallel_launch_point")
@@ -79,10 +80,10 @@ def test_rpc_module():
 
     # Compile the Graph for OpenCL target
     if test_opencl:
-        s = tvm.create_schedule(B.op)
+        s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=64)
-        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
-        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        s[B].bind(xi, te.thread_axis("threadIdx.x"))
+        s[B].bind(xo, te.thread_axis("blockIdx.x"))
         # Build the dynamic lib.
         # If we don't want to do metal and only use cpu, just set target to be target
         f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
@@ -102,10 +103,10 @@ def test_rpc_module():
 
     # Compile the Graph for Vulkan target
     if test_vulkan:
-        s = tvm.create_schedule(B.op)
+        s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=64)
-        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
-        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        s[B].bind(xi, te.thread_axis("threadIdx.x"))
+        s[B].bind(xo, te.thread_axis("blockIdx.x"))
         # Build the dynamic lib.
         # If we don't want to do metal and only use cpu, just set target to be target
         f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd")
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index 5403e9610c32..53b616868bdd 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -22,6 +22,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm.contrib.util import tempdir
 import tvm.contrib.graph_runtime as runtime
 from tvm import relay
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
index fd96be6ad66c..dfb0445bf214 100644
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -23,6 +23,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 import tvm.contrib.graph_runtime as runtime
 from tvm import relay
 
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
index d5d60a245be3..4f93a0d5e383 100644
--- a/apps/benchmark/mobile_gpu_imagenet_bench.py
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -22,6 +22,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm.contrib.util import tempdir
 import tvm.contrib.graph_runtime as runtime
 from tvm import relay
diff --git a/apps/bundle_deploy/build_model.py b/apps/bundle_deploy/build_model.py
index de9e73522ca2..37e302449016 100644
--- a/apps/bundle_deploy/build_model.py
+++ b/apps/bundle_deploy/build_model.py
@@ -20,6 +20,7 @@
 import os
 from tvm import relay
 import tvm
+from tvm import te
 import logging
 
 
diff --git a/apps/dso_plugin_module/test_plugin_module.py b/apps/dso_plugin_module/test_plugin_module.py
index 6304ef9573fb..0704dd0f5f2d 100644
--- a/apps/dso_plugin_module/test_plugin_module.py
+++ b/apps/dso_plugin_module/test_plugin_module.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import os
 
 def test_plugin_module():
diff --git a/apps/extension/python/tvm_ext/__init__.py b/apps/extension/python/tvm_ext/__init__.py
index 31b149eb4913..377db7c1c6ea 100644
--- a/apps/extension/python/tvm_ext/__init__.py
+++ b/apps/extension/python/tvm_ext/__init__.py
@@ -21,6 +21,7 @@
 import ctypes
 # Import TVM first to get library symbols
 import tvm
+from tvm import te
 
 def load_lib():
     """Load library, the functions will be registered into TVM"""
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index 257ecd684175..f7e17d2fdc62 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -16,6 +16,8 @@
 # under the License.
 import tvm_ext
 import tvm
+import tvm._ffi.registry
+from tvm import te
 import numpy as np
 
 def test_bind_add():
@@ -26,9 +28,9 @@ def add(a, b):
 
 def test_ext_dev():
     n = 10
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute((n,), lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    A = te.placeholder((n,), name='A')
+    B = te.compute((n,), lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
     def check_llvm():
         if not tvm.runtime.enabled("llvm"):
             return
@@ -43,8 +45,8 @@ def check_llvm():
 
 
 def test_sym_add():
-    a = tvm.var('a')
-    b = tvm.var('b')
+    a = te.var('a')
+    b = te.var('b')
     c = tvm_ext.sym_add(a, b)
     assert c.a == a and c.b == b
 
@@ -59,19 +61,20 @@ def ivec_cb(v2):
         assert(isinstance(v2, tvm_ext.IntVec))
         assert v2[2] == 3
 
-    tvm.convert(ivec_cb)(ivec)
+    tvm.runtime.convert(ivec_cb)(ivec)
 
 
 def test_extract_ext():
-    fdict = tvm.extract_ext_funcs(tvm_ext._LIB.TVMExtDeclare)
+    fdict = tvm._ffi.registry.extract_ext_funcs(
+        tvm_ext._LIB.TVMExtDeclare)
     assert fdict["mul"](3, 4) == 12
 
 
 def test_extern_call():
     n = 10
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute((n,), lambda *i: tvm.call_extern("float32", "TVMTestAddOne", A(*i)), name='B')
-    s = tvm.create_schedule(B.op)
+    A = te.placeholder((n,), name='A')
+    B = te.compute((n,), lambda *i: tvm.tir.call_extern("float32", "TVMTestAddOne", A(*i)), name='B')
+    s = te.create_schedule(B.op)
 
     def check_llvm():
         if not tvm.runtime.enabled("llvm"):
diff --git a/apps/howto_deploy/prepare_test_libs.py b/apps/howto_deploy/prepare_test_libs.py
index b620bc7a1d5f..88d9f8ed5902 100644
--- a/apps/howto_deploy/prepare_test_libs.py
+++ b/apps/howto_deploy/prepare_test_libs.py
@@ -16,13 +16,14 @@
 # under the License.
 """Script to prepare test_addone.so"""
 import tvm
+from tvm import te
 import os
 
 def prepare_test_libs(base_path):
-    n = tvm.var("n")
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    n = te.var("n")
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
     # Compile library as dynamic library
     fadd_dylib = tvm.build(s, [A, B], "llvm", name="addone")
     dylib_path = os.path.join(base_path, "test_addone_dll.so")
diff --git a/apps/howto_deploy/python_deploy.py b/apps/howto_deploy/python_deploy.py
index 07a27fe9426c..2a443253a6ad 100644
--- a/apps/howto_deploy/python_deploy.py
+++ b/apps/howto_deploy/python_deploy.py
@@ -19,6 +19,7 @@
 # file python_deploy.py
 
 import tvm
+from tvm import te
 import numpy as np
 
 def verify(mod, fname):
diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py
index ac3718f7ba8e..973c252be175 100644
--- a/apps/ios_rpc/tests/ios_rpc_test.py
+++ b/apps/ios_rpc/tests/ios_rpc_test.py
@@ -21,6 +21,7 @@
 """
 
 import tvm
+from tvm import te
 import os
 import re
 import sys
@@ -54,14 +55,14 @@ def compile_metal(src):
 
 def test_rpc_module():
     # graph
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
     temp = util.tempdir()
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[B].bind(xi, te.thread_axis("threadIdx.x"))
+    s[B].bind(xo, te.thread_axis("blockIdx.x"))
     # Build the dynamic lib.
     # If we don't want to do metal and only use cpu, just set target to be target
     f = tvm.build(s, [A, B], "metal", target_host=target, name="myadd")
@@ -70,7 +71,7 @@ def test_rpc_module():
                      arch=arch, sdk=sdk)
     xcode.codesign(path_dso1)
 
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
     s[B].parallel(xi)
     s[B].pragma(xo, "parallel_launch_point")
diff --git a/apps/sgx/enclave/src/build_model.py b/apps/sgx/enclave/src/build_model.py
index dff571668422..f8906d1a0e02 100644
--- a/apps/sgx/enclave/src/build_model.py
+++ b/apps/sgx/enclave/src/build_model.py
@@ -23,6 +23,7 @@
 from tvm import relay
 from tvm.relay import testing
 import tvm
+from tvm import te
 
 
 def main():
diff --git a/apps/sgx/run_model.py b/apps/sgx/run_model.py
index fb39e34dc601..c7af96328ec6 100644
--- a/apps/sgx/run_model.py
+++ b/apps/sgx/run_model.py
@@ -17,6 +17,7 @@
 import os.path as osp
 import numpy as np
 import tvm
+from tvm import te
 
 CWD = osp.abspath(osp.dirname(__file__))
 
diff --git a/docs/api/python/te.rst b/docs/api/python/te.rst
index dc3d3dacd2ca..1f70c4d384bb 100644
--- a/docs/api/python/te.rst
+++ b/docs/api/python/te.rst
@@ -23,6 +23,7 @@ tvm.te
    :members:
    :imported-members:
    :exclude-members:
+      any, all, min_value, max_value, trace,
       exp, erf, tanh, sigmoid, log, cos, sin, atan, sqrt, rsqrt, floor, ceil,
       trunc, abs, round, nearbyint, isnan, power, popcount, fmod, if_then_else,
       div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod,
diff --git a/docs/api/python/tir.rst b/docs/api/python/tir.rst
index d1017cdb46ef..ea1ac669b273 100644
--- a/docs/api/python/tir.rst
+++ b/docs/api/python/tir.rst
@@ -20,5 +20,5 @@ tvm.tir
 .. automodule:: tvm.tir
    :members:
    :imported-members:
-   :exclude-members: PrimExpr
+   :exclude-members: PrimExpr, const
    :autosummary:
diff --git a/docs/conf.py b/docs/conf.py
index 05f4cfc970d1..d882f75d83a7 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -61,6 +61,7 @@
 os.environ['TVM_BUILD_DOC'] = '1'
 # Version information.
 import tvm
+from tvm import te
 version = tvm.__version__
 release = tvm.__version__
 
diff --git a/golang/sample/deploy.py b/golang/sample/deploy.py
index 3b221369dbc4..d523b9c85ffe 100644
--- a/golang/sample/deploy.py
+++ b/golang/sample/deploy.py
@@ -21,6 +21,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 
 # Global declarations of environment.
@@ -31,15 +32,15 @@
 ######################################################################
 # Describe the Computation
 # ------------------------
-n = tvm.var("n")
-A = tvm.placeholder((n,), name='A')
-B = tvm.placeholder((n,), name='B')
-C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+n = te.var("n")
+A = te.placeholder((n,), name='A')
+B = te.placeholder((n,), name='B')
+C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
 
 ######################################################################
 # Schedule the Computation
 # ------------------------
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 
 ######################################################################
 # Compilation
diff --git a/jvm/core/src/test/scripts/test_add_cpu.py b/jvm/core/src/test/scripts/test_add_cpu.py
index dd7e4a8de73a..bda66f8c5c73 100644
--- a/jvm/core/src/test/scripts/test_add_cpu.py
+++ b/jvm/core/src/test/scripts/test_add_cpu.py
@@ -17,14 +17,15 @@
 import os
 
 import tvm
+from tvm import te
 from tvm.contrib import cc, util
 
 def test_add(target_dir):
-    n = tvm.var("n")
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = tvm.create_schedule(C.op)
+    n = te.var("n")
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = te.create_schedule(C.op)
     fadd = tvm.build(s, [A, B, C], "llvm", target_host="llvm", name="myadd")
 
     fadd.save(os.path.join(target_dir, "add_cpu.o"))
diff --git a/jvm/core/src/test/scripts/test_add_gpu.py b/jvm/core/src/test/scripts/test_add_gpu.py
index e3f4fbfedaf3..d520054a4c75 100644
--- a/jvm/core/src/test/scripts/test_add_gpu.py
+++ b/jvm/core/src/test/scripts/test_add_gpu.py
@@ -17,22 +17,23 @@
 import os
 
 import tvm
+from tvm import te
 from tvm.contrib import cc, util
 
 def test_add(target_dir):
     if not tvm.runtime.enabled("cuda"):
         print("skip %s because cuda is not enabled..." % __file__)
         return
-    n = tvm.var("n")
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    n = te.var("n")
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
 
     bx, tx = s[C].split(C.op.axis[0], factor=64)
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
     fadd_cuda = tvm.build(s, [A, B, C], "cuda", target_host="llvm", name="myadd")
 
     fadd_cuda.save(os.path.join(target_dir, "add_gpu.o"))
diff --git a/jvm/core/src/test/scripts/test_graph_runtime.py b/jvm/core/src/test/scripts/test_graph_runtime.py
index 4d82973ae031..63a76d116923 100644
--- a/jvm/core/src/test/scripts/test_graph_runtime.py
+++ b/jvm/core/src/test/scripts/test_graph_runtime.py
@@ -17,14 +17,15 @@
 import os
 
 import tvm
+from tvm import te
 import json
 from tvm.contrib import graph_runtime
 
 def dump_graph_lib(target_dir):
     dim = 4
-    A = tvm.placeholder((dim,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    sched = tvm.create_schedule(B.op)
+    A = te.placeholder((dim,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    sched = te.create_schedule(B.op)
 
     node0 = {"op": "null", "name": "x", "inputs": []}
     node1 = {"op": "tvm_op", "name": "add",
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index 65cb67266de6..0c4ca139c631 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -24,7 +24,7 @@
 # tvm._ffi
 from ._ffi.base import TVMError, __version__
 from ._ffi.runtime_ctypes import TypeCode, DataType
-from ._ffi.registry import register_object, register_func, register_extension
+from ._ffi import register_object, register_func, register_extension, get_global_func
 
 # top-level alias
 # tvm.runtime
@@ -47,10 +47,9 @@
 
 # tvm.target
 from . import target
-from .target import build_config
 
 # tvm.te
-from .te import decl_tensor_intrin, create_schedule, tag_scope
+from . import te
 
 # tvm.testing
 from . import testing
@@ -64,14 +63,6 @@
 # others
 from . import arith
 
-# backward compact for topi, to be removed later
-from .api import *
-from .tir import expr, stmt, ir_builder, ir_pass, generic
-from .te import tensor, schedule
-from .tir.op import *
-from . import intrin
-from . import make
-
 # Contrib initializers
 from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel
 
diff --git a/python/tvm/api.py b/python/tvm/api.py
deleted file mode 100644
index 9afaf03ee255..000000000000
--- a/python/tvm/api.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Functions defined in TVM."""
-# pylint: disable=invalid-name,unused-import,redefined-builtin
-import tvm._ffi
-import tvm.ir
-import tvm.tir
-
-from tvm.runtime import convert, const, DataType
-from tvm.ir import container as _container, Range
-from tvm.tir import decl_buffer, layout, bijective_layout
-from tvm.tir import min_value, max_value, indexdiv, indexmod, all, any
-from tvm.te import placeholder, compute, scan, extern, var, size_var, thread_axis, reduce_axis
-
-
-from ._ffi.base import string_types, TVMError
-from ._ffi.registry import register_func, get_global_func, extract_ext_funcs
-
-from . import make as _make
-
-int8 = "int8"
-int32 = "int32"
-float32 = "float32"
-handle = "handle"
diff --git a/python/tvm/arith/analyzer.py b/python/tvm/arith/analyzer.py
index 382a7e033e75..5a420ad81755 100644
--- a/python/tvm/arith/analyzer.py
+++ b/python/tvm/arith/analyzer.py
@@ -212,7 +212,7 @@ def constraint_scope(self, constraint):
         --------
         .. code-block:: python
 
-          x = tvm.var("x")
+          x = te.var("x")
           analyzer = tvm.arith.Analyzer()
           with analzyer.constraint_scope(x % 3 == 0):
               # constraint in effect
diff --git a/python/tvm/autotvm/feature.py b/python/tvm/autotvm/feature.py
index 4ff1139d85f1..c576ffd76e56 100644
--- a/python/tvm/autotvm/feature.py
+++ b/python/tvm/autotvm/feature.py
@@ -28,8 +28,11 @@
 
 import struct
 import numpy as np
+import tvm._ffi
 
-from tvm import schedule, ir_pass, get_global_func, target as _target
+from tvm import target as _target
+from tvm.tir import ir_pass
+from tvm.te import schedule
 from tvm.driver import build_module
 
 def ana_lower(sch, args,
@@ -49,10 +52,12 @@ def ana_lower(sch, args,
     return stmt
 
 try:
-    _get_buffer_curve_sample_flatten = get_global_func(
+    _get_buffer_curve_sample_flatten = tvm._ffi.get_global_func(
         "autotvm.feature.GetCurveSampleFeatureFlatten")
-    _get_itervar_feature = get_global_func("autotvm.feature.GetItervarFeature")
-    _get_itervar_feature_flatten = get_global_func("autotvm.feature.GetItervarFeatureFlatten")
+    _get_itervar_feature = tvm._ffi.get_global_func(
+        "autotvm.feature.GetItervarFeature")
+    _get_itervar_feature_flatten = tvm._ffi.get_global_func(
+        "autotvm.feature.GetItervarFeatureFlatten")
 except ValueError as e:
     def raise_error(*args, **kwargs):  # pylint: disable=unused-argument
         raise RuntimeError("Cannot load autotvm c++ API")
@@ -64,8 +69,8 @@ def get_itervar_feature(sch, args, take_log=False):
 
     Parameters
     ----------
-    sch: tvm.schedule.Schedule
-    args: Array of tvm.tensor.Tensor
+    sch: tvm.te.schedule.Schedule
+    args: Array of te.tensor.Tensor
         the buffer args for lower
     take_log: bool
         whether take log of numerical statics
@@ -112,8 +117,8 @@ def get_itervar_feature_flatten(sch, args, take_log=True):
 
     Parameters
     ----------
-    sch: tvm.schedule.Schedule
-    args: Array of tvm.tensor.Tensor
+    sch: tvm.te.schedule.Schedule
+    args: Array of te.tensor.Tensor
         the buffer args for lower
     take_log: bool
         whether take log of numerical statics
@@ -185,8 +190,8 @@ def get_buffer_curve_sample_flatten(sch, args, sample_n=30):
 
     Parameters
     ----------
-    sch: tvm.schedule.Schedule
-    args: Array of tvm.tensor.Tensor
+    sch: tvm.te.schedule.Schedule
+    args: Array of te.tensor.Tensor
         the buffer args for lower
     sample_n: int
         number of sample points along one dimension
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index 3e85e938fa82..c6b79fabdaf5 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -23,6 +23,7 @@
 import topi
 
 import tvm
+from tvm import te
 from tvm import autotvm, relay
 from tvm.autotvm.task import get_config
 from tvm.autotvm.record import encode, load_from_file
@@ -301,8 +302,8 @@ def _iterate_layout_transform(self, callback):
                             _, out_layout = o_input_info[0]
                         else:
                             _, out_layout = o_output_info[0]
-                        data_placeholder = tvm.placeholder(in_shape, name="data",
-                                                           dtype=self._dtype)
+                        data_placeholder = te.placeholder(in_shape, name="data",
+                                                          dtype=self._dtype)
                         args = [data_placeholder, in_layout, out_layout]
                         callback(i_idx, o_idx, m, n, args)
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 44e6de934649..698ddbc68dd7 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -33,9 +33,13 @@
 
 import numpy as np
 
-from ... import ir_pass, build, build_config, nd, TVMError, register_func, \
-    rpc as _rpc, target as _target
-from ...contrib import nvcc, ndk, tar
+import tvm._ffi
+from tvm import nd, rpc as _rpc, target as _target
+from tvm.tir import ir_pass
+from tvm.error import TVMError
+from tvm.target import build_config
+from tvm.driver import build
+from tvm.contrib import nvcc, ndk, tar
 
 from ..util import get_const_tuple
 from ..env import AutotvmGlobalScope
@@ -581,7 +585,7 @@ def _check():
     return not t.is_alive()
 
 
-@register_func
+@tvm._ffi.register_func
 def tvm_callback_cuda_compile(code):
     """use nvcc to generate ptx code for better optimization"""
     curr_cuda_target_arch = AutotvmGlobalScope.current.cuda_target_arch
diff --git a/python/tvm/autotvm/task/code_hash.py b/python/tvm/autotvm/task/code_hash.py
index 9410f526c45f..3076970f84c9 100644
--- a/python/tvm/autotvm/task/code_hash.py
+++ b/python/tvm/autotvm/task/code_hash.py
@@ -22,7 +22,7 @@
 import inspect
 import zlib
 
-from tvm import schedule
+from tvm.te import schedule
 
 def attach_code_hash(s):
     """Decorator for attaching a code hash to a schedule
@@ -30,7 +30,7 @@ def attach_code_hash(s):
     Parameters
     ----------
     s: Schedule
-        tvm.schedule.Schedule to attach the hash to
+        tvm.te.schedule.Schedule to attach the hash to
     """
     def decorator(func):
         def wrapper(*args, **kwargs):
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index 47c227073677..fbf474fc4df7 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -32,7 +32,7 @@
 from collections import namedtuple, OrderedDict
 import numpy as np
 
-from tvm import schedule, thread_axis
+from tvm.te import schedule, thread_axis
 from tvm.autotvm.util import get_const_int
 
 Axis = namedtuple('Axis', ['space', 'index'])
@@ -57,7 +57,7 @@ class TransformSpace(object):
     .. note::
 
         We can regard our schedule code as a transformation graph of axes.
-        Starting from raw axes in the definition of tvm.compute, we can transform these axes
+        Starting from raw axes in the definition of te.compute, we can transform these axes
         by some operators. The operator includes 'split', 'reorder' and 'annotate'.
         Each operator has some tunable parameters (e.g. the split factor).
         Then the tuning process is just to find good parameters of these op.
@@ -106,7 +106,7 @@ class VirtualAxis(TransformSpace):
 
     Parameters
     ----------
-    var: int or tvm.schedule.IterVar
+    var: int or tvm.te.schedule.IterVar
         If is int, return a virtual axis whose length is the provided argument.
         If is IterVar, return a virtual axis whose length is extracted from
         the IterVar's extent domain.
@@ -266,11 +266,11 @@ def apply(self, sch, op, axis):
 
         Parameters
         ----------
-        sch: tvm.schedule.Schedule
+        sch: tvm.te.schedule.Schedule
             The tvm schedule
-        op: tvm.tensor.Operation
+        op: tvm.te.Operation
             The stage to be applied
-        axis: tvm.schedule.IterVar
+        axis: tvm.te.schedule.IterVar
             axis to split
 
         Returns
@@ -390,11 +390,11 @@ def apply(self, sch, op, axes):
 
         Parameters
         ----------
-        sch: tvm.schedule.Schedule
+        sch: tvm.te.schedule.Schedule
             The tvm schedule
-        op: tvm.tensor.Operation
+        op: tvm.te.Operation
             The stage to be applied
-        axis: tvm.schedule.IterVar
+        axis: tvm.te.schedule.IterVar
             axis to split
 
         Returns
@@ -513,11 +513,11 @@ def apply(self, sch, op, axes, axis_lens=None,
 
         Parameters
         ----------
-        sch: tvm.schedule.Schedule
+        sch: tvm.te.schedule.Schedule
             The tvm schedule
-        op: tvm.tensor.Operation
+        op: tvm.te.Operation
             The stage to be applied
-        axes: Array of tvm.schedule.IterVar
+        axes: Array of tvm.te.schedule.IterVar
             axis to split
         axis_lens: Array of int, optional
             the length of axes
@@ -532,7 +532,7 @@ def apply(self, sch, op, axes, axis_lens=None,
 
         Returns
         -------
-        axes : list of tvm.schedule.IterVar
+        axes : list of tvm.te.schedule.IterVar
             The transformed axes
         """
         if source is not None:  # special case : attach cache_read/cache_write
@@ -624,7 +624,7 @@ def axis(var):
 
         Parameters
         ----------
-        var: int or tvm.schedule.IterVar
+        var: int or tvm.te.schedule.IterVar
             If is int, return an axis whose length is the provided argument.
             If is IterVar, return an axis whose length is extracted from the
             IterVar's extent domain.
@@ -640,7 +640,7 @@ def define_split(self, name, axis, policy='factors', **kwargs):
         ----------
         name: str
             name to index the entity of this space
-        axis: tvm.schedule.IterVar
+        axis: tvm.te.schedule.IterVar
             axis to split
         policy: str
             name of policy.
@@ -681,7 +681,7 @@ def define_reorder(self, name, axes, policy, **kwargs):
         ----------
         name: str
             name to index the entity of this space
-        axes: Array of tvm.schedule.IterVar
+        axes: Array of tvm.te.schedule.IterVar
             axes to reorder
         policy: str
             name of policy
@@ -702,7 +702,7 @@ def define_annotate(self, name, axes, policy, **kwargs):
         ----------
         name: str
             name to index the entity of this space
-        axes: Array of tvm.schedule.IterVar
+        axes: Array of tvm.te.schedule.IterVar
             axes to annotate
         policy: str
             name of policy
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index ca1ae0eefefd..c75105b413b7 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -21,10 +21,13 @@
 func is a state-less function, or a string that
 registers the standard task.
 """
-
 import numpy as np
 
-from ... import tensor, expr, container, placeholder, target as _target
+from tvm import target as _target
+from tvm.ir import container
+from tvm.tir import expr
+from tvm.te import tensor, placeholder
+
 
 from ..util import get_const_int, get_const_tuple
 from .dispatcher import DispatchContext, ApplyConfig
@@ -81,7 +84,7 @@ def deserialize_args(args):
 def args_to_workload(args, task_name=None):
     """Convert argument list to hashable workload tuple.
     This function will convert list to tuple, tvm node to python value and
-    flatten tvm.tensor.Tensor to a tuple
+    flatten te.tensor.Tensor to a tuple
 
     Parameters
     ----------
@@ -138,9 +141,9 @@ def instantiate(self, config):
 
         Returns
         -------
-        sch: tvm.schedule.Schedule
+        sch: tvm.te.schedule.Schedule
             The tvm schedule
-        arg_bufs: Array of tvm.tensor.Tensor
+        arg_bufs: Array of te.tensor.Tensor
             The input/output buffers
         """
         config.flop = 0
@@ -303,12 +306,12 @@ def register_customized_task(name, func=None):
 
         @autotvm.register_customized_task("matmul")
         def matmul(N, L, M, dtype):
-            A = tvm.placeholder((N, L), name='A', dtype=dtype)
-            B = tvm.placeholder((L, M), name='B', dtype=dtype)
+            A = te.placeholder((N, L), name='A', dtype=dtype)
+            B = te.placeholder((L, M), name='B', dtype=dtype)
 
-            k = tvm.reduce_axis((0, L), name='k')
-            C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
-            s = tvm.create_schedule(C.op)
+            k = te.reduce_axis((0, L), name='k')
+            C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+            s = te.create_schedule(C.op)
 
             # schedule
             y, x = s[C].op.axis
@@ -400,7 +403,7 @@ def compute_flop(sch):
 
     Parameters
     ----------
-    sch: tvm.schedule.Schedule
+    sch: tvm.te.schedule.Schedule
         schedule
 
     Returns
@@ -475,8 +478,8 @@ def traverse(ops):
             elif isinstance(op, tensor.PlaceholderOp):
                 pass
             else:
-                raise FlopCalculationError("Only support tvm.compute currently. "
-                                           "Other ops like tvm.scan/tvm.extern is not supported")
+                raise FlopCalculationError("Only support te.compute currently. "
+                                           "Other ops like tvm.te.scan/te.extern is not supported")
         return ret
 
     try:
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 45385fbe8f7e..e1c09133eb23 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -21,15 +21,15 @@
 These decorators can make your simple implementation be able to use different configurations
 for different workloads.
 Here we directly use all arguments to the TOPI call as "workload", so make sure all the arguments
-(except tvm.Tensor) in you calls are hashable. For tvm.Tensor, we will serialize it to a hashable
-tuple.
+(except tvm.te.Tensor) in you calls are hashable. For tvm.te.Tensor,
+we will serialize it to a hashable tuple.
 
 See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
 """
 import tvm.te._ffi_api
 from tvm import target as _target
+from tvm.te import tensor
 
-from ... import tensor
 from .task import args_to_workload, DispatchContext, \
     register_task_compute, register_task_schedule, serialize_args
 
diff --git a/python/tvm/autotvm/util.py b/python/tvm/autotvm/util.py
index 54001d3338ad..01d50e86a88a 100644
--- a/python/tvm/autotvm/util.py
+++ b/python/tvm/autotvm/util.py
@@ -24,7 +24,7 @@
 
 import numpy as np
 
-from .. import expr, ir_pass
+from tvm.tir import expr, ir_pass
 
 logger = logging.getLogger('autotvm')
 
diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py
index 1f322acdf8b9..521e0885548c 100644
--- a/python/tvm/contrib/binutil.py
+++ b/python/tvm/contrib/binutil.py
@@ -18,8 +18,9 @@
 """Utilities for binary file manipulation"""
 import os
 import subprocess
+import tvm._ffi
 from . import util
-from ..api import register_func
+
 
 RELOCATION_LD_SCRIPT_TEMPLATE = """
 /* linker symbol for use in UTVMInit */
@@ -95,7 +96,7 @@ def run_cmd(cmd):
     return output
 
 
-@register_func("tvm_callback_get_section_size")
+@tvm._ffi.register_func("tvm_callback_get_section_size")
 def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
     """Finds size of the section in the binary.
     Assumes `size` shell command exists (typically works only on Linux machines)
@@ -162,7 +163,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix):
     return section_size
 
 
-@register_func("tvm_callback_relocate_binary")
+@tvm._ffi.register_func("tvm_callback_relocate_binary")
 def tvm_callback_relocate_binary(
         binary_path,
         word_size,
@@ -233,7 +234,7 @@ def tvm_callback_relocate_binary(
     return rel_bin
 
 
-@register_func("tvm_callback_read_binary_section")
+@tvm._ffi.register_func("tvm_callback_read_binary_section")
 def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
     """Returns the contents of the specified section in the binary byte array
 
@@ -273,7 +274,7 @@ def tvm_callback_read_binary_section(binary, section, toolchain_prefix):
     return section_bin
 
 
-@register_func("tvm_callback_get_symbol_map")
+@tvm._ffi.register_func("tvm_callback_get_symbol_map")
 def tvm_callback_get_symbol_map(binary, toolchain_prefix):
     """Obtains a map of symbols to addresses in the passed binary
 
diff --git a/python/tvm/contrib/cblas.py b/python/tvm/contrib/cblas.py
index 2337f846be51..e1a4a8a7849b 100644
--- a/python/tvm/contrib/cblas.py
+++ b/python/tvm/contrib/cblas.py
@@ -16,7 +16,7 @@
 # under the License.
 """External function interface to BLAS libraries."""
 import tvm
-from .. import api as _api
+from tvm import te
 
 
 def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
@@ -41,7 +41,7 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
     """
     n = lhs.shape[1] if transa else lhs.shape[0]
     m = rhs.shape[0] if transb else rhs.shape[1]
-    return _api.extern(
+    return te.extern(
         (n, m),
         [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
@@ -75,7 +75,7 @@ def batch_matmul(lhs, rhs, transa=False, transb=False, iterative=False, **kwargs
     b = lhs.shape[0]
     n = lhs.shape[2] if transa else lhs.shape[1]
     m = rhs.shape[1] if transb else rhs.shape[2]
-    return _api.extern(
+    return te.extern(
         (b, n, m),
         [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
diff --git a/python/tvm/contrib/cublas.py b/python/tvm/contrib/cublas.py
index 75290a8f6402..7b42becec2be 100644
--- a/python/tvm/contrib/cublas.py
+++ b/python/tvm/contrib/cublas.py
@@ -16,7 +16,8 @@
 # under the License.
 """External function interface to cuBLAS libraries."""
 import tvm
-from .. import api as _api
+from tvm import te
+
 
 def matmul(lhs, rhs, transa=False, transb=False, dtype=None):
     """Create an extern op that compute matrix mult of A and rhs with cuBLAS
@@ -40,7 +41,7 @@ def matmul(lhs, rhs, transa=False, transb=False, dtype=None):
     n = lhs.shape[1] if transa else lhs.shape[0]
     m = rhs.shape[0] if transb else rhs.shape[1]
     dtype = dtype if dtype is not None else lhs.dtype
-    return _api.extern(
+    return te.extern(
         (n, m), [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.cublas.matmul",
@@ -69,7 +70,7 @@ def batch_matmul(lhs, rhs, transa=False, transb=False, dtype=None):
     n = lhs.shape[2] if transa else lhs.shape[1]
     m = rhs.shape[1] if transb else rhs.shape[2]
     dtype = dtype if dtype is not None else lhs.dtype
-    return _api.extern(
+    return te.extern(
         (b, n, m), [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.cublas.batch_matmul",
diff --git a/python/tvm/contrib/cublaslt.py b/python/tvm/contrib/cublaslt.py
index 1000ede1379d..3b36f4720fec 100644
--- a/python/tvm/contrib/cublaslt.py
+++ b/python/tvm/contrib/cublaslt.py
@@ -16,7 +16,7 @@
 # under the License.
 """External function interface to cuBLASlt libraries."""
 import tvm
-from .. import api as _api
+from tvm import te
 
 
 def matmul(lhs, rhs, transa=False, transb=False, n=0, m=0, dtype=None):
@@ -43,7 +43,7 @@ def matmul(lhs, rhs, transa=False, transb=False, n=0, m=0, dtype=None):
     if m == 0:
         m = rhs.shape[0] if transb else rhs.shape[1]
     dtype = dtype if dtype is not None else lhs.dtype
-    return _api.extern(
+    return te.extern(
         (n, m), [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.cublaslt.matmul",
diff --git a/python/tvm/contrib/cudnn.py b/python/tvm/contrib/cudnn.py
index 20b42d79d27e..e62724512d49 100644
--- a/python/tvm/contrib/cudnn.py
+++ b/python/tvm/contrib/cudnn.py
@@ -19,8 +19,9 @@
 import ctypes
 import numpy as np
 import tvm
-from .. import api as _api
-from .. import get_global_func as _get_global_func
+
+import tvm._ffi
+from tvm import te
 
 # algos can be read from cudnn.h
 _FWD_ALGOS = [
@@ -217,7 +218,7 @@ def conv_output_shape(tensor_format,
         _prepare_global_func_params(dims - 2, pad, stride, dilation, x_shape, w_shape)
     oshape = np.zeros((dims), dtype=np.int32)
 
-    func = _get_global_func("tvm.contrib.cudnn.conv.output_shape")
+    func = tvm._ffi.get_global_func("tvm.contrib.cudnn.conv.output_shape")
     func(tensor_format,
          dims - 2,
          _get_np_int32_array_handle(pad),
@@ -276,7 +277,7 @@ def conv_find_algo(tensor_format,
     pad, stride, dilation, xshape, wshape = \
         _prepare_global_func_params(dims - 2, pad, stride, dilation, x_shape, w_shape)
     yshape = np.array(y_shape, dtype=np.int32)
-    func = _get_global_func("tvm.contrib.cudnn.conv.find_algo")
+    func = tvm._ffi.get_global_func("tvm.contrib.cudnn.conv.find_algo")
     return func(tensor_format,
                 dims - 2,
                 _get_np_int32_array_handle(pad),
@@ -363,7 +364,7 @@ def conv_forward(x,
                                   conv_dtype)
 
     if dims == 4:
-        return _api.extern(
+        return te.extern(
             oshape, [x, w],
             lambda ins, outs: tvm.tir.call_packed(
                 "tvm.contrib.cudnn.conv2d.forward",
@@ -381,7 +382,7 @@ def conv_forward(x,
                 outs[0],
                 conv_dtype), name="y")
 
-    return _api.extern(
+    return te.extern(
         oshape, [x, w],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.cudnn.conv3d.forward",
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 26c16e3135e8..18920c60719e 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -21,6 +21,7 @@
 import numpy as np
 import tvm
 
+
 GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json'
 CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json"
 
diff --git a/python/tvm/contrib/miopen.py b/python/tvm/contrib/miopen.py
index 7f024f70b21a..04e35de92080 100644
--- a/python/tvm/contrib/miopen.py
+++ b/python/tvm/contrib/miopen.py
@@ -19,8 +19,9 @@
 import ctypes
 import numpy as np
 import tvm
-from .. import api as _api
-from .. import get_global_func as _get_global_func
+import tvm._ffi
+
+from tvm import te
 
 
 def _get_np_int32_array_handle(arr):
@@ -91,7 +92,7 @@ def conv2d_forward(x,
     oshape = np.zeros((len(x.shape)), dtype=np.int32)
     xshape = x.shape
     wshape = w.shape
-    setup_func = _get_global_func("tvm.contrib.miopen.conv2d.setup")
+    setup_func = tvm._ffi.get_global_func("tvm.contrib.miopen.conv2d.setup")
     algo = setup_func(conv_mode,
                       data_type,
                       pad_h,
@@ -111,7 +112,7 @@ def conv2d_forward(x,
                       group_count,
                       _get_np_int32_array_handle(oshape))
 
-    return _api.extern(
+    return te.extern(
         list(oshape), [x, w],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.miopen.conv2d.forward",
diff --git a/python/tvm/contrib/mps.py b/python/tvm/contrib/mps.py
index 5d84e892ec74..8f310b0915b6 100644
--- a/python/tvm/contrib/mps.py
+++ b/python/tvm/contrib/mps.py
@@ -16,7 +16,8 @@
 # under the License.
 """External function interface to MPS libraries."""
 import tvm
-from .. import api as _api
+from tvm import te
+
 
 # pylint: disable=C0103,W0612
 
@@ -47,7 +48,7 @@ def matmul(lhs, rhs, transa=False, transb=False):
         m = b
     if transb:
         n = c
-    return _api.extern(
+    return te.extern(
         (m, n), [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.mps.matmul", ins[0], ins[1], outs[0], transa, transb),
@@ -79,7 +80,7 @@ def conv2d(data, weight, pad='SAME', stride=1):
     ho = hi // stride
     wo = wi // stride
 
-    return _api.extern(
+    return te.extern(
         (n, ho, wo, co), [data, weight],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.mps.conv2d", ins[0], ins[1], outs[0], padding, stride),
diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py
index a55a344b6410..1ce1dcc40f40 100644
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -16,8 +16,8 @@
 # under the License.
 """External function interface to NNPACK libraries."""
 import tvm
+from tvm import te
 import tvm._ffi
-from .. import api as _api
 
 
 def is_available():
@@ -43,7 +43,7 @@ def fully_connected_inference(lhs, rhs, nthreads=1):
         lhs 1D array out[output_channels] of FP32 elements.
     """
     m = rhs.shape[0]
-    return _api.extern(
+    return te.extern(
         (m, ), [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.nnpack.fully_connected_inference",
@@ -100,13 +100,13 @@ def convolution_inference(
     assert isinstance(stride, list) and len(stride) == 2
     batch, _, input_height, input_width = data.shape
     output_channels, _, kernel_height, kernel_width = kernel.shape
-    idxdiv = _api.indexdiv
+    idxdiv = te.indexdiv
     output_height = idxdiv(
         input_height + padding[0] + padding[1] - kernel_height, stride[0]) + 1
     output_width = idxdiv(
         input_width + padding[0] + padding[1] - kernel_width, stride[1]) + 1
 
-    return _api.extern(
+    return te.extern(
         (batch, output_channels, output_height, output_width),
         [data, kernel, bias] if bias is not None else [data, kernel],
         lambda ins, outs: tvm.tir.call_packed(
@@ -155,11 +155,11 @@ def convolution_inference_without_weight_transform(
     batch, _, input_height, input_width = data.shape
     output_channels, _, _, _ = transformed_kernel.shape
     kernel_height, kernel_width = (3, 3)
-    idxdiv = _api.indexdiv
+    idxdiv = te.indexdiv
     output_height = idxdiv(input_height + padding[0] + padding[1] - kernel_height, stride[0]) + 1
     output_width = idxdiv(input_width + padding[0] + padding[1] - kernel_width, stride[1]) + 1
 
-    return _api.extern(
+    return te.extern(
         (batch, output_channels, output_height, output_width),
         [data, transformed_kernel, bias] if bias is not None else [data, transformed_kernel],
         lambda ins, outs: tvm.tir.call_packed(
@@ -194,7 +194,7 @@ def convolution_inference_weight_transform(
     transform_tile_size = 8
     if not isinstance(dtype, str):
         dtype = dtype.dtype
-    return _api.extern(
+    return te.extern(
         (output_channels, input_channels, transform_tile_size, transform_tile_size),
         [kernel],
         lambda ins, outs: tvm.tir.call_packed(
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 8712f73c2343..fc8232053b5f 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -21,10 +21,11 @@
 import subprocess
 import os
 import warnings
+
+import tvm._ffi
 from tvm.runtime import ndarray as nd
 
 from . import util
-from ..api import register_func
 from .._ffi.base import py_str
 
 def compile_cuda(code,
@@ -152,7 +153,7 @@ def get_cuda_version(cuda_path):
         raise RuntimeError("Cannot read cuda version file")
 
 
-@register_func("tvm_callback_libdevice_path")
+@tvm._ffi.register_func("tvm_callback_libdevice_path")
 def find_libdevice_path(arch):
     """Utility function to find libdevice
 
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index bc93afbf165e..2906410efc40 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -19,6 +19,7 @@
 
 import logging
 import tvm
+from tvm import te
 from . import util
 from .. import rpc
 
@@ -79,17 +80,17 @@ def measure_bandwidth_sum(total_item, item_per_thread, stride,
     base_type = str(base_type) + str(bits)
     dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)
 
-    k = tvm.reduce_axis((0, m), name="k")
+    k = te.reduce_axis((0, m), name="k")
 
-    x = tvm.placeholder((n,), dtype=dtype, name="x")
-    op = tvm.comm_reducer(lambda x, y: x*y, lambda t: tvm.const(1, dtype=t), name="sum")
-    y = tvm.compute((n // m,),
-                    lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k))
-    s = tvm.create_schedule(y.op)
+    x = te.placeholder((n,), dtype=dtype, name="x")
+    op = te.comm_reducer(lambda x, y: x*y, lambda t: tvm.tir.const(1, dtype=t), name="sum")
+    y = te.compute((n // m,),
+                   lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k))
+    s = te.create_schedule(y.op)
 
     yo, yi = s[y].split(y.op.axis[0], target.max_num_threads)
-    s[y].bind(yo, tvm.thread_axis("blockIdx.x"))
-    s[y].bind(yi, tvm.thread_axis("threadIdx.x"))
+    s[y].bind(yo, te.thread_axis("blockIdx.x"))
+    s[y].bind(yi, te.thread_axis("threadIdx.x"))
     s[y].unroll(k)
 
     try:
@@ -207,10 +208,10 @@ def measure_compute_mad(total_item, item_per_thread, base_type, bits, lanes,
     def extern(ins, outs):
         # pylint: disable=unused-argument
         """construct measurement function by building IR directly"""
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
 
-        bx = tvm.thread_axis("blockIdx.x")
-        tx = tvm.thread_axis("threadIdx.x")
+        bx = te.thread_axis("blockIdx.x")
+        tx = te.thread_axis("threadIdx.x")
 
         ib.scope_attr(bx, "thread_extent", n // max_threads)
         ib.scope_attr(tx, "thread_extent", max_threads)
@@ -235,8 +236,8 @@ def extern(ins, outs):
         ib.emit(outs[0].vstore(idx, b[0]))
         return ib.get()
 
-    y = tvm.extern((n,), [], extern, name="y", dtype=dtype)
-    s = tvm.create_schedule(y.op)
+    y = te.extern((n,), [], extern, name="y", dtype=dtype)
+    s = te.create_schedule(y.op)
 
     try:
         func = tvm.build(s, [y], target, target_host=target_host)
diff --git a/python/tvm/contrib/random.py b/python/tvm/contrib/random.py
index bcc9b1703386..727b68bbbd19 100644
--- a/python/tvm/contrib/random.py
+++ b/python/tvm/contrib/random.py
@@ -16,8 +16,8 @@
 # under the License.
 """External function interface to random library."""
 import tvm
+from tvm import te
 import tvm._ffi
-from .. import api as _api
 
 
 def randint(low, high, size, dtype='int32'):
@@ -38,7 +38,7 @@ def randint(low, high, size, dtype='int32'):
         A tensor with specified size and dtype
     """
     assert 'int' in dtype, "the type of randint output must be int or uint"
-    return _api.extern(size, [], lambda ins, outs: tvm.tir.call_packed(
+    return te.extern(size, [], lambda ins, outs: tvm.tir.call_packed(
         "tvm.contrib.random.randint", int(low), int(high), outs[0]), dtype=dtype)
 
 
@@ -66,7 +66,7 @@ def uniform(low, high, size):
     out : Tensor
         A tensor with specified size and dtype.
     """
-    return _api.extern(size, [], lambda ins, outs: tvm.tir.call_packed(
+    return te.extern(size, [], lambda ins, outs: tvm.tir.call_packed(
         "tvm.contrib.random.uniform", float(low), float(high), outs[0]), dtype='float32')
 
 
@@ -90,7 +90,7 @@ def normal(loc, scale, size):
     out : Tensor
         A tensor with specified size and dtype
     """
-    return _api.extern(size, [], lambda ins, outs: tvm.tir.call_packed(
+    return te.extern(size, [], lambda ins, outs: tvm.tir.call_packed(
         "tvm.contrib.random.normal", float(loc), float(scale), outs[0]), dtype='float32')
 
 
diff --git a/python/tvm/contrib/rocblas.py b/python/tvm/contrib/rocblas.py
index e11be5a1d973..86ffaea4e040 100644
--- a/python/tvm/contrib/rocblas.py
+++ b/python/tvm/contrib/rocblas.py
@@ -16,7 +16,8 @@
 # under the License.
 """External function interface to rocBLAS libraries."""
 import tvm
-from .. import api as _api
+from tvm import te
+
 
 def matmul(lhs, rhs, transa=False, transb=False):
     """Create an extern op that compute matrix mult of A and rhs with rocBLAS
@@ -39,7 +40,7 @@ def matmul(lhs, rhs, transa=False, transb=False):
     """
     n = lhs.shape[1] if transa else lhs.shape[0]
     m = rhs.shape[0] if transb else rhs.shape[1]
-    return _api.extern(
+    return te.extern(
         (n, m), [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
             "tvm.contrib.rocblas.matmul",
diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index e5cebdd3f5dc..7d4b4a2ebefd 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -18,11 +18,13 @@
 import subprocess
 from os.path import join, exists
 
+import tvm._ffi
 from tvm._ffi.base import py_str
+import tvm.runtime
 import tvm.target
 
 from . import util
-from ..api import register_func, convert
+
 
 def find_lld(required=True):
     """Find ld.lld in system.
@@ -85,7 +87,7 @@ def rocm_link(in_file, out_file, lld=None):
         raise RuntimeError(msg)
 
 
-@register_func("tvm_callback_rocm_link")
+@tvm._ffi.register_func("tvm_callback_rocm_link")
 def callback_rocm_link(obj_bin):
     """Links object file generated from LLVM to HSA Code Object
 
@@ -108,7 +110,7 @@ def callback_rocm_link(obj_bin):
     cobj_bin = bytearray(open(tmp_cobj, "rb").read())
     return cobj_bin
 
-@register_func("tvm_callback_rocm_bitcode_path")
+@tvm._ffi.register_func("tvm_callback_rocm_bitcode_path")
 def callback_rocm_bitcode_path(rocdl_dir="/opt/rocm/lib/"):
     """Utility function to find ROCm device library bitcodes
 
@@ -138,4 +140,4 @@ def callback_rocm_bitcode_path(rocdl_dir="/opt/rocm/lib/"):
         "oclc_wavefrontsize64_on.amdgcn.bc"
     ]
     paths = [join(rocdl_dir, bitcode) for bitcode in bitcode_files]
-    return convert([path for path in paths if exists(path)])
+    return tvm.runtime.convert([path for path in paths if exists(path)])
diff --git a/python/tvm/contrib/sdaccel.py b/python/tvm/contrib/sdaccel.py
index 1234d546ebae..3f9bf43a85d3 100644
--- a/python/tvm/contrib/sdaccel.py
+++ b/python/tvm/contrib/sdaccel.py
@@ -17,11 +17,12 @@
 """Utility for Interacting with SDAccel Tools"""
 import subprocess
 import os
+
+import tvm._ffi
 from . import util
-from ..api import register_func
 
 
-@register_func("tvm_callback_sdaccel_compile")
+@tvm._ffi.register_func("tvm_callback_sdaccel_compile")
 def compile_vhls(kernel_info, device_name):
     """Compile Vivado HLS code for SDAccel.
 
diff --git a/python/tvm/contrib/sparse.py b/python/tvm/contrib/sparse.py
index 966e180ec2b8..77f84b1eb4ed 100644
--- a/python/tvm/contrib/sparse.py
+++ b/python/tvm/contrib/sparse.py
@@ -18,10 +18,9 @@
 # pylint: disable=invalid-name
 import numpy as _np
 from tvm.runtime import ndarray as _nd
-
-from .. import expr as _expr
-from .. import api as _api
-from .. import tensor as _tensor
+from tvm import te
+from tvm.tir import expr as _expr
+from tvm.te import tensor as _tensor
 
 
 float32 = "float32"
@@ -136,9 +135,9 @@ def __init__(self, shape, nonzeros, dtype, name):
         """
         SparsePlaceholderOp.__init__(self, shape, nonzeros, dtype, name)
         self.stype = 'csr'
-        self.data = _api.placeholder((nonzeros,), dtype=dtype, name=self.name+'_data')
-        self.indices = _api.placeholder((nonzeros,), dtype=itype, name=self.name+'_indices')
-        self.indptr = _api.placeholder((self.shape[0]+1,), dtype=itype, name=self.name+'_indptr')
+        self.data = te.placeholder((nonzeros,), dtype=dtype, name=self.name+'_data')
+        self.indices = te.placeholder((nonzeros,), dtype=itype, name=self.name+'_indices')
+        self.indptr = te.placeholder((self.shape[0]+1,), dtype=itype, name=self.name+'_indptr')
         assert isinstance(self.data, _tensor.Tensor)
         assert isinstance(self.indices, _tensor.Tensor)
         assert isinstance(self.indptr, _tensor.Tensor)
diff --git a/python/tvm/contrib/tedd.py b/python/tvm/contrib/tedd.py
index f15b7d489eee..68e15f2b1ddd 100644
--- a/python/tvm/contrib/tedd.py
+++ b/python/tvm/contrib/tedd.py
@@ -282,7 +282,7 @@ def get_leaf_itervar_index(itervar, leaf_iv):
     def encode_itervar_relation(obj_manager, rel):
         """Extract and encode IterVar Relationship visualization data to a dictionary"""
         rel_type = type(rel)
-        if rel_type is tvm.schedule.Split:
+        if rel_type is tvm.te.schedule.Split:
             node_type = 'Split_Relation'
             rel_dict = {
                 "type": node_type,
@@ -290,7 +290,7 @@ def encode_itervar_relation(obj_manager, rel):
                 "outer": obj_manager.get_dom_path(rel.outer),
                 "inner": obj_manager.get_dom_path(rel.inner),
             }
-        elif rel_type is tvm.schedule.Fuse:
+        elif rel_type is tvm.te.schedule.Fuse:
             node_type = 'Fuse_Relation'
             rel_dict = {
                 "type": node_type,
@@ -298,7 +298,7 @@ def encode_itervar_relation(obj_manager, rel):
                 "outer": obj_manager.get_dom_path(rel.outer),
                 "inner": obj_manager.get_dom_path(rel.inner),
             }
-        elif rel_type is tvm.schedule.Singleton:
+        elif rel_type is tvm.te.schedule.Singleton:
             node_type = 'Singleton_Relation'
             rel_dict = {
                 "type": node_type,
@@ -377,12 +377,12 @@ def encode_schedule(sch, need_range):
             dict : dictionary
                 A nested dictionary
         """
-        assert isinstance(sch, tvm.schedule.Schedule
-                          ), 'Input is not a tvm.schedule.Schedule object.'
+        assert isinstance(sch, tvm.te.schedule.Schedule
+                          ), 'Input is not a tvm.te.schedule.Schedule object.'
         range_map = None
         if need_range:
             try:
-                range_map = tvm.schedule.InferBound(sch)
+                range_map = tvm.te.schedule.InferBound(sch)
             except tvm._ffi.base.TVMError as expt:
                 warnings.warn(
                     'Ranges are not available, because InferBound fails with the following error:\n'
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index f529ee26b58f..67eb22414abd 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -89,7 +89,7 @@ def form_body(sch):
     """According to the given schedule, form the raw body
     Parameters
     ----------
-    sch : tvm.schedule.Schedule
+    sch : tvm.te.schedule.Schedule
     The given scheduler to form the raw body
 
     Returns
@@ -113,7 +113,7 @@ def lower(sch,
 
     Parameters
     ----------
-    sch : tvm.schedule.Schedule
+    sch : tvm.te.schedule.Schedule
         The schedule to be built
 
     args : list of Buffer or Tensor or Var
@@ -286,7 +286,7 @@ def build(inputs,
 
     Parameters
     ----------
-    inputs : tvm.Schedule, LoweredFunc, or dict of target to LoweredFunc list
+    inputs : tvm.te.Schedule, LoweredFunc, or dict of target to LoweredFunc list
         The schedule to be built
 
     args : list of Buffer or Tensor or Var, optional
@@ -325,10 +325,10 @@ def build(inputs,
     .. code-block:: python
 
         n = 2
-        A = tvm.placeholder((n,), name='A')
-        B = tvm.placeholder((n,), name='B')
-        C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-        s = tvm.create_schedule(C.op)
+        A = te.placeholder((n,), name='A')
+        B = te.placeholder((n,), name='B')
+        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+        s = tvm.te.create_schedule(C.op)
         f = tvm.lower(s, [A, B, C], name="test_add")
         m = tvm.build(f, target="llvm")
 
@@ -337,10 +337,10 @@ def build(inputs,
     .. code-block:: python
 
         n = 2
-        A = tvm.placeholder((n,), name='A')
-        B = tvm.placeholder((n,), name='B')
-        C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-        s1 = tvm.create_schedule(C.op)
+        A = te.placeholder((n,), name='A')
+        B = te.placeholder((n,), name='B')
+        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+        s1 = tvm.te.create_schedule(C.op)
         with tvm.target.cuda() as cuda_tgt:
           s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
           f1 = tvm.lower(s1, [A, B, C], name="test_add1")
diff --git a/python/tvm/hybrid/calls.py b/python/tvm/hybrid/calls.py
index 0933628a9943..5b5c34d5cb0f 100644
--- a/python/tvm/hybrid/calls.py
+++ b/python/tvm/hybrid/calls.py
@@ -16,6 +16,9 @@
 # under the License.
 """Intrinsics of TVM-Python Hybrid Script for Python compilation time
 semantic support."""
+
+from tvm.runtime import const, convert
+import tvm.te
 from tvm.ir.container import Array
 from tvm import target as _tgt
 from tvm.tir import expr as _expr
@@ -23,8 +26,6 @@
 from tvm.tir import call_pure_intrin
 from tvm.tir.stmt import For
 
-from .. import api as _api
-
 from .util import _internal_assert
 
 # pylint: disable=redefined-builtin
@@ -42,11 +43,11 @@ def _range(annotation, args):
     """Handling TVM loop types"""
     n = args.__len__()
     if n == 1:
-        low, ext = _api.const(0, dtype='int32'), args[0]
+        low, ext = const(0, dtype='int32'), args[0]
     else:
         _internal_assert(n == 2, "A loop intrinsic should only have 1 or 2 arguments!")
         low, ext = args[0], args[1]
-    if not ir_pass.Equal(low, _api.const(0, dtype='int32')):
+    if not ir_pass.Equal(low, const(0, dtype='int32')):
         ext = ext - low
     for_type = LOOP_INTRIN[annotation]
     iter_var = None
@@ -62,16 +63,16 @@ def bind(func_id, args):
     _internal_assert(args.__len__() == 2, "A loop bind should only have 2 arguments!")
     _internal_assert(isinstance(args[0], str), \
                      "A loop bind's first argument should be a string!")
-    low, ext = _api.const(0, "int32"), args[1]
-    iter_var = _api.thread_axis((low, ext), args[0])
+    low, ext = const(0, "int32"), args[1]
+    iter_var = tvm.te.thread_axis((low, ext), args[0])
     for_type = None
     return iter_var, low, ext, for_type
 
 
 def _math_intrin(func_id, args):
     # pylint: disable=import-outside-toplevel
-    import tvm.tir.op
-    return getattr(tvm.tir.op, func_id)(*args)
+    from tvm.tir import op
+    return getattr(op, func_id)(*args)
 
 sqrt = log = exp = tanh = sigmoid = power = popcount = _math_intrin #pylint: disable=invalid-name
 
@@ -88,7 +89,7 @@ def _allocate_tensor(func_id, args):
     """Handling TVM tensor allocation.
     You may refer hybrid.intrin.allocate for more details."""
     n = args.__len__()
-    _internal_assert(isinstance(_api.convert(args[0]), Array), \
+    _internal_assert(isinstance(convert(args[0]), Array), \
                      "allocate's first argument should be a tuple of shape!")
     shape = args[0]
     for i in shape:
@@ -119,10 +120,10 @@ def len(func_id, args):
     _internal_assert(args.__len__() == 1, "Only 1 argument is expected!")
     _internal_assert(func_id == "len", "This function cannot be directly invoked!")
     try:
-        return _api.convert(args[0].__len__())
+        return convert(args[0].__len__())
     except: #pylint: disable=bare-except
         _internal_assert(args[0].shape.__len__() == 1, "Only one-dimension array can get len")
-        return _api.convert(args[0].shape[0])
+        return convert(args[0].shape[0])
 
 
 def _cast(func_id, args):
@@ -159,4 +160,4 @@ def max_num_threads(func_id, args):
     else:
         _internal_assert(isinstance(args[0], _expr.IntImm), "In tvm bool should be uint")
         res = _tgt.Target.current(args[0].value).max_num_threads
-    return _api.convert(res)
+    return convert(res)
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index cf8584a1e999..0f8f3dd2ad01 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -25,7 +25,9 @@
 
 from enum import Enum
 from tvm.ir import Array, Range
+import tvm.runtime
 import tvm.tir
+import tvm.te
 import tvm.te._ffi_api
 
 from tvm.tir import expr as _expr
@@ -40,8 +42,6 @@
 from . import util
 from .preprocessor import determine_variable_usage
 
-from .. import api  as _api
-
 
 def concat_list_to_block(lst):
     """Concatenate a list of Python IR nodes to HalideIR Block"""
@@ -125,7 +125,7 @@ def __init__(self, args, usage, symbols, closure_vars, func_name=None):
         """
         Parameters
         ----------
-        args: A list of tvm.placeholder or tvm.var
+        args: A list of tvm.te.placeholder or te.var
             Provided by the user, the argument list of the function to be lowered.
 
         usage: A dict of variables used in last in this function
@@ -210,9 +210,9 @@ def wrap_up_realize(self, node, body):
 
             _domain = [Range.make_by_min_extent(0, i) for i in _buf.shape]
             _dtype = _buf.dtype
-            _true = _api.convert(True)
+            _true = tvm.runtime.convert(True)
             body = tvm.tir.Realize(_buf.op, 0, _dtype, _domain, _true, body)
-            body = tvm.tir.AttrStmt(_buf.op, 'realize_scope', _api.convert(_scope), body)
+            body = tvm.tir.AttrStmt(_buf.op, 'realize_scope', tvm.runtime.convert(_scope), body)
 
         for elem in to_pop:
             self.symbols.pop(elem)
@@ -256,10 +256,10 @@ def visit_Expr(self, node):
     def visit_Name(self, node):
         name = node.id
         if sys.version_info[0] == 2 and name in ['True', 'False']:
-            return _api.convert(ast.literal_eval(name))
+            return tvm.runtime.convert(ast.literal_eval(name))
 
         if name in self.closure_vars:
-            return _api.convert(self.closure_vars[name])
+            return tvm.runtime.convert(self.closure_vars[name])
 
         ty, entry = self.symbols[name]
         _internal_assert(name in self.symbols, "Unknown symbol %s!" % name)
@@ -271,9 +271,9 @@ def visit_Name(self, node):
             return entry if isinstance(node.ctx, ast.Load) else None
         if ty is Symbol.BufferVar:
             if isinstance(node.ctx, ast.Load):
-                return tvm.tir.Call(entry.dtype, entry.name, [_api.const(0, 'int32')], \
+                return tvm.tir.Call(entry.dtype, entry.name, [tvm.runtime.const(0, 'int32')], \
                                   _expr.Call.Halide, entry.op, entry.value_index)
-            return entry, [_api.const(0, 'int32')]
+            return entry, [tvm.runtime.const(0, 'int32')]
         # Do I need any assertion here?
         return entry
 
@@ -287,11 +287,11 @@ def visit_Num(self, node):
             _internal_assert(isinstance(node.n, bool),
                              "The data type should be one of (int, float, bool)")
             dtype = "bool"
-        return _api.const(node.n, dtype)
+        return tvm.runtime.const(node.n, dtype)
 
 
     def visit_NameConstant(self, node):
-        return _api.convert(node.value)
+        return tvm.runtime.convert(node.value)
 
 
     def visit_AugAssign(self, node):
@@ -301,7 +301,7 @@ def visit_AugAssign(self, node):
             _internal_assert(len(buf) == 2, "LHS is supposed to be (buf, args)!")
             buf, args = buf
         else:
-            args = [_api.const(0, 'int32')]
+            args = [tvm.runtime.const(0, 'int32')]
         _internal_assert(isinstance(buf, Tensor), "LHS is supposed to be Tensor!")
 
         read = tvm.tir.Call(buf.dtype, buf.name, args, _expr.Call.Halide, buf.op, buf.value_index)
@@ -341,7 +341,7 @@ def visit_Assign(self, node):
                                  "This value should not be defined before this point!")
                 if isinstance(rhs, tuple):
                     shape, dtype, scope = rhs
-                    ph = _api.placeholder(shape, dtype=dtype, name=lhs)
+                    ph = tvm.te.placeholder(shape, dtype=dtype, name=lhs)
                     self.add_symbol(lhs, getattr(Symbol, scope.title() + "Buffer"), ph)
                     if scope == 'output':
                         self.outputs.append(lhs)
@@ -353,7 +353,7 @@ def visit_Assign(self, node):
                                      "Single variable not supported in devices' side!\n" + \
                                      "If you are using GPU, please allocate a 'local' spad " + \
                                      "outside the bind body")
-                    ph = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
+                    ph = tvm.te.placeholder((1, ), dtype=rhs.dtype, name=lhs)
                     self.add_symbol(lhs, Symbol.BufferVar, ph)
             lhs = self.visit(lhs_)
             if lhs is not None:
@@ -524,8 +524,8 @@ def visit_For(self, node):
 
         if iter_var is None:
             _internal_assert(for_type is not None, "The loop iterating function parse error!")
-            offset = iter_var = _api.var(_name)
-            if not _ir_pass.Equal(low, _api.const(0, 'int32')):
+            offset = iter_var = tvm.te.var(_name)
+            if not _ir_pass.Equal(low, tvm.runtime.const(0, 'int32')):
                 offset = iter_var + low
             self.add_symbol(_name, Symbol.LoopVar, offset)
             _body = visit_list_to_block(self.visit, node.body)
@@ -543,7 +543,7 @@ def visit_For(self, node):
         else:
             _internal_assert(not isinstance(for_type, tuple), \
                             "Micro expansion should be handled before!")
-            res = tvm.tir.For(iter_var, _api.const(0, 'int32'), ext, for_type, 0, _body)
+            res = tvm.tir.For(iter_var, tvm.runtime.const(0, 'int32'), ext, for_type, 0, _body)
 
         self.symbols.pop(_name)
         return res
@@ -579,7 +579,7 @@ def visit_Str(self, node):
 
     def visit_Assert(self, node):
         test = self.visit(node.test)
-        mesg = _api.convert(self.visit(node.msg))
+        mesg = tvm.runtime.convert(self.visit(node.msg))
         return tvm.tir.AssertStmt(test, mesg, util.make_nop())
 
 
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 2b6795652878..6c019893bf20 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -22,6 +22,7 @@
 import sys
 import numpy
 
+import tvm.runtime
 from tvm._ffi.base import numeric_types
 from tvm.ir.container import Array
 
@@ -29,8 +30,6 @@
 from tvm.tir import stmt as _stmt
 from tvm.te.tensor import Tensor
 
-from .. import api as _api
-
 
 #pylint: disable=invalid-name
 np_arg_types = tuple(list(numeric_types) + [numpy.ndarray])
@@ -47,7 +46,7 @@ def _internal_assert(cond, err):
 # Useful constants. In avoid of runtime dependences, we use function calls to return them.
 def make_nop():
     """Returns a 'no operation' node in HalideIR."""
-    return _stmt.Evaluate(_api.const(0, dtype='int32'))
+    return _stmt.Evaluate(tvm.runtime.const(0, dtype='int32'))
 
 
 def is_docstring(node):
@@ -73,7 +72,7 @@ def _pruned_source(func):
 def replace_io(body, rmap):
     """Replacing tensors usage according to the dict given"""
     # pylint: disable=import-outside-toplevel
-    from .. import ir_pass
+    from tvm.tir import ir_pass
 
     def replace(op):
         if isinstance(op, _stmt.Provide) and op.func in rmap.keys():
diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
deleted file mode 100644
index 93e8fcb3f140..000000000000
--- a/python/tvm/intrin.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=unused-wildcard-import, wildcard-import, redefined-builtin
-"""Backwared compatible layer for intrin."""
-from .tir.op import *
diff --git a/python/tvm/make.py b/python/tvm/make.py
deleted file mode 100644
index 089c3938723b..000000000000
--- a/python/tvm/make.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-import
-"""namespace of IR node builder make function
-
-This namespace is used for developers. While you do not see any declarations.
-The functions are automatically exported from C++ side via PackedFunc.
-
-Each api is a PackedFunc that can be called in a positional argument manner.
-You can use make function to build the IR node.
-"""
-import tvm._ffi
-import tvm.ir
-from tvm.ir import make_node as node
-from tvm.tir import Call
-
-
-def make_by_min_extent(min_value, extent):
-    """Construct a Range by min and extent.
-
-    This constructs a range in [min_value, min_value + extent)
-
-    Parameters
-    ----------
-    min_value : PrimExpr
-        The minimum value of the range.
-
-    extent : PrimExpr
-        The extent of the range.
-
-    Returns
-    -------
-    rng : Range
-        The constructed range.
-    """
-    return tvm.ir.Range.make_by_min_extent(min_value, extent)
-
-tvm._ffi._init_api("tvm.make")
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 2ad210e7d109..f4a7c75864d5 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -18,7 +18,7 @@
 """The Relay IR namespace containing the IR definition and compiler."""
 import os
 from sys import setrecursionlimit
-from ..api import register_func
+
 from . import call_graph
 from . import base
 from . import ty
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index 9169ef49210d..df0347bd2bae 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -26,10 +26,10 @@ def lower(sch, inputs, func_name, source_func):
 
     Parameters
     ----------
-    sch : tvm.Schedule
+    sch : tvm.te.Schedule
         The schedule.
 
-    inputs : List[tvm.Tensor]
+    inputs : List[tvm.te.Tensor]
         The inputs to the function.
 
     func_name : str
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 6466dff6c5df..a51e4f7bad11 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -21,6 +21,7 @@
 import logging
 import numpy as np
 import tvm
+from tvm import te
 from ..base import register_relay_node, Object
 from ... import target as _target
 from ... import autotvm
@@ -79,12 +80,12 @@ def get_shape(shape):
     """Convert the shape to correct dtype and vars."""
     ret = []
     for dim in shape:
-        if isinstance(dim, tvm.expr.IntImm):
+        if isinstance(dim, tvm.tir.IntImm):
             val = int(dim)
             assert val <= np.iinfo(np.int32).max
-            ret.append(tvm.expr.IntImm("int32", val))
-        elif isinstance(dim, tvm.expr.Any):
-            ret.append(tvm.var("any_dim", "int32"))
+            ret.append(tvm.tir.IntImm("int32", val))
+        elif isinstance(dim, tvm.tir.Any):
+            ret.append(te.var("any_dim", "int32"))
         else:
             ret.append(dim)
     return ret
@@ -103,7 +104,7 @@ def get_valid_implementations(op, attrs, inputs, out_type, target):
     attrs : object
         The op attribute.
 
-    inputs : List[tvm.Tensor]
+    inputs : List[tvm.te.Tensor]
         Input tensors to the op.
 
     out_type : relay.Type
@@ -129,7 +130,7 @@ def get_valid_implementations(op, attrs, inputs, out_type, target):
             flag = True
             for clause in spec.condition.clauses:
                 clause = analyzer.canonical_simplify(clause)
-                if isinstance(clause, tvm.expr.IntImm) and clause.value:
+                if isinstance(clause, tvm.tir.IntImm) and clause.value:
                     continue
                 flag = False
                 break
@@ -162,7 +163,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
     attrs : object
         The op attribute.
 
-    inputs : List[tvm.Tensor]
+    inputs : List[tvm.te.Tensor]
         Input tensors to the op.
 
     out_type : relay.Type
@@ -176,7 +177,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
 
     Returns
     -------
-    ret : tuple(relay.op.OpImplementation, List[tvm.Tensor])
+    ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
         The best op implementation and the corresponding output tensors.
     """
     all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
index f58a9b0d5ccd..762210dbe428 100644
--- a/python/tvm/relay/backend/graph_runtime_codegen.py
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -36,7 +36,7 @@
 from tvm.runtime.ndarray import empty
 from tvm.relay import _build_module
 from tvm import target as _target
-from tvm import expr as _expr
+from tvm.tir import expr as _expr
 
 class GraphRuntimeCodegen(object):
     """The compiler from Relay to the TVM runtime system."""
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 6d9c850cb7ff..22e0b916e69a 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -23,7 +23,7 @@
 
 from tvm.ir import IRModule
 
-from tvm import expr as tvm_expr
+from tvm.tir import expr as tvm_expr
 from .. import nd as _nd, target as _target, autotvm
 from ..contrib import graph_runtime as _graph_rt
 from . import _build_module
diff --git a/python/tvm/relay/debug.py b/python/tvm/relay/debug.py
index a2f3533a3564..838eab57c0f2 100644
--- a/python/tvm/relay/debug.py
+++ b/python/tvm/relay/debug.py
@@ -16,22 +16,20 @@
 # under the License.
 # pylint: disable=wildcard-import, redefined-builtin, invalid-name
 """The Relay IR namespace containing the IR definition and compiler."""
-from __future__ import absolute_import
-from ..api import register_func
-
+import tvm._ffi
 
 # pylint: disable=unused-argument, import-outside-toplevel
 def _debugger_init(expr, stack):
     import pdb
     pdb.set_trace()
 
-@register_func("relay.debug")
+@tvm._ffi.register_func("relay.debug")
 def _debug(*args):
     import pdb
     pdb.set_trace()
 
 # pylint: disable=unused-argument
-@register_func("relay.debug_interp")
+@tvm._ffi.register_func("relay.debug_interp")
 def _debug_interp(*args):
     _, _, _, ist = args
     print("Relay Debugger")
diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
index 99a3930a4ea1..0e5b64cbbacc 100644
--- a/python/tvm/relay/frontend/coreml.py
+++ b/python/tvm/relay/frontend/coreml.py
@@ -17,7 +17,6 @@
 # pylint: disable=invalid-name, import-self, unused-argument, unused-variable
 # pylint: disable=inconsistent-return-statements, import-outside-toplevel
 """CoreML frontend."""
-from __future__ import absolute_import as _abs
 import math
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/frontend/darknet.py b/python/tvm/relay/frontend/darknet.py
index 7623df293cb9..0dae645cd9a4 100644
--- a/python/tvm/relay/frontend/darknet.py
+++ b/python/tvm/relay/frontend/darknet.py
@@ -19,7 +19,6 @@
 DarkNet symbol frontend for Relay.
 """
 
-from __future__ import absolute_import as _abs
 from enum import Enum
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index d74277bbe402..2787cd6d4647 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -16,8 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name, import-self, len-as-condition, no-else-return, too-many-lines
 """MXNet symbol frontend."""
-from __future__ import absolute_import as _abs
-
 import json
 import numpy as np
 import tvm
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index af8715abaed3..0b766a17aa1b 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -406,7 +406,7 @@ def _impl(inputs, input_types):
         val = inputs[0]
         dtype = type(val)
 
-        if isinstance(val, tvm.expr.IntImm):
+        if isinstance(val, tvm.tir.IntImm):
             val = val.__int__()
             dtype = int
 
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 5532e3a5c1a4..6f27d73315a1 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -18,9 +18,6 @@
 # pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition, broad-except
 # pylint: disable=import-outside-toplevel
 """TF: Tensorflow frontend."""
-from __future__ import absolute_import as _abs
-from __future__ import print_function
-
 import warnings
 from collections import defaultdict
 
@@ -1012,7 +1009,7 @@ def _impl(inputs, attr, params):
                 'Attribute batch_dims is not supported')
         new_input = inputs[0:2]
         return AttrCvt(op_name="take",
-                       extras={'axis': tvm.const(axis, 'int32')},
+                       extras={'axis': tvm.tir.const(axis, 'int32')},
                        ignores=['Tindices', 'Tparams', 'validate_indices',
                                 'Taxis', '_class', 'batch_dims'])(new_input, attr)
     return _impl
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index e132d4ca3585..3a17083d60ac 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-argument, too-many-lines, import-outside-toplevel
-
 """Tensorflow lite frontend."""
 import math
 import numpy as np
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index 9d52ed3af777..ab8b7c2ac1a8 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -17,9 +17,9 @@
 """Backend compiler related feature registration"""
 from __future__ import absolute_import
 
+from tvm.runtime import convert
 from topi.util import get_const_int, get_const_tuple
 from . import op as _reg
-from ...api import convert
 from ...hybrid import script
 
 _reg.register_reduce_schedule("argmax")
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 7c8ccb7dd827..0fbbaef374df 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -16,14 +16,14 @@
 # under the License.
 #pylint: disable=invalid-name, unused-argument, len-as-condition
 """Backend compiler related feature registration"""
-from __future__ import absolute_import
 import topi
+
+from tvm.runtime import convert
 from topi.util import get_const_tuple
 from .op import register_compute, register_shape_func
 from .op import register_broadcast_schedule, register_injective_schedule
 from .op import register_pattern, OpPattern
 from ...hybrid import script
-from ...api import convert
 
 
 register_broadcast_schedule("log")
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 42c94349da8c..4b350093408e 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -18,13 +18,14 @@
 # pylint: disable=invalid-name,unused-argument, len-as-condition, too-many-nested-blocks, too-many-local-variables, too-many-arguments
 from __future__ import absolute_import
 import tvm
+from tvm import te
+from tvm.runtime import convert
 import topi
 from topi.util import get_const_int, get_const_tuple
 from . import op as _reg
 from . import strategy
 from .op import OpPattern
 from ...hybrid import script
-from ...api import convert
 
 _reg.register_broadcast_schedule("broadcast_to")
 _reg.register_broadcast_schedule("broadcast_to_like")
@@ -79,7 +80,7 @@ def compute_argwhere(attrs, inputs, output_type):
             output_shape.append(s)
         else:
             # see Any, replace it with a var
-            output_shape.append(tvm.var("any_dim", "int32"))
+            output_shape.append(te.var("any_dim", "int32"))
     new_output_type = tvm.relay.ty.TensorType(output_shape, "int32")
     return [topi.argwhere(new_output_type, inputs[0])]
 
@@ -473,7 +474,7 @@ def squeeze_shape_func(attrs, inputs, _):
     if keep_axes:
         out = _squeeze_shape_func(inputs[0], convert(keep_axes))
     else:
-        out = tvm.compute((), lambda *indices: 0)
+        out = te.compute((), lambda *indices: 0)
     return [out]
 
 @script
diff --git a/python/tvm/relay/op/algorithm.py b/python/tvm/relay/op/algorithm.py
index 6f875919df4c..17fab80118af 100644
--- a/python/tvm/relay/op/algorithm.py
+++ b/python/tvm/relay/op/algorithm.py
@@ -28,7 +28,7 @@ def argsort(data, axis=-1, is_ascend=1, dtype="int32"):
     data : relay.Expr
         The input data tensor.
 
-    valid_count : tvm.Tensor
+    valid_count : tvm.te.Tensor
         The number of valid elements to be sorted.
 
     axis : int, optional
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 97a5fa6ec00b..a4fde283daad 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -20,11 +20,12 @@
 
 import topi
 from topi.util import get_const_tuple
+
+from tvm.runtime import convert
 from .. import op as reg
 from .. import strategy
 from ..op import OpPattern
 from .._tensor import elemwise_shape_func
-from ....api import convert
 from ....hybrid import script
 
 # relu
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 4fd88f4383df..6be7d4d4f870 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -21,7 +21,6 @@
 
 from ..base import register_relay_node
 from ..expr import RelayExpr
-from ...api import register_func
 from ...target import get_native_generic_func, GenericFunc
 from ...runtime import Object
 from . import _make
@@ -155,7 +154,7 @@ def compute(self, attrs, inputs, out_type):
         attrs : Attrs
             Op attributes.
 
-        inputs : list[tvm.tensor.Tensor]
+        inputs : list[te.tensor.Tensor]
             The input tensors.
 
         out_type : relay.Type
@@ -163,7 +162,7 @@ def compute(self, attrs, inputs, out_type):
 
         Returns
         -------
-        outs : list[tvm.tensor.Tensor]
+        outs : list[te.tensor.Tensor]
             The output tensors.
         """
         return _OpImplementationCompute(self, attrs, inputs, out_type)
@@ -176,7 +175,7 @@ def schedule(self, attrs, outs, target):
         attrs : Attrs
             Op attributes.
 
-        outs : list[tvm.tensor.Tensor]
+        outs : list[te.tensor.Tensor]
             The output tensors.
 
         target : tvm.target.Target
@@ -184,7 +183,7 @@ def schedule(self, attrs, outs, target):
 
         Returns
         -------
-        schedule : tvm.Schedule
+        schedule : tvm.te.Schedule
             The schedule.
         """
         return _OpImplementationSchedule(self, attrs, outs, target)
@@ -454,11 +453,11 @@ def register_shape_func(op_name, data_dependant, shape_func=None, level=10):
     get(op_name).set_attr("TShapeDataDependant", data_dependant, level)
     return register(op_name, "FShapeFunc", shape_func, level)
 
-@register_func("relay.op.compiler._lower")
+@tvm._ffi.register_func("relay.op.compiler._lower")
 def _lower(name, schedule, inputs, outputs):
     return lower(schedule, list(inputs) + list(outputs), name=name)
 
-@register_func("relay.op.compiler._build")
+@tvm._ffi.register_func("relay.op.compiler._build")
 def _build(lowered_funcs):
     return build(lowered_funcs, target="llvm")
 
@@ -473,7 +472,7 @@ def debug(expr, debug_func=None):
 
     if debug_func:
         name = "debugger_func{}".format(__DEBUG_COUNTER__)
-        register_func(name, debug_func)
+        tvm._ffi.register_func(name, debug_func)
         __DEBUG_COUNTER__ += 1
     else:
         name = ''
diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
index 4c3f6d142369..b7fee8c12128 100644
--- a/python/tvm/relay/param_dict.py
+++ b/python/tvm/relay/param_dict.py
@@ -17,9 +17,11 @@
 # pylint: disable=invalid-name
 """Helper utility to save parameter dicts."""
 import tvm
+import tvm._ffi
 
-_save_param_dict = tvm.get_global_func("tvm.relay._save_param_dict")
-_load_param_dict = tvm.get_global_func("tvm.relay._load_param_dict")
+
+_save_param_dict = tvm._ffi.get_global_func("tvm.relay._save_param_dict")
+_load_param_dict = tvm._ffi.get_global_func("tvm.relay._load_param_dict")
 
 def save_param_dict(params):
     """Save parameter dictionary to binary bytes.
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index be8a3a323316..56a4645058e5 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -16,12 +16,12 @@
 # under the License.
 #pylint: disable=unused-argument, not-context-manager
 """Automatic quantization toolkit."""
-from __future__ import absolute_import
+import tvm.ir
+
 from . import _quantize
 from ._calibrate import calibrate
 from .. import expr as _expr
 from .. import transform as _transform
-from ... import make as _make
 from ..base import Object, register_relay_node
 
 
@@ -181,7 +181,7 @@ def qconfig(**kwargs):
     """
     node_args = {k: v if k not in kwargs else kwargs[k]
                  for k, v in QConfig._node_defaults.items()}
-    return _make.node("relay.quantize.QConfig", **node_args)
+    return tvm.ir.make_node("relay.quantize.QConfig", **node_args)
 
 
 class QuantizeContext(object):
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index bff01e859a50..54c909179e4f 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 import tvm.relay as relay
 import tvm.relay.op as op
 from tvm.relay import transform
diff --git a/python/tvm/relay/testing/config.py b/python/tvm/relay/testing/config.py
index 68756e0a270f..93a08db32d2c 100644
--- a/python/tvm/relay/testing/config.py
+++ b/python/tvm/relay/testing/config.py
@@ -20,6 +20,7 @@
 import os
 import tvm
 
+
 def ctx_list():
     """Get context list for testcases"""
     device_list = os.environ.get("RELAY_TEST_TARGETS", "")
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
index 08b41b28bd35..45535afc486c 100644
--- a/python/tvm/relay/transform.py
+++ b/python/tvm/relay/transform.py
@@ -23,6 +23,7 @@
 import functools
 
 import tvm
+from tvm import te
 from tvm.runtime import ndarray as _nd
 from tvm.ir.transform import PassInfo, PassContext, Pass, ModulePass, Sequential, module_pass
 
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 211bee32ed3c..2643ff131ba0 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -106,6 +106,7 @@ def save(self):
 
             import numpy as np
             import tvm
+from tvm import te
             from tvm import relay
             # define a simple network.
             x = relay.var('x', shape=(10, 10))
diff --git a/python/tvm/target/build_config.py b/python/tvm/target/build_config.py
index 8782d24d2da9..c105175d3e26 100644
--- a/python/tvm/target/build_config.py
+++ b/python/tvm/target/build_config.py
@@ -35,7 +35,7 @@ class DumpIR(object):
     -----------
     .. code-block:: python
 
-        with tvm.build_config(dump_pass_ir=True)
+        with tvm.target.build_config(dump_pass_ir=True)
             run()
     """
     scope_level = 0
diff --git a/python/tvm/target/generic_func.py b/python/tvm/target/generic_func.py
index 1936ff1511be..bfcd2dd56b4f 100644
--- a/python/tvm/target/generic_func.py
+++ b/python/tvm/target/generic_func.py
@@ -116,6 +116,7 @@ def override_native_generic_func(func_name):
     .. code-block:: python
 
       import tvm
+from tvm import te
       # wrap function as target generic
       @tvm.target.override_native_generic_func("my_func")
       def my_func(a):
@@ -210,6 +211,7 @@ def generic_func(fdefault):
     .. code-block:: python
 
       import tvm
+from tvm import te
       # wrap function as target generic
       @tvm.target.generic_func
       def my_func(a):
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 5970315e854b..065cf4e5dbdd 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -18,6 +18,7 @@
 """Namespace for Tensor Expression Language
 """
 # expose all operators in tvm tir.op
+from tvm.tir import any, all, min_value, max_value, trace
 from tvm.tir import exp, erf, tanh, sigmoid, log, cos, sin, atan, sqrt, rsqrt, floor, ceil
 from tvm.tir import trunc, abs, round, nearbyint, isnan, power, popcount, fmod, if_then_else
 from tvm.tir import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod
@@ -29,3 +30,5 @@
 from .tag import tag_scope
 from .operation import placeholder, compute, scan, extern, var, size_var
 from .operation import thread_axis, reduce_axis
+
+from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp, HybridOp
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 3c5b610e99be..3ccab5bfd9c3 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -167,13 +167,13 @@ def scan(init, update, state_placeholder, inputs=None, name="scan", tag="", attr
     .. code-block:: python
 
       # The following code is equivalent to numpy.cumsum
-      m = tvm.var("m")
-      n = tvm.var("n")
-      X = tvm.placeholder((m, n), name="X")
-      s_state = tvm.placeholder((m, n))
-      s_init = tvm.compute((1, n), lambda _, i: X[0, i])
-      s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
-      res = tvm.scan(s_init, s_update, s_state, X)
+      m = te.var("m")
+      n = te.var("n")
+      X = te.placeholder((m, n), name="X")
+      s_state = te.placeholder((m, n))
+      s_init = te.compute((1, n), lambda _, i: X[0, i])
+      s_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+      res = tvm.te.scan(s_init, s_update, s_state, X)
     """
     if _tag.TagScope.get_current() is not None:
         if tag != "":
@@ -264,10 +264,10 @@ def extern(shape,
 
     .. code-block:: python
 
-        A = tvm.placeholder((n, l), name="A")
-        B = tvm.placeholder((l, m), name="B")
-        C = tvm.extern((n, m), [A, B],
-                       lambda ins, outs: tvm.call_packed(
+        A = te.placeholder((n, l), name="A")
+        B = te.placeholder((l, m), name="B")
+        C = te.extern((n, m), [A, B],
+                       lambda ins, outs: tvm.tir.call_packed(
                           "tvm.contrib.cblas.matmul",
                             ins[0], ins[1], outs[0], 0, 0), name="C")
     """
diff --git a/python/tvm/te/tag.py b/python/tvm/te/tag.py
index 189076d03cc3..1022875ce3dd 100644
--- a/python/tvm/te/tag.py
+++ b/python/tvm/te/tag.py
@@ -73,19 +73,19 @@ def tag_scope(tag):
     -------
     .. code-block:: python
 
-        n = tvm.var('n')
-        m = tvm.var('m')
-        l = tvm.var('l')
-        A = tvm.placeholder((n, l), name='A')
-        B = tvm.placeholder((m, l), name='B')
-        k = tvm.reduce_axis((0, l), name='k')
+        n = te.var('n')
+        m = te.var('m')
+        l = te.var('l')
+        A = te.placeholder((n, l), name='A')
+        B = te.placeholder((m, l), name='B')
+        k = te.reduce_axis((0, l), name='k')
 
-        with tvm.tag_scope(tag='matmul'):
-            C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k))
+        with tvm.te.tag_scope(tag='matmul'):
+            C = te.compute((n, m), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k))
 
         # or use tag_scope as decorator
-        @tvm.tag_scope(tag="conv")
+        @tvm.te.tag_scope(tag="conv")
         def compute_relu(data):
-            return tvm.compute(data.shape, lambda *i: tvm.select(data(*i) < 0, 0.0, data(*i)))
+            return te.compute(data.shape, lambda *i: tvm.select(data(*i) < 0, 0.0, data(*i)))
     """
     return TagScope(tag)
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index ab78ca6d6d63..a5c81ac77627 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -17,20 +17,22 @@
 # pylint: disable=unused-import, redefined-builtin
 """Namespace for Tensor-level IR"""
 from tvm.ir import PrimExpr
+from tvm.runtime import const
+
 from .buffer import Buffer, decl_buffer
 from .data_layout import Layout, BijectiveLayout, bijective_layout, layout
 from .expr import Var, SizeVar, Reduce, FloatImm, IntImm, StringImm, Cast
 from .expr import Add, Sub, Mul, Div, Mod, FloorDiv, FloorMod
 from .expr import Min, Max, EQ, NE, LT, LE, GT, GE, And, Or, Not
 from .expr import Select, Load, Ramp, Broadcast, Shuffle, Call, Let
-from .expr import IterVar
+from .expr import IterVar, Any
 
 from .stmt import Stmt, LetStmt, AssertStmt, ProducerConsumer, For
 from .stmt import Store, Provide, Allocate, AttrStmt, Free, Realize, SeqStmt
 from .stmt import IfThenElse, Evaluate, Prefetch, LoweredFunc, stmt_seq, stmt_list
 
 from .op import call_packed, call_pure_intrin, call_intrin, call_pure_extern, call_extern
-from .op import call_llvm_intrin, all, any, min_value, max_value
+from .op import call_llvm_intrin, all, any, min_value, max_value, trace
 from .op import exp, erf, tanh, sigmoid, log, cos, sin, atan, sqrt, rsqrt, floor, ceil
 from .op import trunc, abs, round, nearbyint, isnan, power, popcount, fmod, if_then_else
 from .op import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod
diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index d0d01d7479be..0c7753e4d8ec 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -201,15 +201,15 @@ def decl_buffer(shape,
 
     .. code-block:: python
 
-        m0, m1, m2 = tvm.var("m0"), tvm.var("m1"), tvm.var("m2")
-        n0, n1, n2 = tvm.var("n0"), tvm.var("n1"), tvm.var("n2")
-        o0, o1, o2 = tvm.var("o0"), tvm.var("o1"), tvm.var("o2")
-        A = tvm.placeholder((m0, m1, m2), name='A')
-        B = tvm.placeholder((n0, n1, n2), name='B')
-        C = tvm.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name='C')
+        m0, m1, m2 = te.var("m0"), te.var("m1"), te.var("m2")
+        n0, n1, n2 = te.var("n0"), te.var("n1"), te.var("n2")
+        o0, o1, o2 = te.var("o0"), te.var("o1"), te.var("o2")
+        A = te.placeholder((m0, m1, m2), name='A')
+        B = te.placeholder((n0, n1, n2), name='B')
+        C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name='C')
         Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
         Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
         fadd = tvm.build(s, [A, B, C], target='llvm', name='bcast_add', binds={A:Ab, B:Bb})
         ctx = tvm.cpu(0)
         a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), ctx)
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index acf5f51941dc..bcf596787cd4 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -25,7 +25,7 @@
 
 .. code-block:: python
 
-  x = tvm.var("n")
+  x = te.var("n")
   y = x + 2
   assert(isinstance(y, tvm.tir.Add))
   assert(y.a == x)
@@ -169,7 +169,7 @@ def __ge__(self, other):
 
     def __nonzero__(self):
         raise ValueError("Cannot use and / or / not operator to Expr, hint: " +
-                         "use tvm.all / tvm.any instead")
+                         "use tvm.tir.all / tvm.tir.any instead")
 
     def __bool__(self):
         return self.__nonzero__()
@@ -346,8 +346,8 @@ class IterVar(Object, ExprOp):
 
     See Also
     --------
-    tvm.thread_axis: Create thread axis IterVar.
-    tvm.reduce_axis: Create reduce axis IterVar.
+    te.thread_axis: Create thread axis IterVar.
+    te.reduce_axis: Create reduce axis IterVar.
     """
     DataPar = 0
     ThreadIndex = 1
@@ -812,7 +812,7 @@ class Select(PrimExprWithOp):
     Note
     ----
     Select may compute both true_value and false_value.
-    Use :py:class:`tvm.if_then_else` instead if you want to
+    Use :py:class:`tvm.tir.if_then_else` instead if you want to
     get a conditional expression that only evaluates
     the correct branch.
 
diff --git a/python/tvm/tir/generic.py b/python/tvm/tir/generic.py
index 8a9cf8eeb50d..88be5b1dfd64 100644
--- a/python/tvm/tir/generic.py
+++ b/python/tvm/tir/generic.py
@@ -16,7 +16,7 @@
 # under the License.
 """Generic opertors in TVM.
 We follow the numpy naming convention for this interface
-(e.g., tvm.generic.multitply ~ numpy.multiply).
+(e.g., tvm.tir.generic.multitply ~ numpy.multiply).
 The default implementation is used by tvm.ExprOp.
 """
 # pylint: disable=unused-argument
diff --git a/python/tvm/tir/ir_builder.py b/python/tvm/tir/ir_builder.py
index b56e15377358..885b8475082e 100644
--- a/python/tvm/tir/ir_builder.py
+++ b/python/tvm/tir/ir_builder.py
@@ -98,8 +98,8 @@ class IRBuilder(object):
     --------
     .. code-block:: python
 
-        ib = tvm.ir_builder.create()
-        n = tvm.var("n")
+        ib = tvm.tir.ir_builder.create()
+        n = te.var("n")
         A = ib.allocate("float32", n, name="A")
         with ib.for_range(0, n, name="i") as i:
             with ib.if_scope((i % 2) == 0):
@@ -158,8 +158,8 @@ def scope_attr(self, node, attr_key, value):
         --------
         .. code-block:: python
 
-            ib = tvm.ir_builder.create()
-            i = tvm.var("i")
+            ib = tvm.tir.ir_builder.create()
+            i = te.var("i")
             x = ib.pointer("float32")
             ib.scope_attr(x, "storage_scope", "global")
             x[i] = x[i - 1] + 1
@@ -200,7 +200,7 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"):
         --------
         .. code-block:: python
 
-            ib = tvm.ir_builder.create()
+            ib = tvm.tir.ir_builder.create()
             x = ib.pointer("float32")
             with ib.for_range(1, 10, name="i") as i:
                 x[i] = x[i - 1] + 1
@@ -243,8 +243,8 @@ def if_scope(self, cond):
         --------
         .. code-block:: python
 
-            ib = tvm.ir_builder.create()
-            i = tvm.var("i")
+            ib = tvm.tir.ir_builder.create()
+            i = te.var("i")
             x = ib.pointer("float32")
             with ib.if_scope((i % 2) == 0):
                 x[i] = x[i - 1] + 1
@@ -268,8 +268,8 @@ def else_scope(self):
         --------
         .. code-block:: python
 
-            ib = tvm.ir_builder.create()
-            i = tvm.var("i")
+            ib = tvm.tir.ir_builder.create()
+            i = te.var("i")
             x = ib.pointer("float32")
             with ib.if_scope((i % 2) == 0):
                 x[i] = x[i - 1] + 1
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 66e70c508438..4a52787262bb 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -64,7 +64,7 @@ def call_packed(*args):
 
     See Also
     --------
-    tvm.extern : Create tensor with extern function call.
+    te.extern : Create tensor with extern function call.
     """
     call_args = [_pack_buffer(x) if isinstance(x, Buffer) else x for x in args]
     return Call(
@@ -194,7 +194,7 @@ def call_llvm_intrin(dtype, name, *args):
     from tvm.target import codegen
     llvm_id = codegen.llvm_lookup_intrinsic_id(name)
     assert llvm_id != 0, "%s is not an LLVM intrinsic" % name
-    return call_pure_intrin(dtype, 'llvm_intrin', tvm.const(llvm_id, 'uint32'), *args)
+    return call_pure_intrin(dtype, 'llvm_intrin', tvm.tir.const(llvm_id, 'uint32'), *args)
 
 
 def any(*args):
@@ -274,7 +274,7 @@ def trace(args, trace_action="tvm.default_trace_action"):
     tvm.tir.call_packed : Creates packed function.
     """
     if not isinstance(args, list):
-        raise Exception("tvm.trace consumes the args as list type")
+        raise Exception("tvm.tir.trace consumes the args as list type")
     call_args = [_pack_buffer(x) if isinstance(x, Buffer) else x for x in args]
     call_args.insert(0, trace_action)
     return tvm.tir.Call(
@@ -556,9 +556,9 @@ def round(x):
 def nearbyint(x):
     """Round elements of the array to the nearest integer.
     This intrinsic uses llvm.nearbyint instead of llvm.round
-    which is faster but will results different from tvm.round.
+    which is faster but will results different from te.round.
     Notably nearbyint rounds according to the rounding mode,
-    whereas tvm.round (llvm.round) ignores that.
+    whereas te.round (llvm.round) ignores that.
     For differences between the two see:
     https://en.cppreference.com/w/cpp/numeric/math/round
     https://en.cppreference.com/w/cpp/numeric/math/nearbyint
@@ -855,13 +855,13 @@ def comm_reducer(fcombine, fidentity, name="reduce"):
     -------
     .. code-block:: python
 
-        n = tvm.var("n")
-        m = tvm.var("m")
-        mysum = tvm.comm_reducer(lambda x, y: x+y,
-            lambda t: tvm.const(0, dtype=t), name="mysum")
-        A = tvm.placeholder((n, m), name="A")
-        k = tvm.reduce_axis((0, m), name="k")
-        B = tvm.compute((n,), lambda i: mysum(A[i, k], axis=k), name="B")
+        n = te.var("n")
+        m = te.var("m")
+        mysum = te.comm_reducer(lambda x, y: x+y,
+            lambda t: tvm.tir.const(0, dtype=t), name="mysum")
+        A = te.placeholder((n, m), name="A")
+        k = te.reduce_axis((0, m), name="k")
+        B = te.compute((n,), lambda i: mysum(A[i, k], axis=k), name="B")
     """
     def _reduce_directly(*args):
         num = len(args)
@@ -943,14 +943,14 @@ def reducer(expr, axis, where=None, *args):
               -------
               .. code-block:: python
 
-                m = tvm.var("m")
-                n = tvm.var("n")
-                A = tvm.placeholder((m, n), name="A")
-                k = tvm.reduce_axis((0, n), name="k")
+                m = te.var("m")
+                n = te.var("n")
+                A = te.placeholder((m, n), name="A")
+                k = te.reduce_axis((0, n), name="k")
 
                 # there are two way to use this {0} reducer:
                 # mode 1, accept (expr, axis, where) to produce an Reduce Expr
-                B = tvm.compute((m,), lambda i: tvm.{0}(A[i, k], axis=k), name="B")
+                B = te.compute((m,), lambda i: tvm.{0}(A[i, k], axis=k), name="B")
 
                 # mode 2, simply use it with multiple Exprs:
                 {0}_res = tvm.{0}(m, n)
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index bc02b7d23ead..65c72ddfeb36 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -23,8 +23,8 @@
 
 .. code-block:: python
 
-    x = tvm.var("n")
-    a = tvm.var("array", tvm.handle)
+    x = te.var("n")
+    a = te.var("array", "handle")
     st = tvm.tir.stmt.Store(a, x + 1, 1)
     assert isinstance(st, tvm.tir.stmt.Store)
     assert(st.buffer_var == a)
diff --git a/rust/frontend/examples/resnet/src/build_resnet.py b/rust/frontend/examples/resnet/src/build_resnet.py
index e71381888c1f..49c67bf1c4f3 100644
--- a/rust/frontend/examples/resnet/src/build_resnet.py
+++ b/rust/frontend/examples/resnet/src/build_resnet.py
@@ -25,6 +25,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import testing
 from tvm.contrib import graph_runtime, cc
diff --git a/rust/frontend/tests/basics/src/tvm_add.py b/rust/frontend/tests/basics/src/tvm_add.py
index 287084bcf9aa..3911d4074e45 100755
--- a/rust/frontend/tests/basics/src/tvm_add.py
+++ b/rust/frontend/tests/basics/src/tvm_add.py
@@ -20,20 +20,21 @@
 import sys
 
 import tvm
+from tvm import te
 from tvm.contrib import cc
 
 
 def main(target, out_dir):
-    n = tvm.var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name='C')
-    s = tvm.create_schedule(C.op)
+    n = te.var('n')
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda i: A[i] + B[i], name='C')
+    s = te.create_schedule(C.op)
 
     if target == 'cuda':
         bx, tx = s[C].split(C.op.axis[0], factor=64)
-        s[C].bind(bx, tvm.thread_axis('blockIdx.x'))
-        s[C].bind(tx, tvm.thread_axis('threadIdx.x'))
+        s[C].bind(bx, te.thread_axis('blockIdx.x'))
+        s[C].bind(tx, te.thread_axis('threadIdx.x'))
 
     fadd = tvm.build(s, [A, B, C], target, target_host='llvm', name='myadd')
 
diff --git a/rust/runtime/tests/build_model.py b/rust/runtime/tests/build_model.py
index e3da95f24fd8..d1dffad37249 100755
--- a/rust/runtime/tests/build_model.py
+++ b/rust/runtime/tests/build_model.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import testing
 
diff --git a/rust/runtime/tests/test_nn/src/build_test_graph.py b/rust/runtime/tests/test_nn/src/build_test_graph.py
index dd7621b921f7..832dddf12d76 100755
--- a/rust/runtime/tests/test_nn/src/build_test_graph.py
+++ b/rust/runtime/tests/test_nn/src/build_test_graph.py
@@ -23,6 +23,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import testing
 
diff --git a/rust/runtime/tests/test_tvm_basic/src/build_test_lib.py b/rust/runtime/tests/test_tvm_basic/src/build_test_lib.py
index 38c1f3a7a223..bf7e60a1df6e 100755
--- a/rust/runtime/tests/test_tvm_basic/src/build_test_lib.py
+++ b/rust/runtime/tests/test_tvm_basic/src/build_test_lib.py
@@ -22,13 +22,14 @@
 import sys
 
 import tvm
+from tvm import te
 
 def main():
-    n = tvm.var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = te.var('n')
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.te.create_schedule(C.op)
     s[C].parallel(s[C].op.axis[0])
     print(tvm.lower(s, [A, B, C], simple_mode=True))
     tvm.build(s, [A, B, C], 'llvm --system-lib').save(osp.join(sys.argv[1], 'test.o'))
diff --git a/rust/runtime/tests/test_tvm_dso/src/build_test_lib.py b/rust/runtime/tests/test_tvm_dso/src/build_test_lib.py
index 63b43a5f9bef..cb7353ff70ab 100755
--- a/rust/runtime/tests/test_tvm_dso/src/build_test_lib.py
+++ b/rust/runtime/tests/test_tvm_dso/src/build_test_lib.py
@@ -22,14 +22,15 @@
 import sys
 
 import tvm
+from tvm import te
 from tvm.contrib import cc
 
 def main():
-    n = tvm.var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = te.var('n')
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.te.create_schedule(C.op)
     s[C].parallel(s[C].op.axis[0])
     print(tvm.lower(s, [A, B, C], simple_mode=True))
     obj_file = osp.join(sys.argv[1], 'test.o')
diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py
index 44739bbda3cb..3106e73136fa 100644
--- a/tests/python/contrib/test_binutil.py
+++ b/tests/python/contrib/test_binutil.py
@@ -24,6 +24,7 @@
 """
 
 import tvm
+from tvm import te
 import subprocess
 from tvm.contrib import util
 from tvm.contrib import cc
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 99614a8d93ad..18ea57a003f0 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -15,19 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 import topi.testing
 from tvm.contrib import cblas
 
-def verify_matmul_add(m, l, n, transa=False, transb=False, dtype=tvm.float32):
-    bias = tvm.var('bias', dtype=dtype)
+def verify_matmul_add(m, l, n, transa=False, transb=False, dtype="float32"):
+    bias = te.var('bias', dtype=dtype)
     ashape = (l, n) if transa else (n, l)
     bshape = (m, l) if transb else (l, m)
-    A = tvm.placeholder(ashape, name='A', dtype=dtype)
-    B = tvm.placeholder(bshape, name='B', dtype=dtype)
+    A = te.placeholder(ashape, name='A', dtype=dtype)
+    B = te.placeholder(bshape, name='B', dtype=dtype)
     C = cblas.matmul(A, B, transa, transb)
-    D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
-    s = tvm.create_schedule(D.op)
+    D = te.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
+    s = te.create_schedule(D.op)
 
     def get_numpy(a, b, bb, transa, transb):
         if transa:
@@ -64,14 +65,14 @@ def test_matmul_add():
     verify_matmul_add(1, 16, 3, False, False)
     verify_matmul_add(1, 16, 3, True, True)
 
-def verify_batch_matmul(batch, m, l, n, transa=False, transb=False, iterative=False, dtype=tvm.float32):
+def verify_batch_matmul(batch, m, l, n, transa=False, transb=False, iterative=False, dtype="float32"):
     ashape = (batch, l, n) if transa else (batch, n, l)
     bshape = (batch, m, l) if transb else (batch, l, m)
-    A = tvm.placeholder(ashape, name='A', dtype=dtype)
-    B = tvm.placeholder(bshape, name='B', dtype=dtype)
+    A = te.placeholder(ashape, name='A', dtype=dtype)
+    B = te.placeholder(bshape, name='B', dtype=dtype)
     C = cblas.batch_matmul(A, B, transa, transb)
-    D = tvm.compute(C.shape, lambda k, i, j: C[k, i,j], name="D")
-    s = tvm.create_schedule(D.op)
+    D = te.compute(C.shape, lambda k, i, j: C[k, i,j], name="D")
+    s = te.create_schedule(D.op)
 
     def get_numpy(a, b, transa, transb):
         if transa:
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index a3baa8c829e3..517e6e124030 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import cublas
 from tvm.contrib import cublaslt
@@ -23,10 +24,10 @@ def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
     n = 1024
     l = 128
     m = 236
-    A = tvm.placeholder((n, l), name='A', dtype=in_dtype)
-    B = tvm.placeholder((l, m), name='B', dtype=in_dtype)
+    A = te.placeholder((n, l), name='A', dtype=in_dtype)
+    B = te.placeholder((l, m), name='B', dtype=in_dtype)
     C = cublas.matmul(A, B, dtype=out_dtype)
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
 
     def verify(target="cuda"):
         if not tvm.runtime.enabled(target):
@@ -56,11 +57,11 @@ def verify_matmul_add_igemm(in_dtype, out_dtype, rtol=1e-5):
     N = roundoff(n, 8)
     N_out = roundoff(n, 32)
 
-    A = tvm.placeholder((N, L), name='A', dtype=in_dtype)
-    B = tvm.placeholder((m, L), name='B', dtype=in_dtype)
+    A = te.placeholder((N, L), name='A', dtype=in_dtype)
+    B = te.placeholder((m, L), name='B', dtype=in_dtype)
     # C has CUBLASLT_ORDER_COL32 layout, thus a different shape
     C = cublaslt.matmul(A, B, False, True, m, N_out, dtype=out_dtype)
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
 
     def verify(target="cuda"):
         if not tvm.runtime.enabled(target):
@@ -108,10 +109,10 @@ def verify_batch_matmul(in_dtype, out_dtype, rtol=1e-5):
     n = 1024
     l = 128
     m = 236
-    A = tvm.placeholder((j, n, l), name='A', dtype=in_dtype)
-    B = tvm.placeholder((j, l, m), name='B', dtype=in_dtype)
+    A = te.placeholder((j, n, l), name='A', dtype=in_dtype)
+    B = te.placeholder((j, l, m), name='B', dtype=in_dtype)
     C = cublas.batch_matmul(A, B, dtype=out_dtype)
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
 
     def verify(target="cuda"):
         if not tvm.runtime.enabled(target):
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 1a22f90eb804..58e7b4905988 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm.contrib import cudnn
 import numpy as np
 import topi.testing
@@ -48,8 +49,8 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
         xshape = [batch, height, weight, in_channel]
         wshape = [out_channel, filter_h, filter_w, in_channel]
 
-    X = tvm.placeholder(xshape, name='X', dtype=data_dtype)
-    W = tvm.placeholder(wshape, name='W', dtype=data_dtype)
+    X = te.placeholder(xshape, name='X', dtype=data_dtype)
+    W = te.placeholder(wshape, name='W', dtype=data_dtype)
     Y = cudnn.conv_forward(X,
                            W,
                            [pad_h, pad_w],
@@ -60,7 +61,7 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0):
                            conv_dtype=conv_dtype,
                            algo=-1)
     yshape = [x.value for x in Y.shape]
-    s = tvm.create_schedule(Y.op)
+    s = te.create_schedule(Y.op)
 
     def verify():
         ctx = tvm.gpu(0)
@@ -120,8 +121,8 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0):
     xshape = [batch, in_channel, depth, height, weight]
     wshape = [out_channel, in_channel, filter_d, filter_h, filter_w]
 
-    X = tvm.placeholder(xshape, name='X', dtype=data_dtype)
-    W = tvm.placeholder(wshape, name='W', dtype=data_dtype)
+    X = te.placeholder(xshape, name='X', dtype=data_dtype)
+    W = te.placeholder(wshape, name='W', dtype=data_dtype)
     Y = cudnn.conv_forward(X,
                            W,
                            [pad_d, pad_h, pad_w],
@@ -132,7 +133,7 @@ def verify_conv3d(data_dtype, conv_dtype, tensor_format=0):
                            algo=-1,
                            conv_dtype=conv_dtype)
     yshape = [x.value for x in Y.shape]
-    s = tvm.create_schedule(Y.op)
+    s = te.create_schedule(Y.op)
 
     def verify():
         ctx = tvm.gpu(0)
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
index f39595582f03..453556c83e18 100644
--- a/tests/python/contrib/test_dlpack.py
+++ b/tests/python/contrib/test_dlpack.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib.dlpack import to_pytorch_func
 
@@ -34,17 +35,17 @@ def test():
         np.testing.assert_equal(y.asnumpy(), tvm_x.asnumpy())
         np.testing.assert_equal(torch.utils.dlpack.from_dlpack(y.to_dlpack()).numpy(), tvm_x.asnumpy())
 
-        n = tvm.convert(137)
+        n = tvm.runtime.convert(137)
         xx = torch.rand(137,137)
         yy = torch.rand(137,137)
         zz2 = torch.empty(137,137)
         zz = xx.mm(yy)
-        XX = tvm.placeholder((n,n), name='X')
-        YY = tvm.placeholder((n,n), name='Y')
+        XX = te.placeholder((n,n), name='X')
+        YY = te.placeholder((n,n), name='Y')
 
-        k = tvm.reduce_axis((0, n), name='k')
-        ZZ = tvm.compute((n,n), lambda i,j : tvm.sum(XX[i,k]*YY[k,j], axis=k))
-        s = tvm.create_schedule(ZZ.op)
+        k = te.reduce_axis((0, n), name='k')
+        ZZ = te.compute((n,n), lambda i,j : te.sum(XX[i,k]*YY[k,j], axis=k))
+        s = te.create_schedule(ZZ.op)
         f = tvm.build(s, [XX, YY, ZZ], target_host='llvm', name='f')
 
         f_pytorch = to_pytorch_func(f)
diff --git a/tests/python/contrib/test_edgetpu_runtime.py b/tests/python/contrib/test_edgetpu_runtime.py
index a5d9e34e2efb..625dc94b7ec8 100644
--- a/tests/python/contrib/test_edgetpu_runtime.py
+++ b/tests/python/contrib/test_edgetpu_runtime.py
@@ -16,6 +16,7 @@
 # under the License.
 import os
 import tvm
+from tvm import te
 import numpy as np
 from tvm import rpc
 from tvm.contrib import util, tflite_runtime
diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py
index d83ecdc35b6e..1fd5974cd2dc 100644
--- a/tests/python/contrib/test_gemm_acc16.py
+++ b/tests/python/contrib/test_gemm_acc16.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
 import tvm
+from tvm import te
 import numpy as np
 from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16
 
@@ -25,8 +26,8 @@ def benchmark_fc_int8_acc16():
     n = 128
     k = 128
 
-    X = tvm.placeholder((m, k), name='X', dtype="uint8")
-    W = tvm.placeholder((n, k), name='W', dtype="int8")
+    X = te.placeholder((m, k), name='X', dtype="uint8")
+    W = te.placeholder((n, k), name='W', dtype="int8")
 
     peak = 512/16*2*2*2
     gops_per_mm = 2*n*m*k
@@ -38,15 +39,15 @@ def verify(target="llvm -mcpu=skylake-avx512"):
             return
 
         ctx = tvm.context(target, 0)
-        X = tvm.placeholder((m, k), name='X', dtype="uint8")
-        W = tvm.placeholder((n, k), name='W', dtype="int8")
+        X = te.placeholder((m, k), name='X', dtype="uint8")
+        W = te.placeholder((n, k), name='W', dtype="int8")
         pc = dot_16x1x16_uint8_int8_int16()
-        ak = tvm.reduce_axis((0, k), name='k')
+        ak = te.reduce_axis((0, k), name='k')
 
-        packedW = tvm.placeholder((n//128, 128*(k//2), 2), name='packedW', dtype="int8")
-        t_fc = tvm.compute((m, n), lambda i, j: tvm.sum(X[i, ak].astype("int16") * packedW[j//128, (ak//2)*128+j%128, ak%2].astype("int16"), axis=ak), name="F")
+        packedW = te.placeholder((n//128, 128*(k//2), 2), name='packedW', dtype="int8")
+        t_fc = te.compute((m, n), lambda i, j: te.sum(X[i, ak].astype("int16") * packedW[j//128, (ak//2)*128+j%128, ak%2].astype("int16"), axis=ak), name="F")
 
-        t_sch = tvm.create_schedule(t_fc.op)
+        t_sch = te.create_schedule(t_fc.op)
         a_x, a_y = t_fc.op.axis
         a_k, = t_fc.op.reduce_axis
 
diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
index e810da7d3b07..f723ccb1d235 100644
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -17,6 +17,7 @@
 # pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
 
 import tvm
+from tvm import te
 import numpy as np
 from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
 from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
@@ -29,8 +30,8 @@ def test_fc_int8_acc32():
     n = 1024
     k = 1024
 
-    X = tvm.placeholder((m, k), name='X', dtype="uint8")
-    W = tvm.placeholder((n, k), name='W', dtype="int8")
+    X = te.placeholder((m, k), name='X', dtype="uint8")
+    W = te.placeholder((n, k), name='W', dtype="int8")
 
     peak = 280
     print("Peak {} Gops/s".format(peak))
@@ -47,13 +48,13 @@ def verify(target="llvm -mcpu=cascadelake"):
 
         ctx = tvm.context(target, 0)
         pc = dot_16x1x16_uint8_int8_int32_cascadelake()
-        ak = tvm.reduce_axis((0, k), name='k')
-        packedW = tvm.placeholder(
+        ak = te.reduce_axis((0, k), name='k')
+        packedW = te.placeholder(
             (n // 16, 16 * (k // 4), 4), name='packedW', dtype="int8")
 
-        t_fc = tvm.compute((m, n), lambda i, j: tvm.sum(X[i, ak].astype(
+        t_fc = te.compute((m, n), lambda i, j: te.sum(X[i, ak].astype(
             "int32") * packedW[j / 16, (ak / 4) * 16 + j % 16, ak % 4].astype("int32"), axis=ak), name="F")
-        t_sch = tvm.create_schedule(t_fc.op)
+        t_sch = te.create_schedule(t_fc.op)
         a_x, a_y = t_fc.op.axis
         a_k, = t_fc.op.reduce_axis
 
diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py
index d7a46e5d94ad..b4bedd84e2e1 100644
--- a/tests/python/contrib/test_miopen.py
+++ b/tests/python/contrib/test_miopen.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm.contrib import miopen
 import numpy as np
 
@@ -40,8 +41,8 @@ def test_conv2d():
         return
     wshape = (out_channel, in_channel, filter_h, filter_w)
 
-    X = tvm.placeholder(xshape, name='X')
-    W = tvm.placeholder(wshape, name='W')
+    X = te.placeholder(xshape, name='X')
+    W = te.placeholder(wshape, name='W')
     Y = miopen.conv2d_forward(X,
                               W,
                               stride_h,
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
index fc85290c56e7..b5243659c1d5 100644
--- a/tests/python/contrib/test_mps.py
+++ b/tests/python/contrib/test_mps.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import mps
 
@@ -25,19 +26,19 @@ def test_matmul():
     n = 1024
     l = 128
     m = 256
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((l, m), name='B')
+    A = te.placeholder((n, l), name='A')
+    B = te.placeholder((l, m), name='B')
     C = mps.matmul(A, B)
-    D = tvm.compute(
+    D = te.compute(
         C.shape,
         lambda *i: C(*i) + 1.
     )
-    s = tvm.create_schedule(D.op)
+    s = te.create_schedule(D.op)
     yo, xo = D.op.axis
-    block_y = tvm.thread_axis("blockIdx.y")
-    block_x = tvm.thread_axis("blockIdx.x")
-    thread_y = tvm.thread_axis("threadIdx.y")
-    thread_x = tvm.thread_axis("threadIdx.x")
+    block_y = te.thread_axis("blockIdx.y")
+    block_x = te.thread_axis("blockIdx.x")
+    thread_y = te.thread_axis("threadIdx.y")
+    thread_x = te.thread_axis("threadIdx.x")
     by, ty = s[D].split(yo, factor=16)
     bx, tx = s[D].split(xo, factor=16)
     s[D].bind(by, block_y)
@@ -73,10 +74,10 @@ def test_conv2d():
     kh = 3
     kw = 3
     stride = 2
-    A = tvm.placeholder((n, h, w, ci), name="x")
-    B = tvm.placeholder((co, kh, kw, ci), name="w")
+    A = te.placeholder((n, h, w, ci), name="x")
+    B = te.placeholder((co, kh, kw, ci), name="w")
     C = mps.conv2d(A, B, 'SAME', 2)
-    s1 = tvm.create_schedule(C.op)
+    s1 = te.create_schedule(C.op)
 
     def verify(A, B, C, target="llvm"):
         if not tvm.get_global_func("tvm.contrib.mps.conv2d", True):
diff --git a/tests/python/contrib/test_mxnet_bridge.py b/tests/python/contrib/test_mxnet_bridge.py
index 9f1be7e2b9f1..37c164483e18 100644
--- a/tests/python/contrib/test_mxnet_bridge.py
+++ b/tests/python/contrib/test_mxnet_bridge.py
@@ -24,17 +24,18 @@ def mxnet_check():
     import mxnet as mx
     import topi
     import tvm
+    from tvm import te
     import numpy as np
     from tvm.contrib.mxnet import to_mxnet_func
 
     # build a TVM function through topi
     n = 20
     shape = (20,)
-    scale = tvm.var("scale", dtype="float32")
-    x = tvm.placeholder(shape)
-    y = tvm.placeholder(shape)
+    scale = te.var("scale", dtype="float32")
+    x = te.placeholder(shape)
+    y = te.placeholder(shape)
     z = topi.broadcast_add(x, y)
-    zz = tvm.compute(shape, lambda *i: z(*i) * scale)
+    zz = te.compute(shape, lambda *i: z(*i) * scale)
 
     target = tvm.target.cuda()
 
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index af8ae133923d..505199a55724 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 import scipy.signal
 from topi.nn.util import get_pad_tuple
@@ -26,12 +27,12 @@ def test_fully_connected_inference():
     n = 1024
     l = 128
     m = 235
-    bias = tvm.var('bias', dtype=tvm.float32)
-    A = tvm.placeholder((l, ), name='A')
-    B = tvm.placeholder((m, l), name='B')
+    bias = te.var('bias', dtype="float32")
+    A = te.placeholder((l, ), name='A')
+    B = te.placeholder((m, l), name='B')
     C = nnpack.fully_connected_inference(A, B)
-    D = tvm.compute(C.shape, lambda i: C[i] + bias, name="D")
-    s = tvm.create_schedule(D.op)
+    D = te.compute(C.shape, lambda i: C[i] + bias, name="D")
+    s = te.create_schedule(D.op)
 
     def verify(target="llvm"):
         if not tvm.runtime.enabled(target):
@@ -98,9 +99,9 @@ def test_convolution_inference():
     bshape = (OC, )
     oshape = (BATCH, OC, OH, OW)
 
-    data = tvm.placeholder(dshape, name='data')
-    kernel = tvm.placeholder(kshape, name='kernel')
-    bias = tvm.placeholder(bshape, name='bias')
+    data = te.placeholder(dshape, name='data')
+    kernel = te.placeholder(kshape, name='kernel')
+    bias = te.placeholder(bshape, name='bias')
     def verify(target="llvm",
                algorithm=nnpack.ConvolutionAlgorithm.AUTO,
                with_bias=True):
@@ -116,7 +117,7 @@ def verify(target="llvm",
             data, kernel, bias if with_bias else None,
             [PAD, PAD, PAD, PAD], [STRIDE, STRIDE],
             algorithm=algorithm)
-        s = tvm.create_schedule(output.op)
+        s = te.create_schedule(output.op)
 
         f = tvm.build(s, [data, kernel, bias, output], target)
 
@@ -160,9 +161,9 @@ def test_convolution_inference_without_weight_transform():
     bshape = (OC, )
     oshape = (BATCH, OC, OH, OW)
 
-    data = tvm.placeholder(dshape, name='data')
-    kernel = tvm.placeholder(kshape, name='kernel')
-    bias = tvm.placeholder(bshape, name='bias')
+    data = te.placeholder(dshape, name='data')
+    kernel = te.placeholder(kshape, name='kernel')
+    bias = te.placeholder(bshape, name='bias')
     def verify(target="llvm",
                algorithm=nnpack.ConvolutionAlgorithm.AUTO,
                with_bias=True):
@@ -181,7 +182,7 @@ def verify(target="llvm",
             [PAD, PAD, PAD, PAD], [STRIDE, STRIDE],
             algorithm=algorithm)
 
-        s = tvm.create_schedule(output.op)
+        s = te.create_schedule(output.op)
 
         f = tvm.build(s, [data, kernel, bias, output], target)
 
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
index f86a42447d81..9efdc3e5a763 100644
--- a/tests/python/contrib/test_random.py
+++ b/tests/python/contrib/test_random.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import random
 
@@ -22,7 +23,7 @@ def test_randint():
     m = 1024
     n = 1024
     A = random.randint(-127, 128, size=(m, n), dtype='int32')
-    s = tvm.create_schedule(A.op)
+    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.runtime.enabled(target):
@@ -46,7 +47,7 @@ def test_uniform():
     m = 1024
     n = 1024
     A = random.uniform(0, 1, size=(m, n))
-    s = tvm.create_schedule(A.op)
+    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.runtime.enabled(target):
@@ -70,7 +71,7 @@ def test_normal():
     m = 1024
     n = 1024
     A = random.normal(3, 4, size=(m, n))
-    s = tvm.create_schedule(A.op)
+    s = te.create_schedule(A.op)
 
     def verify(target="llvm"):
         if not tvm.runtime.enabled(target):
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
index 2b6d001d243d..af9d6ddf8dc9 100644
--- a/tests/python/contrib/test_rocblas.py
+++ b/tests/python/contrib/test_rocblas.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import rocblas
 
@@ -22,10 +23,10 @@ def test_matmul_add():
     n = 1024
     l = 128
     m = 235
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((l, m), name='B')
+    A = te.placeholder((n, l), name='A')
+    B = te.placeholder((l, m), name='B')
     C = rocblas.matmul(A, B)
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
 
     def verify(target="rocm"):
         if not tvm.runtime.enabled(target):
diff --git a/tests/python/contrib/test_rpc_proxy.py b/tests/python/contrib/test_rpc_proxy.py
index df0ee2bb8478..6cd865e10ed3 100644
--- a/tests/python/contrib/test_rpc_proxy.py
+++ b/tests/python/contrib/test_rpc_proxy.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import logging
 import numpy as np
 import time
diff --git a/tests/python/contrib/test_rpc_tracker.py b/tests/python/contrib/test_rpc_tracker.py
index 11e7766f374b..2443c708c5c0 100644
--- a/tests/python/contrib/test_rpc_tracker.py
+++ b/tests/python/contrib/test_rpc_tracker.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import logging
 import numpy as np
 import time
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index 87cdac01ce3a..9297a32871fa 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -15,18 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def test_sort():
     n = 2
     l = 5
     m = 3
-    data = tvm.placeholder((n, l, m), name='data')
-    sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32")
+    data = te.placeholder((n, l, m), name='data')
+    sort_num = te.placeholder((n, m), name="sort_num", dtype="int32")
     axis = 1
     is_ascend = False
-    out = tvm.extern(data.shape, [data, sort_num],
-                     lambda ins, outs: tvm.call_packed(
+    out = te.extern(data.shape, [data, sort_num],
+                     lambda ins, outs: tvm.tir.call_packed(
                          "tvm.contrib.sort.argsort_nms", ins[0],
                          ins[1], outs[0], axis, is_ascend),
                      dtype='int32', name="sort_tensor")
@@ -38,7 +39,7 @@ def test_sort():
 
     ctx = tvm.cpu(0)
     target = "llvm"
-    s = tvm.create_schedule(out.op)
+    s = te.create_schedule(out.op)
     f = tvm.build(s, [data, sort_num, out], target)
     a = tvm.nd.array(np.array(input).astype(data.dtype), ctx)
     b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
@@ -51,17 +52,17 @@ def test_sort_np():
     axis = 4
     reduced_shape = (1, 2, 3, 4, 6)
     is_ascend = True
-    data = tvm.placeholder(dshape, name='data')
-    sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32")
-    out = tvm.extern(data.shape, [data, sort_num],
-                     lambda ins, outs: tvm.call_packed(
+    data = te.placeholder(dshape, name='data')
+    sort_num = te.placeholder(reduced_shape, name="sort_num", dtype="int32")
+    out = te.extern(data.shape, [data, sort_num],
+                     lambda ins, outs: tvm.tir.call_packed(
                          "tvm.contrib.sort.argsort_nms", ins[0],
                          ins[1], outs[0], axis, is_ascend),
                      dtype='int32', name="sort_tensor")
 
     ctx = tvm.cpu(0)
     target = "llvm"
-    s = tvm.create_schedule(out.op)
+    s = te.create_schedule(out.op)
     f = tvm.build(s, [data, sort_num, out], target)
 
     np_data = np.random.uniform(size=dshape)
diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py
index 7cc4a00cf16c..5e0ca5cab104 100644
--- a/tests/python/contrib/test_sparse.py
+++ b/tests/python/contrib/test_sparse.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import tvm.contrib.sparse as tvmsp
 import tvm.runtime.ndarray as _nd
 import numpy as np
@@ -25,18 +26,18 @@ def test_static_tensor():
     stype = 'csr'
     target = 'llvm'
     ctx = tvm.context(target, 0)
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
+    m = te.size_var('m')
+    n = te.size_var('n')
     A = tvmsp.placeholder(shape=(m, n), name='A', dtype=dtype)
     assert(A.stype == 'csr')
     n = 3
     a = np.maximum(np.random.uniform(size=(n,n)).astype(dtype)-.6, 0.)
     a = tvmsp.array(a, ctx)
-    A.data = tvm.placeholder(a.data.shape, dtype, name='A_data')
-    Ab = tvm.decl_buffer(a.data.shape, dtype, name='A_data')
+    A.data = te.placeholder(a.data.shape, dtype, name='A_data')
+    Ab = tvm.tir.decl_buffer(a.data.shape, dtype, name='A_data')
     binds = {A.data: Ab}
-    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
-    s = tvm.create_schedule(C.op)
+    C = te.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = te.create_schedule(C.op)
     f = tvm.build(s, [A.data, C], target, binds=binds)
     c = tvmsp.array(np.zeros((n,n), dtype), ctx)
     c.data = tvm.nd.empty(a.data.shape, dtype)
@@ -50,18 +51,18 @@ def test_dynamic_tensor():
     stype = 'csr'
     target = 'llvm'
     ctx = tvm.context(target, 0)
-    nr, nc, n = tvm.size_var('nr'), tvm.size_var('nc'), tvm.size_var('n')
+    nr, nc, n = te.size_var('nr'), te.size_var('nc'), te.size_var('n')
     A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
     assert(A.stype == 'csr')
-    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
-    s = tvm.create_schedule(C.op)
+    C = te.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = te.create_schedule(C.op)
     _nr, _nc = 3, 5
     a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
     a = tvmsp.array(a, ctx)
     assert a.data.dtype == a.dtype
     Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
-    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
-    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
+    Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
+    Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
     binds = {A.data: Ab.data, A.indices: Ab.indices}
     f = tvm.build(s, [nr, A.data, C], target, binds=binds)
     c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
@@ -76,11 +77,11 @@ def test_sparse_array_tuple():
     stype = 'csr'
     target = 'llvm'
     ctx = tvm.context(target, 0)
-    nr, nc, n = tvm.size_var('nr'), tvm.size_var('nc'), tvm.size_var('n')
+    nr, nc, n = te.size_var('nr'), te.size_var('nc'), te.size_var('n')
     A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
     assert(A.stype == 'csr')
-    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
-    s = tvm.create_schedule(C.op)
+    C = te.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = te.create_schedule(C.op)
     _nr, _nc = 3, 5
     a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
     # convert to sparse array tuple
@@ -98,8 +99,8 @@ def test_sparse_array_tuple():
     a = tvmsp.array(a_init, shape=source_array.shape, ctx=ctx)
     assert a.data.dtype == a.dtype
     Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
-    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
-    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
+    Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
+    Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
     binds = {A.data: Ab.data, A.indices: Ab.indices}
     f = tvm.build(s, [nr, A.data, C], target, binds=binds)
     c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
diff --git a/tests/python/contrib/test_tedd.py b/tests/python/contrib/test_tedd.py
index d4d3ce464d44..6e5f3a40fbcb 100644
--- a/tests/python/contrib/test_tedd.py
+++ b/tests/python/contrib/test_tedd.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
+from tvm import te
 import numpy as np
 import re
 import topi
@@ -31,10 +31,10 @@ def checkdepdency():
     return not {'graphviz', 'ipython'} - {pkg.key for pkg in pkg_resources.working_set}
 
 def test_dfg():
-    A = tvm.placeholder((1024, 4096), dtype='float32', name='A')
+    A = te.placeholder((1024, 4096), dtype='float32', name='A')
     B = topi.nn.softmax(A)
     # confirm lower works
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
 
     def verify():
         from tvm.contrib import tedd
@@ -49,7 +49,7 @@ def verify():
         findany(r"Stage_2:O_0 -> Tensor_2_0", str)
         findany(r"Tensor_2_0 -> Stage_3:I_0", str)
         findany(r"Stage_3:O_0 -> Tensor_3_0", str)
-        findany(r"Tensor_2_0 -> Stage_4:I_0", str)                
+        findany(r"Tensor_2_0 -> Stage_4:I_0", str)
         findany(r"Tensor_3_0 -> Stage_4:I_1", str)
         findany(r"Stage_4:O_0 -> Tensor_4_0", str)
     if checkdepdency():
@@ -57,13 +57,13 @@ def verify():
 
 
 def test_itervar_relationship_graph():
-    n = tvm.var("n")
-    m = tvm.var("m")
-    A = tvm.placeholder((n, m), name='A')
-    k = tvm.reduce_axis((0, m), "k")
-    B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k], axis=k), name="B")
+    n = te.var("n")
+    m = te.var("m")
+    A = te.placeholder((n, m), name='A')
+    k = te.reduce_axis((0, m), "k")
+    B = te.compute((n, ), lambda i: te.sum(A[i, k], axis=k), name="B")
 
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     s[B].split(B.op.reduce_axis[0], factor=16)
 
     def verify():
@@ -89,18 +89,18 @@ def verify():
 
 
 def test_schedule_tree():
-    block_x = tvm.thread_axis('blockIdx.x')
-    thread_x = tvm.thread_axis('threadIdx.x')
-    n = tvm.var("n")
-    m = tvm.var("m")
-    l = tvm.var("l")
-    A = tvm.placeholder((n, m, l), name='A')
-    B = tvm.compute((n, m, l), lambda bi, bj, bk: A[bi, bj, bk] + 1, name='B')
-    r = tvm.reduce_axis((0, m), "r")
-    C = tvm.compute((n, m,),
-                    lambda ci, cj: tvm.sum(B[ci, cj, r], axis=r),
-                    name="C")
-    s = tvm.create_schedule(C.op)
+    block_x = te.thread_axis('blockIdx.x')
+    thread_x = te.thread_axis('threadIdx.x')
+    n = te.var("n")
+    m = te.var("m")
+    l = te.var("l")
+    A = te.placeholder((n, m, l), name='A')
+    B = te.compute((n, m, l), lambda bi, bj, bk: A[bi, bj, bk] + 1, name='B')
+    r = te.reduce_axis((0, m), "r")
+    C = te.compute((n, m,),
+                   lambda ci, cj: te.sum(B[ci, cj, r], axis=r),
+                   name="C")
+    s = te.create_schedule(C.op)
     s.cache_read(A, 'shared', [B])
     s[B].vectorize(B.op.axis[-1])
     s[C].reorder(C.op.reduce_axis[0], C.op.axis[0])
@@ -115,7 +115,7 @@ def verify():
         str = tedd.viz_schedule_tree(s, False, '', True)
         findany(r"digraph \"Schedule Tree\"", str)
         findany(r"subgraph cluster_legend", str)
-        # Check the A_shared stage, including memory scope, itervars, 
+        # Check the A_shared stage, including memory scope, itervars,
         # and compute
         findany(r"Stage_1.*A\.shared<br/>Scope: shared.+>0.+>" \
             r"ax0\(kDataPar\).+>1.+ax1\(kDataPar\).+>2.+>ax2\(kDataPar\).+>" \
@@ -134,4 +134,4 @@ def verify():
 if __name__ == "__main__":
     test_dfg()
     test_itervar_relationship_graph()
-    test_schedule_tree()
\ No newline at end of file
+    test_schedule_tree()
diff --git a/tests/python/contrib/test_tflite_runtime.py b/tests/python/contrib/test_tflite_runtime.py
index 9d396be85822..8c883b031a89 100644
--- a/tests/python/contrib/test_tflite_runtime.py
+++ b/tests/python/contrib/test_tflite_runtime.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm import rpc
 from tvm.contrib import util, tflite_runtime
@@ -28,7 +29,7 @@ def create_tflite_model():
         root = tf.Module()
         root.const = tf.constant([1., 2.], tf.float32)
         root.f = tf.function(lambda x: root.const * x)
-        
+
         input_signature = tf.TensorSpec(shape=[2,  ], dtype=tf.float32)
         concrete_func = root.f.get_concrete_function(input_signature)
         converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
@@ -48,13 +49,13 @@ def check_local():
         interpreter.allocate_tensors()
         input_details = interpreter.get_input_details()
         output_details = interpreter.get_output_details()
-        
+
         input_shape = input_details[0]['shape']
         tflite_input = np.array(np.random.random_sample(input_shape), dtype=np.float32)
         interpreter.set_tensor(input_details[0]['index'], tflite_input)
         interpreter.invoke()
         tflite_output = interpreter.get_tensor(output_details[0]['index'])
-        
+
         # inference via tvm tflite runtime
         with open(tflite_model_path, 'rb') as model_fin:
             runtime = tflite_runtime.create(model_fin.read(), tvm.cpu(0))
@@ -76,7 +77,7 @@ def check_remote():
         interpreter.allocate_tensors()
         input_details = interpreter.get_input_details()
         output_details = interpreter.get_output_details()
-        
+
         input_shape = input_details[0]['shape']
         tflite_input = np.array(np.random.random_sample(input_shape), dtype=np.float32)
         interpreter.set_tensor(input_details[0]['index'], tflite_input)
diff --git a/tests/python/frontend/caffe2/test_forward.py b/tests/python/frontend/caffe2/test_forward.py
index 92258bbc284e..f05287216ec9 100644
--- a/tests/python/frontend/caffe2/test_forward.py
+++ b/tests/python/frontend/caffe2/test_forward.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime
 from tvm.relay.testing.config import ctx_list
 from tvm import relay
diff --git a/tests/python/frontend/coreml/test_forward.py b/tests/python/frontend/coreml/test_forward.py
index b4ad300c3403..3a156385d510 100644
--- a/tests/python/frontend/coreml/test_forward.py
+++ b/tests/python/frontend/coreml/test_forward.py
@@ -20,6 +20,7 @@
 from coremltools.models import datatypes
 
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime
 import topi
 import topi.testing
diff --git a/tests/python/frontend/darknet/test_forward.py b/tests/python/frontend/darknet/test_forward.py
index 22dd08ab52ea..fcaeaec79cb6 100644
--- a/tests/python/frontend/darknet/test_forward.py
+++ b/tests/python/frontend/darknet/test_forward.py
@@ -23,6 +23,7 @@
 """
 import numpy as np
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime
 from tvm.contrib.download import download_testdata
 download_testdata.__test__ = False
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index f7dcb29b37aa..db0c2c65e04f 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.contrib import graph_runtime
 from tvm.relay.testing.config import ctx_list
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 504f70031e24..f676295b324d 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -18,6 +18,7 @@
 import operator
 
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime
 from tvm.relay.testing.config import ctx_list
 from tvm import relay
diff --git a/tests/python/frontend/mxnet/test_graph.py b/tests/python/frontend/mxnet/test_graph.py
index 6e870000a76b..0008799caebb 100644
--- a/tests/python/frontend/mxnet/test_graph.py
+++ b/tests/python/frontend/mxnet/test_graph.py
@@ -17,6 +17,7 @@
 import mxnet as mx
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 import model_zoo
diff --git a/tests/python/frontend/mxnet/test_qnn_ops_utils.py b/tests/python/frontend/mxnet/test_qnn_ops_utils.py
index 4ee5f2e3c3c3..32042562b209 100644
--- a/tests/python/frontend/mxnet/test_qnn_ops_utils.py
+++ b/tests/python/frontend/mxnet/test_qnn_ops_utils.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 6243178dcb2b..20d7003e1353 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -23,6 +23,7 @@
 import topi
 import topi.testing
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.contrib import graph_runtime
 from tvm.relay.testing.config import ctx_list
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 715ae7805cc3..ba1d7bbe67bc 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -25,6 +25,7 @@
 import torch
 from torch.nn import Module
 import tvm
+from tvm import te
 import torchvision
 
 from tvm import relay
@@ -720,7 +721,7 @@ def test_vgg11():
 def test_vgg11_bn():
     torch.set_grad_enabled(False)
     verify_model("vgg11_bn")
-    
+
 #TODO: Need to update schedule in tophub file after PR #4787 updated workloads
 def test_mobilenet_v2():
     torch.set_grad_enabled(False)
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 2340bd4e6318..9cd978e2e147 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -34,6 +34,7 @@
 from tensorflow.python.ops import init_ops
 from distutils.version import LooseVersion
 import tvm
+from tvm import te
 from tvm import relay
 import tvm.relay.testing.tf as tf_testing
 
@@ -2717,7 +2718,7 @@ def test_forward_reduce_any():
     in_data = tf.placeholder(tf.bool, (5, 7, 11), name="in_data")
     tf.reduce_any(in_data, name="any")
     compare_tf_with_tvm([np_data], ['in_data:0'], 'any:0')
-    
+
 def test_forward_reduce_max():
     def check_max(ishape, axis, keepdims, dtype):
         tf.reset_default_graph()
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index f4b7ee0cd8b1..4a16325e3e40 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -24,6 +24,7 @@
 from functools import partial
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 import tensorflow as tf
 from tensorflow.python.framework import constant_op
diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py
index f95787dd94a4..c66e596ef50c 100644
--- a/tests/python/integration/test_dot.py
+++ b/tests/python/integration/test_dot.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def lower(s, args, name="mydot"):
@@ -22,18 +23,18 @@ def lower(s, args, name="mydot"):
     arg_list = []
 
     for x in args:
-        assert isinstance(x, tvm.tensor.Tensor)
-        buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.op.name)
+        assert isinstance(x, te.tensor.Tensor)
+        buf = tvm.tir.decl_buffer(x.shape, dtype=x.dtype, name=x.op.name)
         binds[x] = buf
         arg_list.append(buf)
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 16)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    fapi = tvm.ir_pass.MakeAPI(stmt, name, arg_list, 0, True)
-    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, binds, 16)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, name, arg_list, 0, True)
+    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
     return fapi
 
 
@@ -43,12 +44,12 @@ def mybuild(fapi, target="llvm"):
 
 def test_dot():
     nn = 12
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    k = tvm.reduce_axis((0, n), 'k')
-    C = tvm.compute((1,), lambda _: tvm.sum(A[k] * B[k], axis=k), name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    k = te.reduce_axis((0, n), 'k')
+    C = te.compute((1,), lambda _: te.sum(A[k] * B[k], axis=k), name='C')
+    s = te.create_schedule(C.op)
     fapi = lower(s, [A, B, C])
 
     def verify(target):
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index ecfc83ca010d..a8f2db19a9b0 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -15,21 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm.contrib import nvcc
 import numpy as np
 import time
 
 def test_exp():
     # graph
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name='B')
+    s = te.create_schedule(B.op)
     # create iter var and assign them tags.
     num_thread = 8
     bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-    s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[B].bind(bx, te.thread_axis("blockIdx.x"))
+    s[B].bind(tx, te.thread_axis("threadIdx.x"))
 
     # one line to build the function.
     def check_device(device, host="stackvm"):
@@ -57,11 +58,11 @@ def check_device(device, host="stackvm"):
 def test_fmod():
     # graph
     def run(dtype):
-        n = tvm.size_var('n')
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
-        B = tvm.placeholder((n,), name='B', dtype=dtype)
-        C = tvm.compute(A.shape, lambda *i: tvm.fmod(A(*i), B(*i)), name='C')
-        s = tvm.create_schedule(C.op)
+        n = te.size_var('n')
+        A = te.placeholder((n,), name='A', dtype=dtype)
+        B = te.placeholder((n,), name='B', dtype=dtype)
+        C = te.compute(A.shape, lambda *i: te.fmod(A(*i), B(*i)), name='C')
+        s = te.create_schedule(C.op)
         # create iter var and assign them tags.
         num_thread = 8
         bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
@@ -73,8 +74,8 @@ def check_device(device):
                 return
             target = tvm.target.create(device)
             if "cpu" not in target.keys:
-                s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-                s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[C].bind(bx, te.thread_axis("blockIdx.x"))
+                s[C].bind(tx, te.thread_axis("threadIdx.x"))
             fmod = tvm.build(s, [A, B, C], device, name="myfmod")
 
             # launch the kernel.
@@ -96,23 +97,23 @@ def check_device(device):
 
 def test_multiple_cache_write():
     # graph
-    n = tvm.convert(1024)
-    A0 = tvm.placeholder((n,), name='A0', dtype = "float32")
-    A1 = tvm.placeholder((n,), name='A1', dtype = "float32")
-    B0, B1 = tvm.compute((n,),
+    n = tvm.runtime.convert(1024)
+    A0 = te.placeholder((n,), name='A0', dtype = "float32")
+    A1 = te.placeholder((n,), name='A1', dtype = "float32")
+    B0, B1 = te.compute((n,),
             lambda *i: (A0(*i) + A1(*i), A0(*i) * A1(*i)),
             name='B')
-    C = tvm.compute((n,), lambda *i: B0(*i) + B1(*i),
+    C = te.compute((n,), lambda *i: B0(*i) + B1(*i),
             name='C')
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     # create iter var and assign them tags.
     num_thread = 8
     B0_cache, B1_cache = s.cache_write([B0, B1], "local")
     bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
     s[B0].compute_at(s[C], bx)
     s[B0_cache].compute_at(s[C], bx)
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
     # one line to build the function.
     def check_device(device, host="stackvm"):
         if not tvm.runtime.enabled(host):
@@ -140,10 +141,10 @@ def check_device(device, host="stackvm"):
 
 def test_log_pow_llvm():
     # graph
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: tvm.power(tvm.log(A(*i)), 2.0), name='B')
-    s = tvm.create_schedule(B.op)
+    n = te.size_var('n')
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: te.power(te.log(A(*i)), 2.0), name='B')
+    s = te.create_schedule(B.op)
     # create iter var and assign them tags.
     bx, tx = s[B].split(B.op.axis[0], factor=32)
     # one line to build the function.
@@ -168,10 +169,10 @@ def test_log_pow_llvm():
 def test_popcount():
     def run(dtype):
         # graph
-        n = tvm.convert(1024)
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
-        B = tvm.compute(A.shape, lambda *i: tvm.popcount(A(*i)), name='B')
-        s = tvm.create_schedule(B.op)
+        n = tvm.runtime.convert(1024)
+        A = te.placeholder((n,), name='A', dtype=dtype)
+        B = te.compute(A.shape, lambda *i: tvm.tir.popcount(A(*i)), name='B')
+        s = te.create_schedule(B.op)
         # simple schedule
         num_thread = 8
         bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
@@ -183,8 +184,8 @@ def check_device(device):
                 return
             target = tvm.target.create(device)
             if "cpu" not in target.keys:
-                s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-                s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[B].bind(bx, te.thread_axis("blockIdx.x"))
+                s[B].bind(tx, te.thread_axis("threadIdx.x"))
             func = tvm.build(s, [A, B], device)
             # launch the kernel.
             n = 1024
@@ -207,21 +208,21 @@ def check_device(device):
 def test_add():
     def run(dtype):
         # graph
-        n = tvm.size_var('n')
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
-        B = tvm.placeholder((n,), name='B', dtype=dtype)
-        bias = tvm.var("bias", dtype=dtype)
-        scale = tvm.var("scale", dtype=dtype)
-        C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+        n = te.size_var('n')
+        A = te.placeholder((n,), name='A', dtype=dtype)
+        B = te.placeholder((n,), name='B', dtype=dtype)
+        bias = te.var("bias", dtype=dtype)
+        scale = te.var("scale", dtype=dtype)
+        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
         # schedule
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
         # create iter var and assign them tags.
         num_thread = 16
         bx, x = s[C].split(C.op.axis[0], factor=num_thread*4)
         tx, x = s[C].split(x, nparts=num_thread)
         _, x = s[C].split(x, factor=4)
-        s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-        s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[C].bind(bx, te.thread_axis("blockIdx.x"))
+        s[C].bind(tx, te.thread_axis("threadIdx.x"))
         s[C].vectorize(x)
 
         # one line to build the function.
@@ -259,16 +260,16 @@ def check_device(device):
 def try_warp_memory():
     """skip this in default test because it require higher arch"""
     m = 128
-    A = tvm.placeholder((m,), name='A')
-    B = tvm.compute((m,), lambda i: A[i] + 3, name='B')
+    A = te.placeholder((m,), name='A')
+    B = te.compute((m,), lambda i: A[i] + 3, name='B')
     warp_size = 32
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     AA = s.cache_read(A, "warp", [B])
     xo, xi = s[B].split(B.op.axis[0], warp_size * 2)
     xi0, xi1 = s[B].split(xi, factor=warp_size)
-    tx = tvm.thread_axis("threadIdx.x")
+    tx = te.thread_axis("threadIdx.x")
     s[B].bind(xi1, tx)
-    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[B].bind(xo, te.thread_axis("blockIdx.x"))
     s[AA].compute_at(s[B], xo)
     xo, xi = s[AA].split(s[AA].op.axis[0], warp_size)
     s[AA].bind(xi, tx)
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
index b2c783487074..7883a4cc4dce 100644
--- a/tests/python/integration/test_ewise_fpga.py
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 import os
 
@@ -29,13 +30,13 @@ def tvm_callback_vhls_postproc(code):
 
 def test_exp():
     # graph
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: tvm.exp(A(*i)), name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name='B')
+    s = te.create_schedule(B.op)
     # create iter var and assign them tags.
     px, x = s[B].split(B.op.axis[0], nparts=1)
-    s[B].bind(px, tvm.thread_axis("pipeline"))
+    s[B].bind(px, te.thread_axis("pipeline"))
 
     # one line to build the function.
     def check_device(device, host="llvm"):
@@ -64,17 +65,17 @@ def check_device(device, host="llvm"):
 
 def test_multi_kernel():
     # graph
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-    D = tvm.compute(A.shape, lambda *i: A(*i) + C(*i), name='D')
-    s = tvm.create_schedule(D.op)
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    D = te.compute(A.shape, lambda *i: A(*i) + C(*i), name='D')
+    s = te.create_schedule(D.op)
     # create iter var and assign them tags.
     px, x = s[C].split(C.op.axis[0], nparts=1)
-    s[C].bind(px, tvm.thread_axis("pipeline"))
+    s[C].bind(px, te.thread_axis("pipeline"))
     px, x = s[D].split(D.op.axis[0], nparts=1)
-    s[D].bind(px, tvm.thread_axis("pipeline"))
+    s[D].bind(px, te.thread_axis("pipeline"))
 
     # one line to build the function.
     def check_device(device, host="llvm"):
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index d61335f68924..12026da61394 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 import time
 
@@ -22,26 +23,26 @@
 def test_gemm():
     # graph
     nn = 1024
-    n = tvm.convert(nn)
+    n = tvm.runtime.convert(nn)
     m = n
     l = n
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((m, l), name='B')
-    k = tvm.reduce_axis((0, l), name='k')
-    C = tvm.compute(
+    A = te.placeholder((n, l), name='A')
+    B = te.placeholder((m, l), name='B')
+    k = te.reduce_axis((0, l), name='k')
+    C = te.compute(
         (n, m),
-        lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
+        lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k),
         name='CC')
     # schedule
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     xtile, ytile = 32, 32
     scale = 8
     num_thread = 8
     block_factor = scale * num_thread
-    block_x = tvm.thread_axis("blockIdx.x")
-    thread_x = tvm.thread_axis("threadIdx.x")
-    block_y = tvm.thread_axis("blockIdx.y")
-    thread_y = tvm.thread_axis("threadIdx.y")
+    block_x = te.thread_axis("blockIdx.x")
+    thread_x = te.thread_axis("threadIdx.x")
+    block_y = te.thread_axis("blockIdx.y")
+    thread_y = te.thread_axis("threadIdx.y")
 
     CC = s.cache_write(C, "local")
     AA = s.cache_read(A, "shared", [CC])
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index 62c029043084..82ade4478bea 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -15,25 +15,26 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 
 def test_reduce_prims():
     def test_prim(reducer, np_reducer):
         # graph
-        n = tvm.size_var('n')
-        m = tvm.size_var('m')
-        A = tvm.placeholder((n, m), name='A')
-        R = tvm.compute((n, ), lambda i: tvm.tir.Select((i > 1), 1, 0), name='R')
-        k = tvm.reduce_axis((0, m))
-        B = tvm.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(R[i]==1)), name='B')
+        n = tvm.te.size_var('n')
+        m = tvm.te.size_var('m')
+        A = te.placeholder((n, m), name='A')
+        R = te.compute((n, ), lambda i: tvm.tir.Select((i > 1), 1, 0), name='R')
+        k = te.reduce_axis((0, m))
+        B = te.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(R[i]==1)), name='B')
         # schedule
-        s = tvm.create_schedule(B.op)
+        s = te.create_schedule(B.op)
         # create iter var and assign them tags.
         num_thread = 1
         xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, te.thread_axis("blockIdx.x"))
+        s[B].bind(xi, te.thread_axis("threadIdx.x"))
         s[R].compute_inline()
 
         # one line to build the function.
@@ -64,18 +65,18 @@ def check_device(device, host="llvm"):
         check_device("vulkan")
         check_device("cuda")
         check_device("opencl")
-    test_prim(tvm.sum, np.sum)
-    test_prim(tvm.min, np.amin)
-    test_prim(tvm.max, np.amax)
+    test_prim(te.sum, np.sum)
+    test_prim(tvm.te.min, np.amin)
+    test_prim(tvm.te.max, np.amax)
 
 
 def test_rfactor():
-    n = tvm.convert(1027)
-    A = tvm.placeholder((n,), name='A')
-    k = tvm.reduce_axis((0, n))
-    B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B')
+    n = tvm.runtime.convert(1027)
+    A = te.placeholder((n,), name='A')
+    k = te.reduce_axis((0, n))
+    B = te.compute((1,), lambda i: te.sum(A[k], axis=k), name='B')
     # schedule
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     kf, ki = s[B].split(k, nparts=4)
     BF = s.rfactor(B, kf)
     s[BF].parallel(BF.op.axis[0])
@@ -100,12 +101,12 @@ def check_target(target="llvm"):
     check_target()
 
 def test_rfactor_factor_axis():
-    n = tvm.convert(1027)
-    A = tvm.placeholder((n,), name='A')
-    k = tvm.reduce_axis((0, n))
-    B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k), name='B')
+    n = tvm.runtime.convert(1027)
+    A = te.placeholder((n,), name='A')
+    k = te.reduce_axis((0, n))
+    B = te.compute((1,), lambda i: te.sum(A[k], axis=k), name='B')
     # schedule
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     kf, ki = s[B].split(k, nparts=4)
     BF = s.rfactor(B, kf, 1)
     s[BF].parallel(BF.op.axis[0])
@@ -133,21 +134,21 @@ def check_target(target="llvm"):
 def test_rfactor_threads():
     nn = 1027
     mm = 10
-    n = tvm.convert(nn)
-    m = tvm.convert(mm)
-    A = tvm.placeholder((m, n), name='A')
-    k = tvm.reduce_axis((0, n))
+    n = tvm.runtime.convert(nn)
+    m = tvm.runtime.convert(mm)
+    A = te.placeholder((m, n), name='A')
+    k = te.reduce_axis((0, n))
     nthread = 16
-    B = tvm.compute((m,), lambda i: tvm.sum(A[i, k], axis=k, where=(i>1)), name='B')
+    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k, where=(i>1)), name='B')
     # schedule
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     ko, kf = s[B].split(k, factor=nthread)
     BF = s.rfactor(B, kf)
     bx, ty = s[B].split(s[B].op.axis[0], factor=nthread)
-    s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[B].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[B].bind(bx, te.thread_axis("blockIdx.x"))
+    s[B].bind(ty, te.thread_axis("threadIdx.y"))
     tx = s[B].op.reduce_axis[0]
-    thread_x = tvm.thread_axis("threadIdx.x")
+    thread_x = te.thread_axis("threadIdx.x")
     s[B].bind(tx, thread_x)
     s[BF].compute_at(s[B], tx)
     s[B].set_store_predicate(thread_x.var.equal(0))
@@ -183,23 +184,23 @@ def check_target(device, host="stackvm"):
 def test_rfactor_elemwise_threads():
     n = 1025
     m = 10
-    A = tvm.placeholder((m, n), name='A')
-    k = tvm.reduce_axis((0, n))
+    A = te.placeholder((m, n), name='A')
+    k = te.reduce_axis((0, n))
     nthread = 16
-    B = tvm.compute((m,), lambda i: tvm.sum(A[i, k], axis=k), name='B')
-    BB = tvm.compute((m,), lambda i: B[i] + 1, name='BB')
-    C = tvm.compute((m,), lambda i: BB[i] + 1, name='C')
+    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k), name='B')
+    BB = te.compute((m,), lambda i: B[i] + 1, name='BB')
+    C = te.compute((m,), lambda i: BB[i] + 1, name='C')
     # schedule
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     s[BB].compute_inline()
     bx, ty = s[C].split(s[C].op.axis[0], factor=nthread)
     ko, kf = s[B].split(k, factor=nthread)
     BF = s.rfactor(B, kf)
     s[B].compute_at(s[C], ty)
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
     tx = s[B].op.reduce_axis[0]
-    thread_x = tvm.thread_axis("threadIdx.x")
+    thread_x = te.thread_axis("threadIdx.x")
     s[B].bind(tx, thread_x)
     s[BF].compute_at(s[B], tx)
     # Since thread_x is shared across reductions
@@ -237,18 +238,18 @@ def fcombine(x, y):
         return lhs, rhs
 
     def fidentity(t0, t1):
-        return tvm.const(-1, t0), tvm.min_value(t1)
+        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)
 
-    argmax = tvm.comm_reducer(fcombine,
+    argmax = te.comm_reducer(fcombine,
                               fidentity,
                               name='argmax')
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    idx = tvm.placeholder((m, n), name='idx', dtype='int32')
-    val = tvm.placeholder((m, n), name='val', dtype='float32')
-    k = tvm.reduce_axis((0, n), 'k')
-    T0, T1 = tvm.compute((m,), lambda i: argmax((idx[i,k], val[i,k]), axis=k), name='T')
-    s = tvm.create_schedule(T0.op)
+    m = te.size_var('m')
+    n = te.size_var('n')
+    idx = te.placeholder((m, n), name='idx', dtype='int32')
+    val = te.placeholder((m, n), name='val', dtype='float32')
+    k = te.reduce_axis((0, n), 'k')
+    T0, T1 = te.compute((m,), lambda i: argmax((idx[i,k], val[i,k]), axis=k), name='T')
+    s = te.create_schedule(T0.op)
 
     def check_target():
         device = 'cpu'
@@ -284,31 +285,31 @@ def fcombine(x, y):
         return lhs, rhs
 
     def fidentity(t0, t1):
-        return tvm.const(-1, t0), tvm.min_value(t1)
+        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)
 
-    argmax = tvm.comm_reducer(fcombine,
+    argmax = te.comm_reducer(fcombine,
                               fidentity,
                               name='argmax')
 
     nn = 1027
     mm = 10
-    n = tvm.convert(nn)
-    m = tvm.convert(mm)
-    A0 = tvm.placeholder((m, n), name='A0', dtype='int32')
-    A1 = tvm.placeholder((m, n), name='A1', dtype='float32')
-    k = tvm.reduce_axis((0, n))
-    B0, B1 = tvm.compute((m,), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name='B')
+    n = tvm.runtime.convert(nn)
+    m = tvm.runtime.convert(mm)
+    A0 = te.placeholder((m, n), name='A0', dtype='int32')
+    A1 = te.placeholder((m, n), name='A1', dtype='float32')
+    k = te.reduce_axis((0, n))
+    B0, B1 = te.compute((m,), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name='B')
 
     # schedule
-    s = tvm.create_schedule(B0.op)
+    s = te.create_schedule(B0.op)
     nthread = 16
     ko, kf = s[B0].split(k, factor=nthread)
     BF0, BF1 = s.rfactor(B0, kf)
     bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread)
-    s[B0].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[B0].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[B0].bind(bx, te.thread_axis("blockIdx.x"))
+    s[B0].bind(ty, te.thread_axis("threadIdx.y"))
     tx = s[B0].op.reduce_axis[0]
-    thread_x = tvm.thread_axis("threadIdx.x")
+    thread_x = te.thread_axis("threadIdx.x")
     s[B0].bind(tx, thread_x)
     s[BF0.op].compute_at(s[B0], tx)
     s[B0].set_store_predicate(thread_x.var.equal(0))
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index 366ed3d4f1a5..99553c3579d5 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -15,24 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def test_scan():
-    m = tvm.size_var("m")
-    n = tvm.size_var("n")
-    X = tvm.placeholder((m, n), name="X")
-    s_state = tvm.placeholder((m, n))
-    s_init = tvm.compute((1, n), lambda _, i: X[0, i])
-    s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
-    scan = tvm.scan(s_init, s_update, s_state)
+    m = te.size_var("m")
+    n = te.size_var("n")
+    X = te.placeholder((m, n), name="X")
+    s_state = te.placeholder((m, n))
+    s_init = te.compute((1, n), lambda _, i: X[0, i])
+    s_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+    scan = tvm.te.scan(s_init, s_update, s_state)
     # test scan + compute case
-    res = tvm.compute((m, n), lambda i, j: scan[i, j])
+    res = te.compute((m, n), lambda i, j: scan[i, j])
 
     # schedule
-    s = tvm.create_schedule(res.op)
+    s = te.create_schedule(res.op)
     num_thread = 256
-    block_x = tvm.thread_axis(None, "blockIdx.x")
-    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
+    block_x = te.thread_axis(None, "blockIdx.x")
+    thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
     xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
     s[s_init].bind(xo, block_x)
     s[s_init].bind(xi, thread_x)
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 99f8b47cce07..60a372c2be39 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -21,6 +21,7 @@
 import time
 
 import tvm
+from tvm import te
 
 from tvm import autotvm
 from tvm.autotvm.tuner import RandomTuner
@@ -30,20 +31,20 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
     """An example template for testing"""
     assert N == 1, "Only consider batch_size = 1 in this template"
 
-    data = tvm.placeholder((N, CI, H, W), name='data')
-    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
+    data = te.placeholder((N, CI, H, W), name='data')
+    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
 
-    rc = tvm.reduce_axis((0, CI), name='rc')
-    ry = tvm.reduce_axis((0, KH), name='ry')
-    rx = tvm.reduce_axis((0, KW), name='rx')
+    rc = te.reduce_axis((0, CI), name='rc')
+    ry = te.reduce_axis((0, KH), name='ry')
+    rx = te.reduce_axis((0, KW), name='rx')
 
-    conv = tvm.compute(
+    conv = te.compute(
         (N, CO, H - KH + 1, W - KW + 1),
-        lambda nn, ff, yy, xx: tvm.sum(
+        lambda nn, ff, yy, xx: te.sum(
             data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx],
             axis=[rc, ry, rx]), tag="conv2d_nchw")
 
-    s = tvm.create_schedule([conv.op])
+    s = te.create_schedule([conv.op])
 
     output = conv
     OL = s.cache_write(conv, 'local')
@@ -65,15 +66,15 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
     kernel_scope = n  # this is the scope to attach global config inside this kernel
 
-    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
-    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
-    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
-    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
     s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
     s[OL].compute_at(s[output], tx)
 
@@ -100,9 +101,9 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
         tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
         ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
         tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # tune unroll
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index 5e45c612707a..7dad2ca586d7 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
 from tvm.contrib import nnpack
@@ -32,9 +33,9 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = te.placeholder((num_filter, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/tests/python/nightly/quantization/test_quantization_accuracy.py b/tests/python/nightly/quantization/test_quantization_accuracy.py
index f047952f3e6b..4818cc651b94 100644
--- a/tests/python/nightly/quantization/test_quantization_accuracy.py
+++ b/tests/python/nightly/quantization/test_quantization_accuracy.py
@@ -16,6 +16,7 @@
 # under the License.
 from collections import namedtuple
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import quantize as qtz
 import mxnet as mx
diff --git a/tests/python/relay/benchmarking/benchmark_vm.py b/tests/python/relay/benchmarking/benchmark_vm.py
index 55d788756b5c..1e9030c5d8e6 100644
--- a/tests/python/relay/benchmarking/benchmark_vm.py
+++ b/tests/python/relay/benchmarking/benchmark_vm.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime
 from tvm import relay
 from tvm.runtime import container
@@ -73,7 +74,7 @@ def get_vm_output(mod, data, params, target, ctx, dtype='float32',
             prof_res = np.array(ftimer("main", data).results) * 1000
             print("Mean vm inference time (std dev): %.2f ms (%.2f ms)" %
                   (np.mean(prof_res), np.std(prof_res)))
-            
+
         return result.asnumpy().astype(dtype)
 
     # random input
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 8f631f8fd047..deeb7330f9da 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.backend.interpreter import ConstructorValue
 from tvm.relay import create_executor
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 24176e4c41dd..aa81e3113b7f 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.loops import while_loop
 from tvm.relay.testing import run_infer_type as infer_type
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
index 4e4122a28cf0..eb018fed96e7 100644
--- a/tests/python/relay/test_backend_compile_engine.py
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 import tvm.testing
 from tvm import relay
 from tvm import autotvm
@@ -69,7 +70,7 @@ def _tmp_strategy(attrs, inputs, out_type, target):
     return strategy
 
 def _create_record(task_name, dshape, wshape, target, cost):
-    args = [tvm.placeholder(dshape), tvm.placeholder(wshape), (1, 1), (1, 1, 1, 1),
+    args = [te.placeholder(dshape), te.placeholder(wshape), (1, 1), (1, 1, 1, 1),
             (1, 1), 'float32']
     task = autotvm.task.create(task_name, args, target)
     cfg = autotvm.ConfigEntity(0, None, {}, [])
@@ -89,7 +90,7 @@ def _get_impls(dshape, wshape):
         return relay.backend.compile_engine.get_valid_implementations(
             relay.op.get("nn.conv2d"),
             out.attrs,
-            [tvm.placeholder(dshape), tvm.placeholder(wshape)],
+            [te.placeholder(dshape), te.placeholder(wshape)],
             out.checked_type,
             target)
 
@@ -110,7 +111,7 @@ def _select_impl(dshape, wshape, use_autotvm=False):
         return relay.backend.compile_engine.select_implementation(
             relay.op.get("nn.conv2d"),
             out.attrs,
-            [tvm.placeholder(dshape), tvm.placeholder(wshape)],
+            [te.placeholder(dshape), te.placeholder(wshape)],
             out.checked_type,
             target,
             use_autotvm)
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
index d5d29b645cfa..71428a6dbefd 100644
--- a/tests/python/relay/test_backend_graph_runtime.py
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.contrib import graph_runtime
 from tvm.relay.scope_builder import ScopeBuilder
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index 9b548f12f65b..360b6bd20416 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 import tvm.testing
 from tvm import nd
 from tvm import relay
diff --git a/tests/python/relay/test_change_batch.py b/tests/python/relay/test_change_batch.py
index e822bbb05910..e53887b1c408 100644
--- a/tests/python/relay/test_change_batch.py
+++ b/tests/python/relay/test_change_batch.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.testing import resnet
 from tvm.relay import transform
diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py
index 674e214df058..171b6b0b77b0 100644
--- a/tests/python/relay/test_cpp_build_module.py
+++ b/tests/python/relay/test_cpp_build_module.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.contrib.nvcc import have_fp16
 
diff --git a/tests/python/relay/test_error_reporting.py b/tests/python/relay/test_error_reporting.py
index aef93ad9f4dc..d69744824faa 100644
--- a/tests/python/relay/test_error_reporting.py
+++ b/tests/python/relay/test_error_reporting.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 
 def check_type_err(expr, msg):
diff --git a/tests/python/relay/test_expr_functor.py b/tests/python/relay/test_expr_functor.py
index 5c923655d7b7..ea7f8f6b411a 100644
--- a/tests/python/relay/test_expr_functor.py
+++ b/tests/python/relay/test_expr_functor.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import ExprFunctor, ExprMutator, ExprVisitor
 
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index b086df07a835..e3789988d2f3 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 import tvm.relay.testing
 import tvm.relay.transform
 from tvm import relay
diff --git a/tests/python/relay/test_external_runtime.py b/tests/python/relay/test_external_runtime.py
index 713aca918883..0942cbb941ea 100644
--- a/tests/python/relay/test_external_runtime.py
+++ b/tests/python/relay/test_external_runtime.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 import tvm.runtime._ffi_api
 from tvm import relay
 from tvm.contrib import util
diff --git a/tests/python/relay/test_feature.py b/tests/python/relay/test_feature.py
index 9066e85cf6da..3ef53d3b88b1 100644
--- a/tests/python/relay/test_feature.py
+++ b/tests/python/relay/test_feature.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import detect_feature
 from tvm.relay.transform import gradient
diff --git a/tests/python/relay/test_ir_bind.py b/tests/python/relay/test_ir_bind.py
index df280e2fa248..45474b6cc426 100644
--- a/tests/python/relay/test_ir_bind.py
+++ b/tests/python/relay/test_ir_bind.py
@@ -16,6 +16,7 @@
 # under the License.
 """ test bind function."""
 import tvm
+from tvm import te
 from tvm import relay
 
 
diff --git a/tests/python/relay/test_ir_module.py b/tests/python/relay/test_ir_module.py
index 939672d42152..bab82472263a 100644
--- a/tests/python/relay/test_ir_module.py
+++ b/tests/python/relay/test_ir_module.py
@@ -16,6 +16,7 @@
 # under the License.
 """Tests for module functionality."""
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.prelude import Prelude
 from tvm.relay.testing import add_nat_definitions
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index b7d7eb9f389c..cc663a1614fe 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -17,6 +17,7 @@
 """ test ir"""
 import pytest
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.tir.expr import *
 from tvm.relay import op
@@ -57,7 +58,7 @@ def test_span():
 # Types
 
 def test_tensor_type():
-    shape = tvm.convert([1, 2, 3])
+    shape = tvm.runtime.convert([1, 2, 3])
     dtype = 'float32'
     tt = relay.TensorType(shape, dtype)
     assert tt.dtype == dtype
@@ -76,9 +77,9 @@ def test_type_param():
 
 
 def test_func_type():
-    type_params = tvm.convert([])
-    type_constraints = tvm.convert([])  # TODO: fill me in
-    arg_types = tvm.convert([])
+    type_params = tvm.runtime.convert([])
+    type_constraints = tvm.runtime.convert([])  # TODO: fill me in
+    arg_types = tvm.runtime.convert([])
     ret_type = relay.TensorType((1, 2, 3), 'float32')
     tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
     assert tf.type_params == type_params
@@ -93,9 +94,9 @@ def test_func_type():
 
 def test_tuple_type():
     tp = relay.TypeVar('tp', relay.TypeKind.Type)
-    tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
-    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
-    fields = tvm.convert([tp, tf, tt])
+    tf = relay.FuncType(tvm.runtime.convert([]), None, tvm.runtime.convert([]), tvm.runtime.convert([]))
+    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), 'float32')
+    fields = tvm.runtime.convert([tp, tf, tt])
 
     tup_ty = relay.TupleType(fields)
     assert tup_ty.fields == fields
@@ -105,9 +106,9 @@ def test_tuple_type():
 
 def test_type_relation():
     tp = relay.TypeVar('tp', relay.TypeKind.Type)
-    tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
-    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
-    args = tvm.convert([tp, tf, tt])
+    tf = relay.FuncType(tvm.runtime.convert([]), None, tvm.runtime.convert([]), tvm.runtime.convert([]))
+    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), 'float32')
+    args = tvm.runtime.convert([tp, tf, tt])
 
     num_inputs = 2
     func = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Broadcast")
@@ -130,7 +131,7 @@ def test_constant():
 
 
 def test_tuple():
-    fields = tvm.convert([])
+    fields = tvm.runtime.convert([])
     tup = relay.Tuple(fields)
     assert tup.fields == fields
     assert tup.span == None
@@ -163,10 +164,10 @@ def test_global_var():
 
 def test_function():
     param_names = ['a', 'b', 'c', 'd']
-    params = tvm.convert([relay.Var(n) for n in param_names])
-    ret_type = relay.TupleType(tvm.convert([]))
-    body = relay.Tuple(tvm.convert([]))
-    type_params = tvm.convert([])
+    params = tvm.runtime.convert([relay.Var(n) for n in param_names])
+    ret_type = relay.TupleType(tvm.runtime.convert([]))
+    body = relay.Tuple(tvm.runtime.convert([]))
+    type_params = tvm.runtime.convert([])
     fn = relay.Function(params, body, ret_type, type_params)
     fn = fn.set_attribute("test_attribute", tvm.tir.StringImm("value"))
     assert fn.params == params
@@ -180,10 +181,10 @@ def test_function():
 @pytest.mark.skip(reason="AttrsEqualHandler doesn't handle Map so far.")
 def test_function_attrs():
     param_names = ['a', 'b', 'c', 'd']
-    params = tvm.convert([relay.var(n, shape=(5, 2)) for n in param_names])
-    ret_type = relay.TupleType(tvm.convert([]))
-    body = relay.Tuple(tvm.convert([]))
-    type_params = tvm.convert([])
+    params = tvm.runtime.convert([relay.var(n, shape=(5, 2)) for n in param_names])
+    ret_type = relay.TupleType(tvm.runtime.convert([]))
+    body = relay.Tuple(tvm.runtime.convert([]))
+    type_params = tvm.runtime.convert([])
     fn = relay.Function(params, body, ret_type, type_params)
     model_params = {}
     for param in params[:1]:
@@ -210,7 +211,7 @@ def test_function_attrs():
 def test_call():
     op = relay.Var('f')
     arg_names = ['a', 'b', 'c', 'd']
-    args = tvm.convert([relay.Var(n) for n in arg_names])
+    args = tvm.runtime.convert([relay.Var(n) for n in arg_names])
     call = relay.Call(op, args, None, None)
     assert call.op == op
     assert call.args == args
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index bcce9b4ba5dd..ba1f8d884adc 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import graph_equal, assert_graph_equal
 from tvm.relay.analysis import alpha_equal, assert_alpha_equal
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index e2a0bdc205d6..3bdd803122f7 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 import tvm.relay.testing
 import numpy as np
@@ -70,7 +71,7 @@ def test_env():
 
 
 def test_meta_data():
-    n, c, h, w = tvm.size_var("n"), 10, 224, 224
+    n, c, h, w = te.size_var("n"), 10, 224, 224
     x = relay.var("x", shape=(n, c, h, w))
     w = relay.var("w")
     z = relay.nn.conv2d(x, w,
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
index fbbfbd23a6c2..db953d5762e3 100644
--- a/tests/python/relay/test_ir_well_formed.py
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import well_formed
 from tvm.relay.prelude import Prelude
diff --git a/tests/python/relay/test_json_compact.py b/tests/python/relay/test_json_compact.py
index 40b686a05c5e..631679140c1b 100644
--- a/tests/python/relay/test_json_compact.py
+++ b/tests/python/relay/test_json_compact.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 from tvm import relay
 import json
 
diff --git a/tests/python/relay/test_memory_alloc.py b/tests/python/relay/test_memory_alloc.py
index 18b1500dfc3c..08fc39df9ad0 100644
--- a/tests/python/relay/test_memory_alloc.py
+++ b/tests/python/relay/test_memory_alloc.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.relay import memory_alloc
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index 3be62a3170fb..0eb1cec916b6 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.testing import check_grad, ctx_list, run_infer_type
 from tvm.relay.transform import gradient
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 57b1e2c676ac..2b5a1c29e0de 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -19,6 +19,7 @@
 import topi
 import topi.testing
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.testing import check_grad, ctx_list, run_infer_type
 from tvm.relay.transform import gradient
@@ -92,8 +93,8 @@ def verify_global_avg_pool2d_grad(x_shape):
     data = np.random.rand(*x_shape).astype("float32")
     y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
     out_grad = np.ones(shape=y_shape)
-    ref_grad = topi.testing.pool_grad_nchw(data, out_grad, pool_size=(x_shape[2], x_shape[3]), 
-                                            strides=(1, 1), padding=[0, 0, 0, 0], pool_type='avg', 
+    ref_grad = topi.testing.pool_grad_nchw(data, out_grad, pool_size=(x_shape[2], x_shape[3]),
+                                            strides=(1, 1), padding=[0, 0, 0, 0], pool_type='avg',
                                             ceil_mode=False)
 
     for target, ctx in ctx_list():
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 430c3dde5505..d13687fbec72 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.testing import check_grad, ctx_list, run_infer_type
 from tvm.relay.transform import gradient
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 194b09564288..0fa07499193a 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -17,6 +17,7 @@
 import numpy as np
 import pytest
 import tvm
+from tvm import te
 import scipy
 from tvm import relay
 from tvm.relay import transform
@@ -86,7 +87,7 @@ def inst(vars, sh):
 
     def check_binary_op(opfunc, ref, dtype):
         # TODO(@jroesch): this piece of code improperly uses type variables.
-        n = tvm.var("n")
+        n = te.var("n")
         s1 = (5, n, 5)
         s2 = (n, 1)
         t1 = relay.TensorType(s1)
@@ -173,7 +174,7 @@ def test_bias_add():
 
 def test_expand_dims_infer_type():
     for dtype in ['float16', 'float32']:
-        n, t, d = tvm.size_var("n"), tvm.size_var("t"), 100
+        n, t, d = te.size_var("n"), te.size_var("t"), 100
         x = relay.var("x", shape=(n, t, d), dtype=dtype)
         y = relay.expand_dims(x, axis=2)
         assert "axis=2" in y.astext()
@@ -223,23 +224,23 @@ def test_log_softmax():
 
 def test_concatenate():
     for dtype in ['float16', 'float32']:
-        n, t, d = tvm.size_var("n"), tvm.size_var("t"), 100
+        n, t, d = te.size_var("n"), te.size_var("t"), 100
         x = relay.var("x", shape=(n, t, d))
         y = relay.var("y", shape=(n, t, d))
         z = relay.concatenate((x, y), axis=-1)
         assert "axis=" in z.astext()
         zz = run_infer_type(z)
         assert zz.checked_type == relay.TensorType((n, t, 200))
-    
+
         x = relay.exp(x)
         z = relay.concatenate((x, y), axis=2)
         zz = run_infer_type(z)
         assert zz.checked_type == relay.TensorType((n, t, 200))
-    
+
         z = relay.concatenate((x, y), axis=1)
         zz = run_infer_type(z)
         assert zz.checked_type == relay.TensorType((n, t + t, 100))
-    
+
         # check shape mismatches (the following case is expected to raise tvm._ffi.base.TVMError.
         try:
             x = relay.var('p1', shape=(2, 5))
@@ -251,7 +252,7 @@ def test_concatenate():
             pass
         else:
             assert False
-    
+
         x = relay.var("x", shape=(10, 5), dtype=dtype)
         y = relay.var("y", shape=(10, 5), dtype=dtype)
         t = relay.var("z", shape=(), dtype=dtype)
@@ -263,7 +264,7 @@ def test_concatenate():
         y_data = np.random.rand(10, 5).astype(dtype)
         t_data = np.random.uniform(size=()).astype(dtype)
         ref_res = np.concatenate((x_data, y_data), axis=1) + t_data
-    
+
         for target, ctx in ctx_list():
             if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
                 continue
@@ -276,7 +277,7 @@ def test_concatenate():
 
 def test_dropout():
     for dtype in ['float16', 'float32']:
-        n, t, d = tvm.size_var("n"), tvm.size_var("t"), tvm.size_var("d")
+        n, t, d = te.size_var("n"), te.size_var("t"), te.size_var("d")
         input_ty = relay.TensorType((n, t, d), dtype)
         x = relay.var("x", input_ty)
         y = relay.nn.dropout(x, rate=0.75)
@@ -297,7 +298,7 @@ def test_batch_norm():
                                 center=False, scale=False)
         yy = run_infer_type(y.astuple())
         assert "center=" in yy.astext()
-        assert yy.checked_type == relay.ty.TupleType(tvm.convert([
+        assert yy.checked_type == relay.ty.TupleType(tvm.runtime.convert([
             relay.TensorType((3, 2, 1), dtype),
             relay.TensorType((2,), dtype),
             relay.TensorType((2,), dtype)
@@ -311,7 +312,7 @@ def test_batch_norm():
         y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
                                 axis=0, center=False, scale=False)
         yy = run_infer_type(y.astuple())
-        assert yy.checked_type == relay.ty.TupleType(tvm.convert([
+        assert yy.checked_type == relay.ty.TupleType(tvm.runtime.convert([
             relay.ty.TensorType((3, 2, 1), dtype),
             relay.ty.TensorType((3,), dtype),
             relay.ty.TensorType((3,), dtype)
@@ -326,7 +327,7 @@ def test_batch_norm():
         y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
                                 axis=-1, center=False, scale=False)
         yy = run_infer_type(y.astuple())
-        assert yy.checked_type == relay.ty.TupleType(tvm.convert([
+        assert yy.checked_type == relay.ty.TupleType(tvm.runtime.convert([
             relay.ty.TensorType((1, 2, 3), dtype),
             relay.ty.TensorType((3,), dtype),
             relay.ty.TensorType((3,), dtype)
@@ -348,7 +349,7 @@ def test_dense():
         # Dense accuracy for float16 is poor
         if dtype == 'float16':
             return
-        n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
+        n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
         x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
         w = relay.var("w", relay.TensorType((2, w), dtype))
         y = relay.nn.dense(x, w, units=2)
@@ -356,15 +357,15 @@ def test_dense():
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((n, c, h, 2), dtype)
 
-        n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), 2
+        n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
         x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
-        wh, ww = tvm.size_var("wh"), tvm.size_var("ww")
+        wh, ww = te.size_var("wh"), te.size_var("ww")
         w = relay.var("w", relay.TensorType((ww, wh), dtype))
         y = relay.nn.dense(x, w)
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType((n, c, h, ww), dtype)
 
-        n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), 2
+        n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), 2
         x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
         w = relay.var("w", relay.IncompleteType())
         y = relay.nn.dense(x, w, units=2)
@@ -394,7 +395,7 @@ def test_dense_dtype():
     data_dtype = 'uint8'
     weight_dtype = 'int8'
     out_dtype = 'uint8'
-    n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
+    n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), data_dtype))
     w = relay.var("w", relay.TensorType((2, w), weight_dtype))
     y = relay.nn.dense(x, w, units=2, out_dtype=out_dtype)
@@ -406,7 +407,7 @@ def test_dense_dtype():
 
 
 def test_bitserial_dense():
-    m, k = tvm.size_var("m"), tvm.size_var("k")
+    m, k = te.size_var("m"), te.size_var("k")
     x = relay.var("x", relay.TensorType((m, k), "int16"))
     w = relay.var("w", relay.TensorType((k, 32), "int16"))
     y = relay.nn.bitserial_dense(x, w, units=32)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index c3033e9181cb..1e4be742ff25 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -18,6 +18,7 @@
 """
 import numpy as np
 import tvm
+from tvm import te
 import topi.testing
 from tvm import relay
 from tvm.relay import transform
@@ -250,7 +251,7 @@ def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
 
 def test_slice_like():
-    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
     verify_slice_like(data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
     verify_slice_like(data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3))
     verify_slice_like(data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1,2), output=(d2, d2, d3))
@@ -304,7 +305,7 @@ def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
             tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5)
 
 def test_batch_matmul():
-    b, m, n, k = tvm.size_var("b"), tvm.size_var("m"), tvm.size_var("n"), tvm.size_var("k")
+    b, m, n, k = te.size_var("b"), te.size_var("m"), te.size_var("n"), te.size_var("k")
     x = relay.var("x", relay.TensorType((b, m, k), "float32"))
     y = relay.var("y", relay.TensorType((b, n, k), "float32"))
     z = relay.nn.batch_matmul(x, y)
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index d545d0c1635a..7a42fc329e04 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -18,6 +18,7 @@
 """
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm import relay
 from tvm.relay import transform
@@ -28,7 +29,7 @@
 
 def test_conv1d_infer_type():
     # symbolic in batch dimension
-    n, c, w = tvm.var("n"), 10, 224
+    n, c, w = te.var("n"), 10, 224
     x = relay.var("x", relay.ty.TensorType((n, c, w), "float32"))
     w = relay.var("w")
     y = relay.nn.conv1d(x, w,
@@ -42,7 +43,7 @@ def test_conv1d_infer_type():
         (2, 10, 3), "float32")
 
     # infer by shape of w, mixed precision
-    n, c, w = tvm.var("n"), 10, 224
+    n, c, w = te.var("n"), 10, 224
     x = relay.var("x", relay.TensorType((n, c, w), "int8"))
     w = relay.var("w", relay.TensorType((2, 10, 3), "int8"))
     y = relay.nn.conv1d(x, w, out_dtype="int32")
@@ -52,7 +53,7 @@ def test_conv1d_infer_type():
         (n, 2, 222), "int32")
 
     # infer shape in case of different dtypes for input and weight.
-    n, c, w = tvm.var("n"), 10, 224
+    n, c, w = te.var("n"), 10, 224
     x = relay.var("x", relay.TensorType((n, c, w), "uint8"))
     w = relay.var("w", relay.TensorType((2, 10, 3), "int8"))
     y = relay.nn.conv1d(x, w, out_dtype="int32")
@@ -122,7 +123,7 @@ def run_test_conv1d(dtype, out_dtype, scale, dshape, kshape,
 
 def test_conv2d_infer_type():
     # symbolic in batch dimension
-    n, c, h, w = tvm.size_var("n"), 10, 224, 224
+    n, c, h, w = te.size_var("n"), 10, 224, 224
     x = relay.var("x", relay.ty.TensorType((n, c, h, w), "float32"))
     w = relay.var("w")
     y = relay.nn.conv2d(x, w,
@@ -136,7 +137,7 @@ def test_conv2d_infer_type():
         (2, 10, 3, 3), "float32")
 
     # infer by shape of w, mixed precision
-    n, c, h, w = tvm.size_var("n"), 10, 224, 224
+    n, c, h, w = te.size_var("n"), 10, 224, 224
     x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
     w = relay.var("w", relay.TensorType((2, 10, 3, 3), "int8"))
     y = relay.nn.conv2d(x, w, out_dtype="int32")
@@ -146,7 +147,7 @@ def test_conv2d_infer_type():
         (n, 2, 222, 222), "int32")
 
     # infer shape in case of different dtypes for input and weight.
-    n, c, h, w = tvm.size_var("n"), 10, 224, 224
+    n, c, h, w = te.size_var("n"), 10, 224, 224
     x = relay.var("x", relay.TensorType((n, c, h, w), "uint8"))
     w = relay.var("w", relay.TensorType((2, 10, 3, 3), "int8"))
     y = relay.nn.conv2d(x, w, out_dtype="int32")
@@ -385,7 +386,7 @@ def run_test_conv2d_cuda(dtype, out_dtype, scale, dshape, kshape,
 
 def test_conv3d_infer_type():
     # symbolic in batch dimension
-    n, c, d, h, w = tvm.size_var("n"), 10, 224, 224, 224
+    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
     x = relay.var("x", relay.ty.TensorType((n, c, d, h, w), "float32"))
     w = relay.var("w")
     y = relay.nn.conv3d(x, w,
@@ -399,7 +400,7 @@ def test_conv3d_infer_type():
         (2, 10, 3, 3, 3), "float32")
 
     # infer by shape of w, mixed precision
-    n, c, d, h, w = tvm.size_var("n"), 10, 224, 224, 224
+    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
     x = relay.var("x", relay.TensorType((n, c, d, h, w), "int8"))
     w = relay.var("w", relay.TensorType((2, 10, 3, 3, 3), "int8"))
     y = relay.nn.conv3d(x, w, out_dtype="int32")
@@ -409,7 +410,7 @@ def test_conv3d_infer_type():
         (n, 2, 222, 222, 222), "int32")
 
     # infer shape in case of different dtypes for input and weight.
-    n, c, d, h, w = tvm.size_var("n"), 10, 224, 224, 224
+    n, c, d, h, w = te.size_var("n"), 10, 224, 224, 224
     x = relay.var("x", relay.TensorType((n, c, d, h, w), "uint8"))
     w = relay.var("w", relay.TensorType((2, 10, 3, 3, 3), "int8"))
     y = relay.nn.conv3d(x, w, out_dtype="int32")
@@ -524,7 +525,7 @@ def run_test_conv3d(dtype, out_dtype, scale, dshape, kshape,
 
 def test_conv2d_transpose_infer_type():
     # symbolic in batch dimension
-    n, c, h, w = tvm.size_var("n"), 10, 10, 12
+    n, c, h, w = te.size_var("n"), 10, 10, 12
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     w = relay.var("w", relay.IncompleteType())
     y = relay.nn.conv2d_transpose(x, w,
@@ -539,7 +540,7 @@ def test_conv2d_transpose_infer_type():
         (10, 15, 3, 3), "float32")
 
     # infer by shape of w, mixed precision
-    n, h, w, c = tvm.size_var("n"), 10, 10, 12
+    n, h, w, c = te.size_var("n"), 10, 10, 12
     x = relay.var("x", relay.TensorType((n, h, w, c), "float32"))
     w = relay.var("w", relay.TensorType((12, 11, 5, 5), "float32"))
     y = relay.nn.conv2d_transpose(x, w,
@@ -624,41 +625,41 @@ def test_conv1d_transpose_ncw_run():
 
 
 def test_upsampling_infer_type():
-    n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
-    scale = tvm.const(2.0, "float64")
+    n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
+    scale = tvm.tir.const(2.0, "float64")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = relay.nn.upsampling(x, scale_h=2, scale_w=2, layout="NCHW", method="bilinear")
     "method=\"BINLINEAR\"" in y.astext()
     yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, tvm.tir.Cast("int32", tvm.round(h*scale)),
-                                                tvm.tir.Cast("int32", tvm.round(w*scale))),
+    assert yy.checked_type == relay.TensorType((n, c, tvm.tir.Cast("int32", te.round(h*scale)),
+                                                tvm.tir.Cast("int32", te.round(w*scale))),
                                                 "float32")
-    n, c = tvm.size_var("n"), tvm.size_var("c")
+    n, c = te.size_var("n"), te.size_var("c")
     x = relay.var("x", relay.TensorType((n, c, 100, 200), "float32"))
     y = relay.nn.upsampling(x, scale_h=2, scale_w=2, layout="NCHW", method="bilinear")
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, 200, 400), "float32")
 
 def test_upsampling3d_infer_type():
-    n, c, d, h, w = tvm.size_var("n"), tvm.size_var("c"),\
-                    tvm.size_var("d"), tvm.size_var("h"), tvm.size_var("w")
-    scale = tvm.const(2.0, "float64")
+    n, c, d, h, w = te.size_var("n"), te.size_var("c"),\
+                    te.size_var("d"), te.size_var("h"), te.size_var("w")
+    scale = tvm.tir.const(2.0, "float64")
     x = relay.var("x", relay.TensorType((n, c, d, h, w), "float32"))
     y = relay.nn.upsampling3d(x, scale_d=2, scale_h=2, scale_w=2, layout="NCDHW", method="trilinear")
 
     yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, tvm.tir.Cast("int32", tvm.round(d*scale)),
-                                                tvm.tir.Cast("int32", tvm.round(h*scale)),
-                                                tvm.tir.Cast("int32", tvm.round(w*scale))),
+    assert yy.checked_type == relay.TensorType((n, c, tvm.tir.Cast("int32", te.round(d*scale)),
+                                                tvm.tir.Cast("int32", te.round(h*scale)),
+                                                tvm.tir.Cast("int32", te.round(w*scale))),
                                                 "float32")
-    n, c = tvm.size_var("n"), tvm.size_var("c")
+    n, c = te.size_var("n"), te.size_var("c")
     x = relay.var("x", relay.TensorType((n, c, 100, 100, 200), "float32"))
     y = relay.nn.upsampling3d(x, scale_d=2, scale_h=2, scale_w=2, layout="NCDHW", method="trilinear")
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n, c, 200, 200, 400), "float32")
 
 def _test_pool2d(opfunc, reffunc):
-    n, c, h, w = tvm.size_var("n"), 10, 224, 224
+    n, c, h, w = te.size_var("n"), 10, 224, 224
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = opfunc(x, pool_size=(1, 1))
     assert "pool_size=" in y.astext()
@@ -678,7 +679,7 @@ def _test_pool2d(opfunc, reffunc):
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 def _test_pool2d_int(opfunc, reffunc, dtype):
-    n, c, h, w = tvm.size_var("n"), 10, 224, 224
+    n, c, h, w = te.size_var("n"), 10, 224, 224
     x = relay.var("x", relay.TensorType((n, c, h, w), dtype))
     y = opfunc(x, pool_size=(1, 1))
     assert "pool_size=" in y.astext()
@@ -698,13 +699,13 @@ def _test_pool2d_int(opfunc, reffunc, dtype):
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 def _test_global_pool2d(opfunc, reffunc):
-    n, c, h, w = tvm.size_var("n"), tvm.size_var("c"), 224, 224
+    n, c, h, w = te.size_var("n"), te.size_var("c"), 224, 224
     x = relay.var("x", relay.TensorType((n, h, w, c), "float32"))
     y = opfunc(x, layout="NHWC")
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n, 1, 1, c), "float32")
 
-    n, c, h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
+    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = opfunc(x)
     yy = run_infer_type(y)
@@ -735,7 +736,7 @@ def test_pool2d():
 def test_pool1d():
 
     def _test_pool1d(opfunc):
-        n, c, w = tvm.var("n"), 10, 224
+        n, c, w = te.var("n"), 10, 224
         x = relay.var("x", relay.TensorType((n, c, w), "float32"))
         y = opfunc(x, pool_size=(1,))
         assert "pool_size=" in y.astext()
@@ -763,7 +764,7 @@ def _test_pool1d(opfunc):
 def test_pool3d():
 
     def _test_pool3d(opfunc, padding=(0, 0, 0, 0, 0, 0), out_shape=(1, 3, 16, 16, 16)):
-        n, c, d, h, w = tvm.size_var("n"), 10, 5, 224, 224
+        n, c, d, h, w = te.size_var("n"), 10, 5, 224, 224
         x = relay.var("x", relay.TensorType((n, c, d, h, w), "float32"))
         y = opfunc(x, pool_size=(1, 1, 1))
         assert "pool_size=" in y.astext()
@@ -833,7 +834,7 @@ def test_avg_pool2d_no_count_pad():
         tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
 
 def test_flatten_infer_type():
-    d1, d2, d3, d4 = tvm.size_var("d1"), tvm.size_var("d2"), tvm.size_var("d3"), tvm.size_var("d4")
+    d1, d2, d3, d4 = te.size_var("d1"), te.size_var("d2"), te.size_var("d3"), te.size_var("d4")
     x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
     y = relay.nn.batch_flatten(x)
     yy = run_infer_type(y)
@@ -878,7 +879,7 @@ def test_pad_infer_type():
     assert yy.checked_type == relay.TensorType((3, 6, 9, 12), "float32")
 
     # some symbolic values
-    n, c, h, w = tvm.size_var("n"), 2, 3, tvm.size_var("w")
+    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
     t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
     y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4)))
     yy = run_infer_type(y)
@@ -901,7 +902,7 @@ def _test_run(dtype):
     _test_run('int32')
 
 def test_lrn():
-    n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
+    n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", shape=(n, c , h, w))
     y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=.00001, beta=0.75)
     "alpha=" in y.astext()
@@ -932,7 +933,7 @@ def test_lrn():
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
 def test_l2_normalize():
-    n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
+    n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", shape=(n, c , h, w))
     y = relay.nn.l2_normalize(x, eps=0.001, axis=[1])
     "axis=" in y.astext()
@@ -982,7 +983,7 @@ def test_batch_flatten():
 
 
 def _test_upsampling(layout, method, align_corners=False):
-    n, c, h, w = tvm.size_var("n"), 16, 32, 32
+    n, c, h, w = te.size_var("n"), 16, 32, 32
     scale_h = 2.0
     scale_w = 2.0
     dtype = "float32"
@@ -1021,7 +1022,7 @@ def test_upsampling():
     _test_upsampling("NHWC", "bilinear", True)
 
 def _test_upsampling3d(layout, method, coordinate_transformation_mode="half_pixel"):
-    n, c, d, h, w = tvm.size_var("n"), 8, 16, 16, 16
+    n, c, d, h, w = te.size_var("n"), 8, 16, 16, 16
     scale_d = 2.0
     scale_h = 2.0
     scale_w = 2.0
@@ -1220,7 +1221,7 @@ def test_depthwise_conv2d_int8():
 
 def test_bitserial_conv2d_infer_type():
     # Basic shape test with ambiguous batch.
-    n, c, h, w = tvm.size_var("n"), 32, 224, 224
+    n, c, h, w = te.size_var("n"), 32, 224, 224
     x = relay.var("x", relay.ty.TensorType((n, c, h, w), "int16"))
     w = relay.var("w", relay.ty.TensorType((32, 32, 3, 3), "int16"))
     y = relay.nn.bitserial_conv2d(
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index c5f340a843a3..7e5314dd8234 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -19,6 +19,7 @@
 import numpy as np
 import pytest
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import create_executor, transform
 from tvm.relay.testing import ctx_list, check_grad, run_infer_type
@@ -166,7 +167,7 @@ def verify_squeeze(shape, dtype, axis):
 
 
 def test_transpose_infer_type():
-    n, t, d = tvm.size_var("n"), tvm.size_var("t"), 100
+    n, t, d = te.size_var("n"), te.size_var("t"), 100
     x = relay.var("x", relay.TensorType((n, t, d), "float32"))
     y = relay.transpose(x, axes=(1, 0, 2))
     assert "axes=" in y.astext()
@@ -274,7 +275,7 @@ def test_reshape_like_infer_type():
     assert zz.checked_type == relay.TensorType((1, 6), "float32")
 
     # symbolic shape
-    n, c, h, w = tvm.size_var("n"), 2, 3, tvm.size_var("w")
+    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = relay.var("y", relay.TensorType((1, 8, 8), "float32"))
     z = relay.reshape_like(x, y)
@@ -313,8 +314,8 @@ def verify_take(dshape, indices_shape, oshape, axis=None):
         yy = run_infer_type(y)
         assert yy.checked_type == relay.TensorType(oshape, "float32")
 
-    d1, d2, d3 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3")
-    d4, d5, d6 = tvm.var("d4"), tvm.var("d5"), tvm.var("d6")
+    d1, d2, d3 = te.var("d1"), te.var("d2"), te.var("d3")
+    d4, d5, d6 = te.var("d4"), te.var("d5"), te.var("d6")
     verify_take((d1,), (1,), (1,), 0)
     verify_take((4,), (d1, d2), (d1, d2))
     verify_take((3, 3, 3), (1, d2), (1, d2))
@@ -368,12 +369,12 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None):
         yy = run_infer_type(y.astuple())
         assert yy.checked_type == ret_type
 
-    idxd = tvm.indexdiv
+    idxd = tvm.tir.indexdiv
 
-    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
-    axis = tvm.var("axis")
+    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
+    axis = te.var("axis")
     verify_split((5, 5, 2, 2), 5,
-                 relay.ty.TupleType(tvm.convert([
+                 relay.ty.TupleType(tvm.runtime.convert([
                      relay.ty.TensorType((5, 1, 2, 2), "float32"),
                      relay.ty.TensorType((5, 1, 2, 2), "float32"),
                      relay.ty.TensorType((5, 1, 2, 2), "float32"),
@@ -381,7 +382,7 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None):
                      relay.ty.TensorType((5, 1, 2, 2), "float32")])),
                   axis=1)
     verify_split((5, 5, 2, 2), 5,
-                 relay.ty.TupleType(tvm.convert([
+                 relay.ty.TupleType(tvm.runtime.convert([
                      relay.ty.TensorType((1, 5, 2, 2), "float32"),
                      relay.ty.TensorType((1, 5, 2, 2), "float32"),
                      relay.ty.TensorType((1, 5, 2, 2), "float32"),
@@ -389,19 +390,19 @@ def verify_split(dshape, indices_or_sections, ret_type, axis=None):
                      relay.ty.TensorType((1, 5, 2, 2), "float32")])),
                   axis=0)
     verify_split((d1, d2, d3, d4), 4,
-                 relay.ty.TupleType(tvm.convert([
+                 relay.ty.TupleType(tvm.runtime.convert([
                      relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
                      relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
                      relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32"),
                      relay.ty.TensorType((d1, d2, idxd(d3, 4), d4), "float32")])),
                   axis=2)
     verify_split((d1, d2, d3, d4), 2,
-                 relay.ty.TupleType(tvm.convert([
+                 relay.ty.TupleType(tvm.runtime.convert([
                      relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32"),
                      relay.ty.TensorType((idxd(d1, 2), d2, d3, d4), "float32")])),
                   axis=0)
     verify_split((d1, d2, d3, d4), (2, 4, 7),
-                 relay.ty.TupleType(tvm.convert([
+                 relay.ty.TupleType(tvm.runtime.convert([
                      relay.ty.TensorType((d1, 2, d3, d4), "float32"),
                      relay.ty.TensorType((d1, 2, d3, d4), "float32"),
                      relay.ty.TensorType((d1, 3, d3, d4), "float32"),
@@ -447,7 +448,7 @@ def test_full_like_infer_type():
     assert yy.checked_type == relay.TensorType((1, 2, 3), "float32")
 
     # symbolic shape
-    n, c, h, w = tvm.size_var("n"), 2, 3, tvm.size_var("w")
+    n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w")
     base = relay.var("base", relay.TensorType((n, c, h, w), "float32"))
     fill = relay.var("fill", relay.TensorType((), "float32"))
     y = relay.full_like(base, fill)
@@ -475,7 +476,7 @@ def verify_full_like(base, fill_value, dtype):
 
 
 def test_infer_type_leaky_relu():
-    n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
+    n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = relay.nn.leaky_relu(x, alpha=0.1)
     "alpha=0.1" in y.astext()
@@ -539,7 +540,7 @@ def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
 
 
 def test_infer_type_prelu():
-    n, c , h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
+    n, c , h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     verify_infer_type_prelu((n, c, h, w), (c,), 1, (n, c, h, w))
     verify_infer_type_prelu((n, h, w, c), (c,), 3, (n, h, w, c))
     verify_infer_type_prelu((n, c, h, w), None, 1, (n, c, h, w))
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index 44b51f2c2367..473ae59a9dbe 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.relay import transform
@@ -24,7 +25,7 @@
 
 def test_binary_op():
     def check_binary_op(opfunc, ref):
-        n = tvm.size_var("n")
+        n = te.size_var("n")
         t1 = relay.TensorType((5, n, 5))
         t2 = relay.TensorType((n, 1))
         x = relay.var("x", t1)
@@ -193,7 +194,7 @@ def _wrapper(data, axis=None, keepdims=False):
                 return func(data, axis=axis).reshape(out_shape)
         return _wrapper
 
-    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
     for func in [[relay.sum, np.sum],
                  [relay.max, np.max],
                  [relay.min, np.min],
@@ -282,7 +283,7 @@ def verify(dshape, begin, end, strides, output, test_ref=True):
             op_res = intrp.evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
 
-    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
     verify((d1, d2, 3), [None, None, 1], [None, None, 2], None, (d1, d2, 1), False)
     verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
     verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3))
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index e622a8ae01ab..8fd05daf73c7 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -19,6 +19,7 @@
 import math
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import ctx_list, run_infer_type
@@ -26,9 +27,9 @@
 
 
 def test_resize_infer_type():
-    n, c, h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
+    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
-    th, tw = tvm.var("th"), tvm.var("tw")
+    th, tw = te.var("th"), te.var("tw")
     z = relay.image.resize(x, (th, tw))
     zz = run_infer_type(z)
     assert zz.checked_type == relay.TensorType((n, c, th, tw), "int8")
@@ -182,7 +183,7 @@ def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
     x = relay.var("x", relay.TensorType(dshape, "float32"))
     verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets,
                           check_size=True)
-    y = relay.var("y", relay.TensorType((tvm.size_var("n"), 3, 56, 56), "float32"))
+    y = relay.var("y", relay.TensorType((te.size_var("n"), 3, 56, 56), "float32"))
     verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets,
                           check_size=True, check_type_only=True)
 
@@ -190,7 +191,7 @@ def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
     ref_res = get_ref_result(dshape, clip=False)
     x = relay.var("x", relay.TensorType(dshape, "float32"))
     verify_multibox_prior(x, dshape, ref_res, clip=False)
-    y = relay.var("y", relay.TensorType((tvm.size_var("n"), 24, 32, 32), "float32"))
+    y = relay.var("y", relay.TensorType((te.size_var("n"), 24, 32, 32), "float32"))
     verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
 
 
@@ -280,7 +281,7 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
     np_indices_result = np.array([[3, 0, -1, -1, -1]])
     num_anchors = 5
 
-    dshape = (tvm.size_var("n"), num_anchors, 6)
+    dshape = (te.size_var("n"), num_anchors, 6)
     verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
                force_suppress=True, top_k=2, check_type_only=True)
     dshape = (1, num_anchors, 6)
@@ -291,7 +292,7 @@ def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
                            [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
     np_indices_result = np.array([[3, 0, 1, -1, -1]])
-    dshape = (tvm.size_var("n"), num_anchors, 6)
+    dshape = (te.size_var("n"), num_anchors, 6)
     verify_nms(np_data, np_valid_count, dshape, np_result,
                np_indices_result, check_type_only=True)
     dshape = (1, num_anchors, 6)
@@ -331,7 +332,7 @@ def test_default_value():
             cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors)
         ret = run_infer_type(mtl.astuple())
         ref_type = relay.ty.TupleType(
-            tvm.convert([
+            tvm.runtime.convert([
                 relay.ty.TensorType((1, num_anchors, 6), "float32"),
                 relay.ty.TensorType((1, ), "int")
             ]))
@@ -354,7 +355,7 @@ def test_default_value():
     def test_threshold():
         num_anchors = 5
         num_classes = 5
-        n = tvm.size_var("n")
+        n = te.size_var("n")
         cls_prob = relay.var(
             "cls_prob",
             relay.ty.TensorType((n, num_anchors, num_classes), "float32"))
@@ -373,7 +374,7 @@ def test_threshold():
             variances=variances)
         ret = run_infer_type(ret.astuple())
         ref_type = relay.ty.TupleType(
-            tvm.convert([
+            tvm.runtime.convert([
                 relay.ty.TensorType((n, num_anchors, 6), "float32"),
                 relay.ty.TensorType((n, ), "int")
             ]))
@@ -520,8 +521,8 @@ def verify_yolo_reorg(shape, stride, out_shape):
         assert "stride=" in z.astext()
         assert zz.checked_type == relay.ty.TensorType(out_shape, "float32")
 
-    n, c, h, w = tvm.size_var("n"), tvm.size_var("c"), tvm.size_var("h"), tvm.size_var("w")
-    idxd = tvm.indexdiv
+    n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
+    idxd = tvm.tir.indexdiv
     verify_yolo_reorg((n, c, 20, 20), 10, (n, c*10*10, 2, 2))
     verify_yolo_reorg((n, c, h, w), 2, (n, c*2*2, idxd(h, 2), idxd(w, 2)))
 
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 286776e3f7b2..287e80a0fab7 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -18,6 +18,7 @@
 """
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.testing import ctx_list
 
diff --git a/tests/python/relay/test_op_qnn_add.py b/tests/python/relay/test_op_qnn_add.py
index e1f54ed4b78c..bd0f6612d80d 100644
--- a/tests/python/relay/test_op_qnn_add.py
+++ b/tests/python/relay/test_op_qnn_add.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_op_qnn_concatenate.py b/tests/python/relay/test_op_qnn_concatenate.py
index 35c2f971a791..03ab9eeb1321 100644
--- a/tests/python/relay/test_op_qnn_concatenate.py
+++ b/tests/python/relay/test_op_qnn_concatenate.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_op_qnn_conv2d.py b/tests/python/relay/test_op_qnn_conv2d.py
index e827c722b255..66acda863596 100644
--- a/tests/python/relay/test_op_qnn_conv2d.py
+++ b/tests/python/relay/test_op_qnn_conv2d.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.relay import transform
diff --git a/tests/python/relay/test_op_qnn_dense.py b/tests/python/relay/test_op_qnn_dense.py
index 43600cbf60c5..3cfcfd165b46 100644
--- a/tests/python/relay/test_op_qnn_dense.py
+++ b/tests/python/relay/test_op_qnn_dense.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index b1965c97ad0d..febf5c5e6ecc 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_op_qnn_mul.py b/tests/python/relay/test_op_qnn_mul.py
index 959a02a976ad..6516871d3fb5 100644
--- a/tests/python/relay/test_op_qnn_mul.py
+++ b/tests/python/relay/test_op_qnn_mul.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py
index bdc7bc04d6da..09b04d8925c6 100644
--- a/tests/python/relay/test_op_qnn_quantize.py
+++ b/tests/python/relay/test_op_qnn_quantize.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_op_qnn_requantize.py b/tests/python/relay/test_op_qnn_requantize.py
index 8af778160ccb..81233972cb28 100644
--- a/tests/python/relay/test_op_qnn_requantize.py
+++ b/tests/python/relay/test_op_qnn_requantize.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py
index 4161b908c7a8..497a81881c87 100644
--- a/tests/python/relay/test_param_dict.py
+++ b/tests/python/relay/test_param_dict.py
@@ -17,6 +17,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import json
 import base64
 from tvm._ffi.base import py_str
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 0319d0b1a371..7e34f48ec7e1 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import analysis
 from tvm.relay.testing import run_opt_pass
@@ -64,10 +65,10 @@ def test_type_param_alpha_equal():
 
     # function types are the only way to put type params
     # in eq map
-    ft1 = relay.FuncType(tvm.convert([]), t1, tvm.convert([t1]), tvm.convert([]))
-    ft2 = relay.FuncType(tvm.convert([]), t3, tvm.convert([t3]), tvm.convert([]))
+    ft1 = relay.FuncType(tvm.runtime.convert([]), t1, tvm.runtime.convert([t1]), tvm.runtime.convert([]))
+    ft2 = relay.FuncType(tvm.runtime.convert([]), t3, tvm.runtime.convert([t3]), tvm.runtime.convert([]))
     # actually an invalid type because t2 is wrong kind
-    ft3 = relay.FuncType(tvm.convert([]), t2, tvm.convert([t2]), tvm.convert([]))
+    ft3 = relay.FuncType(tvm.runtime.convert([]), t2, tvm.runtime.convert([t2]), tvm.runtime.convert([]))
 
     assert ft1 == ft2
     assert ft1 != ft3 # kinds still do not match
@@ -85,51 +86,51 @@ def test_func_type_alpha_equal():
     broadcast = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Broadcast")
     identity = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Identity")
 
-    tr1 = relay.TypeRelation(broadcast, tvm.convert([tp1, tp3]), 1, None)
-    tr2 = relay.TypeRelation(broadcast, tvm.convert([tp2, tp4]), 1, None)
-    tr3 = relay.TypeRelation(identity, tvm.convert([tp1, tp3]), 1, None)
+    tr1 = relay.TypeRelation(broadcast, tvm.runtime.convert([tp1, tp3]), 1, None)
+    tr2 = relay.TypeRelation(broadcast, tvm.runtime.convert([tp2, tp4]), 1, None)
+    tr3 = relay.TypeRelation(identity, tvm.runtime.convert([tp1, tp3]), 1, None)
 
-    ft = relay.FuncType(tvm.convert([t1, t2]), tp1,
-                         tvm.convert([tp1, tp3]),
-                         tvm.convert([tr1]))
-    translate_vars = relay.FuncType(tvm.convert([t1, t2]), tp1,
-                         tvm.convert([tp2, tp4]),
-                         tvm.convert([tr2]))
+    ft = relay.FuncType(tvm.runtime.convert([t1, t2]), tp1,
+                         tvm.runtime.convert([tp1, tp3]),
+                         tvm.runtime.convert([tr1]))
+    translate_vars = relay.FuncType(tvm.runtime.convert([t1, t2]), tp1,
+                         tvm.runtime.convert([tp2, tp4]),
+                         tvm.runtime.convert([tr2]))
     assert ft == translate_vars
 
-    different_args = relay.FuncType(tvm.convert([t1]), tp1,
-                         tvm.convert([tp1, tp3]),
-                         tvm.convert([tr1]))
+    different_args = relay.FuncType(tvm.runtime.convert([t1]), tp1,
+                         tvm.runtime.convert([tp1, tp3]),
+                         tvm.runtime.convert([tr1]))
     assert ft != different_args
 
-    different_order = relay.FuncType(tvm.convert([t2, t1]), tp1,
-                         tvm.convert([tp1, tp3]),
-                         tvm.convert([tr1]))
+    different_order = relay.FuncType(tvm.runtime.convert([t2, t1]), tp1,
+                         tvm.runtime.convert([tp1, tp3]),
+                         tvm.runtime.convert([tr1]))
     assert ft != different_order
 
-    no_rel = relay.FuncType(tvm.convert([t1, t2]), tp1,
-                         tvm.convert([tp1, tp3]),
-                         tvm.convert([]))
+    no_rel = relay.FuncType(tvm.runtime.convert([t1, t2]), tp1,
+                         tvm.runtime.convert([tp1, tp3]),
+                         tvm.runtime.convert([]))
     assert ft != no_rel
 
-    more_vars = relay.FuncType(tvm.convert([t1, t2]), tp2,
-                         tvm.convert([tp1, tp2, tp3]),
-                         tvm.convert([tr1]))
+    more_vars = relay.FuncType(tvm.runtime.convert([t1, t2]), tp2,
+                         tvm.runtime.convert([tp1, tp2, tp3]),
+                         tvm.runtime.convert([tr1]))
     assert ft != more_vars
 
-    all_the_vars = relay.FuncType(tvm.convert([t1, t2]), tp1,
-                         tvm.convert([tp1, tp2, tp3, tp4]),
-                         tvm.convert([tr1, tr2]))
+    all_the_vars = relay.FuncType(tvm.runtime.convert([t1, t2]), tp1,
+                         tvm.runtime.convert([tp1, tp2, tp3, tp4]),
+                         tvm.runtime.convert([tr1, tr2]))
     assert ft != all_the_vars
 
-    different_rel = relay.FuncType(tvm.convert([t1, t2]), tp1,
-                                   tvm.convert([tp1, tp3]),
-                                   tvm.convert([tr3]))
+    different_rel = relay.FuncType(tvm.runtime.convert([t1, t2]), tp1,
+                                   tvm.runtime.convert([tp1, tp3]),
+                                   tvm.runtime.convert([tr3]))
     assert ft != different_rel
 
-    more_rels = relay.FuncType(tvm.convert([t1, t2]), tp1,
-                                   tvm.convert([tp1, tp3]),
-                                   tvm.convert([tr1, tr3]))
+    more_rels = relay.FuncType(tvm.runtime.convert([t1, t2]), tp1,
+                                   tvm.runtime.convert([tp1, tp3]),
+                                   tvm.runtime.convert([tr1, tr3]))
     assert ft != more_rels
 
 
@@ -139,10 +140,10 @@ def test_tuple_type_alpha_equal():
     tp1 = relay.TypeVar("v1", relay.TypeKind.Type)
     tp2 = relay.TypeVar("v2", relay.TypeKind.Type)
 
-    tup1 = relay.TupleType(tvm.convert([t1, t2, tp1]))
-    tup2 = relay.TupleType(tvm.convert([t1, t2, tp1]))
-    tup3 = relay.TupleType(tvm.convert([t2, t1, tp1]))
-    tup4 = relay.TupleType(tvm.convert([t1, t2, tp2]))
+    tup1 = relay.TupleType(tvm.runtime.convert([t1, t2, tp1]))
+    tup2 = relay.TupleType(tvm.runtime.convert([t1, t2, tp1]))
+    tup3 = relay.TupleType(tvm.runtime.convert([t2, t1, tp1]))
+    tup4 = relay.TupleType(tvm.runtime.convert([t1, t2, tp2]))
 
     # as long as types are alpha-equal and in same order,
     # tuples should be alpha-equal
@@ -165,16 +166,16 @@ def test_type_relation_alpha_equal():
     attr1_same = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3,4))
     attr2 = tvm.ir.make_node("attrs.TestAttrs", name="attr", padding=(3,4,4))
 
-    tr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1)
-    same = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1)
-    diff_func = relay.TypeRelation(identity, tvm.convert([t1, t2]), 1, attr1)
-    diff_order = relay.TypeRelation(broadcast, tvm.convert([t2, t1]), 1, attr1)
-    diff_args = relay.TypeRelation(broadcast, tvm.convert([t2, t3]), 1, attr1)
-    diff_attr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr2)
-    same_attr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1_same)
+    tr = relay.TypeRelation(broadcast, tvm.runtime.convert([t1, t2]), 1, attr1)
+    same = relay.TypeRelation(broadcast, tvm.runtime.convert([t1, t2]), 1, attr1)
+    diff_func = relay.TypeRelation(identity, tvm.runtime.convert([t1, t2]), 1, attr1)
+    diff_order = relay.TypeRelation(broadcast, tvm.runtime.convert([t2, t1]), 1, attr1)
+    diff_args = relay.TypeRelation(broadcast, tvm.runtime.convert([t2, t3]), 1, attr1)
+    diff_attr = relay.TypeRelation(broadcast, tvm.runtime.convert([t1, t2]), 1, attr2)
+    same_attr = relay.TypeRelation(broadcast, tvm.runtime.convert([t1, t2]), 1, attr1_same)
 
-    bigger = relay.TypeRelation(identity, tvm.convert([t1, t3, t2]), 2, attr1)
-    diff_num_inputs = relay.TypeRelation(identity, tvm.convert([t1, t3, t2]), 1, attr2)
+    bigger = relay.TypeRelation(identity, tvm.runtime.convert([t1, t3, t2]), 2, attr1)
+    diff_num_inputs = relay.TypeRelation(identity, tvm.runtime.convert([t1, t3, t2]), 1, attr2)
 
     # func, number of args, input count, and order should be the same
     assert tr == same
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index df01310937ed..eabe7584f013 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform, analysis
 from tvm.relay.testing.temp_op_attr import TempOpAttr
diff --git a/tests/python/relay/test_pass_annotation.py b/tests/python/relay/test_pass_annotation.py
index 3e7d916c96fa..49e9883d8ee8 100644
--- a/tests/python/relay/test_pass_annotation.py
+++ b/tests/python/relay/test_pass_annotation.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.contrib import graph_runtime
 from tvm.relay.expr_functor import ExprMutator
diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py
index 02438ef04f2a..35d33b10f186 100644
--- a/tests/python/relay/test_pass_auto_quantize.py
+++ b/tests/python/relay/test_pass_auto_quantize.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import testing
 
diff --git a/tests/python/relay/test_pass_canonicalize_cast.py b/tests/python/relay/test_pass_canonicalize_cast.py
index 672b4b192995..e9ab67ff5166 100644
--- a/tests/python/relay/test_pass_canonicalize_cast.py
+++ b/tests/python/relay/test_pass_canonicalize_cast.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import tvm.relay as relay
 import tvm.relay.transform as _transform
 
diff --git a/tests/python/relay/test_pass_check_kind.py b/tests/python/relay/test_pass_check_kind.py
index 62a92040ff16..06fe13a9a99a 100644
--- a/tests/python/relay/test_pass_check_kind.py
+++ b/tests/python/relay/test_pass_check_kind.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import check_kind
 import pytest
@@ -33,9 +34,9 @@ def test_typevar_kind():
 def test_tuple_kind():
     # only contain type kinds
     tp = relay.TypeVar('tp', relay.TypeKind.Type)
-    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
-    tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
-    fields = tvm.convert([tp, tf, tt])
+    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), 'float32')
+    tf = relay.FuncType(tvm.runtime.convert([]), tt, tvm.runtime.convert([]), tvm.runtime.convert([]))
+    fields = tvm.runtime.convert([tp, tf, tt])
 
     tup_ty = relay.TupleType(fields)
     assert check_kind(tup_ty) == relay.TypeKind.Type
@@ -46,16 +47,16 @@ def test_func_kind():
     tp1 = relay.TypeVar('tp1', relay.TypeKind.Type)
     tp2 = relay.TypeVar('tp2', relay.TypeKind.Type)
 
-    shape = tvm.convert([1, 2, 3])
+    shape = tvm.runtime.convert([1, 2, 3])
     dtype = 'float32'
     tensor_type = relay.TensorType(shape, dtype)
 
-    tr = relay.TypeRelation(None, tvm.convert([tensor_type, tp1]) , 1, None)
+    tr = relay.TypeRelation(None, tvm.runtime.convert([tensor_type, tp1]) , 1, None)
 
-    type_params = tvm.convert([tp1, tp2])
-    type_constraints = tvm.convert([tr])
-    arg_types = tvm.convert([tp1, tensor_type])
-    ret_type = relay.TupleType(tvm.convert([tp2, tensor_type]))
+    type_params = tvm.runtime.convert([tp1, tp2])
+    type_constraints = tvm.runtime.convert([tr])
+    arg_types = tvm.runtime.convert([tp1, tensor_type])
+    ret_type = relay.TupleType(tvm.runtime.convert([tp2, tensor_type]))
 
     tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
     assert check_kind(tf) == relay.TypeKind.Type
@@ -63,8 +64,8 @@ def test_func_kind():
 
 def test_ref_kind():
     # only contain type kinds
-    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
-    ft = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
+    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), 'float32')
+    ft = relay.FuncType(tvm.runtime.convert([]), tt, tvm.runtime.convert([]), tvm.runtime.convert([]))
 
     rt1 = relay.RefType(tt)
     assert check_kind(rt1) == relay.TypeKind.Type
@@ -77,9 +78,9 @@ def test_ref_kind():
 def test_relation_kind():
     # only have type kinds for arguments
     tp = relay.TypeVar('tp', relay.TypeKind.Type)
-    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
-    tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
-    args = tvm.convert([tf, tt, tp])
+    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), 'float32')
+    tf = relay.FuncType(tvm.runtime.convert([]), tt, tvm.runtime.convert([]), tvm.runtime.convert([]))
+    args = tvm.runtime.convert([tf, tt, tp])
 
     tr = relay.TypeRelation(None, args, 2, None)
     assert check_kind(tr) == relay.TypeKind.Constraint
@@ -115,7 +116,7 @@ def test_invalid_tuple_kind():
     tp1 = relay.TypeVar('tp1', relay.TypeKind.ShapeVar)
     tp2 = relay.TypeVar('tp2', relay.TypeKind.BaseType)
     tp3 = relay.TypeVar('tp3', relay.TypeKind.Constraint)
-    fields = tvm.convert([tp1, tp2, tp3])
+    fields = tvm.runtime.convert([tp1, tp2, tp3])
 
     tup_ty = relay.TupleType(fields)
     check_kind(tup_ty)
@@ -127,9 +128,9 @@ def test_invalid_func_kind():
     tp2 = relay.TypeVar('tp2', relay.TypeKind.BaseType)
     tp3 = relay.TypeVar('tp3', relay.TypeKind.Constraint)
 
-    type_params = tvm.convert([tp1, tp2, tp3])
-    type_constraints = tvm.convert([])
-    arg_types = tvm.convert([tp1, tp2])
+    type_params = tvm.runtime.convert([tp1, tp2, tp3])
+    type_constraints = tvm.runtime.convert([])
+    arg_types = tvm.runtime.convert([tp1, tp2])
     ret_type = tp3
 
     tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
@@ -148,7 +149,7 @@ def test_invalid_relation_kind():
     tp1 = relay.TypeVar('tp1', relay.TypeKind.ShapeVar)
     tp2 = relay.TypeVar('tp2', relay.TypeKind.BaseType)
     tp3 = relay.TypeVar('tp3', relay.TypeKind.Constraint)
-    args = tvm.convert([tp1, tp2, tp3])
+    args = tvm.runtime.convert([tp1, tp2, tp3])
 
     func = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Broadcast")
     tr = relay.TypeRelation(func, args, 2, None)
@@ -187,7 +188,7 @@ def test_typecall_invalid_num_args():
 def test_func_with_invalid_ret_type():
     tp1 = relay.TypeVar('tp1', relay.TypeKind.Type)
     tp2 = relay.TypeVar('tp2', relay.TypeKind.ShapeVar)
-    tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
+    tf = relay.FuncType(tvm.runtime.convert([tp1]), tp2, tvm.runtime.convert([tp1, tp2]), tvm.runtime.convert([]))
 
     check_kind(tf)
 
@@ -196,7 +197,7 @@ def test_func_with_invalid_ret_type():
 def test_func_with_invalid_arg_types():
     tp1 = relay.TypeVar('tp1', relay.TypeKind.ShapeVar)
     tp2 = relay.TypeVar('tp2', relay.TypeKind.Type)
-    tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
+    tf = relay.FuncType(tvm.runtime.convert([tp1]), tp2, tvm.runtime.convert([tp1, tp2]), tvm.runtime.convert([]))
 
     check_kind(tf)
 
@@ -205,9 +206,9 @@ def test_func_with_invalid_arg_types():
 def test_func_with_invalid_tuple():
     tp1 = relay.TypeVar('tp1', relay.TypeKind.ShapeVar)
 
-    ret_type = relay.TupleType(tvm.convert([tp1, tp1, tp1]))
+    ret_type = relay.TupleType(tvm.runtime.convert([tp1, tp1, tp1]))
 
-    tf = relay.FuncType(tvm.convert([]), ret_type, tvm.convert([tp1]), tvm.convert([]))
+    tf = relay.FuncType(tvm.runtime.convert([]), ret_type, tvm.runtime.convert([tp1]), tvm.runtime.convert([]))
     check_kind(tf)
 
 
@@ -218,20 +219,20 @@ def test_func_with_invalid_relation():
     tp3 = relay.TypeVar('tp3', relay.TypeKind.Constraint)
 
     func = tvm.ir.EnvFunc.get("tvm.relay.type_relation.Identity")
-    tr = relay.TypeRelation(func, tvm.convert([tp2, tp3]), 1, None)
+    tr = relay.TypeRelation(func, tvm.runtime.convert([tp2, tp3]), 1, None)
 
-    tf = relay.FuncType(tvm.convert([tp1]), tp1, tvm.convert([tp1, tp2, tp3]), tvm.convert([tr]))
+    tf = relay.FuncType(tvm.runtime.convert([tp1]), tp1, tvm.runtime.convert([tp1, tp2, tp3]), tvm.runtime.convert([tr]))
     check_kind(tf)
 
 
 @pytest.mark.xfail(raises=tvm.error.TVMError)
 def test_tuple_with_invalid_func():
-    tensor_type = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    tensor_type = relay.TensorType(tvm.runtime.convert([1, 2, 3]), 'float32')
 
     tp1 = relay.TypeVar('tp1', relay.TypeKind.ShapeVar)
-    tf = relay.FuncType(tvm.convert([]), tp1, tvm.convert([tp1]), tvm.convert([]))
+    tf = relay.FuncType(tvm.runtime.convert([]), tp1, tvm.runtime.convert([tp1]), tvm.runtime.convert([]))
 
-    tup_ty = relay.TupleType(tvm.convert([tensor_type, tf]))
+    tup_ty = relay.TupleType(tvm.runtime.convert([tensor_type, tf]))
     check_kind(tup_ty)
 
 
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
index c10a7b8d1b39..ec9bcd9f2bc4 100644
--- a/tests/python/relay/test_pass_combine_parallel_conv2d.py
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 
diff --git a/tests/python/relay/test_pass_combine_parallel_dense.py b/tests/python/relay/test_pass_combine_parallel_dense.py
index f693f30060d9..84d8211666d8 100644
--- a/tests/python/relay/test_pass_combine_parallel_dense.py
+++ b/tests/python/relay/test_pass_combine_parallel_dense.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 4b80d6ca120d..f9e7ca9a4b58 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test alter op layout pass"""
 import tvm
+from tvm import te
 
 from tvm import relay
 from tvm.relay.op import register_alter_op_layout
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index 3f1ec9efd5e1..604ec8969ef7 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import Function, transform
 from tvm.relay.analysis import alpha_equal, graph_equal, free_vars, assert_alpha_equal
@@ -25,7 +26,7 @@
 
 class env:
     def __init__(self):
-        self.shape = tvm.convert([1, 2, 3])
+        self.shape = tvm.runtime.convert([1, 2, 3])
         self.tt = relay.TensorType(self.shape, "float32")
         self.int32 = relay.TensorType([], "int32")
         self.float32 = relay.TensorType([], "float32")
diff --git a/tests/python/relay/test_pass_eliminate_common_subexpr.py b/tests/python/relay/test_pass_eliminate_common_subexpr.py
index e2fec6161c87..dddbef73e564 100644
--- a/tests/python/relay/test_pass_eliminate_common_subexpr.py
+++ b/tests/python/relay/test_pass_eliminate_common_subexpr.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test eliminate common subexpr pass"""
 import tvm
+from tvm import te
 
 from tvm import relay
 from tvm.relay.op import register_alter_op_layout
diff --git a/tests/python/relay/test_pass_eta_expand.py b/tests/python/relay/test_pass_eta_expand.py
index b9eb2a1e692d..ad04e413b21b 100644
--- a/tests/python/relay/test_pass_eta_expand.py
+++ b/tests/python/relay/test_pass_eta_expand.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import relay
 import tvm.relay.transform as _transform
 
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index 08834f14e851..cc362a266aa5 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
@@ -54,7 +55,7 @@ def fail(x):
         raise RuntimeError()
 
     # the fold constant should work on any context.
-    with tvm.build_config(add_lower_pass=[(0, fail)]):
+    with tvm.target.build_config(add_lower_pass=[(0, fail)]):
         with tvm.target.create("cuda"):
             zz = run_opt_pass(before(), transform.FoldConstant())
     zexpected = run_opt_pass(expected(), transform.InferType())
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index bfc3caba45e3..4c094fb3e6e7 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index e11b6aeb0a2c..a66022275c96 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.testing import run_opt_pass
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index 6c2ea8ffa3b3..6f2a12589fb5 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import free_vars, free_type_vars, assert_alpha_equal
 from tvm.relay import create_executor, transform
diff --git a/tests/python/relay/test_pass_lambda_lift.py b/tests/python/relay/test_pass_lambda_lift.py
index a66c4c7d745a..e38887829551 100644
--- a/tests/python/relay/test_pass_lambda_lift.py
+++ b/tests/python/relay/test_pass_lambda_lift.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 
diff --git a/tests/python/relay/test_pass_legalize.py b/tests/python/relay/test_pass_legalize.py
index e4e16c002abf..9976eca28b29 100644
--- a/tests/python/relay/test_pass_legalize.py
+++ b/tests/python/relay/test_pass_legalize.py
@@ -17,6 +17,7 @@
 """Test legalize pass"""
 import numpy as np
 import tvm
+from tvm import te
 
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_pass_mac_count.py b/tests/python/relay/test_pass_mac_count.py
index 5ce0e41cfbac..697aad8eedb7 100644
--- a/tests/python/relay/test_pass_mac_count.py
+++ b/tests/python/relay/test_pass_mac_count.py
@@ -17,6 +17,7 @@
 """Unit tests for MAC counter."""
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import analysis, transform
 
@@ -39,7 +40,7 @@ def test_gemm():
     data2 = relay.var("data2", shape=dshape2)
     gemm = relay.nn.dense(data1, data2)
     func = relay.Function([data1, data2],
-                            relay.Tuple(tvm.convert([gemm])))
+                            relay.Tuple(tvm.runtime.convert([gemm])))
     func = run_opt_pass(func, transform.InferType())
     compute_count = analysis.get_total_mac_number(func)
     expect_count = n * m * k
@@ -66,7 +67,7 @@ def test_conv():
         channels=output_channel,
         kernel_size=(kh, kw),
         padding=(h_padding, w_padding))
-    func = relay.Function([data, weight], relay.Tuple(tvm.convert([conv2d])))
+    func = relay.Function([data, weight], relay.Tuple(tvm.runtime.convert([conv2d])))
     func = run_opt_pass(func, transform.InferType())
     compute_count = analysis.get_total_mac_number(func)
     expect_count = batch_size * input_channel * oh * ow * output_channel * kh * kw
@@ -99,7 +100,7 @@ def test_simple_network():
         weight_dense)
 
     func = relay.Function([data1, data2, weight_conv, weight_dense],
-                            relay.Tuple(tvm.convert([conv2d_1, conv2d_2,
+                            relay.Tuple(tvm.runtime.convert([conv2d_1, conv2d_2,
                                                     dense_1, add, flattened])))
     # alter the CONV 2D data layout to test
     func = run_opt_pass(func, transform.AlterOpLayout())
@@ -127,7 +128,7 @@ def test_depthwise_conv2d():
         groups=64)
     add = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
     func = relay.Function([data1, data2, weight_conv],
-                            relay.Tuple(tvm.convert([depthwise_conv2d_1,
+                            relay.Tuple(tvm.runtime.convert([depthwise_conv2d_1,
                                                     depthwise_conv2d_2,
                                                     add])))
     func = run_opt_pass(func, transform.InferType())
@@ -156,7 +157,7 @@ def test_conv_2d_transpose():
         kernel_size=(kh, kw),
         padding=(h_padding, w_padding))
     func = relay.Function([data, weight],
-                            relay.Tuple(tvm.convert([conv2d_transpose])))
+                            relay.Tuple(tvm.runtime.convert([conv2d_transpose])))
     func = run_opt_pass(func, transform.InferType())
     compute_count = analysis.get_total_mac_number(func)
     expect_count = batch_size * input_channel * oh * ow * output_channel * kh * kw
diff --git a/tests/python/relay/test_pass_manager.py b/tests/python/relay/test_pass_manager.py
index a13e5e93ea9c..aed026996a21 100644
--- a/tests/python/relay/test_pass_manager.py
+++ b/tests/python/relay/test_pass_manager.py
@@ -19,6 +19,7 @@
 import pytest
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import ExprFunctor
 from tvm.relay import Function, Call
diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py
index 2bec98c173d9..f54dd6bf69c5 100644
--- a/tests/python/relay/test_pass_partial_eval.py
+++ b/tests/python/relay/test_pass_partial_eval.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import alpha_equal, assert_alpha_equal
 from tvm.relay.prelude import Prelude
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 6f20278133d9..9c3228f4ff48 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -21,6 +21,7 @@
 import pytest
 
 import tvm
+from tvm import te
 import tvm.relay.testing
 import tvm.relay.transform as transform
 from tvm import relay
diff --git a/tests/python/relay/test_pass_qnn_legalize.py b/tests/python/relay/test_pass_qnn_legalize.py
index dee19f766605..7d3d9cc106c8 100644
--- a/tests/python/relay/test_pass_qnn_legalize.py
+++ b/tests/python/relay/test_pass_qnn_legalize.py
@@ -17,6 +17,7 @@
 """Test legalize pass"""
 import numpy as np
 import tvm
+from tvm import te
 
 from tvm import relay
 from tvm.contrib import graph_runtime
diff --git a/tests/python/relay/test_pass_remove_unused_functions.py b/tests/python/relay/test_pass_remove_unused_functions.py
index bacc3126c7c4..33816344f562 100644
--- a/tests/python/relay/test_pass_remove_unused_functions.py
+++ b/tests/python/relay/test_pass_remove_unused_functions.py
@@ -16,6 +16,7 @@
 # under the License.
 import pytest
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 from tvm.relay.prelude import Prelude
diff --git a/tests/python/relay/test_pass_to_a_normal_form.py b/tests/python/relay/test_pass_to_a_normal_form.py
index 46bde4f490b8..f68f64874c78 100644
--- a/tests/python/relay/test_pass_to_a_normal_form.py
+++ b/tests/python/relay/test_pass_to_a_normal_form.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import alpha_equal, detect_feature
 from tvm.relay import op, create_executor, transform
diff --git a/tests/python/relay/test_pass_to_cps.py b/tests/python/relay/test_pass_to_cps.py
index 4645e20c7468..fe4959ed8ce3 100644
--- a/tests/python/relay/test_pass_to_cps.py
+++ b/tests/python/relay/test_pass_to_cps.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import alpha_equal, detect_feature
 from tvm.relay.transform import to_cps, un_cps
diff --git a/tests/python/relay/test_pass_to_graph_normal_form.py b/tests/python/relay/test_pass_to_graph_normal_form.py
index 5c5221f65a46..dc47ad350fe5 100644
--- a/tests/python/relay/test_pass_to_graph_normal_form.py
+++ b/tests/python/relay/test_pass_to_graph_normal_form.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import op, create_executor, transform, Feature
 from tvm.relay.analysis import detect_feature
diff --git a/tests/python/relay/test_pass_unmatched_cases.py b/tests/python/relay/test_pass_unmatched_cases.py
index 1ac99a69a249..42344bccabaa 100644
--- a/tests/python/relay/test_pass_unmatched_cases.py
+++ b/tests/python/relay/test_pass_unmatched_cases.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.prelude import Prelude
 from tvm.relay.analysis import unmatched_cases
diff --git a/tests/python/relay/test_pass_vars.py b/tests/python/relay/test_pass_vars.py
index d8b77ba35612..1aad74b930e8 100644
--- a/tests/python/relay/test_pass_vars.py
+++ b/tests/python/relay/test_pass_vars.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.analysis import (free_vars, free_type_vars,
                                 bound_vars, bound_type_vars,
diff --git a/tests/python/relay/test_py_converter.py b/tests/python/relay/test_py_converter.py
index f489e9fcb04b..f6b1b2432d92 100644
--- a/tests/python/relay/test_py_converter.py
+++ b/tests/python/relay/test_py_converter.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay.testing import to_python, run_as_python
 from tvm.relay.prelude import Prelude
diff --git a/tests/python/relay/test_type_functor.py b/tests/python/relay/test_type_functor.py
index 854301bf714a..9e023bc6b1e4 100644
--- a/tests/python/relay/test_type_functor.py
+++ b/tests/python/relay/test_type_functor.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import TypeFunctor, TypeMutator, TypeVisitor
 from tvm.relay.analysis import assert_graph_equal
@@ -53,7 +54,7 @@ def test_tensor_type():
 
 def test_func_type():
     tv = TypeVar('tv')
-    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    tt = relay.TensorType(tvm.runtime.convert([1, 2, 3]), 'float32')
     ft = FuncType([tt], tt, type_params=[tv])
     check_visit(ft)
 
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 892c91d9c43a..74507baa1096 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -18,6 +18,7 @@
    for expressions.
 """
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import op, transform, analysis
 from tvm.relay.analysis import assert_alpha_equal
diff --git a/tests/python/relay/test_type_solver.py b/tests/python/relay/test_type_solver.py
index 118066e7cf52..d90fd29a7eb5 100644
--- a/tests/python/relay/test_type_solver.py
+++ b/tests/python/relay/test_type_solver.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 import pytest
 
diff --git a/tests/python/relay/test_typecall.py b/tests/python/relay/test_typecall.py
index fa2601f30af1..491047deb4c6 100644
--- a/tests/python/relay/test_typecall.py
+++ b/tests/python/relay/test_typecall.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import relay
 from tvm.relay import transform
 
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 8cac656ee5a1..02f1e5b753f8 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+from tvm import te
 from tvm import runtime
 from tvm import relay
 from tvm.relay.scope_builder import ScopeBuilder
diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py
index 9fed4955705f..5d20651a8126 100644
--- a/tests/python/relay/test_vm_serialization.py
+++ b/tests/python/relay/test_vm_serialization.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm.runtime import vm as _vm
 from tvm.relay import vm as rly_vm
 from tvm import relay
diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py
index 35822d240b04..b4649a4ba75e 100644
--- a/tests/python/unittest/test_arith_canonical_simplify.py
+++ b/tests/python/unittest/test_arith_canonical_simplify.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 class CanonicalChecker:
     def __init__(self):
@@ -22,26 +23,26 @@ def __init__(self):
 
     def verify(self, data, expected):
         res = self.analyzer.canonical_simplify(data)
-        assert tvm.ir_pass.Equal(res, expected), "\ndata={}\nres={}\nexpected={}".format(data, res, expected)
+        assert tvm.tir.ir_pass.Equal(res, expected), "\ndata={}\nres={}\nexpected={}".format(data, res, expected)
 
 
 def test_mul_sum_simplify():
     ck = CanonicalChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
 
     ck.verify(2 + (3 * x + z + y + 1) * 4 + x,
               x * 13 + z * 4 + y * 4 +6)
     ck.verify(x * 3 - 4 * x + 1, 1 - x)
     ck.verify(y + x * 3 - 5 * x + 1 + y, y * 2 + 1 - x * 2)
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     # trucdiv
     ck.verify(tdiv(x + y + x + y * 3, 2), y * 2 + x)
     ck.verify(tmod(x + y + x + y * 3, 2), 0)
 
     # floordiv
-    fld = tvm.floordiv
-    flm = tvm.floormod
+    fld = tvm.te.floordiv
+    flm = tvm.te.floormod
     ck.verify(flm(x + x + y * 3, 2), flm(y * 3, 2))
     ck.verify(fld(x + y + x + y * 3, 2), y * 2 + x)
     ck.verify(flm(x + y + x + y * 3, 2), 0)
@@ -50,11 +51,11 @@ def test_mul_sum_simplify():
 
 def test_split_index_simplify():
     ck = CanonicalChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
 
     # trucdiv
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
 
     # split div const
     ck.verify(tdiv(x, 3) *3 + tmod(x, 3), x)
@@ -80,8 +81,8 @@ def test_split_index_simplify():
     ck.verify(tdiv(x * 4 + y, 2) * 2 + tmod(x * 4 + y, 2), x * 4 + y)
 
     # floordiv
-    fld = tvm.floordiv
-    flm = tvm.floormod
+    fld = tvm.te.floordiv
+    flm = tvm.te.floormod
     ck.verify(fld(x, 3) * 3 + flm(x, 3), x)
     ck.verify(fld(x, 6) * 6 + flm(fld(x, 3), 2) * 3 + flm(x, 3), x)
     ck.verify(fld(fld(flm(x, 16), 2) * 2, 4), fld(flm(x, 16), 4))
@@ -95,8 +96,8 @@ def test_split_index_simplify():
 
 def test_div_simplify():
     ck = CanonicalChecker()
-    x = tvm.var("x")
-    tdiv = tvm.truncdiv
+    x = te.var("x")
+    tdiv = tvm.tir.truncdiv
 
     # truc div
     ck.verify(tdiv(16+48*x,16), x*3 + 1)
@@ -110,7 +111,7 @@ def test_div_simplify():
     ck.verify(tdiv(17 + 47 * x, 16), tdiv(x * 47 + 17, 16))
 
     # floordiv
-    fld = tvm.floordiv
+    fld = tvm.te.floordiv
     ck.analyzer.update(x, tvm.arith.ConstIntBound(-1000, 10000), True)
     ck.verify(fld(16+48*x, 16), x*3 + 1)
     ck.verify(fld(17+48*x, 16), x * 3 + 1)
@@ -119,8 +120,8 @@ def test_div_simplify():
 
 def test_floormod_simplify():
     ck = CanonicalChecker()
-    flm = tvm.floormod
-    x, y = tvm.var("x"), tvm.var("y")
+    flm = tvm.te.floormod
+    x, y = te.var("x"), te.var("y")
     ck.verify(flm(flm((x*4) + y  - 466036, 24528) - 24512,  16),
               flm((x*4) + y  + 12, 16))
 
@@ -128,59 +129,59 @@ def test_floormod_simplify():
 
 def test_canonical_mixed():
     ck = CanonicalChecker()
-    x = tvm.var("x")
-    z = tvm.const(3, "int32")
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    x = te.var("x")
+    z = tvm.tir.const(3, "int32")
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     ck.verify(tdiv(x, (z*z)) - tdiv(x, (z*z)), 0)
     ck.verify(tdiv(x, (z+z)) - tdiv(x, (z+z)), 0)
     ck.verify(x - 2 < 3, x < 5)
-    ck.verify(tvm.max(x, 1) - tvm.max(x, 1), 0)
-    ck.verify(tvm.min(x, 1) - tvm.min(x, 1), 0)
+    ck.verify(tvm.te.max(x, 1) - tvm.te.max(x, 1), 0)
+    ck.verify(tvm.te.min(x, 1) - tvm.te.min(x, 1), 0)
     ck.verify(x * x - x * x, 0)
 
-    fld = tvm.floordiv
+    fld = tvm.te.floordiv
     ck.verify(fld(x, (z*z)) - fld(x, (z*z)), 0)
     ck.verify(fld(x, (z+z)) - fld(x, (z+z)), 0)
 
 
 def test_reduce_combiner_simplify():
     ck = CanonicalChecker()
-    dummy = tvm.var('dummy')
-    comm_reducer = tvm.comm_reducer
-    prod = comm_reducer(lambda x, y: x*y, lambda t0: tvm.const(1, t0))
+    dummy = te.var('dummy')
+    comm_reducer = te.comm_reducer
+    prod = comm_reducer(lambda x, y: x*y, lambda t0: tvm.tir.const(1, t0))
 
     sum_or_prod = comm_reducer(
         lambda x, y: tvm.tir.Select(dummy < 0,
                                      x + y, x*y),
         lambda t0: tvm.tir.Select(dummy < 0,
-                                   tvm.const(0, t0), tvm.const(1, t0)))
+                                   tvm.tir.const(0, t0), tvm.tir.const(1, t0)))
     sum_and_prod = comm_reducer(
         lambda x, y: (x[0] + y[0],
                       x[1]*y[1]),
-        lambda t0, t1: (tvm.const(0, t0),
-                        tvm.const(5, t0) - tvm.const(4, t0)))
+        lambda t0, t1: (tvm.tir.const(0, t0),
+                        tvm.tir.const(5, t0) - tvm.tir.const(4, t0)))
     some_reducer1 = comm_reducer(
         lambda x, y: (x[0] + y[0],
                       x[0] + y[0] + x[1] + y[1],
                       x[0]*y[2] + y[0]*x[2],
                       x[1] + y[2],
                     4.0),
-        lambda t0, t1, t2, t3, t4: (tvm.const(0, t0),
-                                    tvm.const(1, t1),
-                                    tvm.const(2, t2),
-                                    tvm.const(3, t3),
-                                    tvm.const(4, t4)))
-
-    k = tvm.reduce_axis((0, 10), name="k")
-    A = tvm.placeholder((10,), name='A')
+        lambda t0, t1, t2, t3, t4: (tvm.tir.const(0, t0),
+                                    tvm.tir.const(1, t1),
+                                    tvm.tir.const(2, t2),
+                                    tvm.tir.const(3, t3),
+                                    tvm.tir.const(4, t4)))
+
+    k = te.reduce_axis((0, 10), name="k")
+    A = te.placeholder((10,), name='A')
     # Test that SimplifyCombiner makes use of vranges
     ck.analyzer.update(dummy, tvm.arith.ConstIntBound(-10, -4))
-    ck.verify(sum_or_prod(A[k], k), tvm.sum(A[k], k))
+    ck.verify(sum_or_prod(A[k], k), te.sum(A[k], k))
     ck.analyzer.update(dummy, tvm.arith.ConstIntBound(5, 9), True)
     ck.verify(sum_or_prod(A[k], k), prod(A[k], k))
     ck.analyzer.update(dummy, tvm.arith.ConstIntBound(-10, 100), True)
-    ck.verify(sum_and_prod((A[k], A[10-k]), k)[0], tvm.sum(A[k], k))
+    ck.verify(sum_and_prod((A[k], A[10-k]), k)[0], te.sum(A[k], k))
     ck.verify(sum_and_prod((A[k], A[10-k]), k)[1], prod(A[10-k], k))
 
     reference_simplified_sources = [[A[0]],
@@ -196,72 +197,72 @@ def test_reduce_combiner_simplify():
 
         # Check that the remaining components are the expected ones.
         for lhs, rhs in zip(simplified.source, reference_simplified_sources[j]):
-            assert tvm.ir_pass.Equal(lhs, rhs)
+            assert tvm.tir.ir_pass.Equal(lhs, rhs)
 
     # Test that components with side effects are not removed
     side_effect = lambda *xs: tvm.tir.Call("int32", "dummy", xs, tvm.tir.Call.Intrinsic, None, 0)
     ck.verify(sum_and_prod((A[k], side_effect(A[10-k])), k)[0],
              sum_and_prod((A[k], side_effect(A[10-k])), k)[0])
     ck.verify(sum_and_prod((side_effect(A[k]), A[10-k]), k)[0],
-              tvm.sum(side_effect(A[k]), k))
+              te.sum(side_effect(A[k]), k))
 
 
 def test_reduce_simplify():
     ck = CanonicalChecker()
-    k = tvm.reduce_axis((0, 10), name="k")
-    j = tvm.reduce_axis((-5, 3), name="j")
-    A = tvm.placeholder((10,), name='A')
-    ck.verify(tvm.sum(tvm.tir.Select(k + j < 12, k + j, 0), [k, j]),
-              tvm.sum(k + j, [k, j]))
-    ck.verify(tvm.sum(A[3], []), A[3])
+    k = te.reduce_axis((0, 10), name="k")
+    j = te.reduce_axis((-5, 3), name="j")
+    A = te.placeholder((10,), name='A')
+    ck.verify(te.sum(tvm.tir.Select(k + j < 12, k + j, 0), [k, j]),
+              te.sum(k + j, [k, j]))
+    ck.verify(te.sum(A[3], []), A[3])
     # The rule below is not typical, removed for now
-    ck.verify(tvm.sum(tvm.div(k, 10), k), tvm.sum(tvm.const(0, "int32"), k))
+    ck.verify(te.sum(te.div(k, 10), k), te.sum(tvm.tir.const(0, "int32"), k))
 
 
 def test_simplify_if_then_else():
     ck = CanonicalChecker()
-    x = tvm.var("x")
-    y = tvm.var("y")
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    x = te.var("x")
+    y = te.var("y")
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     # simplification that takes condition into account.
-    res = tvm.if_then_else((x * 4 + y) >= 466036,
-                           tvm.if_then_else(24512 <= tmod(((x*4) + y) - 466036, 24528),
+    res = tvm.tir.if_then_else((x * 4 + y) >= 466036,
+                           tvm.tir.if_then_else(24512 <= tmod(((x*4) + y) - 466036, 24528),
                                             tmod(tmod(((x*4) + y)  - 466036, 24528) -24512, 16),
                                             x), y)
 
-    res2 = tvm.if_then_else((x * 4) >= 466036 - y,
-                           tvm.if_then_else(24512 <= tmod(((x*4) + y) - 466036, 24528),
+    res2 = tvm.tir.if_then_else((x * 4) >= 466036 - y,
+                           tvm.tir.if_then_else(24512 <= tmod(((x*4) + y) - 466036, 24528),
                                             tmod(tmod(((x*4) + y)  - 466036, 24528) -24512, 16),
                                             x), y)
-    expected = tvm.if_then_else(
+    expected = tvm.tir.if_then_else(
         tvm.tir.LE(466036, (x * 4 + y)),
-        tvm.if_then_else(tvm.tir.LE(24512, tmod(((x*4) + y) - 4, 24528)),
+        tvm.tir.if_then_else(tvm.tir.LE(24512, tmod(((x*4) + y) - 4, 24528)),
                                      tmod(((x*4) + y)  - 4, 16),
                          x), y)
     ck.verify(res, expected)
     ck.verify(res2, expected)
     # can only simplify if condition
-    res = tvm.tir.Select(tvm.all(x >= -1, y >= 0), tmod(x + y + 100, 3), tmod(x + 100, 3))
-    expected = tvm.tir.Select(tvm.all(x >= -1, y >= 0), tmod(x + y + 1, 3), tmod(x + 100, 3))
+    res = tvm.tir.Select(tvm.tir.all(x >= -1, y >= 0), tmod(x + y + 100, 3), tmod(x + 100, 3))
+    expected = tvm.tir.Select(tvm.tir.all(x >= -1, y >= 0), tmod(x + y + 1, 3), tmod(x + 100, 3))
     ck.verify(res, ck.analyzer.canonical_simplify(expected))
 
     res = tvm.tir.Select(x >= 10,
-                          tvm.if_then_else(tdiv(x, 3) > 2, x, 0), 0)
+                          tvm.tir.if_then_else(tdiv(x, 3) > 2, x, 0), 0)
     expected = tvm.tir.Select(x >= 10, x, 0)
     ck.verify(res, ck.analyzer.canonical_simplify(expected))
 
     res = tvm.tir.Select(x >= 10,
-                          tvm.if_then_else(tdiv(x, 3) < 2, x, 0), 0)
+                          tvm.tir.if_then_else(tdiv(x, 3) < 2, x, 0), 0)
     ck.verify(res, 0)
 
 
 def test_complex_cases():
     ck = CanonicalChecker()
-    x = tvm.var("x")
-    y = tvm.var("y")
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    x = te.var("x")
+    y = te.var("y")
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     res2 = (tdiv(tdiv(tmod(x*128 + y, 1296),36)*2 + 1,2)*36 +
             tdiv(tmod((x*128) + y, 36)*2 + 1,2)
             - tmod((x*128) + y, 1296) + 1)
diff --git a/tests/python/unittest/test_arith_const_int_bound.py b/tests/python/unittest/test_arith_const_int_bound.py
index aba56ac6c0c5..4829b97c348e 100644
--- a/tests/python/unittest/test_arith_const_int_bound.py
+++ b/tests/python/unittest/test_arith_const_int_bound.py
@@ -15,21 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_dtype_bound():
     analyzer = tvm.arith.Analyzer()
 
-    x = tvm.var("x", dtype="int64")
+    x = te.var("x", dtype="int64")
     bd = analyzer.const_int_bound(x)
     assert bd.min_value == bd.NEG_INF
     assert bd.max_value == bd.POS_INF
 
-    x = tvm.var("x", dtype="int8")
+    x = te.var("x", dtype="int8")
     bd = analyzer.const_int_bound(x)
     assert bd.min_value == -128
     assert bd.max_value == 127
 
-    x = tvm.var("x", dtype="uint8")
+    x = te.var("x", dtype="uint8")
     bd = analyzer.const_int_bound(x)
     assert bd.min_value == 0
     assert bd.max_value == 255
@@ -37,8 +38,8 @@ def test_dtype_bound():
 
 def test_cast_bound():
     analyzer = tvm.arith.Analyzer()
-    x = tvm.var("x", dtype="int8")
-    tmod = tvm.truncmod
+    x = te.var("x", dtype="int8")
+    tmod = tvm.tir.truncmod
     bd = analyzer.const_int_bound(tmod(x, 3).astype("uint32"))
     assert bd.min_value == 0
     assert bd.max_value == 2
@@ -51,7 +52,7 @@ def test_cast_bound():
 
 def test_add_sub_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x", "int64"), tvm.var("y", "int64")
+    x, y = te.var("x", "int64"), te.var("y", "int64")
     bd = analyzer.const_int_bound(x + y)
     assert bd.min_value == bd.NEG_INF
     assert bd.max_value == bd.POS_INF
@@ -78,7 +79,7 @@ def test_add_sub_bound():
 
 def test_mul_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
 
     analyzer.update(x, tvm.arith.ConstIntBound(-2, 4))
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
@@ -101,8 +102,8 @@ def test_mul_bound():
 
 def test_truncdiv_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
-    tdiv = tvm.truncdiv
+    x, y = te.var("x"), te.var("y")
+    tdiv = tvm.tir.truncdiv
 
     analyzer.update(x, tvm.arith.ConstIntBound(-9, 4))
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
@@ -124,9 +125,9 @@ def test_truncdiv_bound():
 
 def test_truncmod_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
 
-    tmod = tvm.truncmod
+    tmod = tvm.tir.truncmod
 
     analyzer.update(x, tvm.arith.ConstIntBound(-9, 4))
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
@@ -149,8 +150,8 @@ def test_truncmod_bound():
 
 def test_floordiv_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
-    fld = tvm.floordiv
+    x, y = te.var("x"), te.var("y")
+    fld = tvm.te.floordiv
     analyzer.update(x, tvm.arith.ConstIntBound(-9, 4))
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
     bd = analyzer.const_int_bound(fld(x, y))
@@ -171,8 +172,8 @@ def test_floordiv_bound():
 
 def test_floormod_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
-    flm = tvm.floormod
+    x, y = te.var("x"), te.var("y")
+    flm = tvm.te.floormod
 
     analyzer.update(x, tvm.arith.ConstIntBound(-9, 4))
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
@@ -195,34 +196,34 @@ def test_floormod_bound():
 
 def test_min_max_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
 
     analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
-    bd = analyzer.const_int_bound(tvm.min(x, y))
+    bd = analyzer.const_int_bound(tvm.te.min(x, y))
     assert bd.min_value == -9
     assert bd.max_value == 10
 
     analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
-    bd = analyzer.const_int_bound(tvm.min(x, y))
+    bd = analyzer.const_int_bound(tvm.te.min(x, y))
     assert bd.min_value == bd.NEG_INF
     assert bd.max_value == 10
 
-    bd = analyzer.const_int_bound(tvm.max(x, y))
+    bd = analyzer.const_int_bound(tvm.te.max(x, y))
     assert bd.min_value == 4
     assert bd.max_value == bd.POS_INF
 
     analyzer.update(x, tvm.arith.ConstIntBound(1, bd.POS_INF), override=True)
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
-    bd = analyzer.const_int_bound(tvm.max(x, y))
+    bd = analyzer.const_int_bound(tvm.te.max(x, y))
     assert bd.min_value == 4
     assert bd.max_value == bd.POS_INF
 
 
 def test_select_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
 
     analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
     analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
@@ -235,7 +236,7 @@ def test_select_bound():
 
 def test_shift_and_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
 
     analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
     analyzer.update(y, tvm.arith.ConstIntBound(2, 10))
@@ -256,9 +257,9 @@ def test_shift_and_bound():
 
 def test_mix_index_bound():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    x, y = te.var("x"), te.var("y")
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
 
     analyzer.update(x, tvm.arith.ConstIntBound(0, 24 - 1))
     analyzer.update(y, tvm.arith.ConstIntBound(0, 3 - 1))
@@ -277,7 +278,7 @@ def test_mix_index_bound():
 
 def test_size_var_bound():
     analyzer = tvm.arith.Analyzer()
-    x = tvm.size_var("x")
+    x = te.size_var("x")
     bd = analyzer.const_int_bound(x)
     assert bd.min_value == 0
     assert bd.max_value == bd.POS_INF
diff --git a/tests/python/unittest/test_arith_deduce_bound.py b/tests/python/unittest/test_arith_deduce_bound.py
index 5e08635cd53f..5baabd16c615 100644
--- a/tests/python/unittest/test_arith_deduce_bound.py
+++ b/tests/python/unittest/test_arith_deduce_bound.py
@@ -15,27 +15,28 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 
 def assert_expr_equal(a, b):
-    res =  tvm.ir_pass.Simplify(a - b)
+    res =  tvm.tir.ir_pass.Simplify(a - b)
     equal = isinstance(res, tvm.tir.IntImm) and res.value == 0
     if not equal:
         raise ValueError("{} and {} are not equal".format(a, b))
 
 
 def test_deduce():
-    a = tvm.var('a')
-    b = tvm.var('b')
-    c = tvm.var('c')
-    d = tvm.var('d')
+    a = te.var('a')
+    b = te.var('b')
+    c = te.var('c')
+    d = te.var('d')
 
     b_s = tvm.arith.IntervalSet(2, 3)
     c_s = tvm.arith.IntervalSet(10, 15)
     d_s = tvm.arith.IntervalSet(-3, -1)
-    zero = tvm.const(0, "int32")
+    zero = tvm.tir.const(0, "int32")
 
-    fdiv = tvm.floordiv
+    fdiv = tvm.te.floordiv
 
     e0 = (-b)*a+c-d
     res0 = tvm.arith.deduce_bound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
@@ -68,13 +69,13 @@ def test_deduce():
     assert_expr_equal(res1.max_value, ans1)
 
 
-    e2 = (tvm.max(5, a * 4) < 0)
+    e2 = (tvm.te.max(5, a * 4) < 0)
     res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
     assert str(res2.max_value) == "neg_inf"
     assert str(res2.min_value) == "pos_inf"
 
     # expression containing variable a is on rhs
-    e2 = (zero < tvm.max(5, a * 4))
+    e2 = (zero < tvm.te.max(5, a * 4))
     res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
     assert str(res2.max_value) == "neg_inf"
     assert str(res2.min_value) == "pos_inf"
@@ -82,10 +83,10 @@ def test_deduce():
     e3 = (-b)+a*c-d
     res3 = tvm.arith.deduce_bound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
     ans3 = fdiv(2,c)+1
-    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)
+    assert str(tvm.tir.ir_pass.Simplify(res3.min_value)) == str(ans3)
 
     res3 = tvm.arith.deduce_bound(a, zero <= e3, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
-    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)
+    assert str(tvm.tir.ir_pass.Simplify(res3.min_value)) == str(ans3)
 
     # tests for `EQ` op
     res4 = tvm.arith.deduce_bound(a, a == b, {}, {})
@@ -127,10 +128,10 @@ def test_deduce():
 
 
 def test_check():
-    a = tvm.var('a')
-    b = tvm.var('b')
-    c = tvm.var('c')
-    d = tvm.var('d')
+    a = te.var('a')
+    b = te.var('b')
+    c = te.var('c')
+    d = te.var('d')
 
     b_s = tvm.arith.IntervalSet(2, 3)
     c_s = tvm.arith.IntervalSet(5, 7)
@@ -150,28 +151,28 @@ def test_check():
 
 def test_deduce_basic():
     def test_basic(a1, a2, coff):
-        a = tvm.var('a')
-        b = tvm.var('b')
+        a = te.var('a')
+        b = te.var('b')
         b_s = tvm.arith.IntervalSet(a1, a2)
         e0 = b + a*coff + 3
 
         res1 = tvm.arith.deduce_bound(a, e0<17, {b: b_s}, {b: b_s})
         [x, y] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
-        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) < 17)).value == 1
+        assert (tvm.tir.ir_pass.Simplify((x * coff + 3 + y) < 17)).value == 1
 
         # expression containing variable a is on rhs
-        res1 = tvm.arith.deduce_bound(a, tvm.const(17, "int32") < e0, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, tvm.tir.const(17, "int32") < e0, {b: b_s}, {b: b_s})
         [x, y] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
-        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) > 17)).value == 1
+        assert (tvm.tir.ir_pass.Simplify((x * coff + 3 + y) > 17)).value == 1
 
         # expression containing variable a is on rhs
-        res1 = tvm.arith.deduce_bound(a, tvm.const(17, "int32")>= e0, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, tvm.tir.const(17, "int32")>= e0, {b: b_s}, {b: b_s})
         [x, y] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
-        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
+        assert (tvm.tir.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
 
         res1 = tvm.arith.deduce_bound(a, e0>=17, {b: b_s}, {b: b_s})
         [x, y] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
-        assert (tvm.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1
+        assert (tvm.tir.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1
 
     test_basic(0, 4, 4)
     test_basic(1, 5, 4)
@@ -182,28 +183,28 @@ def test_basic(a1, a2, coff):
 
 def test_deduce_complex():
     def test_complex(a1, a2, coff):
-        a = tvm.var('a')
-        b = tvm.var('b')
+        a = te.var('a')
+        b = te.var('b')
         b_s = tvm.arith.IntervalSet(a1, a2)
         e0 = (b*3 + a* coff) * 4
 
         res1 = tvm.arith.deduce_bound(a, e0<63, {b: b_s}, {b: b_s})
         [t, x] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
-        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) < 63)).value == 1
+        assert (tvm.tir.ir_pass.Simplify(((x*3 + t* coff) * 4) < 63)).value == 1
 
         # expression containing variable a is on rhs
-        res1 = tvm.arith.deduce_bound(a, tvm.const(63, "int32")>= e0, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, tvm.tir.const(63, "int32")>= e0, {b: b_s}, {b: b_s})
         [t, x] = [res1.max_value, b_s.max_value] if coff > 0 else [res1.min_value, b_s.min_value]
-        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) <= 63)).value == 1
+        assert (tvm.tir.ir_pass.Simplify(((x*3 + t* coff) * 4) <= 63)).value == 1
 
         res1 = tvm.arith.deduce_bound(a, e0>63, {b: b_s}, {b: b_s})
         [t, x] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
-        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) > 63)).value == 1
+        assert (tvm.tir.ir_pass.Simplify(((x*3 + t* coff) * 4) > 63)).value == 1
 
         # expression containing variable a is on rhs
-        res1 = tvm.arith.deduce_bound(a, tvm.const(63, "int32") <= e0, {b: b_s}, {b: b_s})
+        res1 = tvm.arith.deduce_bound(a, tvm.tir.const(63, "int32") <= e0, {b: b_s}, {b: b_s})
         [t, x] = [res1.max_value, b_s.max_value] if coff < 0 else [res1.min_value, b_s.min_value]
-        assert (tvm.ir_pass.Simplify(((x*3 + t* coff) * 4) >= 63)).value == 1
+        assert (tvm.tir.ir_pass.Simplify(((x*3 + t* coff) * 4) >= 63)).value == 1
 
     test_complex(0, 4, 4)
     test_complex(0, 4, -4)
diff --git a/tests/python/unittest/test_arith_detect_clip_bound.py b/tests/python/unittest/test_arith_detect_clip_bound.py
index 44ae24cb6815..d6953713f14b 100644
--- a/tests/python/unittest/test_arith_detect_clip_bound.py
+++ b/tests/python/unittest/test_arith_detect_clip_bound.py
@@ -15,22 +15,23 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_basic():
-    a = tvm.var("a")
-    b = tvm.var("b")
-    c = tvm.var("c")
-    m = tvm.arith.detect_clip_bound(tvm.all(a * 1 < b * 6,
+    a = te.var("a")
+    b = te.var("b")
+    c = te.var("c")
+    m = tvm.arith.detect_clip_bound(tvm.tir.all(a * 1 < b * 6,
                                           a - 1 > 0), [a])
-    assert tvm.ir_pass.Simplify(m[1] - (b * 6 - 1)).value == 0
+    assert tvm.tir.ir_pass.Simplify(m[1] - (b * 6 - 1)).value == 0
     assert m[0].value == 2
-    m = tvm.arith.detect_clip_bound(tvm.all(a * 1 < b * 6,
+    m = tvm.arith.detect_clip_bound(tvm.tir.all(a * 1 < b * 6,
                                           a - 1 > 0), [a, b])
     assert len(m) == 0
-    m = tvm.arith.detect_clip_bound(tvm.all(a + 10 * c <= 20,
+    m = tvm.arith.detect_clip_bound(tvm.tir.all(a + 10 * c <= 20,
                                           b - 1 > 0), [a, b])
-    assert tvm.ir_pass.Simplify(m[1] - (20 - 10 * c)).value == 0
-    assert tvm.ir_pass.Simplify(m[2] - 2).value == 0
+    assert tvm.tir.ir_pass.Simplify(m[1] - (20 - 10 * c)).value == 0
+    assert tvm.tir.ir_pass.Simplify(m[2] - 2).value == 0
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_arith_detect_linear_equation.py b/tests/python/unittest/test_arith_detect_linear_equation.py
index 3b103026aec3..c6e6b753a692 100644
--- a/tests/python/unittest/test_arith_detect_linear_equation.py
+++ b/tests/python/unittest/test_arith_detect_linear_equation.py
@@ -15,20 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_basic():
-    a = tvm.var("a")
-    b = tvm.var("b")
+    a = te.var("a")
+    b = te.var("b")
     m = tvm.arith.detect_linear_equation(a * 4 + b * 6 + 7, [a])
     assert m[0].value == 4
-    assert tvm.ir_pass.Simplify(m[1] - (b * 6 + 7)).value == 0
+    assert tvm.tir.ir_pass.Simplify(m[1] - (b * 6 + 7)).value == 0
 
     m = tvm.arith.detect_linear_equation(a * 4 * (a+1) + b * 6 + 7, [a])
     assert len(m) == 0
 
     m = tvm.arith.detect_linear_equation(a * 4  + (a+1) + b * 6 + 7, [a])
     assert m[0].value == 5
-    assert tvm.ir_pass.Simplify(m[1] - (b * 6 + 7 + 1)).value == 0
+    assert tvm.tir.ir_pass.Simplify(m[1] - (b * 6 + 7 + 1)).value == 0
 
     m = tvm.arith.detect_linear_equation(a * b + 7, [a])
     assert m[0] == b
@@ -38,13 +39,13 @@ def test_basic():
 
     m = tvm.arith.detect_linear_equation(b * 7, [])
     assert len(m) == 1
-    assert tvm.ir_pass.Simplify(m[0] - b * 7).value == 0
+    assert tvm.tir.ir_pass.Simplify(m[0] - b * 7).value == 0
 
 def test_multivariate():
-    v = [tvm.var("v%d" % i) for i in range(4)]
-    b = tvm.var("b")
+    v = [te.var("v%d" % i) for i in range(4)]
+    b = te.var("b")
     m = tvm.arith.detect_linear_equation(v[0] * (b + 4) + v[0] + v[1] * 8, v)
-    assert(tvm.ir_pass.Equal(tvm.ir_pass.Simplify(m[0]), b + 5))
+    assert(tvm.tir.ir_pass.Equal(tvm.tir.ir_pass.Simplify(m[0]), b + 5))
     assert(m[1].value == 8)
 
     m = tvm.arith.detect_linear_equation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[2], v)
@@ -60,11 +61,11 @@ def test_multivariate():
 
     m = tvm.arith.detect_linear_equation((v[0] - v[1]), [v[2]])
     assert(m[0].value == 0)
-    assert(tvm.ir_pass.Simplify(m[1] - (v[0] - v[1])).value == 0)
+    assert(tvm.tir.ir_pass.Simplify(m[1] - (v[0] - v[1])).value == 0)
 
     m = tvm.arith.detect_linear_equation((v[0] - v[1]), [])
     assert(len(m) == 1)
-    assert(tvm.ir_pass.Simplify(m[0] - (v[0] - v[1])).value == 0)
+    assert(tvm.tir.ir_pass.Simplify(m[0] - (v[0] - v[1])).value == 0)
 
 if __name__ == "__main__":
     test_basic()
diff --git a/tests/python/unittest/test_arith_domain_touched.py b/tests/python/unittest/test_arith_domain_touched.py
index 7876fb6c4d37..0d769aabf247 100644
--- a/tests/python/unittest/test_arith_domain_touched.py
+++ b/tests/python/unittest/test_arith_domain_touched.py
@@ -15,14 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_domain_touched():
-    i = tvm.var('i')
-    j = tvm.var('j')
-    n = tvm.convert(100)
-    m = tvm.var('m')
-    a = tvm.placeholder((n, m), name = 'a')
-    b = tvm.placeholder((n, m), name = 'b')
+    i = te.var('i')
+    j = te.var('j')
+    n = tvm.runtime.convert(100)
+    m = te.var('m')
+    a = te.placeholder((n, m), name = 'a')
+    b = te.placeholder((n, m), name = 'b')
     ir = tvm.tir.For(
             i, 0, n, 0, 0,
             tvm.tir.For(j, 0, m, 0, 0,
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index dad2fa705b0f..8352d9cf22dd 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 
 class IntSetChecker:
@@ -27,7 +28,7 @@ def err_msg():
             return "\ndata={}\ndmap={}\nres={}\nexpected={}".format(data, dmap, res, expected)
         def equal(x, y):
             res = self.analyzer.canonical_simplify(x - y)
-            return tvm.ir_pass.Equal(res, 0)
+            return tvm.tir.ir_pass.Equal(res, 0)
         assert equal(res.min_value, expected[0]), err_msg()
         assert equal(res.max_value, expected[1]), err_msg()
 
@@ -52,7 +53,7 @@ def test_vector():
 
 def test_add_sub():
     ck = IntSetChecker()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
     ck.verify(x + y, {x : tvm.arith.IntervalSet(0, 10)}, (y, 10 + y))
     ck.verify(x + y,
               {x : tvm.arith.IntervalSet(0, 10), y : tvm.arith.IntervalSet(1, 11)},
@@ -63,9 +64,9 @@ def test_add_sub():
 
 def test_mul_div():
     ck = IntSetChecker()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
 
-    tdiv = tvm.truncdiv
+    tdiv = tvm.tir.truncdiv
     ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
     ck.verify(x * y, {x : tvm.arith.IntervalSet(0, 10)}, (0, 10 * y))
     ck.verify(x * 2, {x : tvm.arith.IntervalSet(1, 10)}, (2, 20))
@@ -74,35 +75,35 @@ def test_mul_div():
     ck.verify(tdiv(x, y), {x : tvm.arith.IntervalSet(0, 10)}, (0, tdiv(10, y)))
     ck.verify(tdiv(x, 2), {x : tvm.arith.IntervalSet(1, 10)}, (0, 5))
 
-    fld = tvm.floordiv
+    fld = tvm.te.floordiv
     ck.verify(fld(x, y), {x : tvm.arith.IntervalSet(0, 10)}, (0, fld(10, y)))
     ck.verify(fld(x, 2), {x : tvm.arith.IntervalSet(-1, 10)}, (-1, 5))
 
 
 def test_mod():
     ck = IntSetChecker()
-    x, y = tvm.var("x"), tvm.var("y")
-    tmod = tvm.truncmod
+    x, y = te.var("x"), te.var("y")
+    tmod = tvm.tir.truncmod
     ck.analyzer.update(y, tvm.arith.ConstIntBound(1, 100), override=True)
     ck.verify(tmod(x, y), {x : tvm.arith.IntervalSet(0, 10)}, (0, y - 1))
     ck.verify(tmod(x, 10), {x : tvm.arith.IntervalSet(1, 10)}, (0, 9))
 
-    flm = tvm.floormod
+    flm = tvm.te.floormod
     ck.verify(flm(x, 10), {x : tvm.arith.IntervalSet(-10, 10)}, (0, 9))
 
 
 def test_max_min():
     ck = IntSetChecker()
-    x, y = tvm.var("x"), tvm.var("y")
-    ck.verify(tvm.max(x, x + 1), {x : tvm.arith.IntervalSet(0, 10)}, (1, 11))
-    ck.verify(tvm.min(x - 1, x + 1), {x : tvm.arith.IntervalSet(0, 10)}, (-1, 9))
-    ck.verify(tvm.min(x, y), {}, (tvm.min(x, y), tvm.min(x, y)))
-    ck.verify(tvm.max(x, y), {}, (tvm.max(x, y), tvm.max(x, y)))
+    x, y = te.var("x"), te.var("y")
+    ck.verify(tvm.te.max(x, x + 1), {x : tvm.arith.IntervalSet(0, 10)}, (1, 11))
+    ck.verify(tvm.te.min(x - 1, x + 1), {x : tvm.arith.IntervalSet(0, 10)}, (-1, 9))
+    ck.verify(tvm.te.min(x, y), {}, (tvm.te.min(x, y), tvm.te.min(x, y)))
+    ck.verify(tvm.te.max(x, y), {}, (tvm.te.max(x, y), tvm.te.max(x, y)))
 
 
 def test_select():
     ck = IntSetChecker()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
     ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1),
               {x : tvm.arith.IntervalSet(0, 10)}, (-1, 11))
 
diff --git a/tests/python/unittest/test_arith_modular_set.py b/tests/python/unittest/test_arith_modular_set.py
index 6bb86e4c4717..01180d2efb69 100644
--- a/tests/python/unittest/test_arith_modular_set.py
+++ b/tests/python/unittest/test_arith_modular_set.py
@@ -15,11 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 
 def test_cast():
     analyzer = tvm.arith.Analyzer()
-    x = tvm.var("x", dtype="int8")
+    x = te.var("x", dtype="int8")
     m = analyzer.modular_set((x * 3).astype("uint32"))
     assert m.coeff == 3
     assert m.base == 0
@@ -31,7 +32,7 @@ def test_cast():
 
 def test_add_sub():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x", "int64"), tvm.var("y", "int64")
+    x, y = te.var("x", "int64"), te.var("y", "int64")
     m = analyzer.modular_set(x * 6 + y * 4)
     assert m.coeff == 2
     assert m.base == 0
@@ -44,7 +45,7 @@ def test_add_sub():
 
 def test_mul():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
     m = analyzer.modular_set((x * 4 + 2) * (y * 6 + 1))
     assert m.coeff == 4
     assert m.base == 2
@@ -52,9 +53,9 @@ def test_mul():
 
 def test_div_shift():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
     # not sure if x is non-negative
-    tdiv = tvm.truncdiv
+    tdiv = tvm.tir.truncdiv
     m = analyzer.modular_set(tdiv(x * 4 + 2, 2))
     assert m.coeff == 1
     assert m.base == 0
@@ -62,7 +63,7 @@ def test_div_shift():
     m = analyzer.modular_set((x * 4 + 2) >> 1)
     assert m.coeff == 2
     assert m.base == 1
-    fld = tvm.floordiv
+    fld = tvm.te.floordiv
     m = analyzer.modular_set(fld(x * 4 + 2, 2))
     assert m.coeff == 2
     assert m.base == 1
@@ -75,12 +76,12 @@ def test_div_shift():
 
 def test_min_max_select():
     analyzer = tvm.arith.Analyzer()
-    x, y = tvm.var("x"), tvm.var("y")
-    m = analyzer.modular_set(tvm.min(x * 3, y * 9))
+    x, y = te.var("x"), te.var("y")
+    m = analyzer.modular_set(tvm.te.min(x * 3, y * 9))
     assert m.coeff == 3
     assert m.base == 0
 
-    m = analyzer.modular_set(tvm.max(x * 3 + 1, y * 9 + 4))
+    m = analyzer.modular_set(tvm.te.max(x * 3 + 1, y * 9 + 4))
     assert m.coeff == 3
     assert m.base == 1
 
@@ -90,10 +91,10 @@ def test_min_max_select():
 
 
 def test_mix_index():
-    a = tvm.var("a")
-    b = tvm.var("b")
+    a = te.var("a")
+    b = te.var("b")
     analyzer = tvm.arith.Analyzer()
-    tdiv = tvm.truncdiv
+    tdiv = tvm.tir.truncdiv
     m = analyzer.modular_set(a * 4 + b * 6 + 7)
     assert m.coeff == 2
     assert m.base == 1
@@ -114,16 +115,16 @@ def test_mix_index():
     assert m.coeff == 3
     assert m.base == 2
 
-    m = analyzer.modular_set(a * 12 + tvm.min(b * 3 * 7, 2))
+    m = analyzer.modular_set(a * 12 + tvm.te.min(b * 3 * 7, 2))
     assert m.coeff == 1
     assert m.base == 0
 
 
 def test_constraint_scope():
-    a = tvm.var("a")
-    b = tvm.var("b")
+    a = te.var("a")
+    b = te.var("b")
     analyzer = tvm.arith.Analyzer()
-    tmod = tvm.truncmod
+    tmod = tvm.tir.truncmod
 
     with analyzer.constraint_scope(tmod(b, 4) == 2):
         m = analyzer.modular_set(b + 1)
@@ -142,9 +143,9 @@ def test_constraint_scope():
     assert m.base == 0
 
 def test_intersect():
-    a = tvm.var("a")
+    a = te.var("a")
     analyzer = tvm.arith.Analyzer()
-    tmod = tvm.truncmod
+    tmod = tvm.tir.truncmod
     with analyzer.constraint_scope(tmod(a, 4) == 1):
         with analyzer.constraint_scope(tmod(a, 3) == 1):
             m = analyzer.modular_set(a)
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 84560e8c1f9d..c8c3b0bd9a3b 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 class RewriteChecker:
     def __init__(self):
@@ -22,12 +23,12 @@ def __init__(self):
 
     def verify(self, data, expected):
         res = self.analyzer.rewrite_simplify(data)
-        assert tvm.ir_pass.Equal(res, expected), "data={}, res={}, expected={}".format(data, res, expected)
+        assert tvm.tir.ir_pass.Equal(res, expected), "data={}, res={}, expected={}".format(data, res, expected)
 
 
 def test_vector_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
     # Add rules
     ck.verify(tvm.tir.Ramp(x, 1, 4) + tvm.tir.Ramp(y, 2, 4),
               tvm.tir.Ramp(x + y, 3, 4))
@@ -56,8 +57,8 @@ def test_vector_simplify():
               tvm.tir.Ramp(x * 2, 8, 4))
 
     ## DivMod rules
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     # truc div
     ck.verify(tdiv(y.astype("int32x2"), x.astype("int32x2")),
               tdiv(y, x).astype("int32x2"))
@@ -78,8 +79,8 @@ def test_vector_simplify():
               tmod(tvm.tir.Ramp(1, 15, 4), 8))
 
     # floor div
-    fld = tvm.floordiv
-    flm = tvm.floormod
+    fld = tvm.te.floordiv
+    flm = tvm.te.floormod
     ck.analyzer.update(x, tvm.arith.ConstIntBound(-10, 1000), override=True)
     ck.verify(fld(y.astype("int32x2"), x.astype("int32x2")),
               fld(y, x).astype("int32x2"))
@@ -99,16 +100,16 @@ def test_vector_simplify():
               flm(tvm.tir.Ramp(1, 15, 4), 8))
 
     # Min/Max rules
-    vx = tvm.var("vx", dtype="int32x2")
-    vc = tvm.var("vc", dtype="uint1")
-    ck.verify(tvm.min(y.astype("int32x2"), x.astype("int32x2")),
-              tvm.min(y, x).astype("int32x2"))
-    ck.verify(tvm.min(tvm.min(vx, y.astype("int32x2")), x.astype("int32x2")),
-              tvm.min(vx, tvm.min(y, x).astype("int32x2")))
-    ck.verify(tvm.max(y.astype("int32x2"), x.astype("int32x2")),
-              tvm.max(y, x).astype("int32x2"))
-    ck.verify(tvm.max(tvm.max(vx, y.astype("int32x2")), x.astype("int32x2")),
-              tvm.max(vx, tvm.max(y, x).astype("int32x2")))
+    vx = te.var("vx", dtype="int32x2")
+    vc = te.var("vc", dtype="uint1")
+    ck.verify(tvm.te.min(y.astype("int32x2"), x.astype("int32x2")),
+              tvm.te.min(y, x).astype("int32x2"))
+    ck.verify(tvm.te.min(tvm.te.min(vx, y.astype("int32x2")), x.astype("int32x2")),
+              tvm.te.min(vx, tvm.te.min(y, x).astype("int32x2")))
+    ck.verify(tvm.te.max(y.astype("int32x2"), x.astype("int32x2")),
+              tvm.te.max(y, x).astype("int32x2"))
+    ck.verify(tvm.te.max(tvm.te.max(vx, y.astype("int32x2")), x.astype("int32x2")),
+              tvm.te.max(vx, tvm.te.max(y, x).astype("int32x2")))
 
     ## Logical rules
     ck.verify(y.astype("int32x2").equal(x.astype("int32x2")),
@@ -131,7 +132,7 @@ def test_vector_simplify():
 
 def test_select_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
     # Add rules
     ck.verify(tvm.tir.Select(x < 0, y, 0) + tvm.tir.Select(x < 0, 1, z),
               tvm.tir.Select(x < 0, y + 1, z))
@@ -141,10 +142,10 @@ def test_select_simplify():
               tvm.tir.Select(x < 0, 0, z - y))
     ck.verify(tvm.tir.Select(x < 0, y, z) - z,
               tvm.tir.Select(x < 0, y - z, 0))
-    ck.verify(tvm.min(tvm.tir.Select(x < 0, y, 0), tvm.tir.Select(x < 0, 1, z)),
-              tvm.tir.Select(x < 0, tvm.min(y, 1), tvm.min(0, z)))
-    ck.verify(tvm.max(tvm.tir.Select(x < 0, y, 0), tvm.tir.Select(x < 0, 1, z)),
-              tvm.tir.Select(x < 0, tvm.max(y, 1), tvm.max(0, z)))
+    ck.verify(tvm.te.min(tvm.tir.Select(x < 0, y, 0), tvm.tir.Select(x < 0, 1, z)),
+              tvm.tir.Select(x < 0, tvm.te.min(y, 1), tvm.te.min(0, z)))
+    ck.verify(tvm.te.max(tvm.tir.Select(x < 0, y, 0), tvm.tir.Select(x < 0, 1, z)),
+              tvm.tir.Select(x < 0, tvm.te.max(y, 1), tvm.te.max(0, z)))
 
     ck.verify(tvm.tir.Select(x * 3 + 1 != 0, y, z), y)
     ck.verify(tvm.tir.Select(x * 3 + 1 == 0, y, z), z)
@@ -153,30 +154,30 @@ def test_select_simplify():
 
 def test_add_index_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
 
     ck.verify(x + (y - x), y)
     ck.verify(x - (y + 1) + (y + 1), x)
     ck.verify((x - 10) + (10 - z), x - z)
     ck.verify((x - y) + (z - x), z - y)
 
-    ck.verify(tvm.min(x, y - z) + z, tvm.min(x + z, y))
-    ck.verify(tvm.min(x - z, y) + z, tvm.min(x, y + z))
-    ck.verify(tvm.max(x, y - 10) + 10, tvm.max(x + 10, y))
-    ck.verify(tvm.max(x - 11, y) + 11, tvm.max(x, y + 11))
+    ck.verify(tvm.te.min(x, y - z) + z, tvm.te.min(x + z, y))
+    ck.verify(tvm.te.min(x - z, y) + z, tvm.te.min(x, y + z))
+    ck.verify(tvm.te.max(x, y - 10) + 10, tvm.te.max(x + 10, y))
+    ck.verify(tvm.te.max(x - 11, y) + 11, tvm.te.max(x, y + 11))
 
-    ck.verify(tvm.max(x, y * 2) + tvm.min(x, y * 2), x + y * 2);
-    ck.verify(tvm.min(x, y * 2) + tvm.max(x, y * 2), x + y * 2);
+    ck.verify(tvm.te.max(x, y * 2) + tvm.te.min(x, y * 2), x + y * 2);
+    ck.verify(tvm.te.min(x, y * 2) + tvm.te.max(x, y * 2), x + y * 2);
 
-    ck.verify(tvm.max(x, y + 2) + (-2), tvm.max(x + (-2), y));
-    ck.verify(tvm.min(x, y + 2) + (-2), tvm.min(x + (-2), y));
-    ck.verify(tvm.min(x + 2, y + 3) + (-2), tvm.min(x, y + 1));
+    ck.verify(tvm.te.max(x, y + 2) + (-2), tvm.te.max(x + (-2), y));
+    ck.verify(tvm.te.min(x, y + 2) + (-2), tvm.te.min(x + (-2), y));
+    ck.verify(tvm.te.min(x + 2, y + 3) + (-2), tvm.te.min(x, y + 1));
 
-    ck.verify(tvm.max(0, 1 - x * 4) + x * 4, tvm.max(x * 4, 1))
-    ck.verify(tvm.max(2 - x * 4, 0) + x * 4, tvm.max(x * 4, 2))
+    ck.verify(tvm.te.max(0, 1 - x * 4) + x * 4, tvm.te.max(x * 4, 1))
+    ck.verify(tvm.te.max(2 - x * 4, 0) + x * 4, tvm.te.max(x * 4, 2))
 
-    ck.verify(tvm.min(0, 1 - x * 4) + x * 4, tvm.min(x * 4, 1))
-    ck.verify(tvm.min(2 - x * 4, 0) + x * 4, tvm.min(x * 4, 2))
+    ck.verify(tvm.te.min(0, 1 - x * 4) + x * 4, tvm.te.min(x * 4, 1))
+    ck.verify(tvm.te.min(2 - x * 4, 0) + x * 4, tvm.te.min(x * 4, 2))
 
     ck.verify(x * y + x * 10, x * (y + 10))
     ck.verify(y * x + x * 10, x * (y + 10))
@@ -189,16 +190,16 @@ def test_add_index_simplify():
     ck.verify(x + 2 + 3 + 4 + x * 3, x * 4 + 9);
 
     # DivMod rules
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     # truc div
     ck.verify(y * tmod(x, 8) + 10 * tmod(x, 8), tmod(x, 8) * (y + 10))
     ck.analyzer.update(x, tvm.arith.ConstIntBound(-1, 1000), override=True)
     ck.verify(tdiv(x, 8) * 8 + tmod(x, 8), x)
 
     # floor div
-    fld = tvm.floordiv
-    flm = tvm.floormod
+    fld = tvm.te.floordiv
+    flm = tvm.te.floormod
     ck.verify(y * flm(x, 8) + 10 * flm(x, 8), flm(x, 8) * (y + 10))
     ck.verify(fld(x, 8) * 8 + flm(x, 8), x)
 
@@ -206,22 +207,22 @@ def test_add_index_simplify():
 
 def test_sub_index_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
 
     ck.verify(x + y - y, x)
     ck.verify(x + y - x, y)
     ck.verify(x - (y + x), 0 - y)
     ck.verify(x - (x + y), 0 - y)
 
-    ck.verify(tvm.min(x, y) - x, tvm.min(0, y - x))
-    ck.verify(tvm.min(x, y) - y, tvm.min(x - y, 0))
-    ck.verify(tvm.max(x, y) - x, tvm.max(0, y - x))
-    ck.verify(tvm.max(x, y) - y, tvm.max(x - y, 0))
+    ck.verify(tvm.te.min(x, y) - x, tvm.te.min(0, y - x))
+    ck.verify(tvm.te.min(x, y) - y, tvm.te.min(x - y, 0))
+    ck.verify(tvm.te.max(x, y) - x, tvm.te.max(0, y - x))
+    ck.verify(tvm.te.max(x, y) - y, tvm.te.max(x - y, 0))
 
-    ck.verify(x - tvm.min(x, y), tvm.max(0, x - y))
-    ck.verify(y - tvm.min(x, y), tvm.max(y - x, 0))
-    ck.verify(x - tvm.max(x, y), tvm.min(0, x - y))
-    ck.verify(y - tvm.max(x, y), tvm.min(y - x, 0))
+    ck.verify(x - tvm.te.min(x, y), tvm.te.max(0, x - y))
+    ck.verify(y - tvm.te.min(x, y), tvm.te.max(y - x, 0))
+    ck.verify(x - tvm.te.max(x, y), tvm.te.min(0, x - y))
+    ck.verify(y - tvm.te.max(x, y), tvm.te.min(y - x, 0))
 
     # mul co-efficient foldng
     ck.verify(x - x, 0)
@@ -238,30 +239,30 @@ def test_sub_index_simplify():
     ck.verify((x + y) - (z + x), y - z)
     ck.verify((y + x) - (z + x), y - z)
 
-    ck.verify(tvm.min(x + y, z) - x, tvm.min(y, z - x))
-    ck.verify(tvm.min(y + x, z) - x, tvm.min(y, z - x))
-    ck.verify(tvm.min(z, x + y) - x, tvm.min(z - x, y))
-    ck.verify(tvm.min(z, y + x) - x, tvm.min(z - x, y))
+    ck.verify(tvm.te.min(x + y, z) - x, tvm.te.min(y, z - x))
+    ck.verify(tvm.te.min(y + x, z) - x, tvm.te.min(y, z - x))
+    ck.verify(tvm.te.min(z, x + y) - x, tvm.te.min(z - x, y))
+    ck.verify(tvm.te.min(z, y + x) - x, tvm.te.min(z - x, y))
 
-    ck.verify(tvm.max(x + y, z) - x, tvm.max(y, z - x))
-    ck.verify(tvm.max(y + x, z) - x, tvm.max(y, z - x))
-    ck.verify(tvm.max(z, x + y) - x, tvm.max(z - x, y))
-    ck.verify(tvm.max(z, y + x) - x, tvm.max(z - x, y))
+    ck.verify(tvm.te.max(x + y, z) - x, tvm.te.max(y, z - x))
+    ck.verify(tvm.te.max(y + x, z) - x, tvm.te.max(y, z - x))
+    ck.verify(tvm.te.max(z, x + y) - x, tvm.te.max(z - x, y))
+    ck.verify(tvm.te.max(z, y + x) - x, tvm.te.max(z - x, y))
 
-    ck.verify(x - tvm.min(x + y, z), tvm.max(0 - y, x - z))
-    ck.verify(x - tvm.min(y + x, z), tvm.max(0 - y, x - z))
-    ck.verify(x - tvm.min(z, x + y), tvm.max(x - z, 0 - y))
-    ck.verify(x - tvm.min(z, y + x), tvm.max(x - z, 0 - y))
+    ck.verify(x - tvm.te.min(x + y, z), tvm.te.max(0 - y, x - z))
+    ck.verify(x - tvm.te.min(y + x, z), tvm.te.max(0 - y, x - z))
+    ck.verify(x - tvm.te.min(z, x + y), tvm.te.max(x - z, 0 - y))
+    ck.verify(x - tvm.te.min(z, y + x), tvm.te.max(x - z, 0 - y))
 
-    ck.verify(tvm.min(x, y) - tvm.min(y, x), 0)
-    ck.verify(tvm.max(x, y) - tvm.max(y, x), 0)
-    ck.verify(tvm.min(x, y) - tvm.min(x + 10, y + 10), -10)
-    ck.verify(tvm.min(x + 10, y + 1) - tvm.min(x, y - 9), 10)
+    ck.verify(tvm.te.min(x, y) - tvm.te.min(y, x), 0)
+    ck.verify(tvm.te.max(x, y) - tvm.te.max(y, x), 0)
+    ck.verify(tvm.te.min(x, y) - tvm.te.min(x + 10, y + 10), -10)
+    ck.verify(tvm.te.min(x + 10, y + 1) - tvm.te.min(x, y - 9), 10)
 
     # DivMod patterns
     # truc div
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
     ck.verify(x - tdiv(x, 3) * 3, tmod(x, 3))
 
@@ -289,8 +290,8 @@ def test_sub_index_simplify():
     ck.verify(tdiv(y - z, 3) * 6 - 2 * y, (0 - tmod(y - z, 3) - z) * 2)
 
     # floor div
-    fld = tvm.floordiv
-    flm = tvm.floormod
+    fld = tvm.te.floordiv
+    flm = tvm.te.floormod
     ck.analyzer.update(x, tvm.arith.ConstIntBound(-1000, 1000), override=True)
     ck.analyzer.update(y, tvm.arith.ConstIntBound(-1000, 1000), override=True)
     ck.verify(x - fld(x, 3) * 3, flm(x, 3))
@@ -318,19 +319,19 @@ def test_sub_index_simplify():
 
 def test_mul_index_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
     ck.verify((x + 2) * 3, x * 3 + 6)
     ck.verify((x * 2) * 3, x * 6)
-    ck.verify(tvm.min(x, y) * tvm.max(x, y), x * y)
-    ck.verify(tvm.max(x, y) * tvm.min(x, y), x * y)
+    ck.verify(tvm.te.min(x, y) * tvm.te.max(x, y), x * y)
+    ck.verify(tvm.te.max(x, y) * tvm.te.min(x, y), x * y)
     ck.verify((x - y) * (-2), (y - x) * 2)
 
 
 def test_div_index_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
 
     ck.verify(tdiv(x, x), 1)
     ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
@@ -343,12 +344,12 @@ def test_div_index_simplify():
     ck.verify(tdiv(x * 4, 2), x * 2)
 
     ck.verify(tdiv(x * 4 + y, 2), x * 2 + tdiv(y, 2))
-    ck.verify(tdiv(tvm.min(x * 6, y), 2), tvm.min(x * 3, tdiv(y, 2)))
-    ck.verify(tdiv(tvm.max(x * 6, y), 2), tvm.max(x * 3, tdiv(y, 2)))
+    ck.verify(tdiv(tvm.te.min(x * 6, y), 2), tvm.te.min(x * 3, tdiv(y, 2)))
+    ck.verify(tdiv(tvm.te.max(x * 6, y), 2), tvm.te.max(x * 3, tdiv(y, 2)))
 
     ck.verify(tdiv(y + x * 4, 2), tdiv(y, 2) + x * 2)
-    ck.verify(tdiv(tvm.min(y, x * 6), 2), tvm.min(tdiv(y, 2), x * 3))
-    ck.verify(tdiv(tvm.max(y, x * 6), 2), tvm.max(tdiv(y, 2), x * 3))
+    ck.verify(tdiv(tvm.te.min(y, x * 6), 2), tvm.te.min(tdiv(y, 2), x * 3))
+    ck.verify(tdiv(tvm.te.max(y, x * 6), 2), tvm.te.max(tdiv(y, 2), x * 3))
 
     # 3-operands
     ck.verify(tdiv(x * 6 + y + z, 2), x * 3 + tdiv(y + z, 2))
@@ -375,9 +376,9 @@ def test_div_index_simplify():
 
 def test_floordiv_index_simplify():
     # short name for floordiv
-    fld = tvm.floordiv
+    fld = tvm.te.floordiv
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
 
     ck.verify(fld(fld(x, 2), 3), fld(x, 6))
     ck.verify(fld(fld(x, 2) + 1, 3), fld(x + 2, 6))
@@ -386,12 +387,12 @@ def test_floordiv_index_simplify():
     ck.verify(fld(x * 4, 2), x * 2)
 
     ck.verify(fld(x * 4 + y, 2), x * 2 + fld(y, 2))
-    ck.verify(fld(tvm.min(x * 6, y), 2), tvm.min(x * 3, fld(y, 2)))
-    ck.verify(fld(tvm.max(x * 6, y), 2), tvm.max(x * 3, fld(y, 2)))
+    ck.verify(fld(tvm.te.min(x * 6, y), 2), tvm.te.min(x * 3, fld(y, 2)))
+    ck.verify(fld(tvm.te.max(x * 6, y), 2), tvm.te.max(x * 3, fld(y, 2)))
 
     ck.verify(fld(y + x * 4, 2), fld(y, 2) + x * 2)
-    ck.verify(fld(tvm.min(y, x * 6), 2), tvm.min(fld(y, 2), x * 3))
-    ck.verify(fld(tvm.max(y, x * 6), 2), tvm.max(fld(y, 2), x * 3))
+    ck.verify(fld(tvm.te.min(y, x * 6), 2), tvm.te.min(fld(y, 2), x * 3))
+    ck.verify(fld(tvm.te.max(y, x * 6), 2), tvm.te.max(fld(y, 2), x * 3))
 
     # 3-operands
     ck.verify(fld(x * 6 + y + z, 2), x * 3 + fld(y + z, 2))
@@ -420,13 +421,13 @@ def test_floordiv_index_simplify():
 
 def test_mod_index_simplify():
     ck = RewriteChecker()
-    x, y, nx, ny, z = tvm.var("x"), tvm.var("y"), tvm.var("nx"), tvm.var("ny"), tvm.var("z")
+    x, y, nx, ny, z = te.var("x"), te.var("y"), te.var("nx"), te.var("ny"), te.var("z")
     ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
     ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 1000), override=True)
     ck.analyzer.update(nx, tvm.arith.ConstIntBound(-1000, 0), override=True)
     ck.analyzer.update(ny, tvm.arith.ConstIntBound(-1000, 0), override=True)
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
 
     ck.verify(tmod(x * 10, 2), 0)
     ck.verify(tmod(x * 10 + y, 2), tmod(y, 2))
@@ -456,11 +457,11 @@ def test_mod_index_simplify():
 
 def test_floormod_index_simplify():
     # short name for floordiv
-    flm = tvm.floormod
+    flm = tvm.te.floormod
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
     ck = RewriteChecker()
-    x, y, nx, ny, z = tvm.var("x"), tvm.var("y"), tvm.var("nx"), tvm.var("ny"), tvm.var("z")
+    x, y, nx, ny, z = te.var("x"), te.var("y"), te.var("nx"), te.var("ny"), te.var("z")
 
     ck.verify(flm(x * 10, 2), 0)
     ck.verify(flm(x * 10 + y, 2), flm(y, 2))
@@ -475,172 +476,172 @@ def test_floormod_index_simplify():
 
 def test_min_index_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
-    fld = tvm.floordiv
-    flm = tvm.floormod
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
+    fld = tvm.te.floordiv
+    flm = tvm.te.floormod
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     # const int bound
-    ck.verify(tvm.min(tmod(x, 2), tmod(y, 2) + 10), tmod(x, 2))
-    ck.verify(tvm.min(flm(x, 2), flm(y, 2) + 10), flm(x, 2))
-
-    ck.verify(tvm.min(x + 1, x + 10), x + 1)
-    ck.verify(tvm.min(x + 111, x + 10), x + 10)
-    ck.verify(tvm.min(x + 1, x), x)
-    ck.verify(tvm.min(x, x + 2), x)
-    ck.verify(tvm.min(1 - x, 2 - x), 1 - x)
-    ck.verify(tvm.min(3 - x, 2 - x), 2 - x)
-
-    ck.verify(tvm.min(tvm.max(x, y), tvm.min(x, y)), tvm.min(x, y))
-    ck.verify(tvm.min(tvm.max(x, y), tvm.min(y, x)), tvm.min(x, y))
-
-    ck.verify(tvm.min(tvm.max(x, y), x), x)
-    ck.verify(tvm.min(tvm.max(y, x), x), x)
-    ck.verify(tvm.min(tvm.min(x, y), x), tvm.min(x, y))
-    ck.verify(tvm.min(tvm.min(x, y), y), tvm.min(x, y))
-
-    ck.verify(tvm.min(x, tvm.max(x, y)), x)
-    ck.verify(tvm.min(x, tvm.max(y, x)), x)
-    ck.verify(tvm.min(x, tvm.min(x, y)), tvm.min(x, y))
-    ck.verify(tvm.min(y, tvm.min(x, y)), tvm.min(x, y))
-
-    ck.verify(tvm.min(tvm.min(tvm.min(x, y), z), y),
-              tvm.min(tvm.min(x, y), z))
-    ck.verify(tvm.min(tvm.min(tvm.min(tvm.min(x, y), z), x * 2), y),
-              tvm.min(tvm.min(tvm.min(x, y), z), x * 2))
-    ck.verify(tvm.min(tvm.min(tvm.min(tvm.min(tvm.min(x, y), z), x * 2), z * 2), y),
-              tvm.min(tvm.min(tvm.min(tvm.min(x, y), z), x * 2), z * 2))
-
-    ck.verify(tvm.min(tvm.max(x, y), tvm.max(x, z)), tvm.max(tvm.min(y, z), x))
-    ck.verify(tvm.min(tvm.max(x, y), tvm.max(z, x)), tvm.max(tvm.min(y, z), x))
-    ck.verify(tvm.min(tvm.max(y, x), tvm.max(x, z)), tvm.max(tvm.min(y, z), x))
-    ck.verify(tvm.min(tvm.max(y, x), tvm.max(z, x)), tvm.max(tvm.min(y, z), x))
-
-    ck.verify(tvm.min(y + x, z + x), tvm.min(y, z) + x)
-    ck.verify(tvm.min(y + x, x + z), tvm.min(y, z) + x)
-    ck.verify(tvm.min(x + y, z + x), tvm.min(y, z) + x)
-    ck.verify(tvm.min(x + y, x + z), tvm.min(y, z) + x)
-
-    ck.verify(tvm.min(x - y, x - z), x - tvm.max(y, z))
-    ck.verify(tvm.min(y - x, z - x), tvm.min(y, z) - x)
-
-    ck.verify(tvm.min(tvm.min(x, 1), 10), tvm.min(x, 1))
-    ck.verify(tvm.min(tvm.min(x, 11), 10), tvm.min(x, 10))
-
-    ck.verify(tvm.min(x * 3, 9), tvm.min(x, 3) * 3)
-    ck.verify(tvm.min(3 - x, 2), 3 - tvm.max(x,  1))
+    ck.verify(tvm.te.min(tmod(x, 2), tmod(y, 2) + 10), tmod(x, 2))
+    ck.verify(tvm.te.min(flm(x, 2), flm(y, 2) + 10), flm(x, 2))
+
+    ck.verify(tvm.te.min(x + 1, x + 10), x + 1)
+    ck.verify(tvm.te.min(x + 111, x + 10), x + 10)
+    ck.verify(tvm.te.min(x + 1, x), x)
+    ck.verify(tvm.te.min(x, x + 2), x)
+    ck.verify(tvm.te.min(1 - x, 2 - x), 1 - x)
+    ck.verify(tvm.te.min(3 - x, 2 - x), 2 - x)
+
+    ck.verify(tvm.te.min(tvm.te.max(x, y), tvm.te.min(x, y)), tvm.te.min(x, y))
+    ck.verify(tvm.te.min(tvm.te.max(x, y), tvm.te.min(y, x)), tvm.te.min(x, y))
+
+    ck.verify(tvm.te.min(tvm.te.max(x, y), x), x)
+    ck.verify(tvm.te.min(tvm.te.max(y, x), x), x)
+    ck.verify(tvm.te.min(tvm.te.min(x, y), x), tvm.te.min(x, y))
+    ck.verify(tvm.te.min(tvm.te.min(x, y), y), tvm.te.min(x, y))
+
+    ck.verify(tvm.te.min(x, tvm.te.max(x, y)), x)
+    ck.verify(tvm.te.min(x, tvm.te.max(y, x)), x)
+    ck.verify(tvm.te.min(x, tvm.te.min(x, y)), tvm.te.min(x, y))
+    ck.verify(tvm.te.min(y, tvm.te.min(x, y)), tvm.te.min(x, y))
+
+    ck.verify(tvm.te.min(tvm.te.min(tvm.te.min(x, y), z), y),
+              tvm.te.min(tvm.te.min(x, y), z))
+    ck.verify(tvm.te.min(tvm.te.min(tvm.te.min(tvm.te.min(x, y), z), x * 2), y),
+              tvm.te.min(tvm.te.min(tvm.te.min(x, y), z), x * 2))
+    ck.verify(tvm.te.min(tvm.te.min(tvm.te.min(tvm.te.min(tvm.te.min(x, y), z), x * 2), z * 2), y),
+              tvm.te.min(tvm.te.min(tvm.te.min(tvm.te.min(x, y), z), x * 2), z * 2))
+
+    ck.verify(tvm.te.min(tvm.te.max(x, y), tvm.te.max(x, z)), tvm.te.max(tvm.te.min(y, z), x))
+    ck.verify(tvm.te.min(tvm.te.max(x, y), tvm.te.max(z, x)), tvm.te.max(tvm.te.min(y, z), x))
+    ck.verify(tvm.te.min(tvm.te.max(y, x), tvm.te.max(x, z)), tvm.te.max(tvm.te.min(y, z), x))
+    ck.verify(tvm.te.min(tvm.te.max(y, x), tvm.te.max(z, x)), tvm.te.max(tvm.te.min(y, z), x))
+
+    ck.verify(tvm.te.min(y + x, z + x), tvm.te.min(y, z) + x)
+    ck.verify(tvm.te.min(y + x, x + z), tvm.te.min(y, z) + x)
+    ck.verify(tvm.te.min(x + y, z + x), tvm.te.min(y, z) + x)
+    ck.verify(tvm.te.min(x + y, x + z), tvm.te.min(y, z) + x)
+
+    ck.verify(tvm.te.min(x - y, x - z), x - tvm.te.max(y, z))
+    ck.verify(tvm.te.min(y - x, z - x), tvm.te.min(y, z) - x)
+
+    ck.verify(tvm.te.min(tvm.te.min(x, 1), 10), tvm.te.min(x, 1))
+    ck.verify(tvm.te.min(tvm.te.min(x, 11), 10), tvm.te.min(x, 10))
+
+    ck.verify(tvm.te.min(x * 3, 9), tvm.te.min(x, 3) * 3)
+    ck.verify(tvm.te.min(3 - x, 2), 3 - tvm.te.max(x,  1))
 
     # DivMod rules
     # truc div
     ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000))
-    ck.verify(tvm.min(tdiv(x + 3, 4) * 4, x), x)
-    ck.verify(tvm.min(tdiv(x + 3, 4) * 4, tvm.max(x, 4)), tvm.max(x, 4))
-    ck.verify(tvm.min(x, tdiv(x + 3, 4) * 4), x)
-    ck.verify(tvm.min(tvm.max(x, 4), tdiv(x + 3, 4) * 4), tvm.max(x, 4))
+    ck.verify(tvm.te.min(tdiv(x + 3, 4) * 4, x), x)
+    ck.verify(tvm.te.min(tdiv(x + 3, 4) * 4, tvm.te.max(x, 4)), tvm.te.max(x, 4))
+    ck.verify(tvm.te.min(x, tdiv(x + 3, 4) * 4), x)
+    ck.verify(tvm.te.min(tvm.te.max(x, 4), tdiv(x + 3, 4) * 4), tvm.te.max(x, 4))
     ck.analyzer.update(x, tvm.arith.ConstIntBound(-1000, 1000), True)
-    ck.verify(tvm.min(tdiv(x, 10), tdiv(y, 10)), tdiv(tvm.min(x, y), 10))
-    ck.verify(tvm.min(tdiv(x, (-10)), tdiv(y, (-10))),
-              tdiv(tvm.max(x, y), (-10)))
+    ck.verify(tvm.te.min(tdiv(x, 10), tdiv(y, 10)), tdiv(tvm.te.min(x, y), 10))
+    ck.verify(tvm.te.min(tdiv(x, (-10)), tdiv(y, (-10))),
+              tdiv(tvm.te.max(x, y), (-10)))
 
     # floor div
     ck.analyzer.update(x, tvm.arith.ConstIntBound(-1000, 1000), True)
-    ck.verify(tvm.min(fld(x + 3, 4) * 4, x), x)
-    ck.verify(tvm.min(fld(x + 3, 4) * 4, tvm.max(x, 4)), tvm.max(x, 4))
-    ck.verify(tvm.min(x, fld(x + 3, 4) * 4), x)
-    ck.verify(tvm.min(x, fld(x, 4) * 4), fld(x, 4) * 4)
-    ck.verify(tvm.min(tvm.max(x, 4), fld(x + 3, 4) * 4), tvm.max(x, 4))
-    ck.verify(tvm.min(fld(x, 10), fld(y, 10)), fld(tvm.min(x, y), 10))
-    ck.verify(tvm.min(fld(x, (-10)), fld(y, (-10))), fld(tvm.max(x, y), (-10)))
+    ck.verify(tvm.te.min(fld(x + 3, 4) * 4, x), x)
+    ck.verify(tvm.te.min(fld(x + 3, 4) * 4, tvm.te.max(x, 4)), tvm.te.max(x, 4))
+    ck.verify(tvm.te.min(x, fld(x + 3, 4) * 4), x)
+    ck.verify(tvm.te.min(x, fld(x, 4) * 4), fld(x, 4) * 4)
+    ck.verify(tvm.te.min(tvm.te.max(x, 4), fld(x + 3, 4) * 4), tvm.te.max(x, 4))
+    ck.verify(tvm.te.min(fld(x, 10), fld(y, 10)), fld(tvm.te.min(x, y), 10))
+    ck.verify(tvm.te.min(fld(x, (-10)), fld(y, (-10))), fld(tvm.te.max(x, y), (-10)))
 
 
 def test_max_index_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
-    flm = tvm.floormod
-    fld = tvm.floordiv
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
+    flm = tvm.te.floormod
+    fld = tvm.te.floordiv
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     # const int bound
-    ck.verify(tvm.max(tmod(x, 2), tmod(y, 2) + 10), tmod(y, 2) + 10)
-    ck.verify(tvm.max(flm(x, 2), flm(y, 2) + 10), flm(y, 2) + 10)
-
-    ck.verify(tvm.max(x + 1, x + 10), x + 10)
-    ck.verify(tvm.max(x + 111, x + 10), x + 111)
-    ck.verify(tvm.max(x + 1, x), x + 1)
-    ck.verify(tvm.max(x, x + 2), x + 2)
-    ck.verify(tvm.max(1 - x, 2 - x), 2 - x)
-    ck.verify(tvm.max(3 - x, 2 - x), 3 - x)
-
-    ck.verify(tvm.max(tvm.min(x, y), tvm.max(x, y)), tvm.max(x, y))
-    ck.verify(tvm.max(tvm.min(x, y), tvm.max(y, x)), tvm.max(x, y))
-
-    ck.verify(tvm.max(tvm.min(x, y), x), x)
-    ck.verify(tvm.max(tvm.min(y, x), x), x)
-    ck.verify(tvm.max(tvm.max(x, y), x), tvm.max(x, y))
-    ck.verify(tvm.max(tvm.max(x, y), y), tvm.max(x, y))
-
-    ck.verify(tvm.max(x, tvm.min(x, y)), x)
-    ck.verify(tvm.max(x, tvm.min(y, x)), x)
-    ck.verify(tvm.max(x, tvm.max(x, y)), tvm.max(x, y))
-    ck.verify(tvm.max(y, tvm.max(x, y)), tvm.max(x, y))
-
-    ck.verify(tvm.max(tvm.max(tvm.max(x, y), z), y),
-              tvm.max(tvm.max(x, y), z))
-    ck.verify(tvm.max(tvm.max(tvm.max(tvm.max(x, y), z), x * 2), y),
-              tvm.max(tvm.max(tvm.max(x, y), z), x * 2))
-    ck.verify(tvm.max(tvm.max(tvm.max(tvm.max(tvm.max(x, y), z), x * 2), z * 2), y),
-              tvm.max(tvm.max(tvm.max(tvm.max(x, y), z), x * 2), z * 2))
-
-    ck.verify(tvm.max(tvm.min(x, y), tvm.min(x, z)), tvm.min(tvm.max(y, z), x))
-    ck.verify(tvm.max(tvm.min(x, y), tvm.min(z, x)), tvm.min(tvm.max(y, z), x))
-    ck.verify(tvm.max(tvm.min(y, x), tvm.min(x, z)), tvm.min(tvm.max(y, z), x))
-    ck.verify(tvm.max(tvm.min(y, x), tvm.min(z, x)), tvm.min(tvm.max(y, z), x))
-
-    ck.verify(tvm.max(y + x, z + x), tvm.max(y, z) + x)
-    ck.verify(tvm.max(y + x, x + z), tvm.max(y, z) + x)
-    ck.verify(tvm.max(x + y, z + x), tvm.max(y, z) + x)
-    ck.verify(tvm.max(x + y, x + z), tvm.max(y, z) + x)
-
-    ck.verify(tvm.max(x - y, x - z), x - tvm.min(y, z))
-    ck.verify(tvm.max(y - x, z - x), tvm.max(y, z) - x)
-
-    ck.verify(tvm.max(tvm.max(x, 1), 10), tvm.max(x, 10))
-    ck.verify(tvm.max(tvm.max(x, 11), 10), tvm.max(x, 11))
-
-    ck.verify(tvm.max(x * 3, 9), tvm.max(x, 3) * 3)
-    ck.verify(tvm.max(3 - x, 1), 3 - tvm.min(x,  2))
+    ck.verify(tvm.te.max(tmod(x, 2), tmod(y, 2) + 10), tmod(y, 2) + 10)
+    ck.verify(tvm.te.max(flm(x, 2), flm(y, 2) + 10), flm(y, 2) + 10)
+
+    ck.verify(tvm.te.max(x + 1, x + 10), x + 10)
+    ck.verify(tvm.te.max(x + 111, x + 10), x + 111)
+    ck.verify(tvm.te.max(x + 1, x), x + 1)
+    ck.verify(tvm.te.max(x, x + 2), x + 2)
+    ck.verify(tvm.te.max(1 - x, 2 - x), 2 - x)
+    ck.verify(tvm.te.max(3 - x, 2 - x), 3 - x)
+
+    ck.verify(tvm.te.max(tvm.te.min(x, y), tvm.te.max(x, y)), tvm.te.max(x, y))
+    ck.verify(tvm.te.max(tvm.te.min(x, y), tvm.te.max(y, x)), tvm.te.max(x, y))
+
+    ck.verify(tvm.te.max(tvm.te.min(x, y), x), x)
+    ck.verify(tvm.te.max(tvm.te.min(y, x), x), x)
+    ck.verify(tvm.te.max(tvm.te.max(x, y), x), tvm.te.max(x, y))
+    ck.verify(tvm.te.max(tvm.te.max(x, y), y), tvm.te.max(x, y))
+
+    ck.verify(tvm.te.max(x, tvm.te.min(x, y)), x)
+    ck.verify(tvm.te.max(x, tvm.te.min(y, x)), x)
+    ck.verify(tvm.te.max(x, tvm.te.max(x, y)), tvm.te.max(x, y))
+    ck.verify(tvm.te.max(y, tvm.te.max(x, y)), tvm.te.max(x, y))
+
+    ck.verify(tvm.te.max(tvm.te.max(tvm.te.max(x, y), z), y),
+              tvm.te.max(tvm.te.max(x, y), z))
+    ck.verify(tvm.te.max(tvm.te.max(tvm.te.max(tvm.te.max(x, y), z), x * 2), y),
+              tvm.te.max(tvm.te.max(tvm.te.max(x, y), z), x * 2))
+    ck.verify(tvm.te.max(tvm.te.max(tvm.te.max(tvm.te.max(tvm.te.max(x, y), z), x * 2), z * 2), y),
+              tvm.te.max(tvm.te.max(tvm.te.max(tvm.te.max(x, y), z), x * 2), z * 2))
+
+    ck.verify(tvm.te.max(tvm.te.min(x, y), tvm.te.min(x, z)), tvm.te.min(tvm.te.max(y, z), x))
+    ck.verify(tvm.te.max(tvm.te.min(x, y), tvm.te.min(z, x)), tvm.te.min(tvm.te.max(y, z), x))
+    ck.verify(tvm.te.max(tvm.te.min(y, x), tvm.te.min(x, z)), tvm.te.min(tvm.te.max(y, z), x))
+    ck.verify(tvm.te.max(tvm.te.min(y, x), tvm.te.min(z, x)), tvm.te.min(tvm.te.max(y, z), x))
+
+    ck.verify(tvm.te.max(y + x, z + x), tvm.te.max(y, z) + x)
+    ck.verify(tvm.te.max(y + x, x + z), tvm.te.max(y, z) + x)
+    ck.verify(tvm.te.max(x + y, z + x), tvm.te.max(y, z) + x)
+    ck.verify(tvm.te.max(x + y, x + z), tvm.te.max(y, z) + x)
+
+    ck.verify(tvm.te.max(x - y, x - z), x - tvm.te.min(y, z))
+    ck.verify(tvm.te.max(y - x, z - x), tvm.te.max(y, z) - x)
+
+    ck.verify(tvm.te.max(tvm.te.max(x, 1), 10), tvm.te.max(x, 10))
+    ck.verify(tvm.te.max(tvm.te.max(x, 11), 10), tvm.te.max(x, 11))
+
+    ck.verify(tvm.te.max(x * 3, 9), tvm.te.max(x, 3) * 3)
+    ck.verify(tvm.te.max(3 - x, 1), 3 - tvm.te.min(x,  2))
 
     # DivMod rules
     # truc div
-    ck.verify(tvm.max(tdiv(x, 10), tdiv(y, 10)), tdiv(tvm.max(x, y), 10))
-    ck.verify(tvm.max(tdiv(x, (-10)), tdiv(y, (-10))), tdiv(tvm.min(x, y), (-10)))
-    ck.verify(tvm.max(tdiv(x + 3, 4) * 4, x), tdiv(x + 3, 4) * 4)
+    ck.verify(tvm.te.max(tdiv(x, 10), tdiv(y, 10)), tdiv(tvm.te.max(x, y), 10))
+    ck.verify(tvm.te.max(tdiv(x, (-10)), tdiv(y, (-10))), tdiv(tvm.te.min(x, y), (-10)))
+    ck.verify(tvm.te.max(tdiv(x + 3, 4) * 4, x), tdiv(x + 3, 4) * 4)
 
     # floordiv
-    ck.verify(tvm.max(fld(x, 10), fld(y, 10)), fld(tvm.max(x, y), 10))
-    ck.verify(tvm.max(fld(x, (-10)), fld(y, (-10))), fld(tvm.min(x, y), (-10)))
-    ck.verify(tvm.max(fld(x + 3, 4) * 4, x), fld(x + 3, 4) * 4)
-    ck.verify(tvm.max(fld(x, 4) * 4, x), x)
-    ck.verify(tvm.max(x, fld(x, 4) * 4), x)
+    ck.verify(tvm.te.max(fld(x, 10), fld(y, 10)), fld(tvm.te.max(x, y), 10))
+    ck.verify(tvm.te.max(fld(x, (-10)), fld(y, (-10))), fld(tvm.te.min(x, y), (-10)))
+    ck.verify(tvm.te.max(fld(x + 3, 4) * 4, x), fld(x + 3, 4) * 4)
+    ck.verify(tvm.te.max(fld(x, 4) * 4, x), x)
+    ck.verify(tvm.te.max(x, fld(x, 4) * 4), x)
 
 
 def test_cmp_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
-    flm = tvm.floormod
-    fld = tvm.floordiv
-    tdiv = tvm.truncdiv
-    tmod = tvm.truncmod
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
+    flm = tvm.te.floormod
+    fld = tvm.te.floordiv
+    tdiv = tvm.tir.truncdiv
+    tmod = tvm.tir.truncmod
     # const int bound
-    ck.verify((tmod(x, 2) + 10).equal(0), tvm.const(0, "bool"))
-    ck.verify(tvm.tir.NE(tmod(x, 2) + 10, 0), tvm.const(1, "bool"))
-    ck.verify(tmod(x, 2) + 10 > 1, tvm.const(1, "bool"))
-    ck.verify(tmod(x, 2) + 10 <= 1, tvm.const(0, "bool"))
-    ck.verify(flm(x, 2) + 2 > 1, tvm.const(1, "bool"))
-    ck.verify(flm(x, 2) + 10 <= 1, tvm.const(0, "bool"))
+    ck.verify((tmod(x, 2) + 10).equal(0), tvm.tir.const(0, "bool"))
+    ck.verify(tvm.tir.NE(tmod(x, 2) + 10, 0), tvm.tir.const(1, "bool"))
+    ck.verify(tmod(x, 2) + 10 > 1, tvm.tir.const(1, "bool"))
+    ck.verify(tmod(x, 2) + 10 <= 1, tvm.tir.const(0, "bool"))
+    ck.verify(flm(x, 2) + 2 > 1, tvm.tir.const(1, "bool"))
+    ck.verify(flm(x, 2) + 10 <= 1, tvm.tir.const(0, "bool"))
 
-    ck.verify(x * 3 + 10 == 0, tvm.const(0, "bool"))
-    ck.verify(x * 3 + 10 != 0, tvm.const(1, "bool"))
+    ck.verify(x * 3 + 10 == 0, tvm.tir.const(0, "bool"))
+    ck.verify(x * 3 + 10 != 0, tvm.tir.const(1, "bool"))
 
     # canonicalization
     ck.verify((x - 10).equal(0), x.equal(10))
@@ -750,88 +751,88 @@ def test_cmp_simplify():
     ck.verify(fld(x + 2, 4) * 4 >= x - y, tvm.tir.LE(flm(x + 2, 4) + (-2), y))
     # End DivMod Rules
 
-    ck.verify(tvm.min(x, 11) < 10, x < 10)
-    ck.verify(tvm.min(x, 8) < 10, tvm.const(1, "bool"))
-    ck.verify(tvm.max(8, x) > 10, tvm.tir.LT(10, x))
-    ck.verify(x + 1 < tvm.max(8, x), x < 7)
+    ck.verify(tvm.te.min(x, 11) < 10, x < 10)
+    ck.verify(tvm.te.min(x, 8) < 10, tvm.tir.const(1, "bool"))
+    ck.verify(tvm.te.max(8, x) > 10, tvm.tir.LT(10, x))
+    ck.verify(x + 1 < tvm.te.max(8, x), x < 7)
 
     ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 10), override=True)
     ck.analyzer.update(y, tvm.arith.ConstIntBound(-10, 0), override=True)
     ck.analyzer.update(z, tvm.arith.ConstIntBound(-5, 5), override=True)
 
-    ck.verify(x < 11, tvm.const(1, "bool"))
-    ck.verify(x <= 10, tvm.const(1, "bool"))
-    ck.verify(z <= 5, tvm.const(1, "bool"))
-    ck.verify(x + y <= 10, tvm.const(1, "bool"))
-    ck.verify(x + y >= -10, tvm.const(1, "bool"))
-    ck.verify(z - 5 <= y + 10, tvm.const(1, "bool"))
-    ck.verify(tvm.all(x > -1, z <= x + 5), tvm.const(1, "bool"))
-    ck.verify(x*y <= 0, tvm.const(1, "bool"))
-    ck.verify((x + 1)*(y - 1) < 0, tvm.const(1, "bool"))
-    ck.verify(y*y >= 0, tvm.const(1, "bool"))
-    ck.verify(x*6 <= -3, tvm.const(0, "bool"))
+    ck.verify(x < 11, tvm.tir.const(1, "bool"))
+    ck.verify(x <= 10, tvm.tir.const(1, "bool"))
+    ck.verify(z <= 5, tvm.tir.const(1, "bool"))
+    ck.verify(x + y <= 10, tvm.tir.const(1, "bool"))
+    ck.verify(x + y >= -10, tvm.tir.const(1, "bool"))
+    ck.verify(z - 5 <= y + 10, tvm.tir.const(1, "bool"))
+    ck.verify(tvm.tir.all(x > -1, z <= x + 5), tvm.tir.const(1, "bool"))
+    ck.verify(x*y <= 0, tvm.tir.const(1, "bool"))
+    ck.verify((x + 1)*(y - 1) < 0, tvm.tir.const(1, "bool"))
+    ck.verify(y*y >= 0, tvm.tir.const(1, "bool"))
+    ck.verify(x*6 <= -3, tvm.tir.const(0, "bool"))
     ck.verify(tmod(y - 1, 3) == 0, tmod(y + (-1), 3) == 0)
 
 
 def test_logical_simplify():
     ck = RewriteChecker()
-    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    x, y, z = te.var("x"), te.var("y"), te.var("z")
 
     ck.verify(tvm.tir.And(tvm.tir.EQ(x, y), tvm.tir.NE(x, y)),
-              tvm.const(False, "bool"))
+              tvm.tir.const(False, "bool"))
     ck.verify(tvm.tir.And(tvm.tir.NE(x, y), tvm.tir.EQ(x, y)),
-              tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(x > 1, tvm.tir.Not(x > 1)), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(x <= y, y < x), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(y < x, x <= y), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(x < 1, 0 < x), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(x < 0, 1 < x), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(x < 1, 1 <= x), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(x <= 1, 1 < x), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(1 <= x, x < 1), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(1 < x, x <= 1), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(x <= 1, 2 <= x), tvm.const(False, "bool"))
-    ck.verify(tvm.tir.And(2 <= x, x <= 1), tvm.const(False, "bool"))
+              tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(x > 1, tvm.tir.Not(x > 1)), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(x <= y, y < x), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(y < x, x <= y), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(x < 1, 0 < x), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(x < 0, 1 < x), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(x < 1, 1 <= x), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(x <= 1, 1 < x), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(1 <= x, x < 1), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(1 < x, x <= 1), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(x <= 1, 2 <= x), tvm.tir.const(False, "bool"))
+    ck.verify(tvm.tir.And(2 <= x, x <= 1), tvm.tir.const(False, "bool"))
     ck.verify(tvm.tir.And(x == 1, x != 2), x == 1)
 
 
     ck.verify(tvm.tir.Or(tvm.tir.EQ(x, y), tvm.tir.NE(x, y)),
-              tvm.const(True, "bool"))
+              tvm.tir.const(True, "bool"))
     ck.verify(tvm.tir.Or(tvm.tir.NE(x, y), tvm.tir.EQ(x, y)),
-              tvm.const(True, "bool"))
-    ck.verify(tvm.tir.Or(x > y, tvm.tir.Not(x > y)), tvm.const(True, "bool"))
+              tvm.tir.const(True, "bool"))
+    ck.verify(tvm.tir.Or(x > y, tvm.tir.Not(x > y)), tvm.tir.const(True, "bool"))
 
-    ck.verify(tvm.tir.Or(x <= y, y < x), tvm.const(True, "bool"))
-    ck.verify(tvm.tir.Or(y < x, y >= x), tvm.const(True, "bool"))
+    ck.verify(tvm.tir.Or(x <= y, y < x), tvm.tir.const(True, "bool"))
+    ck.verify(tvm.tir.Or(y < x, y >= x), tvm.tir.const(True, "bool"))
 
-    ck.verify(tvm.tir.Or(x < 1, 0 < x), tvm.const(True, "bool"))
-    ck.verify(tvm.tir.Or(0 < x, x < 1), tvm.const(True, "bool"))
+    ck.verify(tvm.tir.Or(x < 1, 0 < x), tvm.tir.const(True, "bool"))
+    ck.verify(tvm.tir.Or(0 < x, x < 1), tvm.tir.const(True, "bool"))
 
-    ck.verify(tvm.tir.Or(x < 1, 1 <= x), tvm.const(True, "bool"))
-    ck.verify(tvm.tir.Or(x <= 1, 1 < x), tvm.const(True, "bool"))
-    ck.verify(tvm.tir.Or(1 <= x, x < 1), tvm.const(True, "bool"))
-    ck.verify(tvm.tir.Or(1 < x, x <= 1), tvm.const(True, "bool"))
-    ck.verify(tvm.tir.Or(x <= 1, 2 <= x), tvm.const(True, "bool"))
-    ck.verify(tvm.tir.Or(2 <= x, x <= 1), tvm.const(True, "bool"))
+    ck.verify(tvm.tir.Or(x < 1, 1 <= x), tvm.tir.const(True, "bool"))
+    ck.verify(tvm.tir.Or(x <= 1, 1 < x), tvm.tir.const(True, "bool"))
+    ck.verify(tvm.tir.Or(1 <= x, x < 1), tvm.tir.const(True, "bool"))
+    ck.verify(tvm.tir.Or(1 < x, x <= 1), tvm.tir.const(True, "bool"))
+    ck.verify(tvm.tir.Or(x <= 1, 2 <= x), tvm.tir.const(True, "bool"))
+    ck.verify(tvm.tir.Or(2 <= x, x <= 1), tvm.tir.const(True, "bool"))
     ck.verify(tvm.tir.Or(x != 1, x == 2), x != 1)
 
 def test_let_simplify():
     ck = RewriteChecker()
-    x, y = tvm.var("x"), tvm.var("y")
+    x, y = te.var("x"), te.var("y")
     z = tvm.tir.Let(x, 1, x + 1)
     ck.verify(z + z, 4)
 
 def test_cast_simplify():
     ck = RewriteChecker()
-    x = tvm.var("x")
+    x = te.var("x")
 
     dtypes = ["float32", "float16", "int32", "int8", "bool"]
     for dtype1 in dtypes:
-        ck.verify(tvm.tir.Cast(dtype1, x - x), tvm.const(0, dtype1))
-        ck.verify(tvm.tir.Cast(dtype1, x == x), tvm.const(1, dtype1))
+        ck.verify(tvm.tir.Cast(dtype1, x - x), tvm.tir.const(0, dtype1))
+        ck.verify(tvm.tir.Cast(dtype1, x == x), tvm.tir.const(1, dtype1))
         for dtype2 in dtypes:
             for i in [0, 1, 2, 3]:
-                ck.verify(tvm.tir.Cast(dtype1, tvm.const(i, dtype2)), tvm.const(i, dtype1))
+                ck.verify(tvm.tir.Cast(dtype1, tvm.tir.const(i, dtype2)), tvm.tir.const(i, dtype1))
 
 if __name__ == "__main__":
     test_floordiv_index_simplify()
diff --git a/tests/python/unittest/test_arith_stmt_simplify.py b/tests/python/unittest/test_arith_stmt_simplify.py
index 58b60836539f..45f083342410 100644
--- a/tests/python/unittest/test_arith_stmt_simplify.py
+++ b/tests/python/unittest/test_arith_stmt_simplify.py
@@ -15,50 +15,51 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_stmt_simplify():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     C = ib.pointer("float32", name="C")
-    n = tvm.size_var("n")
+    n = te.size_var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.if_scope(i < 12):
             A[i] = C[i]
 
     body = tvm.tir.LetStmt(n, 10, ib.get())
-    body = tvm.ir_pass.CanonicalSimplify(body)
+    body = tvm.tir.ir_pass.CanonicalSimplify(body)
     assert isinstance(body.body, tvm.tir.Store)
 
 
 def test_thread_extent_simplify():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     C = ib.pointer("float32", name="C")
-    n = tvm.size_var("n")
-    tx = tvm.thread_axis("threadIdx.x")
-    ty = tvm.thread_axis("threadIdx.y")
+    n = te.size_var("n")
+    tx = te.thread_axis("threadIdx.x")
+    ty = te.thread_axis("threadIdx.y")
     ib.scope_attr(tx, "thread_extent", n)
     ib.scope_attr(tx, "thread_extent", n)
     ib.scope_attr(ty, "thread_extent", 1)
     with ib.if_scope(tx + ty < 12):
         A[tx] = C[tx + ty]
     body = tvm.tir.LetStmt(n, 10, ib.get())
-    body = tvm.ir_pass.CanonicalSimplify(body)
+    body = tvm.tir.ir_pass.CanonicalSimplify(body)
     assert isinstance(body.body.body.body, tvm.tir.Store)
 
 
 def test_basic_likely_elimination():
-    n = tvm.size_var('n')
-    X = tvm.placeholder(shape=(n,), name="x")
-    W = tvm.placeholder(shape=(n + 1,), dtype="int32", name="w")
+    n = te.size_var('n')
+    X = te.placeholder(shape=(n,), name="x")
+    W = te.placeholder(shape=(n + 1,), dtype="int32", name="w")
 
     def f(i):
         start = W[i]
         extent = W[i+1] - W[i]
-        rv = tvm.reduce_axis((0, extent))
-        return tvm.sum(X[rv + start], axis=rv)
-    Y = tvm.compute(X.shape, f, name="y")
-    s = tvm.create_schedule([Y.op])
+        rv = te.reduce_axis((0, extent))
+        return te.sum(X[rv + start], axis=rv)
+    Y = te.compute(X.shape, f, name="y")
+    s = te.create_schedule([Y.op])
     stmt = tvm.lower(s, [X, W, Y], simple_mode=True)
     assert('if' not in str(stmt))
 
@@ -68,10 +69,10 @@ def cumsum(X):
         Y[i] = sum(X[:i])
         """
         (m, ) = X.shape
-        s_state = tvm.placeholder((m + 1, ), dtype="int32", name="state")
-        s_init = tvm.compute((1, ), lambda _: tvm.const(0, "int32"))
-        s_update = tvm.compute((m + 1, ), lambda l: s_state[l - 1] + X[l - 1])
-        return tvm.scan(s_init, s_update, s_state, inputs=[X], name="cumsum")
+        s_state = te.placeholder((m + 1, ), dtype="int32", name="state")
+        s_init = te.compute((1, ), lambda _: tvm.tir.const(0, "int32"))
+        s_update = te.compute((m + 1, ), lambda l: s_state[l - 1] + X[l - 1])
+        return tvm.te.scan(s_init, s_update, s_state, inputs=[X], name="cumsum")
 
     def sparse_lengths_sum(data, indices, lengths):
         oshape = list(data.shape)
@@ -79,21 +80,21 @@ def sparse_lengths_sum(data, indices, lengths):
         length_offsets = cumsum(lengths)
 
         def sls(n, d):
-            gg = tvm.reduce_axis((0, lengths[n]))
+            gg = te.reduce_axis((0, lengths[n]))
             indices_idx = length_offsets[n] + gg
             data_idx = indices[indices_idx]
             data_val = data[data_idx, d]
-            return tvm.sum(data_val, axis=gg)
+            return te.sum(data_val, axis=gg)
 
-        return tvm.compute(oshape, sls)
+        return te.compute(oshape, sls)
 
-    m, n, d, i, l = tvm.size_var('m'), tvm.size_var('n'), tvm.size_var('d'),\
-                    tvm.size_var('i'), tvm.size_var('l')
-    data_ph = tvm.placeholder((m, d * 32), name="data")
-    indices_ph = tvm.placeholder((i,), name="indices", dtype="int32")
-    lengths_ph = tvm.placeholder((n,), name="lengths", dtype="int32")
+    m, n, d, i, l = te.size_var('m'), te.size_var('n'), te.size_var('d'),\
+                    te.size_var('i'), te.size_var('l')
+    data_ph = te.placeholder((m, d * 32), name="data")
+    indices_ph = te.placeholder((i,), name="indices", dtype="int32")
+    lengths_ph = te.placeholder((n,), name="lengths", dtype="int32")
     Y = sparse_lengths_sum(data_ph, indices_ph, lengths_ph)
-    s = tvm.create_schedule([Y.op])
+    s = te.create_schedule([Y.op])
     (n, d) = s[Y].op.axis
     (do, di) = s[Y].split(d, factor=32)
     (gg,) = s[Y].op.reduce_axis
diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py
index 83bbd5492619..a2f9b1da42ae 100644
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/tests/python/unittest/test_autotvm_common.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm import MeasureInput, MeasureResult
 from tvm.autotvm.measure.measure import Runner
@@ -38,12 +39,12 @@ def get_build_kwargs(self):
 
 @autotvm.register_customized_task("testing/matmul")
 def matmul(N, L, M, dtype):
-    A = tvm.placeholder((N, L), name='A', dtype=dtype)
-    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+    A = te.placeholder((N, L), name='A', dtype=dtype)
+    B = te.placeholder((L, M), name='B', dtype=dtype)
 
-    k = tvm.reduce_axis((0, L), name='k')
-    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
-    s = tvm.create_schedule(C.op)
+    k = te.reduce_axis((0, L), name='k')
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = te.create_schedule(C.op)
 
     # schedule
     y, x = s[C].op.axis
@@ -66,12 +67,12 @@ def matmul(N, L, M, dtype):
 @autotvm.register_customized_task("testing/bad_matmul")
 def bad_matmul(N, L, M, dtype):
     if 'bad_device' in tvm.target.Target.current().keys:
-        A = tvm.placeholder((N, L), name='A', dtype=dtype)
-        B = tvm.placeholder((L, M), name='B', dtype=dtype)
+        A = te.placeholder((N, L), name='A', dtype=dtype)
+        B = te.placeholder((L, M), name='B', dtype=dtype)
 
-        k = tvm.reduce_axis((0, L-1), name='k')
-        C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
-        s = tvm.create_schedule(C.op)
+        k = te.reduce_axis((0, L-1), name='k')
+        C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+        s = te.create_schedule(C.op)
 
         # schedule
         y, x = s[C].op.axis
diff --git a/tests/python/unittest/test_autotvm_feature.py b/tests/python/unittest/test_autotvm_feature.py
index e0736c280dc4..59ad464f7cea 100644
--- a/tests/python/unittest/test_autotvm_feature.py
+++ b/tests/python/unittest/test_autotvm_feature.py
@@ -19,20 +19,21 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm.autotvm import feature
 
 def test_iter_feature_gemm():
     N = 128
 
-    k = tvm.reduce_axis((0, N), 'k')
-    A = tvm.placeholder((N, N), name='A')
-    B = tvm.placeholder((N, N), name='B')
-    C = tvm.compute(
+    k = te.reduce_axis((0, N), 'k')
+    A = te.placeholder((N, N), name='A')
+    B = te.placeholder((N, N), name='B')
+    C = te.compute(
         A.shape,
-        lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
+        lambda y, x: te.sum(A[y, k] * B[k, x], axis=k),
         name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
 
     feas = feature.get_itervar_feature(s, [A, B, C], take_log=False)
 
@@ -64,15 +65,15 @@ def test_iter_feature_gemm():
 def test_curve_feature_gemm():
     N = 128
 
-    k = tvm.reduce_axis((0, N), 'k')
-    A = tvm.placeholder((N, N), name='A')
-    B = tvm.placeholder((N, N), name='B')
-    C = tvm.compute(
+    k = te.reduce_axis((0, N), 'k')
+    A = te.placeholder((N, N), name='A')
+    B = te.placeholder((N, N), name='B')
+    C = te.compute(
         A.shape,
-        lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
+        lambda y, x: te.sum(A[y, k] * B[k, x], axis=k),
         name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
 
     feas = feature.get_buffer_curve_sample_flatten(s, [A, B, C], sample_n=30)
     # sample_n * #buffers * #curves * 2 numbers per curve
@@ -85,13 +86,13 @@ def test_feature_shape():
     n_sample = 100
 
     def get_gemm_feature(target):
-        k = tvm.reduce_axis((0, N), 'k')
-        A = tvm.placeholder((N, N), name='A')
-        B = tvm.placeholder((N, N), name='B')
-        C = tvm.compute(A.shape, lambda y, x: tvm.sum(A[y, k] * B[k, x], axis=k),
+        k = te.reduce_axis((0, N), 'k')
+        A = te.placeholder((N, N), name='A')
+        B = te.placeholder((N, N), name='B')
+        C = te.compute(A.shape, lambda y, x: te.sum(A[y, k] * B[k, x], axis=k),
                         name='C')
 
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
 
         y, x = s[C].op.axis
         axes = list(s[C].tile(y, x, 8, 8)) + [k]
@@ -105,9 +106,9 @@ def get_gemm_feature(target):
             for i in range(len(perm)):
                 if perm[i] != 4:
                     pick.append(axes[i])
-            s[C].bind(pick[0], tvm.thread_axis("blockIdx.x"))
-            s[C].bind(pick[1], tvm.thread_axis("vthread"))
-            s[C].bind(pick[2], tvm.thread_axis("threadIdx.y"))
+            s[C].bind(pick[0], te.thread_axis("blockIdx.x"))
+            s[C].bind(pick[1], te.thread_axis("vthread"))
+            s[C].bind(pick[2], te.thread_axis("threadIdx.y"))
 
         with target:
             feas = feature.get_itervar_feature(s, [A, B, C])
diff --git a/tests/python/unittest/test_autotvm_flop_calculator.py b/tests/python/unittest/test_autotvm_flop_calculator.py
index 5cafd02c45bf..e06010b948ad 100644
--- a/tests/python/unittest/test_autotvm_flop_calculator.py
+++ b/tests/python/unittest/test_autotvm_flop_calculator.py
@@ -17,6 +17,7 @@
 """Test flop calculation"""
 
 import tvm
+from tvm import te
 import numpy as np
 
 from tvm.autotvm.task.task import compute_flop
@@ -30,24 +31,24 @@ def test_conv():
     for i in range(5):
         N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
         (input_dtype, acc_dtype) = random_dtypes()
-        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
-        K = tvm.placeholder((CO, CI, KH, KW), dtype=input_dtype)
+        D = te.placeholder((N, CI, H, W), dtype=input_dtype)
+        K = te.placeholder((CO, CI, KH, KW), dtype=input_dtype)
 
         KH = min(H, KH)
         KW = min(W, KW)
 
-        ci = tvm.reduce_axis((0, CI))
-        kh = tvm.reduce_axis((0, KH))
-        kw = tvm.reduce_axis((0, KW))
+        ci = te.reduce_axis((0, CI))
+        kh = te.reduce_axis((0, KH))
+        kw = te.reduce_axis((0, KW))
 
         OH = (H - KH) + 1
         OW = (W - KW) + 1
 
-        C = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:
-        tvm.sum(D[n][ci][h][w].astype(acc_dtype) * K[co][ci][h][w].astype(acc_dtype),
+        C = te.compute((N, CO, OH, OW), lambda n, co, h, w:
+        te.sum(D[n][ci][h][w].astype(acc_dtype) * K[co][ci][h][w].astype(acc_dtype),
                 axis=[ci, kh, kw]))
 
-        s = tvm.create_schedule([C.op])
+        s = te.create_schedule([C.op])
 
         assert compute_flop(s) == 2 * N * CO * OH * OW * CI * KH * KW
 
@@ -55,55 +56,55 @@ def test_pack_gemm():
     for i in range(5):
         N, L, M = [np.random.randint(10, 128) * 4 for _ in range(3)]
         (input_dtype, acc_dtype) = random_dtypes()
-        A = tvm.placeholder((N, L), dtype=input_dtype)
-        B = tvm.placeholder((M, L), dtype=input_dtype)
-        k = tvm.reduce_axis((0, L))
+        A = te.placeholder((N, L), dtype=input_dtype)
+        B = te.placeholder((M, L), dtype=input_dtype)
+        k = te.reduce_axis((0, L))
 
         bn = 4
-        idxd = tvm.indexdiv
-        idxm = tvm.indexmod
+        idxd = tvm.tir.indexdiv
+        idxm = tvm.tir.indexmod
 
-        A_pack = tvm.compute((N // bn, L, bn), lambda i, j, k: A[i * bn + k][j])
-        B_pack = tvm.compute((M // bn, L, bn), lambda i, j, k: B[i * bn + k][j])
-        C_pack = tvm.compute((N // bn, M // bn, bn, bn), lambda i, j, ii, jj:
-        tvm.sum(A_pack[i, k, ii].astype(acc_dtype) * B_pack[j, k, jj].astype(acc_dtype), axis=[k]))
-        C = tvm.compute((N, M), lambda i, j: C_pack[idxd(i, bn)][idxd(j, bn)][idxm(i, bn)][idxm(j, bn)])
+        A_pack = te.compute((N // bn, L, bn), lambda i, j, k: A[i * bn + k][j])
+        B_pack = te.compute((M // bn, L, bn), lambda i, j, k: B[i * bn + k][j])
+        C_pack = te.compute((N // bn, M // bn, bn, bn), lambda i, j, ii, jj:
+        te.sum(A_pack[i, k, ii].astype(acc_dtype) * B_pack[j, k, jj].astype(acc_dtype), axis=[k]))
+        C = te.compute((N, M), lambda i, j: C_pack[idxd(i, bn)][idxd(j, bn)][idxm(i, bn)][idxm(j, bn)])
 
-        s = tvm.create_schedule([C.op])
+        s = te.create_schedule([C.op])
         assert compute_flop(s) == 2 * N * L * M
 
 def test_outer_dot():
     for i in range(5):
         N, M = [np.random.randint(10, 128) * 4 for _ in range(2)]
         (input_dtype, acc_dtype) = random_dtypes()
-        A = tvm.placeholder((N,), dtype=input_dtype)
-        B = tvm.placeholder((M,), dtype=input_dtype)
+        A = te.placeholder((N,), dtype=input_dtype)
+        B = te.placeholder((M,), dtype=input_dtype)
 
-        C = tvm.compute((N, M), lambda i, j: A[i].astype(acc_dtype) * B[j].astype(acc_dtype))
+        C = te.compute((N, M), lambda i, j: A[i].astype(acc_dtype) * B[j].astype(acc_dtype))
 
-        s = tvm.create_schedule([C.op])
+        s = te.create_schedule([C.op])
         assert compute_flop(s) == N * M
 
 def test_max_pool():
     for i in range(5):
         N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
         (input_dtype, _) = random_dtypes()
-        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
+        D = te.placeholder((N, CI, H, W), dtype=input_dtype)
 
         KH = min(H, KH)
         KW = min(W, KW)
 
-        kh = tvm.reduce_axis((0, KH))
-        kw = tvm.reduce_axis((0, KW))
+        kh = te.reduce_axis((0, KH))
+        kw = te.reduce_axis((0, KW))
 
         OH = (H - KH) + 1
         OW = (W - KW) + 1
 
-        C = tvm.compute(
+        C = te.compute(
             (N, CO, OH, OW),
-            lambda n, co, h, w: tvm.max(D[n][co][h + kh][w + kw], axis=[kh, kw]))
+            lambda n, co, h, w: tvm.te.max(D[n][co][h + kh][w + kw], axis=[kh, kw]))
 
-        s = tvm.create_schedule([C.op])
+        s = te.create_schedule([C.op])
 
         assert compute_flop(s) == N * CO * OH * OW * KH * KW
 
@@ -111,24 +112,24 @@ def test_average_pool():
     for i in range(5):
         N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
         (input_dtype, acc_dtype) = random_dtypes()
-        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
+        D = te.placeholder((N, CI, H, W), dtype=input_dtype)
 
         KH = min(H, KH)
         KW = min(W, KW)
 
-        kh = tvm.reduce_axis((0, KH))
-        kw = tvm.reduce_axis((0, KW))
+        kh = te.reduce_axis((0, KH))
+        kw = te.reduce_axis((0, KW))
 
         OH = (H - KH) + 1
         OW = (W - KW) + 1
 
 
-        C = tvm.compute(
+        C = te.compute(
             (N, CO, OH, OW),
-            lambda n, co, h, w: tvm.sum(
-                tvm.div(D[n][co][h + kh][w + kw].astype(acc_dtype), (KW * KH)), axis=[kh, kw]))
+            lambda n, co, h, w: te.sum(
+                te.div(D[n][co][h + kh][w + kw].astype(acc_dtype), (KW * KH)), axis=[kh, kw]))
 
-        s = tvm.create_schedule([C.op])
+        s = te.create_schedule([C.op])
 
         assert compute_flop(s) == 2 * N * CO * OH * OW * KH * KW
 
@@ -136,9 +137,9 @@ def test_move():
     """No float number operation in simple move. So the estimator should raise an error """
     N = 1024
 
-    A = tvm.placeholder((N,))
-    C = tvm.compute((N,), lambda i: A[i])
-    s = tvm.create_schedule([C.op])
+    A = te.placeholder((N,))
+    C = te.compute((N,), lambda i: A[i])
+    s = te.create_schedule([C.op])
 
     try:
         compute_flop(s)
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
index 0899f6f5bbff..f96d333ddbc3 100644
--- a/tests/python/unittest/test_autotvm_measure.py
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from test_autotvm_common import DummyRunner, bad_matmul, get_sample_task
 from tvm import autotvm
 from tvm.autotvm.measure.measure import MeasureErrorNo, MeasureResult
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index 0839ad9b68cf..bcc9a93a5b7a 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -18,6 +18,7 @@
 import time
 
 import tvm
+from tvm import te
 from tvm.contrib import util
 
 from tvm import autotvm
diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py
index 95f3201c5eb4..2694c49d6925 100644
--- a/tests/python/unittest/test_autotvm_space.py
+++ b/tests/python/unittest/test_autotvm_space.py
@@ -17,16 +17,17 @@
 """Test space definition primitives"""
 
 import tvm
+from tvm import te
 from tvm.autotvm.task.space import ConfigSpace, FallbackConfigEntity
 
 def gemm_func(cfg, N):
-    A = tvm.placeholder((N, N), name='A')
-    B = tvm.placeholder((N, N), name='B')
+    A = te.placeholder((N, N), name='A')
+    B = te.placeholder((N, N), name='B')
 
-    k = tvm.reduce_axis((0, N), name='k')
-    C = tvm.compute((N, N), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=[k]), name='C')
+    k = te.reduce_axis((0, N), name='k')
+    C = te.compute((N, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=[k]), name='C')
 
-    s = tvm.create_schedule([C.op])
+    s = te.create_schedule([C.op])
 
     y, x = s[C].op.axis
 
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index 24677c566c66..214a600be10b 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm import MeasureInput, MeasureResult
 from tvm.autotvm.tuner.xgboost_cost_model import XGBoostCostModel
diff --git a/tests/python/unittest/test_build_lower.py b/tests/python/unittest/test_build_lower.py
index 58312dc83932..736030bd548d 100644
--- a/tests/python/unittest/test_build_lower.py
+++ b/tests/python/unittest/test_build_lower.py
@@ -15,28 +15,29 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_lower_rfactor():
-    n = tvm.size_var("n")
-    m = tvm.size_var("m")
-    A = tvm.placeholder((n, m), name='A')
-    k = tvm.reduce_axis((0, m), "k")
-    B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
-    s = tvm.create_schedule(B.op)
+    n = te.size_var("n")
+    m = te.size_var("m")
+    A = te.placeholder((n, m), name='A')
+    k = te.reduce_axis((0, m), "k")
+    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
+    s = te.create_schedule(B.op)
     ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
     BF = s.rfactor(B, ki)
     xo, xi = s[B].split(s[B].op.axis[0], factor=32)
-    s[B.op].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[B.op].bind(xi, tvm.thread_axis("threadIdx.y"))
-    s[B].bind(s[B].op.reduce_axis[0], tvm.thread_axis("threadIdx.x"))
+    s[B.op].bind(xo, te.thread_axis("blockIdx.x"))
+    s[B.op].bind(xi, te.thread_axis("threadIdx.y"))
+    s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x"))
     s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
     fapi = tvm.lower(s, [A, B])
 
 def test_dependent_output_shape():
-    n, m, x = tvm.size_var('n'), tvm.size_var('m'), tvm.size_var('x')
-    A = tvm.placeholder((n, m))
-    B = tvm.compute((m, n//x), lambda i, j: A[i,j] , name='B')
-    s = tvm.create_schedule(B.op)
+    n, m, x = te.size_var('n'), te.size_var('m'), te.size_var('x')
+    A = te.placeholder((n, m))
+    B = te.compute((m, n//x), lambda i, j: A[i,j] , name='B')
+    s = te.create_schedule(B.op)
     mod = tvm.build(s, [A, B, x])
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_codegen_arm.py b/tests/python/unittest/test_codegen_arm.py
index 8e2ad7aa76e0..65d82b0146fb 100644
--- a/tests/python/unittest/test_codegen_arm.py
+++ b/tests/python/unittest/test_codegen_arm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import re
 import os
 import ctypes
@@ -23,10 +24,10 @@ def test_popcount():
     target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
 
     def check_correct_assembly(type, elements, counts):
-        n = tvm.convert(elements)
-        A = tvm.placeholder(n, dtype=type, name='A')
-        B = tvm.compute(A.shape, lambda i: tvm.popcount(A[i]), name='B')
-        s = tvm.create_schedule(B.op)
+        n = tvm.runtime.convert(elements)
+        A = te.placeholder(n, dtype=type, name='A')
+        B = te.compute(A.shape, lambda i: tvm.tir.popcount(A[i]), name='B')
+        s = te.create_schedule(B.op)
         s[B].vectorize(s[B].op.axis[0])
         f = tvm.build(s, [A, B], target)
 
@@ -47,13 +48,13 @@ def test_vmlal_s16():
     target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
 
     def check_correct_assembly(N):
-        K = tvm.size_var("K")
-        A = tvm.placeholder((K, N), dtype="int8", name='A')
-        B = tvm.placeholder((K, N), dtype="int8", name='B')
-        k = tvm.reduce_axis((0, K))
-        C = tvm.compute((N, ), lambda n: tvm.sum(
+        K = te.size_var("K")
+        A = te.placeholder((K, N), dtype="int8", name='A')
+        B = te.placeholder((K, N), dtype="int8", name='B')
+        k = te.reduce_axis((0, K))
+        C = te.compute((N, ), lambda n: te.sum(
             A[k, n].astype("int32") * B[k, n].astype("int32"), axis=[k]), name='C')
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
         s[C].vectorize(s[C].op.axis[0])
         f = tvm.build(s, [A, B, C], target)
 
@@ -67,14 +68,14 @@ def check_correct_assembly(N):
     check_correct_assembly(64)
 
     def check_broadcast_correct_assembly(N):
-        K = tvm.size_var("K")
-        A = tvm.placeholder((K, N), dtype="int8", name='A')
-        B = tvm.placeholder((K,), dtype="int8", name='B')
-        k = tvm.reduce_axis((0, K))
-        C = tvm.compute((N, ), lambda n: tvm.sum(
+        K = te.size_var("K")
+        A = te.placeholder((K, N), dtype="int8", name='A')
+        B = te.placeholder((K,), dtype="int8", name='B')
+        k = te.reduce_axis((0, K))
+        C = te.compute((N, ), lambda n: te.sum(
             A[k, n].astype("int32") * B[k].astype("int32"),
             axis=[k]), name='C')
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
         s[C].vectorize(s[C].op.axis[0])
         f = tvm.build(s, [A, B, C], target)
 
diff --git a/tests/python/unittest/test_codegen_blob.py b/tests/python/unittest/test_codegen_blob.py
index c14607d0c0b7..62043e344d96 100644
--- a/tests/python/unittest/test_codegen_blob.py
+++ b/tests/python/unittest/test_codegen_blob.py
@@ -20,6 +20,7 @@
 from tvm.relay import testing
 from tvm.contrib import graph_runtime
 import tvm
+from tvm import te
 import ctypes
 
 def test_resnet18():
@@ -74,13 +75,13 @@ def test_system_lib():
             print("skip because %s is not enabled..." % device)
             return
     nn = 12
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
     bx, tx = s[B].split(B.op.axis[0], factor=4)
-    s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[B].bind(bx, te.thread_axis("blockIdx.x"))
+    s[B].bind(tx, te.thread_axis("threadIdx.x"))
 
     from tvm.contrib import util
     temp = util.tempdir()
diff --git a/tests/python/unittest/test_codegen_bool.py b/tests/python/unittest/test_codegen_bool.py
index 33711cbcdb63..cdb343f3530b 100644
--- a/tests/python/unittest/test_codegen_bool.py
+++ b/tests/python/unittest/test_codegen_bool.py
@@ -17,21 +17,22 @@
 """codegen related to bool types"""
 
 import tvm
+from tvm import te
 import numpy as np
 
 def test_cmp_load_store():
     n = 32
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) > B(*i), name='C')
-    D = tvm.compute(C.shape, lambda *i: tvm.all(C(*i),
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) > B(*i), name='C')
+    D = te.compute(C.shape, lambda *i: tvm.tir.all(C(*i),
                                                 A(*i) > 1).astype('float32'), name="D")
 
 
     def check_llvm():
         if not tvm.runtime.enabled("llvm"):
             return
-        s = tvm.create_schedule(D.op)
+        s = te.create_schedule(D.op)
         xo, xi = s[C].split(C.op.axis[0], factor=4)
         xo1, xo2 = s[C].split(xo, factor=13)
         s[C].parallel(xo2)
@@ -50,11 +51,11 @@ def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
             return
-        s = tvm.create_schedule(D.op)
+        s = te.create_schedule(D.op)
         for stage in [C, D]:
             xo, xi = s[stage].split(stage.op.axis[0], factor=4)
-            s[stage].bind(xo, tvm.thread_axis("blockIdx.x"))
-            s[stage].bind(xi, tvm.thread_axis("threadIdx.x"))
+            s[stage].bind(xo, te.thread_axis("blockIdx.x"))
+            s[stage].bind(xi, te.thread_axis("threadIdx.x"))
         f = tvm.build(s, [A, B, D], device)
         a_np = np.random.uniform(size=n).astype(A.dtype)
         a = tvm.nd.array(a_np, ctx)
diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py
index a126c07c8ac1..1604ffb2293b 100644
--- a/tests/python/unittest/test_codegen_c_host.py
+++ b/tests/python/unittest/test_codegen_c_host.py
@@ -15,16 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import util
 
 def test_add():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = te.create_schedule(C.op)
 
     def check_c():
         mhost = tvm.build(s, [A, B, C], "c", name="fadd")
@@ -47,14 +48,14 @@ def check_c():
 
 def test_add_pipeline():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    AA = tvm.compute((n,), lambda *i: A(*i), name='A')
-    BB = tvm.compute((n,), lambda *i: B(*i), name='B')
-    T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
-    C = tvm.compute(A.shape, lambda *i: T(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    AA = te.compute((n,), lambda *i: A(*i), name='A')
+    BB = te.compute((n,), lambda *i: B(*i), name='B')
+    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
+    C = te.compute(A.shape, lambda *i: T(*i), name='C')
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], factor=4)
     xo1, xo2 = s[C].split(xo, factor=13)
     s[C].parallel(xo2)
@@ -65,16 +66,16 @@ def test_add_pipeline():
 
     def check_c():
         # Specifically allow offset to test codepath when offset is available
-        Ab = tvm.decl_buffer(
+        Ab = tvm.tir.decl_buffer(
             A.shape, A.dtype,
-            elem_offset=tvm.size_var('Aoffset'),
+            elem_offset=te.size_var('Aoffset'),
             offset_factor=8,
             name='A')
         binds = {A : Ab}
         # BUILD and invoke the kernel.
         f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline")
-        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
-        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
+        fsplits = [x for x in tvm.tir.ir_pass.SplitHostDevice(f1)]
+        fsplits[0] = tvm.tir.ir_pass.LowerTVMBuiltin(fsplits[0])
         mhost = tvm.target.codegen.build_module(fsplits[0], "c")
         temp = util.tempdir()
         path_dso = temp.relpath("temp.so")
@@ -91,16 +92,16 @@ def check_c():
         tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
 
-    with tvm.build_config(offset_factor=4):
+    with tvm.target.build_config(offset_factor=4):
         check_c()
 
 
 def test_reinterpret():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A', dtype="int32")
-    B = tvm.compute(A.shape, lambda *i: tvm.call_pure_intrin("float32", "reinterpret", A(*i)), name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A', dtype="int32")
+    B = te.compute(A.shape, lambda *i: tvm.tir.call_pure_intrin("float32", "reinterpret", A(*i)), name='B')
+    s = te.create_schedule(B.op)
 
     def check_c():
         mhost = tvm.build(s, [A, B], "c", name="reinterpret")
diff --git a/tests/python/unittest/test_codegen_cross_llvm.py b/tests/python/unittest/test_codegen_cross_llvm.py
index 1827ccf63d79..cb3986eaf20f 100644
--- a/tests/python/unittest/test_codegen_cross_llvm.py
+++ b/tests/python/unittest/test_codegen_cross_llvm.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test cross compilation"""
 import tvm
+from tvm import te
 import os
 import struct
 from tvm import rpc
@@ -24,11 +25,11 @@
 
 def test_llvm_add_pipeline():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], factor=4)
     s[C].parallel(xo)
     s[C].vectorize(xi)
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index 8652817c21ce..f94d8c38e3a6 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -16,14 +16,15 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 import topi
 import unittest
 from tvm.contrib.nvcc import have_fp16, have_int8
 from tvm.contrib import nvcc
 
-tx = tvm.thread_axis("threadIdx.x")
-bx = tvm.thread_axis("blockIdx.x")
+tx = te.thread_axis("threadIdx.x")
+bx = te.thread_axis("blockIdx.x")
 
 def test_cuda_vectorize_add():
     num_thread = 8
@@ -37,9 +38,9 @@ def check_cuda(dtype, n, lanes):
         if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
             print("skip because gpu does not support int8")
             return
-        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
-        B = tvm.compute((n,), lambda i: A[i] + tvm.const(1, A.dtype), name='B')
-        s = tvm.create_schedule(B.op)
+        A = te.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name='B')
+        s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
         s[B].bind(xo, bx)
         s[B].bind(xi, tx)
@@ -69,12 +70,12 @@ def check_cuda(dtype, n, lanes):
         if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
             print("skip because gpu does not support int8")
             return
-        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
-        B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
-        C = tvm.placeholder((n,), name='C', dtype="int32")
-        D = tvm.compute((n,),
-                        lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
-        s = tvm.create_schedule(D.op)
+        A = te.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = te.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
+        C = te.placeholder((n,), name='C', dtype="int32")
+        D = te.compute((n,),
+                        lambda i: tvm.tir.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
+        s = te.create_schedule(D.op)
         xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
         s[D].bind(xo, bx)
         s[D].bind(xi, tx)
@@ -99,9 +100,9 @@ def check_cuda(dtype, n, lanes):
             print("skip because cuda is not enabled..")
             return
         ctx = tvm.gpu(0)
-        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
-        B = tvm.compute((n,), lambda i: A[i], name='B')
-        s = tvm.create_schedule(B.op)
+        A = te.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = te.compute((n,), lambda i: A[i], name='B')
+        s = te.create_schedule(B.op)
         block, thread = s[B].split(B.op.axis[0], factor=num_thread)
         s[B].bind(block, bx)
         s[B].bind(thread, tx)
@@ -122,8 +123,8 @@ def check_cuda(n, value):
         lanes = 4
         dtype = 'int8'
         ctx = tvm.gpu(0)
-        A = tvm.compute((n, lanes), lambda i,j: tvm.const(value, dtype=dtype))
-        s = tvm.create_schedule(A.op)
+        A = te.compute((n, lanes), lambda i,j: tvm.tir.const(value, dtype=dtype))
+        s = te.create_schedule(A.op)
         y, x = s[A].op.axis
         s[A].vectorize(x)
         s[A].bind(y, bx)
@@ -140,10 +141,10 @@ def check_cuda(n, value):
 def test_cuda_inf_nan():
     target = 'cuda'
     def check_inf_nan(ctx, n, value, dtype):
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
-        inf_value = tvm.const(value, dtype=dtype)
-        C = tvm.compute((n,), lambda i: inf_value, name='C')
-        s = tvm.create_schedule(C.op)
+        A = te.placeholder((n,), name='A', dtype=dtype)
+        inf_value = tvm.tir.const(value, dtype=dtype)
+        C = te.compute((n,), lambda i: inf_value, name='C')
+        s = te.create_schedule(C.op)
         s[C].bind(s[C].op.axis[0], tx)
         fun = tvm.build(s, [A, C], target)
         a = tvm.nd.empty((n,), A.dtype, ctx)
@@ -170,36 +171,36 @@ def test_cuda_shuffle():
         print("skip because cuda is not enabled..")
         return
 
-    idxm = tvm.indexmod
-    a = tvm.placeholder((64, ), 'int32')
-    b = tvm.placeholder((64, ), 'int32')
-    c = tvm.compute((64, ), lambda x: a[x] + b[x - idxm(x, 4) + (3 - idxm(x, 4))])
-    sch = tvm.create_schedule(c.op)
+    idxm = tvm.tir.indexmod
+    a = te.placeholder((64, ), 'int32')
+    b = te.placeholder((64, ), 'int32')
+    c = te.compute((64, ), lambda x: a[x] + b[x - idxm(x, 4) + (3 - idxm(x, 4))])
+    sch = te.create_schedule(c.op)
     x = c.op.axis[0]
     xo, xi = sch[c].split(x, 4)
-    thrx = tvm.thread_axis("threadIdx.x")
+    thrx = te.thread_axis("threadIdx.x")
     sch[c].bind(xo, thrx)
     sch[c].vectorize(xi)
 
     def my_vectorize(stmt):
         def vectorizer(op):
             if op.for_type == tvm.tir.For.Vectorized:
-                four = tvm.const(4, 'int32')
-                idx = tvm.tir.Ramp(thrx.var * four, tvm.const(1, 'int32'), 4)
-                all_ones = tvm.const(1, 'int32x4')
+                four = tvm.tir.const(4, 'int32')
+                idx = tvm.tir.Ramp(thrx.var * four, tvm.tir.const(1, 'int32'), 4)
+                all_ones = tvm.tir.const(1, 'int32x4')
                 store = op.body
                 value = store.value
                 new_a = tvm.tir.Load('int32x4', value.a.buffer_var, idx, all_ones)
                 bs, ids = [], []
                 for i in range(4):
-                    bs.append(tvm.tir.Load('int32', value.b.buffer_var, thrx.var * four + tvm.const(i, 'int32')))
-                    ids.append(tvm.const(3 - i, 'int32'))
+                    bs.append(tvm.tir.Load('int32', value.b.buffer_var, thrx.var * four + tvm.tir.const(i, 'int32')))
+                    ids.append(tvm.tir.const(3 - i, 'int32'))
                 new_b = tvm.tir.Shuffle(bs, ids)
                 return tvm.tir.Store(store.buffer_var, new_a + new_b, idx, all_ones)
             return None
-        return tvm.ir_pass.IRTransform(stmt, None, vectorizer, ['For'])
+        return tvm.tir.ir_pass.IRTransform(stmt, None, vectorizer, ['For'])
 
-    with tvm.build_config(add_lower_pass=[(1, my_vectorize)]):
+    with tvm.target.build_config(add_lower_pass=[(1, my_vectorize)]):
         module = tvm.build(sch, [a, b, c], target='cuda')
         a_ = np.array(list(range(64)), dtype='int32')
         b_ = np.array((list(range(4))[::-1]) * 16, dtype='int32')
@@ -215,17 +216,17 @@ def test_cuda_reducition_binding():
         print("skip because cuda is not enabled..")
         return
 
-    k = tvm.reduce_axis((0, 32), 'k')
-    A = tvm.placeholder((96, 32), name='A')
-    B = tvm.compute( (96,), lambda m:
-                     tvm.sum(A[m, k], axis=k),
+    k = te.reduce_axis((0, 32), 'k')
+    A = te.placeholder((96, 32), name='A')
+    B = te.compute( (96,), lambda m:
+                     te.sum(A[m, k], axis=k),
                      name='B')
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
 
     s[B].reorder(B.op.reduce_axis[0], B.op.axis[0])
 
     mo, _ = s[B].split(B.op.axis[0], 32)
-    s[B].bind(mo, tvm.thread_axis("blockIdx.x"))
+    s[B].bind(mo, te.thread_axis("blockIdx.x"))
 
     fcuda = tvm.build(s, [A, B], "cuda")
 
@@ -234,15 +235,15 @@ def test_rfactor_predicates():
         print("skip because cuda is not enabled..")
         return
 
-    n = tvm.reduce_axis((0, 129), 'n')
-    A = tvm.placeholder((129,), name='A')
-    B = tvm.compute( (1, ), lambda b:
-                     tvm.sum(A[n],
+    n = te.reduce_axis((0, 129), 'n')
+    A = te.placeholder((129,), name='A')
+    B = te.compute( (1, ), lambda b:
+                     te.sum(A[n],
                              axis=n),
                      name='B'
     )
 
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
 
     _, ni = s[B].split(s[B].op.reduce_axis[0], factor=8)
 
@@ -270,15 +271,15 @@ def test_cuda_const_float_to_half():
     # otherwise it is found that the code gen is done by nvrtc.
     from tvm import autotvm
     shape = (2, 3, 4)
-    a = tvm.placeholder(shape, dtype='float16', name='a')
-    b = tvm.const(0.5, dtype='float16')
-    c = tvm.compute(shape, lambda i, j, k: a[i, j, k] > b, name='c')
-    s = tvm.create_schedule(c.op)
+    a = te.placeholder(shape, dtype='float16', name='a')
+    b = tvm.tir.const(0.5, dtype='float16')
+    c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name='c')
+    s = te.create_schedule(c.op)
     axes = [axis for axis in c.op.axis]
     fused = s[c].fuse(*axes)
     bx, tx = s[c].split(fused, factor=64)
-    s[c].bind(bx, tvm.thread_axis('blockIdx.x'))
-    s[c].bind(tx, tvm.thread_axis('threadIdx.x'))
+    s[c].bind(bx, te.thread_axis('blockIdx.x'))
+    s[c].bind(tx, te.thread_axis('threadIdx.x'))
 
     func = tvm.build(s, [a, c], 'cuda')
     ctx = tvm.gpu(0)
@@ -298,8 +299,8 @@ def check_cuda(dtype, m=32, n=32):
             print("Skip because gpu does not have fp16 support")
             return
 
-        a = tvm.placeholder((m, n), name="a", dtype=dtype)
-        b = tvm.placeholder((m, n), name="b", dtype=dtype)
+        a = te.placeholder((m, n), name="a", dtype=dtype)
+        b = te.placeholder((m, n), name="b", dtype=dtype)
         c = a + b
         d = a * b
         e = topi.elemwise_sum([c, d])
diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py
index 63ee03028e7e..88abca8d2820 100644
--- a/tests/python/unittest/test_codegen_device.py
+++ b/tests/python/unittest/test_codegen_device.py
@@ -15,20 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm.contrib import util
 import numpy as np
 
 def test_large_uint_imm():
     value =  (1 << 63) + 123
-    other = tvm.const(3, "uint64")
+    other = tvm.tir.const(3, "uint64")
     n = 12
     num_thread = 2
 
-    A = tvm.compute((n,), lambda *i: tvm.const(value, "uint64") + other, name='A')
-    s = tvm.create_schedule(A.op)
+    A = te.compute((n,), lambda *i: tvm.tir.const(value, "uint64") + other, name='A')
+    s = te.create_schedule(A.op)
     xo, xi = s[A].split(A.op.axis[0], factor=num_thread)
-    s[A].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[A].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[A].bind(xi, te.thread_axis("threadIdx.x"))
+    s[A].bind(xo, te.thread_axis("blockIdx.x"))
 
     def check_target(device):
         ctx = tvm.context(device, 0)
@@ -45,38 +46,38 @@ def check_target(device):
 
 
 def test_add_pipeline():
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(), name='C')
-    D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='D')
-    s = tvm.create_schedule(D.op)
+    n = te.size_var('n')
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(), name='C')
+    D = te.compute(A.shape, lambda *i: C(*i) + 1, name='D')
+    s = te.create_schedule(D.op)
 
     # GPU schedule have to split by gridIdx and threadIdx
     num_thread = 256
     xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
-    s[C].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[C].bind(xi, te.thread_axis("threadIdx.x"))
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
 
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
-    s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[D].bind(xi, te.thread_axis("threadIdx.x"))
+    s[D].bind(xo, te.thread_axis("blockIdx.x"))
 
     # compile to IR
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    Db = tvm.decl_buffer(D.shape, D.dtype, name='D')
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, D:Db}, 64)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Db], 0, True)
-    fsplits = [x for x in tvm.ir_pass.SplitHostDevice(fapi)]
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    Db = tvm.tir.decl_buffer(D.shape, D.dtype, name='D')
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, D:Db}, 64)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Db], 0, True)
+    fsplits = [x for x in tvm.tir.ir_pass.SplitHostDevice(fapi)]
     # lower the floordiv(use stackvm rules so it works for all targets)
-    fsplits = [tvm.ir_pass.LowerIntrin(x, "stackvm") for x in fsplits]
-    fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
+    fsplits = [tvm.tir.ir_pass.LowerIntrin(x, "stackvm") for x in fsplits]
+    fsplits[0] = tvm.tir.ir_pass.LowerTVMBuiltin(fsplits[0])
 
     def check_target(device, host="stackvm"):
         ctx = tvm.context(device, 0)
diff --git a/tests/python/unittest/test_codegen_extern.py b/tests/python/unittest/test_codegen_extern.py
index 03efee58cc57..4104af864439 100644
--- a/tests/python/unittest/test_codegen_extern.py
+++ b/tests/python/unittest/test_codegen_extern.py
@@ -15,37 +15,38 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def test_add_pipeline():
     nn = 64
     max_threads = 4
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
 
     def extern_generator(ins, outs):
         """Manually write the IR for the extern function, add pipeline"""
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
         with ib.for_range(0, (n+1) // 2) as i:
-            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
+            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.tir.const(1, "float32x2")))
         return ib.get()
 
     def extern_generator_gpu(ins, outs):
         """Manually write the IR for the extern function, add pipeline"""
-        ib = tvm.ir_builder.create()
-        bx = tvm.thread_axis("blockIdx.x")
-        tx = tvm.thread_axis("threadIdx.x")
+        ib = tvm.tir.ir_builder.create()
+        bx = te.thread_axis("blockIdx.x")
+        tx = te.thread_axis("threadIdx.x")
         ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads)
         ib.scope_attr(tx, "thread_extent", max_threads)
         idx = bx.var * max_threads + tx.var
         with ib.if_scope(ib.likely(idx < n)):
-            ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2")))
+            ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.tir.const(1, "float32x2")))
         return ib.get()
 
-    C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C')
-    C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C')
-    s_cpu = tvm.create_schedule(C_cpu.op)
-    s_gpu = tvm.create_schedule(C_gpu.op)
+    C_cpu = te.extern(A.shape, [A], extern_generator, name='C')
+    C_gpu = te.extern(A.shape, [A], extern_generator_gpu, name='C')
+    s_cpu = te.create_schedule(C_cpu.op)
+    s_gpu = te.create_schedule(C_gpu.op)
     print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
     print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))
 
@@ -70,14 +71,14 @@ def check_target(target):
 
 def test_pack_buffer_simple():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
     def extern_generator(ins, outs):
         """Manually write the IR for the extern function, add pipeline."""
-        return tvm.call_packed("my_extern_array_func1", ins[0], outs[0])
+        return tvm.tir.call_packed("my_extern_array_func1", ins[0], outs[0])
 
-    C = tvm.extern(A.shape, [A], extern_generator, name='C')
-    s = tvm.create_schedule(C.op)
+    C = te.extern(A.shape, [A], extern_generator, name='C')
+    s = te.create_schedule(C.op)
 
     @tvm.register_func
     def my_extern_array_func1(aa, bb):
@@ -104,15 +105,15 @@ def check_target(target):
 
 def test_pack_buffer_intermediate():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute((n,), lambda i: A[i] + 1, name="B")
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.compute((n,), lambda i: A[i] + 1, name="B")
     def extern_generator(ins, outs):
         """Manually write the IR for the extern function, add pipeline."""
-        return tvm.call_packed("my_extern_array_func2", ins[0], outs[0])
+        return tvm.tir.call_packed("my_extern_array_func2", ins[0], outs[0])
 
-    C = tvm.extern(B.shape, [B], extern_generator, name='C')
-    s = tvm.create_schedule(C.op)
+    C = te.extern(B.shape, [B], extern_generator, name='C')
+    s = te.create_schedule(C.op)
 
     def check_target(target):
         if not tvm.runtime.enabled(target):
diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py
index ca3229389c27..45554c5475a3 100644
--- a/tests/python/unittest/test_codegen_llvm.py
+++ b/tests/python/unittest/test_codegen_llvm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import topi
 from tvm.contrib import util, clang
 import numpy as np
@@ -22,18 +23,18 @@
 import math
 
 def test_llvm_intrin():
-    ib = tvm.ir_builder.create()
-    n = tvm.convert(4)
+    ib = tvm.tir.ir_builder.create()
+    n = tvm.runtime.convert(4)
     A = ib.pointer("float32", name="A")
     args = [
-        tvm.call_pure_intrin("handle", "tvm_address_of", A[0]),
+        tvm.tir.call_pure_intrin("handle", "tvm_address_of", A[0]),
         0, 3, 1
     ]
     ib.emit(tvm.tir.Evaluate(
         tvm.tir.Call(
             "int32", "prefetch", args, tvm.tir.Call.Intrinsic, None, 0)))
     body = ib.get()
-    func = tvm.ir_pass.MakeAPI(body, "prefetch", [A], 0, True)
+    func = tvm.tir.ir_pass.MakeAPI(body, "prefetch", [A], 0, True)
     fcode = tvm.build(func, None, "llvm")
 
 
@@ -45,9 +46,9 @@ def test_llvm_import():
     }
     """
     n = 10
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute((n,), lambda *i:
-                    tvm.call_pure_extern("float32", "my_add", A(*i), 1.0),
+    A = te.placeholder((n,), name='A')
+    B = te.compute((n,), lambda *i:
+                    tvm.tir.call_pure_extern("float32", "my_add", A(*i), 1.0),
                     name='B')
     def check_llvm(use_file):
         if not tvm.runtime.enabled("llvm"):
@@ -58,7 +59,7 @@ def check_llvm(use_file):
         temp = util.tempdir()
         ll_path = temp.relpath("temp.ll")
         ll_code = clang.create_llvm(cc_code, output=ll_path)
-        s = tvm.create_schedule(B.op)
+        s = te.create_schedule(B.op)
         if use_file:
             s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
         else:
@@ -78,21 +79,21 @@ def check_llvm(use_file):
 
 
 def test_llvm_lookup_intrin():
-    ib = tvm.ir_builder.create()
-    m = tvm.size_var("m")
+    ib = tvm.tir.ir_builder.create()
+    m = te.size_var("m")
     A = ib.pointer("uint8x8", name="A")
-    x = tvm.call_llvm_intrin("uint8x8", "llvm.ctpop.i8", tvm.const(1, 'uint32'), A)
+    x = tvm.tir.call_llvm_intrin("uint8x8", "llvm.ctpop.i8", tvm.tir.const(1, 'uint32'), A)
     ib.emit(x)
     body = ib.get()
-    func = tvm.ir_pass.MakeAPI(body, "ctpop", [A], 1, True)
+    func = tvm.tir.ir_pass.MakeAPI(body, "ctpop", [A], 1, True)
     fcode = tvm.build(func, None, "llvm")
 
 
 def test_llvm_large_uintimm():
     value =  (1 << 63) + 123
-    other = tvm.const(3, "uint64")
-    A = tvm.compute((), lambda : tvm.const(value, "uint64") + other, name='A')
-    s = tvm.create_schedule(A.op)
+    other = tvm.tir.const(3, "uint64")
+    A = te.compute((), lambda : tvm.tir.const(value, "uint64") + other, name='A')
+    s = te.create_schedule(A.op)
 
     def check_llvm():
         if not tvm.runtime.enabled("llvm"):
@@ -109,14 +110,14 @@ def check_llvm():
 
 def test_llvm_add_pipeline():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    AA = tvm.compute((n,), lambda *i: A(*i), name='A')
-    BB = tvm.compute((n,), lambda *i: B(*i), name='B')
-    T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
-    C = tvm.compute(A.shape, lambda *i: T(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    AA = te.compute((n,), lambda *i: A(*i), name='A')
+    BB = te.compute((n,), lambda *i: B(*i), name='B')
+    T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
+    C = te.compute(A.shape, lambda *i: T(*i), name='C')
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], factor=4)
     xo1, xo2 = s[C].split(xo, factor=13)
     s[C].parallel(xo2)
@@ -129,9 +130,9 @@ def check_llvm():
         if not tvm.runtime.enabled("llvm"):
             return
         # Specifically allow offset to test codepath when offset is available
-        Ab = tvm.decl_buffer(
+        Ab = tvm.tir.decl_buffer(
             A.shape, A.dtype,
-            elem_offset=tvm.size_var('Aoffset'),
+            elem_offset=te.size_var('Aoffset'),
             offset_factor=8,
             name='A')
         binds = {A : Ab}
@@ -147,16 +148,16 @@ def check_llvm():
         tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
 
-    with tvm.build_config(offset_factor=4):
+    with tvm.target.build_config(offset_factor=4):
         check_llvm()
 
 
 def test_llvm_persist_parallel():
     n = 128
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
-    C = tvm.compute(A.shape, lambda *i: tvm.sqrt(B(*i)) * 2 + 2, name='C')
-    s = tvm.create_schedule(C.op)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1, name='B')
+    C = te.compute(A.shape, lambda *i: te.sqrt(B(*i)) * 2 + 2, name='C')
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], factor=8)
     xo1, xo2 = s[C].split(xo, nparts=1)
     s[B].compute_at(s[C], xo1)
@@ -187,10 +188,10 @@ def test_llvm_flip_pipeline():
     def check_llvm(nn, base):
         if not tvm.runtime.enabled("llvm"):
             return
-        n = tvm.convert(nn)
-        A = tvm.placeholder((n + base), name='A')
-        C = tvm.compute((n,), lambda i: A(nn + base- i - 1), name='C')
-        s = tvm.create_schedule(C.op)
+        n = tvm.runtime.convert(nn)
+        A = te.placeholder((n + base), name='A')
+        C = te.compute((n,), lambda i: A(nn + base- i - 1), name='C')
+        s = te.create_schedule(C.op)
         xo, xi = s[C].split(C.op.axis[0], factor=4)
         s[C].parallel(xo)
         s[C].vectorize(xi)
@@ -214,10 +215,10 @@ def test_llvm_vadd_pipeline():
     def check_llvm(n, lanes):
         if not tvm.runtime.enabled("llvm"):
             return
-        A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes)
-        B = tvm.compute((n,), lambda i: A[i], name='B')
-        C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C')
-        s = tvm.create_schedule(C.op)
+        A = te.placeholder((n,), name='A', dtype="float32x%d" % lanes)
+        B = te.compute((n,), lambda i: A[i], name='B')
+        C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name='C')
+        s = te.create_schedule(C.op)
         xo, xi = s[C].split(C.op.axis[0], nparts=2)
         _, xi = s[C].split(xi, factor=2)
         s[C].parallel(xo)
@@ -243,10 +244,10 @@ def test_llvm_madd_pipeline():
     def check_llvm(nn, base, stride):
         if not tvm.runtime.enabled("llvm"):
             return
-        n = tvm.convert(nn)
-        A = tvm.placeholder((n + base, stride), name='A')
-        C = tvm.compute((n, stride), lambda i, j: A(base + i, j) + 1, name='C')
-        s = tvm.create_schedule(C.op)
+        n = tvm.runtime.convert(nn)
+        A = te.placeholder((n + base, stride), name='A')
+        C = te.compute((n, stride), lambda i, j: A(base + i, j) + 1, name='C')
+        s = te.create_schedule(C.op)
         xo, xi = s[C].split(C.op.axis[0], factor=4)
         s[C].parallel(xo)
         s[C].vectorize(xi)
@@ -262,17 +263,17 @@ def check_llvm(nn, base, stride):
             c.asnumpy(), a.asnumpy()[base:] + 1)
     check_llvm(64, 0, 2)
     check_llvm(4, 0, 1)
-    with tvm.build_config(restricted_func=False):
+    with tvm.target.build_config(restricted_func=False):
         check_llvm(4, 0, 3)
 
 
 def test_llvm_temp_space():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda i: A(i) + 1, name='B')
-    C = tvm.compute(A.shape, lambda i: B(i) + 1, name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda i: A(i) + 1, name='B')
+    C = te.compute(A.shape, lambda i: B(i) + 1, name='C')
+    s = te.create_schedule(C.op)
 
     def check_llvm():
         if not tvm.runtime.enabled("llvm"):
@@ -291,11 +292,11 @@ def check_llvm():
 
 def test_multiple_func():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], factor=4)
     s[C].parallel(xo)
     s[C].vectorize(xi)
@@ -328,9 +329,9 @@ def test_llvm_condition():
     def check_llvm(n, offset):
         if not tvm.runtime.enabled("llvm"):
             return
-        A = tvm.placeholder((n, ), name='A')
-        C = tvm.compute((n,), lambda i: tvm.if_then_else(i >= offset, A[i], 0.0), name='C')
-        s = tvm.create_schedule(C.op)
+        A = te.placeholder((n, ), name='A')
+        C = te.compute((n,), lambda i: tvm.tir.if_then_else(i >= offset, A[i], 0.0), name='C')
+        s = te.create_schedule(C.op)
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
         ctx = tvm.cpu(0)
@@ -348,9 +349,9 @@ def test_llvm_bool():
     def check_llvm(n):
         if not tvm.runtime.enabled("llvm"):
             return
-        A = tvm.placeholder((n, ), name='A', dtype="int32")
-        C = tvm.compute((n,), lambda i: A[i].equal(1).astype("float"), name='C')
-        s = tvm.create_schedule(C.op)
+        A = te.placeholder((n, ), name='A', dtype="int32")
+        C = te.compute((n,), lambda i: A[i].equal(1).astype("float"), name='C')
+        s = te.create_schedule(C.op)
         # build and invoke the kernel.
         f = tvm.build(s, [A, C], "llvm")
         ctx = tvm.cpu(0)
@@ -367,12 +368,12 @@ def test_rank_zero():
     def check_llvm(n):
         if not tvm.runtime.enabled("llvm"):
             return
-        A = tvm.placeholder((n, ), name='A')
-        scale = tvm.placeholder((), name='scale')
-        k = tvm.reduce_axis((0, n), name="k")
-        C = tvm.compute((), lambda : tvm.sum(A[k] * scale(), axis=k), name="C")
-        D = tvm.compute((), lambda : C() + 1)
-        s = tvm.create_schedule(D.op)
+        A = te.placeholder((n, ), name='A')
+        scale = te.placeholder((), name='scale')
+        k = te.reduce_axis((0, n), name="k")
+        C = te.compute((), lambda : te.sum(A[k] * scale(), axis=k), name="C")
+        D = te.compute((), lambda : C() + 1)
+        s = te.create_schedule(D.op)
         # build and invoke the kernel.
         f = tvm.build(s, [A, scale, D], "llvm")
         ctx = tvm.cpu(0)
@@ -390,13 +391,13 @@ def test_rank_zero_bound_checkers():
     def check_llvm(n):
         if not tvm.runtime.enabled("llvm"):
             return
-        with tvm.build_config(instrument_bound_checkers=True):
-            A = tvm.placeholder((n, ), name='A')
-            scale = tvm.placeholder((), name='scale')
-            k = tvm.reduce_axis((0, n), name="k")
-            C = tvm.compute((), lambda : tvm.sum(A[k] * scale(), axis=k), name="C")
-            D = tvm.compute((), lambda : C() + 1)
-            s = tvm.create_schedule(D.op)
+        with tvm.target.build_config(instrument_bound_checkers=True):
+            A = te.placeholder((n, ), name='A')
+            scale = te.placeholder((), name='scale')
+            k = te.reduce_axis((0, n), name="k")
+            C = te.compute((), lambda : te.sum(A[k] * scale(), axis=k), name="C")
+            D = te.compute((), lambda : C() + 1)
+            s = te.create_schedule(D.op)
             # build and invoke the kernel.
             f = tvm.build(s, [A, scale, D], "llvm")
             ctx = tvm.cpu(0)
@@ -412,10 +413,10 @@ def check_llvm(n):
 
 
 def test_alignment():
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda i: A[i] * 3, name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda i: A[i] * 3, name='B')
+    s = te.create_schedule(B.op)
     bx, tx = s[B].split(B.op.axis[0], factor=8)
     s[B].vectorize(tx)
     f = tvm.build(s, [A, B], "llvm")
@@ -427,26 +428,26 @@ def test_alignment():
 def test_llvm_div():
     """Check that the semantics of div and mod is correct"""
     def check(start, end, dstart, dend, dtype, floor_div=False):
-        div = tvm.floordiv if floor_div else tvm.truncdiv
-        mod = tvm.floormod if floor_div else tvm.truncmod
+        div = tvm.te.floordiv if floor_div else tvm.tir.truncdiv
+        mod = tvm.te.floormod if floor_div else tvm.tir.truncmod
 
         # A are dividends, B are divisors. Note that we add 1 to make include end in the range.
-        A = tvm.placeholder((end - start + 1,), name="A", dtype=dtype)
-        B = tvm.placeholder((dend - dstart + 1,), name="B", dtype=dtype)
+        A = te.placeholder((end - start + 1,), name="A", dtype=dtype)
+        B = te.placeholder((dend - dstart + 1,), name="B", dtype=dtype)
         # We clip values with min and max so that simplifiers know the ranges of values
-        clipa = lambda x: tvm.min(tvm.const(end, dtype), tvm.max(tvm.const(start, dtype), x))
-        clipb = lambda x: tvm.min(tvm.const(dend, dtype), tvm.max(tvm.const(dstart, dtype), x))
+        clipa = lambda x: tvm.te.min(tvm.tir.const(end, dtype), tvm.te.max(tvm.tir.const(start, dtype), x))
+        clipb = lambda x: tvm.te.min(tvm.tir.const(dend, dtype), tvm.te.max(tvm.tir.const(dstart, dtype), x))
         # If the range is just a single point, use the constant itself
         if start == end:
-            clipa = lambda x: tvm.const(start, dtype)
+            clipa = lambda x: tvm.tir.const(start, dtype)
         if dstart == dend:
-            clipb = lambda x: tvm.const(dstart, dtype)
+            clipb = lambda x: tvm.tir.const(dstart, dtype)
         # D are division results and M are modulo results
-        [D, M] = tvm.compute((end - start + 1, dend - dstart + 1),
+        [D, M] = te.compute((end - start + 1, dend - dstart + 1),
                              lambda i, j: (div(clipa(A[i]), clipb(B[j])),
                                           mod(clipa(A[i]), clipb(B[j]))))
 
-        s = tvm.create_schedule([D.op, M.op])
+        s = te.create_schedule([D.op, M.op])
         f = tvm.build(s, [A, B, D, M], "llvm")
 
         # Fill input arrays with values
@@ -525,10 +526,10 @@ def _show_info():
 
 def test_llvm_fp_math():
     def check_llvm_reciprocal(n):
-        A = tvm.placeholder((n,), name='A')
-        B = tvm.compute((n,), lambda i: tvm.div(1.0,(1e+37*A[i])), name='B')
+        A = te.placeholder((n,), name='A')
+        B = te.compute((n,), lambda i: te.div(1.0,(1e+37*A[i])), name='B')
 
-        s = tvm.create_schedule(B.op)
+        s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "llvm")
 
         a = tvm.nd.array(np.full((n,), 100, 'float32'))
@@ -541,10 +542,10 @@ def check_llvm_reciprocal(n):
     check_llvm_reciprocal(16)
 
     def check_llvm_sigmoid(n):
-        A = tvm.placeholder((n,), name='A')
-        B = tvm.compute((n,), lambda i: tvm.sigmoid(A[i]), name='B')
+        A = te.placeholder((n,), name='A')
+        B = te.compute((n,), lambda i: te.sigmoid(A[i]), name='B')
 
-        s = tvm.create_schedule(B.op)
+        s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "llvm")
 
         a = tvm.nd.array(np.full((n,), -1000, 'float32'))
@@ -559,11 +560,11 @@ def check_llvm_sigmoid(n):
 
 def test_dwarf_debug_information():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], factor=4)
     s[C].parallel(xo)
     s[C].vectorize(xi)
@@ -634,27 +635,27 @@ def check_llvm_ir():
 
 
 def test_llvm_shuffle():
-    a = tvm.placeholder((8, ), 'int32')
-    b = tvm.placeholder((8, ), 'int32')
-    c = tvm.compute((8, ), lambda x: a[x] + b[7-x])
-    sch = tvm.create_schedule(c.op)
+    a = te.placeholder((8, ), 'int32')
+    b = te.placeholder((8, ), 'int32')
+    c = te.compute((8, ), lambda x: a[x] + b[7-x])
+    sch = te.create_schedule(c.op)
 
     def my_vectorize(stmt):
 
         def vectorizer(op):
             store = op.body
-            idx = tvm.tir.Ramp(tvm.const(0, 'int32'), tvm.const(1, 'int32'), 8)
-            all_ones = tvm.const(1, 'int32x8')
+            idx = tvm.tir.Ramp(tvm.tir.const(0, 'int32'), tvm.tir.const(1, 'int32'), 8)
+            all_ones = tvm.tir.const(1, 'int32x8')
             value = store.value
-            b_idx = tvm.tir.Shuffle([idx], [tvm.const(i, 'int32') for i in range(7, -1, -1)])
+            b_idx = tvm.tir.Shuffle([idx], [tvm.tir.const(i, 'int32') for i in range(7, -1, -1)])
             new_a = tvm.tir.Load('int32x8', value.a.buffer_var, idx, all_ones)
             new_b = tvm.tir.Load('int32x8', value.b.buffer_var, b_idx, all_ones)
             value = new_a + new_b
             return tvm.tir.Store(store.buffer_var, new_a + new_b, idx, all_ones)
 
-        return tvm.ir_pass.IRTransform(stmt, None, vectorizer, ['For'])
+        return tvm.tir.ir_pass.IRTransform(stmt, None, vectorizer, ['For'])
 
-    with tvm.build_config(add_lower_pass=[(1, my_vectorize)]):
+    with tvm.target.build_config(add_lower_pass=[(1, my_vectorize)]):
         ir = tvm.lower(sch, [a, b, c], simple_mode=True)
         module = tvm.build(sch, [a, b, c])
         a_ = tvm.nd.array(np.arange(1, 9, dtype='int32'))
diff --git a/tests/python/unittest/test_codegen_opencl.py b/tests/python/unittest/test_codegen_opencl.py
index 3b9b4a73c52d..140e1f6fbdea 100644
--- a/tests/python/unittest/test_codegen_opencl.py
+++ b/tests/python/unittest/test_codegen_opencl.py
@@ -15,19 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 target = 'opencl'
 
 def test_opencl_ternary_expression():
     def check_if_then_else(ctx, n, dtype):
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
-        true_value = tvm.const(1, dtype=dtype)
-        false_value = tvm.const(3, dtype=dtype)
-        max_lhs = tvm.const(2, dtype=dtype)
-        max_rhs = tvm.if_then_else(A[0] > 0, true_value, false_value)
-        C = tvm.compute((n,), lambda i: tvm.max(max_lhs, max_rhs), name='C')
-        s = tvm.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
+        A = te.placeholder((n,), name='A', dtype=dtype)
+        true_value = tvm.tir.const(1, dtype=dtype)
+        false_value = tvm.tir.const(3, dtype=dtype)
+        max_lhs = tvm.tir.const(2, dtype=dtype)
+        max_rhs = tvm.tir.if_then_else(A[0] > 0, true_value, false_value)
+        C = te.compute((n,), lambda i: tvm.te.max(max_lhs, max_rhs), name='C')
+        s = te.create_schedule(C.op)
+        s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
         fun = tvm.build(s, [A, C], target)
 
         a = tvm.nd.empty((n,), A.dtype, ctx)
@@ -36,14 +37,14 @@ def check_if_then_else(ctx, n, dtype):
         fun(a, c)
 
     def check_select(ctx, n, dtype):
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
-        true_value = tvm.const(1, dtype=dtype)
-        false_value = tvm.const(3, dtype=dtype)
-        max_lhs = tvm.const(2, dtype=dtype)
+        A = te.placeholder((n,), name='A', dtype=dtype)
+        true_value = tvm.tir.const(1, dtype=dtype)
+        false_value = tvm.tir.const(3, dtype=dtype)
+        max_lhs = tvm.tir.const(2, dtype=dtype)
         max_rhs = tvm.tir.Select(A[0] > 0, true_value, false_value)
-        C = tvm.compute((n,), lambda i: tvm.max(max_lhs, max_rhs), name='C')
-        s = tvm.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
+        C = te.compute((n,), lambda i: tvm.te.max(max_lhs, max_rhs), name='C')
+        s = te.create_schedule(C.op)
+        s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
         fun = tvm.build(s, [A, C], target)
 
         a = tvm.nd.empty((n,), A.dtype, ctx)
@@ -68,11 +69,11 @@ def check_select(ctx, n, dtype):
 
 def test_opencl_inf_nan():
     def check_inf_nan(ctx, n, value, dtype):
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
-        inf_value = tvm.const(value, dtype=dtype)
-        C = tvm.compute((n,), lambda i: inf_value, name='C')
-        s = tvm.create_schedule(C.op)
-        s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
+        A = te.placeholder((n,), name='A', dtype=dtype)
+        inf_value = tvm.tir.const(value, dtype=dtype)
+        C = te.compute((n,), lambda i: inf_value, name='C')
+        s = te.create_schedule(C.op)
+        s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
         fun = tvm.build(s, [A, C], target)
         a = tvm.nd.empty((n,), A.dtype, ctx)
         c = tvm.nd.empty((n,), A.dtype, ctx)
diff --git a/tests/python/unittest/test_codegen_rocm.py b/tests/python/unittest/test_codegen_rocm.py
index 73f76465649a..f107e592d2d3 100644
--- a/tests/python/unittest/test_codegen_rocm.py
+++ b/tests/python/unittest/test_codegen_rocm.py
@@ -15,23 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 import unittest
 
-tx = tvm.thread_axis("threadIdx.x")
-ty = tvm.thread_axis("threadIdx.y")
-bx = tvm.thread_axis("blockIdx.x")
-by = tvm.thread_axis("blockIdx.y")
+tx = te.thread_axis("threadIdx.x")
+ty = te.thread_axis("threadIdx.y")
+bx = te.thread_axis("blockIdx.x")
+by = te.thread_axis("blockIdx.y")
 
 @unittest.skipIf(not tvm.rocm(0).exist or not tvm.runtime.enabled("rocm"), "skip because rocm is not enabled..")
 def test_rocm_cross_thread_reduction():
     # based on the reduction tutorial
-    n = tvm.size_var("n")
-    m = tvm.size_var("m")
-    A = tvm.placeholder((n, m), name='A')
-    k = tvm.reduce_axis((0, m), "k")
-    B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
-    s = tvm.create_schedule(B.op)
+    n = te.size_var("n")
+    m = te.size_var("m")
+    A = te.placeholder((n, m), name='A')
+    k = te.reduce_axis((0, m), "k")
+    B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
+    s = te.create_schedule(B.op)
     ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
     BF = s.rfactor(B, ki)
     xo, xi = s[B].split(s[B].op.axis[0], factor=32)
@@ -54,10 +55,10 @@ def test_rocm_cross_thread_reduction():
 @unittest.skipIf(not tvm.rocm(0).exist or not tvm.runtime.enabled("rocm"), "skip because rocm is not enabled..")
 def test_rocm_inf_nan():
     def check_inf_nan(ctx, n, value, dtype):
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
-        inf_value = tvm.const(value, dtype=dtype)
-        C = tvm.compute((n,), lambda i: inf_value, name='C')
-        s = tvm.create_schedule(C.op)
+        A = te.placeholder((n,), name='A', dtype=dtype)
+        inf_value = tvm.tir.const(value, dtype=dtype)
+        C = te.compute((n,), lambda i: inf_value, name='C')
+        s = te.create_schedule(C.op)
         s[C].bind(s[C].op.axis[0], tx)
         fun = tvm.build(s, [A, C], "rocm")
         a = tvm.nd.empty((n,), A.dtype, ctx)
@@ -76,12 +77,12 @@ def check_inf_nan(ctx, n, value, dtype):
 
 @unittest.skipIf(not tvm.rocm(0).exist or not tvm.runtime.enabled("rocm"), "skip because rocm is not enabled..")
 def test_rocm_reducition_binding():
-    k = tvm.reduce_axis((0, 32), 'k')
-    A = tvm.placeholder((96, 32), name='A')
-    B = tvm.compute( (96,), lambda m:
-                     tvm.sum(A[m, k], axis=k),
+    k = te.reduce_axis((0, 32), 'k')
+    A = te.placeholder((96, 32), name='A')
+    B = te.compute( (96,), lambda m:
+                     te.sum(A[m, k], axis=k),
                      name='B')
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
 
     s[B].reorder(B.op.reduce_axis[0], B.op.axis[0])
 
@@ -92,7 +93,7 @@ def test_rocm_reducition_binding():
 def test_rocm_copy():
 
     def check_rocm(dtype, n):
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        A = te.placeholder((n,), name='A', dtype=dtype)
         ctx = tvm.rocm(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
         a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(a_np)
@@ -111,9 +112,9 @@ def test_rocm_vectorize_add():
     num_thread = 8
 
     def check_rocm(dtype, n, lanes):
-        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
-        B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
-        s = tvm.create_schedule(B.op)
+        A = te.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = te.compute((n,), lambda i: A[i]+tvm.tir.const(1, A.dtype), name='B')
+        s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
         s[B].bind(xo, bx)
         s[B].bind(xi, tx)
diff --git a/tests/python/unittest/test_codegen_static_init.py b/tests/python/unittest/test_codegen_static_init.py
index 4d71cb3929a7..3b5f17a4243a 100644
--- a/tests/python/unittest/test_codegen_static_init.py
+++ b/tests/python/unittest/test_codegen_static_init.py
@@ -15,24 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import ctypes
 import numpy as np
 
 def test_static_callback():
     dtype = 'int64'
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((n, ), dtype)
-    i = tvm.size_var('i')
-    ib = tvm.ir_builder.create()
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((n, ), dtype)
+    i = te.size_var('i')
+    ib = tvm.tir.ir_builder.create()
     A = ib.buffer_ptr(Ab)
-    cp = tvm.thread_axis((0, 1), "cop")
+    cp = te.thread_axis((0, 1), "cop")
     finit = tvm.tir.StringImm("TVMBackendRunOnce")
     ib.scope_attr(cp, "coproc_uop_scope", finit)
     with ib.for_range(0, n, "i", for_type="parallel") as i:
         A[i] = A[i] + 1
     stmt = ib.get()
-    fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
-    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
+    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
     f = tvm.target.codegen.build_module(fapi, "llvm")
     a = tvm.nd.array(np.zeros(10, dtype=dtype))
     f(a)
@@ -41,13 +42,13 @@ def test_static_callback():
 
 def test_static_init():
     dtype = 'int64'
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((n, ), dtype)
-    i = tvm.size_var('i')
-    ib = tvm.ir_builder.create()
-    handle = tvm.call_intrin("handle", "tvm_static_handle")
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((n, ), dtype)
+    i = te.size_var('i')
+    ib = tvm.tir.ir_builder.create()
+    handle = tvm.tir.call_intrin("handle", "tvm_static_handle")
     ib.emit(
-        tvm.call_packed("test_static_callback", handle, Ab))
+        tvm.tir.call_packed("test_static_callback", handle, Ab))
 
     @tvm.register_func("test_static_callback")
     def test_cb(sh, A):
@@ -55,8 +56,8 @@ def test_cb(sh, A):
         return sh
 
     stmt = ib.get()
-    fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
-    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
+    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
     f = tvm.target.codegen.build_module(fapi, "llvm")
     a = tvm.nd.array(np.zeros(10, dtype=dtype))
     f(a)
diff --git a/tests/python/unittest/test_codegen_vm_basic.py b/tests/python/unittest/test_codegen_vm_basic.py
index 7f08c75366e6..e2ff4875e6fd 100644
--- a/tests/python/unittest/test_codegen_vm_basic.py
+++ b/tests/python/unittest/test_codegen_vm_basic.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def run_jit(fapi, check):
@@ -32,12 +33,12 @@ def tvm_call_back_get_shape(shape0):
         print(shape0)
         assert shape0 == a.shape[0]
 
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((n, ), tvm.float32)
-    stmt = tvm.tir.Evaluate(tvm.call_packed("tvm_call_back_get_shape", Ab.shape[0]))
-    fapi = tvm.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0, True)
-    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
-    fapi = tvm.ir_pass.LowerIntrin(fapi, "stackvm")
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((n, ), "float32")
+    stmt = tvm.tir.Evaluate(tvm.tir.call_packed("tvm_call_back_get_shape", Ab.shape[0]))
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "print_shape", [Ab], 0, True)
+    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
+    fapi = tvm.tir.ir_pass.LowerIntrin(fapi, "stackvm")
     run_jit(fapi, lambda f: f(a))
 
 
@@ -47,19 +48,19 @@ def tvm_stack_vm_print(*x):
 
 def test_stack_vm_loop():
     dtype = 'int64'
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((n, ), dtype)
-    i = tvm.size_var('i')
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((n, ), dtype)
+    i = te.size_var('i')
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     A = ib.buffer_ptr(Ab)
     with ib.for_range(0, n - 1, "i") as i:
         A[i + 1] = A[i] + 1
-        ib.emit(tvm.call_packed("tvm_stack_vm_print", i))
+        ib.emit(tvm.tir.call_packed("tvm_stack_vm_print", i))
 
     stmt = ib.get()
-    fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
-    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
+    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
     a = tvm.nd.array(np.zeros(10, dtype=dtype))
     def check(f):
         f(a)
@@ -69,10 +70,10 @@ def check(f):
 
 def test_stack_vm_cond():
     dtype = 'int64'
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((n, ), dtype)
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((n, ), dtype)
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     A = ib.buffer_ptr(Ab)
     with ib.for_range(0, n - 1, "i") as i:
         with ib.if_scope(tvm.tir.EQ(i,  4)):
@@ -81,8 +82,8 @@ def test_stack_vm_cond():
             A[i + 1] = A[i] + 2
 
     stmt = ib.get()
-    fapi = tvm.ir_pass.MakeAPI(stmt, "test", [Ab], 0, True)
-    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "test", [Ab], 0, True)
+    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
     def check(f):
         a = tvm.nd.array(np.zeros(10, dtype=dtype))
         f(a)
@@ -93,16 +94,16 @@ def check(f):
 
 def test_vm_parallel():
     dtype = 'int64'
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((n, ), dtype)
-    i = tvm.size_var('i')
-    ib = tvm.ir_builder.create()
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((n, ), dtype)
+    i = te.size_var('i')
+    ib = tvm.tir.ir_builder.create()
     A = ib.buffer_ptr(Ab)
     with ib.for_range(0, n, "i", for_type="parallel") as i:
         A[i] = A[i] + 1
     stmt = ib.get()
-    fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
-    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
+    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
     def check(f):
         a = tvm.nd.array(np.zeros(10, dtype=dtype))
         f(a)
diff --git a/tests/python/unittest/test_codegen_vulkan.py b/tests/python/unittest/test_codegen_vulkan.py
index d480a0f6ead8..722a9ec6be15 100644
--- a/tests/python/unittest/test_codegen_vulkan.py
+++ b/tests/python/unittest/test_codegen_vulkan.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import re
 import numpy as np
 
@@ -28,18 +29,18 @@ def test_vector_comparison():
 
     def check_correct_assembly(dtype):
         n = (1024,)
-        A = tvm.placeholder(n, dtype=dtype, name='A')
-        B = tvm.compute(
+        A = te.placeholder(n, dtype=dtype, name='A')
+        B = te.compute(
             A.shape,
             lambda i: tvm.tir.Select(
-                A[i] >= 0, A[i] + tvm.const(1, dtype),
-                tvm.const(0, dtype)), name='B')
-        s = tvm.create_schedule(B.op)
+                A[i] >= 0, A[i] + tvm.tir.const(1, dtype),
+                tvm.tir.const(0, dtype)), name='B')
+        s = te.create_schedule(B.op)
 
         (bx, tx) = s[B].split(s[B].op.axis[0], factor=128)
         (tx, vx) = s[B].split(tx, factor=4)
-        s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-        s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(bx, te.thread_axis("blockIdx.x"))
+        s[B].bind(tx, te.thread_axis("threadIdx.x"))
         s[B].vectorize(vx)
         f = tvm.build(s, [A, B], target)
 
@@ -55,8 +56,8 @@ def check_correct_assembly(dtype):
     check_correct_assembly('float16')
 
 
-tx = tvm.thread_axis("threadIdx.x")
-bx = tvm.thread_axis("blockIdx.x")
+tx = te.thread_axis("threadIdx.x")
+bx = te.thread_axis("blockIdx.x")
 
 
 def test_vulkan_copy():
@@ -65,7 +66,7 @@ def check_vulkan(dtype, n):
         if not tvm.vulkan(0).exist or not tvm.runtime.enabled("vulkan"):
             print("skip because vulkan is not enabled..")
             return
-        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        A = te.placeholder((n,), name='A', dtype=dtype)
         ctx = tvm.vulkan(0)
         a_np = np.random.uniform(size=(n,)).astype(A.dtype)
         a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(a_np)
@@ -87,9 +88,9 @@ def check_vulkan(dtype, n, lanes):
         if not tvm.vulkan(0).exist or not tvm.runtime.enabled("vulkan"):
             print("skip because vulkan is not enabled..")
             return
-        A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
-        B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
-        s = tvm.create_schedule(B.op)
+        A = te.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
+        B = te.compute((n,), lambda i: A[i]+tvm.tir.const(1, A.dtype), name='B')
+        s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
         s[B].bind(xo, bx)
         s[B].bind(xi, tx)
@@ -120,21 +121,21 @@ def worker():
             if not tvm.vulkan(0).exist or not tvm.runtime.enabled("vulkan"):
                 print("skip because vulkan is not enabled..")
                 return
-            A = tvm.placeholder((n,), name='A', dtype="float32")
-            B = tvm.placeholder((n,), name='B', dtype="float32")
+            A = te.placeholder((n,), name='A', dtype="float32")
+            B = te.placeholder((n,), name='B', dtype="float32")
             functions = [
-                (lambda: tvm.compute((n,), lambda i: 2 * A[i] + 3 * B[i]),
+                (lambda: te.compute((n,), lambda i: 2 * A[i] + 3 * B[i]),
                  lambda a, b: 2 * a + 3 * b),
-                (lambda: tvm.compute((n,), lambda i: A[i]+B[i]),
+                (lambda: te.compute((n,), lambda i: A[i]+B[i]),
                  lambda a, b: a + b),
-                (lambda: tvm.compute((n,), lambda i: A[i]+2 * B[i]),
+                (lambda: te.compute((n,), lambda i: A[i]+2 * B[i]),
                  lambda a, b: a + 2 * b),
             ]
 
             def build_f(f_ref):
                 (C_f, ref) = f_ref
                 C = C_f()
-                s = tvm.create_schedule(C.op)
+                s = te.create_schedule(C.op)
                 xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
                 s[C].bind(xo, bx)
                 s[C].bind(xi, tx)
diff --git a/tests/python/unittest/test_codegen_x86.py b/tests/python/unittest/test_codegen_x86.py
index e17c6cf8cbcc..cdba774ff10d 100644
--- a/tests/python/unittest/test_codegen_x86.py
+++ b/tests/python/unittest/test_codegen_x86.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import re
 
 
@@ -26,10 +27,10 @@ def test_fp16_to_fp32():
 
     def fp16_to_fp32(target, width, match=None, not_match=None):
         elements = 64
-        n = tvm.convert(elements)
-        A = tvm.placeholder((n, width), dtype="float16", name='A')
-        B = tvm.compute(A.shape, lambda *i: A(*i).astype("float32"), name='B')
-        s = tvm.create_schedule(B.op)
+        n = tvm.runtime.convert(elements)
+        A = te.placeholder((n, width), dtype="float16", name='A')
+        B = te.compute(A.shape, lambda *i: A(*i).astype("float32"), name='B')
+        s = te.create_schedule(B.op)
         s[B].vectorize(s[B].op.axis[1])
         f = tvm.build(s, [A, B], target)
 
diff --git a/tests/python/unittest/test_container.py b/tests/python/unittest/test_container.py
index f7ffd0288f1b..84b26be6cbc1 100644
--- a/tests/python/unittest/test_container.py
+++ b/tests/python/unittest/test_container.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import nd, relay
 from tvm.runtime import container as _container
 
diff --git a/tests/python/unittest/test_custom_datatypes_mybfloat16.py b/tests/python/unittest/test_custom_datatypes_mybfloat16.py
index cae481353d6b..32f6e1865b63 100644
--- a/tests/python/unittest/test_custom_datatypes_mybfloat16.py
+++ b/tests/python/unittest/test_custom_datatypes_mybfloat16.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 from ctypes import *
 import topi
 import tvm.tir.ir_pass as ir_pass
@@ -56,14 +57,14 @@ def lower_datatypes_and_build(schedule, args):
     return tvm.build(flist[0], target=tgt)
 
 def test_bfloat_add_and_cast_1():
-    X = tvm.placeholder((3, ), name="X")
-    Y = tvm.placeholder((3, ), name="Y")
+    X = te.placeholder((3, ), name="X")
+    Y = te.placeholder((3, ), name="Y")
     Z = topi.cast(
         topi.cast(X, dtype="custom[bfloat]16") +
         topi.cast(Y, dtype="custom[bfloat]16"),
         dtype="float")
 
-    s = tvm.create_schedule([Z.op])
+    s = te.create_schedule([Z.op])
     built_cast = lower_datatypes_and_build(s, [X,Y,Z])
 
     ctx = tvm.context(tgt, 0)
@@ -87,14 +88,14 @@ def test_bfloat_add_and_cast_1():
 
 
 def test_bfloat_add_and_cast_2():
-    X = tvm.placeholder((3, ), name="X")
-    Y = tvm.placeholder((3, ), name="Y")
+    X = te.placeholder((3, ), name="X")
+    Y = te.placeholder((3, ), name="Y")
     Z = topi.cast(
         topi.cast(X, dtype="custom[bfloat]16") +
         topi.cast(Y, dtype="custom[bfloat]16"),
         dtype="float")
 
-    s = tvm.create_schedule([Z.op])
+    s = te.create_schedule([Z.op])
     built_cast = lower_datatypes_and_build(s, [X,Y,Z])
 
     ctx = tvm.context(tgt, 0)
@@ -122,14 +123,14 @@ def test_bfloat_add_and_cast_2():
 
 
 def test_bfloat_add_and_cast_FloatImm():
-    X = tvm.placeholder((3, ), name="X")
+    X = te.placeholder((3, ), name="X")
     Z = topi.cast(
         topi.add(
             topi.cast(X, dtype="custom[bfloat]16"),
             tvm.tir.FloatImm("custom[bfloat]16", 1.5)),
         dtype="float")
 
-    s = tvm.create_schedule([Z.op])
+    s = te.create_schedule([Z.op])
     built_cast = lower_datatypes_and_build(s, [X,Z])
 
     ctx = tvm.context(tgt, 0)
diff --git a/tests/python/unittest/test_graph_tuner_core.py b/tests/python/unittest/test_graph_tuner_core.py
index 27e077f5319c..a7be18a5a2d3 100644
--- a/tests/python/unittest/test_graph_tuner_core.py
+++ b/tests/python/unittest/test_graph_tuner_core.py
@@ -24,6 +24,7 @@
 import copy
 import numpy as np
 import tvm
+from tvm import te
 import tvm.relay.testing
 
 from tvm import autotvm
@@ -79,20 +80,20 @@ def _create_data(target, dshape, dtype, layout):
         records.append((ms_input, ms_output))
 
     ltf_records = []
-    ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
+    ltf_arg = [te.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
     ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
     ms_input = MeasureInput(target=target, task=ltf_task, config=None)
     ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
     ltf_records.append((ms_input, ms_output))
 
     ltf_keys = []
-    ltf_arg = [tvm.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"]
+    ltf_arg = [te.placeholder((1, 4, 8, 8, 4), dtype=dtype), "NCHW4c", "NCHW8c"]
     ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform')
     ltf_keys.append(ltf_wkl)
-    ltf_arg = [tvm.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"]
+    ltf_arg = [te.placeholder((1, 1, 8, 8, 32), dtype=dtype), "NCHW32c", "NCHW4c"]
     ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform')
     ltf_keys.append(ltf_wkl)
-    ltf_arg = [tvm.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"]
+    ltf_arg = [te.placeholder((1, 4, 8, 8, 8), dtype=dtype), "NCHW8c", "NCHW32c"]
     ltf_wkl = autotvm.task.args_to_workload(ltf_arg, 'layout_transform')
     ltf_keys.append(ltf_wkl)
 
@@ -314,7 +315,7 @@ def test_many_sub_graphs():
         records.append((ms_input, ms_output))
 
     ltf_records = []
-    ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
+    ltf_arg = [te.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
     ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
     ms_input = MeasureInput(target=target, task=ltf_task, config=None)
     ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
@@ -397,7 +398,7 @@ def test_tuple():
         records.append((ms_input, ms_output))
 
     ltf_records = []
-    ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
+    ltf_arg = [te.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
     ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
     ms_input = MeasureInput(target=target, task=ltf_task, config=None)
     ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
@@ -497,7 +498,7 @@ def test_triangle_block():
         records.append((ms_input, ms_output))
 
     ltf_records = []
-    ltf_arg = [tvm.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
+    ltf_arg = [te.placeholder((1, 64, 16, 16, 8), dtype=dtype), "NCHW8c", "NCHW512c"]
     ltf_task = autotvm.task.create('layout_transform', ltf_arg, target)
     ms_input = MeasureInput(target=target, task=ltf_task, config=None)
     ms_output =  MeasureResult(costs=(1.91224744e-05,), error_no=0, all_cost=-1, timestamp=-1)
diff --git a/tests/python/unittest/test_graph_tuner_utils.py b/tests/python/unittest/test_graph_tuner_utils.py
index 112c5b8a7059..f620accb1719 100644
--- a/tests/python/unittest/test_graph_tuner_utils.py
+++ b/tests/python/unittest/test_graph_tuner_utils.py
@@ -21,6 +21,7 @@
 # https://github.com/apache/incubator-tvm/issues/3240
 # TODO: restore the file name after this issue is resolved.
 import tvm
+from tvm import te
 
 from tvm import autotvm, relay
 from tvm.relay.testing import resnet
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 311dae803dba..3e90442d6ee8 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm, inspect, sys, traceback, numpy, pytest, types, os
+
+from tvm import te
 from tvm.contrib import util
 from tvm.hybrid import script
 from tvm.hybrid.runtime import HYBRID_GLOBALS
@@ -22,8 +24,8 @@
 @pytest.mark.skip
 def run_and_check(func, args, var_dict={}, target='llvm', sch=None, outs=None):
     def tvm_val_2_py_val(val):
-        val = tvm.ir_pass.Substitute(val, var_dict)
-        val = tvm.ir_pass.Simplify(val)
+        val = tvm.tir.ir_pass.Substitute(val, var_dict)
+        val = tvm.tir.ir_pass.Simplify(val)
         assert isinstance(val, (tvm.tir.IntImm,))
         return val.value
 
@@ -31,9 +33,9 @@ def tvm_val_2_py_val(val):
     op = None
 
     if sch is None:
-        outs = func(*tuple(tvm.convert(i) if isinstance(i, list) else i for i in args))
+        outs = func(*tuple(tvm.runtime.convert(i) if isinstance(i, list) else i for i in args))
         op = outs[0].op if isinstance(outs, list) else outs.op
-        sch = tvm.create_schedule(op)
+        sch = te.create_schedule(op)
     else:
         assert outs is not None
         assert isinstance(outs, list)
@@ -42,7 +44,7 @@ def tvm_val_2_py_val(val):
     emu_args = []
     nd_args = []
     for i in args:
-        if isinstance(i, tvm.tensor.Tensor):
+        if isinstance(i, te.tensor.Tensor):
             shape = [tvm_val_2_py_val(j) for j in i.shape]
             emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
             nd_args.append(tvm.nd.array(emu_args[-1], ctx))
@@ -53,7 +55,7 @@ def tvm_val_2_py_val(val):
             assert isinstance(i, list)
             emu_args.append(numpy.array(i))
 
-    compile_args = [i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.tir.Var))] + \
+    compile_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))] + \
                    (outs if isinstance(outs, list) else [outs])
     module = tvm.build(sch,
                        compile_args,
@@ -76,8 +78,8 @@ def tvm_val_2_py_val(val):
     for nd, np in zip(out_tensors, ref_data):
         tvm.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
 
-    module_args = [i for i in args if isinstance(i, (tvm.tensor.Tensor, tvm.tir.Var))]
-    module_outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    module_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))]
+    module_outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     h_module = tvm.hybrid.build(sch, module_args, module_outs)
 
     return h_module, module_args, module_outs
@@ -98,10 +100,10 @@ def outer_product(n, m, a, b):
 #Test global function
 #Test bridge between frontend and backend
 def test_outer_product():
-    n = tvm.size_var('n')
-    m = tvm.size_var('m')
-    a = tvm.placeholder((n, ), name='a')
-    b = tvm.placeholder((m, ), name='b')
+    n = te.size_var('n')
+    m = te.size_var('m')
+    a = te.placeholder((n, ), name='a')
+    b = te.placeholder((m, ), name='b')
 
     try:
         c = outer_product(n, m, a, b)
@@ -167,8 +169,8 @@ def fanout(n, a):
             b[i] = sigma
         return b
 
-    n = tvm.size_var('n')
-    a = tvm.placeholder((n, ), 'float32', name='a')
+    n = te.size_var('n')
+    a = te.placeholder((n, ), 'float32', name='a')
     try:
         b = fanout(n, a)
         ir = b.op.body
@@ -180,7 +182,7 @@ def fanout(n, a):
     assert isinstance(ir, tvm.tir.For)
     assert ir.loop_var.name == 'i'
     assert ir.min.value == 0
-    assert tvm.ir_pass.Equal(ir.extent, n - 3)
+    assert tvm.tir.ir_pass.Equal(ir.extent, n - 3)
     #Check loopbody
     ibody = ir.body
     assert isinstance(ibody, tvm.tir.AttrStmt)
@@ -213,7 +215,7 @@ def fanout(n, a):
     assert value.a.args[0].value == 0
     assert value.b.name == 'a'
     assert len(value.b.args) == 1
-    assert tvm.ir_pass.Equal(value.b.args[0], ir.loop_var + jloop.loop_var)
+    assert tvm.tir.ir_pass.Equal(value.b.args[0], ir.loop_var + jloop.loop_var)
     divide= rbody[2]
     assert isinstance(divide, tvm.tir.Provide)
     assert len(divide.args) == 1
@@ -249,9 +251,9 @@ def looptype(a, b, c):
             f[k] = c[k]
         return d, e, f
 
-    a = tvm.placeholder((16, ), name='a', dtype='int32')
-    b = tvm.placeholder((16, ), name='b', dtype='int32')
-    c = tvm.placeholder((16, ), name='c', dtype='int32')
+    a = te.placeholder((16, ), name='a', dtype='int32')
+    b = te.placeholder((16, ), name='b', dtype='int32')
+    c = te.placeholder((16, ), name='c', dtype='int32')
     try:
         d, e, f = looptype(a, b, c)
         ir = d.op.body
@@ -282,7 +284,7 @@ def if_then_else(a):
             b[i] = -1 if i % 2 == 0 else 1
         return b, c
 
-    a = tvm.placeholder((10, ), dtype='int32', name='a')
+    a = te.placeholder((10, ), dtype='int32', name='a')
 
     func, ins, outs = run_and_check(if_then_else, [a])
     run_and_check(func, ins, outs=outs)
@@ -326,8 +328,8 @@ def vec_add(a, b):
             c[tx] = a[tx] + b[tx]
         return c
 
-    a = tvm.placeholder((1000, ), dtype='float32', name='a')
-    b = tvm.placeholder((1000, ), dtype='float32', name='b')
+    a = te.placeholder((1000, ), dtype='float32', name='a')
+    b = te.placeholder((1000, ), dtype='float32', name='b')
     func, ins, outs = run_and_check(vec_add, [a, b], target='cuda')
     run_and_check(func, ins, outs=outs, target='cuda')
 
@@ -339,8 +341,8 @@ def raw(a, b):
         return c
 
     c = raw(a, b)
-    sch = tvm.create_schedule(c.op)
-    x = tvm.thread_axis('threadIdx.x')
+    sch = te.create_schedule(c.op)
+    x = te.thread_axis('threadIdx.x')
     sch[c].bind(c.op.axis[0], x)
     func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda')
     run_and_check(func, ins, outs=outs, target='cuda')
@@ -360,9 +362,9 @@ def foo(a):
 
         return c
 
-    a = tvm.placeholder((8, 4), 'float32')
+    a = te.placeholder((8, 4), 'float32')
     c = foo(a)
-    s = tvm.create_schedule(c.op)
+    s = te.create_schedule(c.op)
     ir = tvm.lower(s, [a, c], simple_mode=True)
     assert not isinstance(ir, tvm.tir.AttrStmt)
     func, ins, outs = run_and_check(foo, [a], target='cuda')
@@ -379,7 +381,7 @@ def max_threads(a):
                     b[i * m + j] = a[i * m + j] + a[i * m + j]
         return b
 
-    a = tvm.placeholder((10000, ), 'float32')
+    a = te.placeholder((10000, ), 'float32')
     with tvm.target.create('cuda'):
         func, ins, outs = run_and_check(max_threads, [a], target='cuda')
         run_and_check(func, ins, outs=outs, target='cuda')
@@ -399,9 +401,9 @@ def intrin_real(a):
         b[7] = max(a[5], a[6])
         return b
 
-    a8 = tvm.placeholder((8, ), dtype='float32', name='a')
+    a8 = te.placeholder((8, ), dtype='float32', name='a')
     b8 = intrin_real(a8)
-    sch = tvm.create_schedule(b8.op)
+    sch = te.create_schedule(b8.op)
     func = tvm.build(sch, [a8, b8])
     assert func
     a = numpy.arange(2, 10).astype('float32')
@@ -417,9 +419,9 @@ def intrin_int(a):
         b[0] = popcount(a[0])
         return b
 
-    a1 = tvm.placeholder((1, ), dtype='int32')
+    a1 = te.placeholder((1, ), dtype='int32')
     b1 = intrin_int(a1)
-    sch = tvm.create_schedule(b1.op)
+    sch = te.create_schedule(b1.op)
     func = tvm.build(sch, [a1, b1])
     assert func
     a = numpy.array([114514]).astype('int32')
@@ -443,7 +445,7 @@ def blur(a):
                 b[i-2, j-2] = s / 9.0
         return b
 
-    a = tvm.placeholder((32, 32), 'float32', 'a')
+    a = te.placeholder((32, 32), 'float32', 'a')
     func, ins, outs = run_and_check(blur, [a])
     run_and_check(func, ins, outs=outs)
 
@@ -455,8 +457,8 @@ def triangle(a, b):
                 c[i, j] = a[i] * b[j]
         return c
 
-    a = tvm.placeholder((10, ), dtype='float32', name='a')
-    b = tvm.placeholder((10, ), dtype='float32', name='b')
+    a = te.placeholder((10, ), dtype='float32', name='a')
+    b = te.placeholder((10, ), dtype='float32', name='b')
 
     func, ins, outs = run_and_check(triangle, [a, b])
     run_and_check(func, ins, outs=outs)
@@ -474,9 +476,9 @@ def blur2d(a):
                 b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0
         return b
 
-    a = tvm.placeholder((32, 32), 'float32', 'a')
+    a = te.placeholder((32, 32), 'float32', 'a')
     b = blur2d(a)
-    sch = tvm.create_schedule(b.op)
+    sch = te.create_schedule(b.op)
     func, ins, outs = run_and_check(blur2d, [a])
     run_and_check(func, ins, outs=outs)
 
@@ -494,8 +496,8 @@ def share_vec_add(a, b):
                 c[i] = shared[i] + local[i]
             return c
 
-        a = tvm.placeholder((256, ), dtype='float32', name='a')
-        b = tvm.placeholder((256, ), dtype='float32', name='b')
+        a = te.placeholder((256, ), dtype='float32', name='a')
+        b = te.placeholder((256, ), dtype='float32', name='b')
         c = share_vec_add(a, b)
         func, ins, outs = run_and_check(share_vec_add, [a, b], target='cuda')
         run_and_check(func, ins, outs=outs, target='cuda')
@@ -510,11 +512,11 @@ def upstream(a):
             b[i] = a[i] * i
         return b
 
-    a = tvm.placeholder((20, ), 'float32')
-    b = tvm.placeholder((20, ), 'float32')
-    c = tvm.compute((20, ), lambda x: a[x] + b[x])
+    a = te.placeholder((20, ), 'float32')
+    b = te.placeholder((20, ), 'float32')
+    c = te.compute((20, ), lambda x: a[x] + b[x])
     d = upstream(c)
-    sch = tvm.create_schedule([c.op, d.op])
+    sch = te.create_schedule([c.op, d.op])
     ir = tvm.lower(sch, [a, b, d], simple_mode=True)
     func = tvm.build(sch, [a, b, d])
     assert(func)
@@ -541,11 +543,11 @@ def downstream(a):
         return b
 
 
-    a = tvm.placeholder((20, ), 'float32')
+    a = te.placeholder((20, ), 'float32')
     b = downstream(a)
-    c = tvm.compute((20, ), lambda x: b[x] + 1.0)
+    c = te.compute((20, ), lambda x: b[x] + 1.0)
 
-    sch = tvm.create_schedule(c.op)
+    sch = te.create_schedule(c.op)
     module = tvm.build(sch, [a, c])
     assert module
 
@@ -567,10 +569,10 @@ def add_something(a, b):
             c[i] = a[i] + b
         return c
 
-    a = tvm.placeholder((11, ), dtype='int32', name='a')
-    b = tvm.const(11, 'int32')
+    a = te.placeholder((11, ), dtype='int32', name='a')
+    b = tvm.tir.const(11, 'int32')
     c = add_something(a, b)
-    sch = tvm.create_schedule(c.op)
+    sch = te.create_schedule(c.op)
     module = tvm.build(sch, [a, c], 'llvm')
     assert(module)
 
@@ -603,10 +605,10 @@ def kernel_b(b, a):
                 c[i, j] = a[i * 4 + j] * b[i, j]
         return c
 
-    a = tvm.placeholder((16, ), 'int32')
+    a = te.placeholder((16, ), 'int32')
     b, c = kernel_a(a)
     d = kernel_b(c, b)
-    sch = tvm.create_schedule(d.op)
+    sch = te.create_schedule(d.op)
     module = tvm.build(sch, [a, d])
     assert module
 
@@ -632,8 +634,8 @@ def foo(a, b):
                 d[i, j] = c[i, j] + i * j
         return d
 
-    a = tvm.placeholder((10, ), name='a')
-    b = tvm.placeholder((10, ), name='b')
+    a = te.placeholder((10, ), name='a')
+    b = te.placeholder((10, ), name='b')
     func, ins, outs = run_and_check(foo, [a, b])
     run_and_check(func, ins, outs=outs)
 
@@ -648,7 +650,7 @@ def foo(a):
             else:
                 b[i] = 0.0
         return b
-    a = tvm.placeholder((10, ), name='a')
+    a = te.placeholder((10, ), name='a')
     func, ins, outs = run_and_check(foo, [a])
     run_and_check(func, ins, outs=outs)
 
@@ -668,7 +670,7 @@ def foo(a, b):
 
         return c, d
 
-    a = tvm.placeholder((2, 5), name='a', dtype='float32')
+    a = te.placeholder((2, 5), name='a', dtype='float32')
     b = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]]
     func, ins, outs = run_and_check(foo, [a, b])
     run_and_check(func, ins, outs=outs)
@@ -683,10 +685,10 @@ def goo(a, b):
             else:
                 c[i - len_b] = a[i - len_b] + b[i - len_b]
         return c
-    a = tvm.placeholder((5, ), name='a', dtype='int32')
+    a = te.placeholder((5, ), name='a', dtype='int32')
     b = [1, 2, 3, 4, 5]
-    c = goo(a, tvm.convert(b))
-    sch = tvm.create_schedule(c.op)
+    c = goo(a, tvm.runtime.convert(b))
+    sch = te.create_schedule(c.op)
     func, ins, outs = run_and_check(goo, [a, b])
     run_and_check(func, ins, outs=outs)
 
@@ -700,7 +702,7 @@ def hoo(a, b):
                 d += a[i] + b[j]
                 c[i] = d
         return c
-    a = tvm.placeholder((5, ), name='a', dtype='int32')
+    a = te.placeholder((5, ), name='a', dtype='int32')
     b = [1, 2, 3, 4, 5]
     func, ins, outs = run_and_check(hoo, [a, b])
     run_and_check(func, ins, outs=outs)
@@ -713,14 +715,14 @@ def outer_product(a, b):
             for j in range(64):
                 c[i, j] = a[i] * b[j]
         return c
-    a = tvm.placeholder((64,), name='a', dtype='float32')
-    b = tvm.placeholder((64,), name='b', dtype='float32')
+    a = te.placeholder((64,), name='a', dtype='float32')
+    b = te.placeholder((64,), name='b', dtype='float32')
     c = outer_product(a, b)
 
     # Test perfect loop split
     # Test loop reorder
     # Test loop annotation
-    sch = tvm.create_schedule(c.op)
+    sch = te.create_schedule(c.op)
     i, j = c.op.axis
     io, ii = sch[c].split(i, 4)
     sch[c].parallel(ii)
@@ -749,7 +751,7 @@ def outer_product(a, b):
     run_and_check(func, ins, outs=outs)
 
     # Test fuse
-    sch = tvm.create_schedule(c.op)
+    sch = te.create_schedule(c.op)
     sch[c].fuse(c.op.axis[0], c.op.axis[1])
     ir = tvm.lower(sch, [a, b, c], simple_mode=True)
     assert isinstance(ir, tvm.tir.ProducerConsumer)
@@ -762,7 +764,7 @@ def outer_product(a, b):
     run_and_check(func, ins, outs=outs)
 
     # Test imperfect loop split
-    sch = tvm.create_schedule(c.op)
+    sch = te.create_schedule(c.op)
     sch[c].split(c.op.axis[0], 3)
     ir = tvm.lower(sch, [a, b, c], simple_mode=True)
     func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c])
@@ -784,7 +786,7 @@ def add_something(a):
             c[i] = a[i] + constant_list[1][const_value]
         return c
 
-    a = tvm.placeholder((n, ), dtype='int32', name='a')
+    a = te.placeholder((n, ), dtype='int32', name='a')
 
     func, ins, outs = run_and_check(add_something, [a])
     run_and_check(func, ins, outs=outs)
@@ -801,12 +803,12 @@ def sum_array(inputs):
     n = 5
     inputs = []
     for i in range(n):
-        inputs.append(tvm.placeholder((10,), name='t%s' % i, dtype='float32'))
+        inputs.append(te.placeholder((10,), name='t%s' % i, dtype='float32'))
 
-    out = sum_array(tvm.convert(inputs))
+    out = sum_array(tvm.runtime.convert(inputs))
     assert len(out.op.inputs) == n
 
-    sch = tvm.create_schedule(out.op)
+    sch = te.create_schedule(out.op)
     mod = tvm.build(sch, inputs + [out], target='llvm')
     assert mod
 
diff --git a/tests/python/unittest/test_ir_builder.py b/tests/python/unittest/test_ir_builder.py
index 5679625e7799..9106be843b48 100644
--- a/tests/python/unittest/test_ir_builder.py
+++ b/tests/python/unittest/test_ir_builder.py
@@ -15,11 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def test_for():
-    ib = tvm.ir_builder.create()
-    n = tvm.size_var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.size_var("n")
     A = ib.allocate("float32", n, name="A", scope="global")
     with ib.for_range(0, n, name="i") as i:
         A[i] = A[i] + 1
@@ -38,10 +39,10 @@ def test_for():
     assert isinstance(body[1], tvm.tir.For)
 
 def test_if():
-    ib = tvm.ir_builder.create()
-    n = tvm.size_var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.size_var("n")
     A = ib.pointer("float32", name="A")
-    tmod = tvm.truncmod
+    tmod = tvm.tir.truncmod
     with ib.for_range(0, n, name="i") as i:
         with ib.if_scope(tmod(i, 2) == 0):
             A[i] = A[i] + 1
@@ -58,9 +59,9 @@ def test_if():
     assert body.else_case.index.value == 0
 
 def test_prefetch():
-    A = tvm.placeholder((10, 20), name="A")
-    ib = tvm.ir_builder.create()
-    n = tvm.size_var("n")
+    A = te.placeholder((10, 20), name="A")
+    ib = tvm.tir.ir_builder.create()
+    n = te.size_var("n")
 
     with ib.for_range(0, n, name="i") as i:
         ib.emit(
@@ -74,12 +75,12 @@ def test_prefetch():
 def test_cpu():
     n = 1024
     dtype = "float32"
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
     def test_device_ir(A, B, C):
         n = A.shape[0]
         max_threads = 8
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
         Aptr = ib.buffer_ptr(A)
         Bptr = ib.buffer_ptr(B)
         Cptr = ib.buffer_ptr(C)
@@ -87,9 +88,9 @@ def test_device_ir(A, B, C):
             Cptr[i] = Aptr[i] + Bptr[i]
         body = ib.get()
         return body
-    C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
+    C = te.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
                    name="vector_add", dtype=dtype)
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     def check_target(target):
         if not tvm.runtime.enabled(target):
             return
@@ -105,18 +106,18 @@ def check_target(target):
     check_target("llvm")
 
 def test_gpu():
-    n = tvm.size_var('n')
+    n = te.size_var('n')
     dtype = "float32"
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    idxd = tvm.indexdiv
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    idxd = tvm.tir.indexdiv
 
     def test_device_ir(A, B, C):
         n = A.shape[0]
         max_threads = 32
-        ib = tvm.ir_builder.create()
-        bx = tvm.thread_axis("blockIdx.x")
-        tx = tvm.thread_axis("threadIdx.x")
+        ib = tvm.tir.ir_builder.create()
+        bx = te.thread_axis("blockIdx.x")
+        tx = te.thread_axis("threadIdx.x")
         ib.scope_attr(bx, "thread_extent", idxd(n+max_threads-1, max_threads))
         ib.scope_attr(tx, "thread_extent", max_threads)
         idx = bx.var * max_threads + tx.var
@@ -127,11 +128,11 @@ def test_device_ir(A, B, C):
             Cptr[idx] = Aptr[idx] + Bptr[idx]
         body = ib.get()
         return body
-    C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
+    C = te.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
                    name="vector_add", dtype=dtype)
-    s = tvm.create_schedule(C.op)
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    s = te.create_schedule(C.op)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
     def check_target(target):
         n = 1024
         if not tvm.runtime.enabled(target):
diff --git a/tests/python/unittest/test_lang_basic.py b/tests/python/unittest/test_lang_basic.py
index 3b1431a54d36..cd532a0db77f 100644
--- a/tests/python/unittest/test_lang_basic.py
+++ b/tests/python/unittest/test_lang_basic.py
@@ -15,13 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 
 def test_const():
-    x = tvm.const(1, "int32")
+    x = tvm.tir.const(1, "int32")
     print(x.dtype)
-    assert x.dtype == tvm.int32
+    assert x.dtype == "int32"
     assert isinstance(x, tvm.tir.IntImm)
 
 
@@ -29,28 +30,28 @@ def test_scalar_dtype_inference():
     for data in [True, np.bool(1), np.uint8(1), np.uint16(1), np.uint32(1), np.uint64(1),
                  np.int8(1), np.int16(1), np.int32(1), np.int64(1),
                  np.float16(1), np.float32(1), np.float64(1)]:
-        assert tvm.const(data).dtype == str(np.array(data).dtype)
-    assert tvm.const(1).dtype == 'int32'
-    assert tvm.const(1.0).dtype == 'float32'
+        assert tvm.tir.const(data).dtype == str(np.array(data).dtype)
+    assert tvm.tir.const(1).dtype == 'int32'
+    assert tvm.tir.const(1.0).dtype == 'float32'
 
     for data in [True, np.bool(1), np.uint8(1), np.uint16(1), np.uint32(1), np.uint64(1),
                  np.int8(1), np.int16(1), np.int32(1), np.int64(1),
                  np.float16(1), np.float32(1), np.float64(1)]:
-        assert tvm.convert(data).dtype == str(np.array(data).dtype)
-    assert tvm.convert(1).dtype == 'int32'
-    assert tvm.convert(1.0).dtype == 'float32'
+        assert tvm.runtime.convert(data).dtype == str(np.array(data).dtype)
+    assert tvm.runtime.convert(1).dtype == 'int32'
+    assert tvm.runtime.convert(1.0).dtype == 'float32'
 
 
 def test_make():
-    x = tvm.const(1, "int32")
-    y = tvm.var("x")
+    x = tvm.tir.const(1, "int32")
+    y = te.var("x")
     z = x + y
-    assert isinstance(tvm.max(x, y), tvm.tir.Max)
-    assert isinstance(tvm.min(x, y), tvm.tir.Min)
+    assert isinstance(tvm.te.max(x, y), tvm.tir.Max)
+    assert isinstance(tvm.te.min(x, y), tvm.tir.Min)
 
 
 def test_ir():
-    x = tvm.const(1, "int32")
+    x = tvm.tir.const(1, "int32")
     y = tvm.tir.IntImm('int32', 1)
     z = x + y
     stmt = tvm.tir.Evaluate(z)
@@ -58,22 +59,22 @@ def test_ir():
 
 
 def test_ir2():
-    x = tvm.var("n")
-    a = tvm.var("array", tvm.handle)
+    x = te.var("n")
+    a = te.var("array", "handle")
     st = tvm.tir.Store(a, x + 1, 1)
     assert isinstance(st, tvm.tir.Store)
     assert(st.buffer_var == a)
 
 
 def test_let():
-    x = tvm.var('x')
-    y = tvm.var('y')
+    x = te.var('x')
+    y = te.var('y')
     stmt = tvm.tir.LetStmt(
         x, 10, tvm.tir.Evaluate(x + 1));
 
 
 def test_cast():
-    x = tvm.var('x', dtype="float32")
+    x = te.var('x', dtype="float32")
     y = x.astype("int32")
     z = x.astype("float32x4")
     assert isinstance(y, tvm.tir.Cast)
@@ -82,13 +83,13 @@ def test_cast():
 
 
 def test_attr():
-    x = tvm.var('x')
-    y = tvm.var('y')
+    x = te.var('x')
+    y = te.var('y')
     stmt = tvm.tir.AttrStmt(
         y, "stride", 10, tvm.tir.Evaluate(x + 1));
     assert stmt.node == y
 
-    a = tvm.convert(1)
+    a = tvm.runtime.convert(1)
     assert a.value == 1
     try:
         a.no_field
@@ -98,78 +99,78 @@ def test_attr():
 
 
 def test_basic():
-    a = tvm.var('a')
-    b = tvm.var('b')
+    a = te.var('a')
+    b = te.var('b')
     c =  a + b
     assert str(c) == '(%s + %s)' % (a.name, b.name)
 
 
 def test_stmt():
     x = tvm.tir.Evaluate(0)
-    tvm.tir.For(tvm.var('i'), 0, 1,
+    tvm.tir.For(te.var('i'), 0, 1,
                  tvm.tir.For.Serial, 0,
                  x)
 
 
 def test_dir():
-    x = tvm.var('x')
+    x = te.var('x')
     dir(x)
 
 
 def test_dtype():
-    x = tvm.var('x')
+    x = te.var('x')
     assert x.dtype == 'int32'
-    y = tvm.var('y')
+    y = te.var('y')
     assert (x > y).dtype == 'bool'
 
 
 def test_any():
-    x = tvm.var('x')
-    y = tvm.var('y')
-    z = tvm.var('z')
+    x = te.var('x')
+    y = te.var('y')
+    z = te.var('z')
     try:
         t = x or x
         assert False
     except ValueError:
         pass
     try:
-        tvm.any()
+        tvm.tir.any()
         assert False
     except ValueError:
         pass
-    assert str(tvm.any(x < y)) == '(%s < %s)' % (x.name, y.name)
-    assert str(tvm.any(x < y, x > z)) == '((%s < %s) || (%s > %s))' % (
+    assert str(tvm.tir.any(x < y)) == '(%s < %s)' % (x.name, y.name)
+    assert str(tvm.tir.any(x < y, x > z)) == '((%s < %s) || (%s > %s))' % (
         x.name, y.name, x.name, z.name)
-    assert str(tvm.any(x < y, y > z + 1, x < z * 2)) == \
+    assert str(tvm.tir.any(x < y, y > z + 1, x < z * 2)) == \
         '(((%s < %s) || (%s > (%s + 1))) || (%s < (%s*2)))' % (
             x.name, y.name, y.name, z.name, x.name, z.name)
 
 
 def test_all():
-    x = tvm.var('x')
-    y = tvm.var('y')
-    z = tvm.var('z')
+    x = te.var('x')
+    y = te.var('y')
+    z = te.var('z')
     try:
         t = x and x
         assert False
     except ValueError:
         pass
     try:
-        tvm.all()
+        tvm.tir.all()
         assert False
     except ValueError:
         pass
-    assert str(tvm.all(x < y)) == '(%s < %s)' % (x.name, y.name)
-    assert str(tvm.all(x < y, x > z)) == '((%s < %s) && (%s > %s))' % (
+    assert str(tvm.tir.all(x < y)) == '(%s < %s)' % (x.name, y.name)
+    assert str(tvm.tir.all(x < y, x > z)) == '((%s < %s) && (%s > %s))' % (
         x.name, y.name, x.name, z.name)
-    assert str(tvm.all(x < y, y > z + 1, x < z * 2)) == \
+    assert str(tvm.tir.all(x < y, y > z + 1, x < z * 2)) == \
         '(((%s < %s) && (%s > (%s + 1))) && (%s < (%s*2)))' % (
             x.name, y.name, y.name, z.name, x.name, z.name)
 
 
 def test_bitwise():
-    x = tvm.var('x')
-    y = tvm.var('y')
+    x = te.var('x')
+    y = te.var('y')
     assert str(x << y) == 'shift_left(x, y)'
     assert str(x >> y) == 'shift_right(x, y)'
     assert str(x & y) == 'bitwise_and(x, y)'
@@ -182,12 +183,12 @@ def test_bitwise():
     assert str(10 << x) == 'shift_left(10, x)'
     assert str(10 % x) == 'floormod(10, x)'
     assert str(~x) == 'bitwise_not(x)'
-    assert(tvm.const(1, "int8x2") >> 1).dtype == "int8x2"
-    assert(x >> tvm.const(1, "int32x2")).dtype == "int32x2"
-    assert(tvm.var("z", "int8x2") << tvm.const(1, "int8x2")).dtype == "int8x2"
+    assert(tvm.tir.const(1, "int8x2") >> 1).dtype == "int8x2"
+    assert(x >> tvm.tir.const(1, "int32x2")).dtype == "int32x2"
+    assert(te.var("z", "int8x2") << tvm.tir.const(1, "int8x2")).dtype == "int8x2"
 
 def test_float_bitwise():
-    t = tvm.const(1.5,dtype='float32')
+    t = tvm.tir.const(1.5,dtype='float32')
     for test in [lambda lhs, rhs : lhs << rhs,
                     lambda lhs, rhs : lhs >> rhs,
                     lambda lhs, rhs : lhs | rhs,
@@ -206,20 +207,20 @@ def test_float_bitwise():
         pass
 
 def test_isnan():
-    x = tvm.var('x', 'float32')
-    assert str(tvm.isnan(x)) == 'isnan(x)'
-    assert str(tvm.isnan(x).dtype) == 'bool'
-    y = tvm.var('y', 'float16')
-    assert str(tvm.isnan(y)) == 'isnan(float32(y))'
-    z = tvm.var('z', 'int32')
-    assert str(tvm.isnan(z)) == '(bool)0'
-    k = tvm.var('k', 'int8x2')
-    assert str(tvm.isnan(k).dtype) == 'uint1x2'
+    x = te.var('x', 'float32')
+    assert str(tvm.tir.isnan(x)) == 'isnan(x)'
+    assert str(tvm.tir.isnan(x).dtype) == 'bool'
+    y = te.var('y', 'float16')
+    assert str(tvm.tir.isnan(y)) == 'isnan(float32(y))'
+    z = te.var('z', 'int32')
+    assert str(tvm.tir.isnan(z)) == '(bool)0'
+    k = te.var('k', 'int8x2')
+    assert str(tvm.tir.isnan(k).dtype) == 'uint1x2'
 
 
 def test_equality():
-    a = tvm.var('a')
-    b = tvm.var('b')
+    a = te.var('a')
+    b = te.var('b')
     c = (a == b)
     assert not c
     d = (c != c)
diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py
index 7568814fbfe6..9203fb1c7b34 100644
--- a/tests/python/unittest/test_lang_buffer.py
+++ b/tests/python/unittest/test_lang_buffer.py
@@ -15,27 +15,28 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm.tir import Buffer
 import numpy as np
 
 def test_buffer():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    l = tvm.size_var('l')
-    Ab = tvm.decl_buffer((m, n), tvm.float32)
-    Bb = tvm.decl_buffer((n, l), tvm.float32)
+    m = te.size_var('m')
+    n = te.size_var('n')
+    l = te.size_var('l')
+    Ab = tvm.tir.decl_buffer((m, n), "float32")
+    Bb = tvm.tir.decl_buffer((n, l), "float32")
 
     assert isinstance(Ab, tvm.tir.Buffer)
-    assert Ab.dtype == tvm.float32
+    assert Ab.dtype == "float32"
     assert tuple(Ab.shape) == (m, n)
 
 
 def test_buffer_access_ptr():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((m, n), tvm.float32, strides=[n + 1 , 1])
+    m = te.size_var('m')
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((m, n), "float32", strides=[n + 1 , 1])
     aptr = Ab.access_ptr("rw")
-    assert tvm.ir_pass.Equal(aptr.args[3], Ab.strides[0] * m)
+    assert tvm.tir.ir_pass.Equal(aptr.args[3], Ab.strides[0] * m)
     assert aptr.args[0].dtype == Ab.dtype
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
     aptr = Ab.access_ptr("w")
@@ -43,59 +44,59 @@ def test_buffer_access_ptr():
 
 
 def test_buffer_access_ptr_offset():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((m, n), tvm.float32)
+    m = te.size_var('m')
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((m, n), "float32")
     aptr = Ab.access_ptr("rw", offset=100)
-    offset = tvm.ir_pass.Simplify(aptr.args[2])
-    assert tvm.ir_pass.Equal(offset, 100)
+    offset = tvm.tir.ir_pass.Simplify(aptr.args[2])
+    assert tvm.tir.ir_pass.Equal(offset, 100)
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
-    v = tvm.size_var('int32')
+    v = te.size_var('int32')
     aptr = Ab.access_ptr("rw", offset=100 + 100 + v)
-    offset = tvm.ir_pass.Simplify(aptr.args[2])
-    assert tvm.ir_pass.Equal(offset, 200 + v)
+    offset = tvm.tir.ir_pass.Simplify(aptr.args[2])
+    assert tvm.tir.ir_pass.Equal(offset, 200 + v)
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
-    aptr = Ab.access_ptr("rw", offset=tvm.call_extern('int32', "test_call", 100 + 100 + v))
-    offset = tvm.ir_pass.Simplify(aptr.args[2])
-    assert tvm.ir_pass.Equal(offset, tvm.call_extern('int32', "test_call", 200 + v))
+    aptr = Ab.access_ptr("rw", offset=tvm.tir.call_extern('int32', "test_call", 100 + 100 + v))
+    offset = tvm.tir.ir_pass.Simplify(aptr.args[2])
+    assert tvm.tir.ir_pass.Equal(offset, tvm.tir.call_extern('int32', "test_call", 200 + v))
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
 
 
 def test_buffer_access_ptr_extent():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((m, n), tvm.float32)
+    m = te.size_var('m')
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((m, n), "float32")
     aptr = Ab.access_ptr("rw")
-    assert tvm.ir_pass.Equal(aptr.args[3], m * n)
+    assert tvm.tir.ir_pass.Equal(aptr.args[3], m * n)
     aptr = Ab.access_ptr("rw", offset=100)
-    assert tvm.ir_pass.Equal(aptr.args[3], m * n - 100)
-    Ab = tvm.decl_buffer((m, n), tvm.float32, strides=[n + 1 , 1])
+    assert tvm.tir.ir_pass.Equal(aptr.args[3], m * n - 100)
+    Ab = tvm.tir.decl_buffer((m, n), "float32", strides=[n + 1 , 1])
     aptr = Ab.access_ptr("rw", offset=100)
-    assert tvm.ir_pass.Equal(aptr.args[3], Ab.strides[0] * m - 100)
+    assert tvm.tir.ir_pass.Equal(aptr.args[3], Ab.strides[0] * m - 100)
 
 
 def test_buffer_vload():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((m, n), tvm.float32, elem_offset=100)
+    m = te.size_var('m')
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((m, n), "float32", elem_offset=100)
     load = Ab.vload([2, 3])
-    offset = tvm.ir_pass.Simplify(load.index)
-    assert tvm.ir_pass.Equal(offset, n * 2 + 103)
+    offset = tvm.tir.ir_pass.Simplify(load.index)
+    assert tvm.tir.ir_pass.Equal(offset, n * 2 + 103)
 
 
 def test_buffer_index_merge_mult_mod():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    s = tvm.size_var('s')
-    k0 = tvm.size_var('k0')
-    k1 = tvm.size_var('k1')
-    A = tvm.decl_buffer((m, n), tvm.float32)
-    A_stride = tvm.decl_buffer((m, n), tvm.float32, strides=(s, 1))
+    m = te.size_var('m')
+    n = te.size_var('n')
+    s = te.size_var('s')
+    k0 = te.size_var('k0')
+    k1 = te.size_var('k1')
+    A = tvm.tir.decl_buffer((m, n), "float32")
+    A_stride = tvm.tir.decl_buffer((m, n), "float32", strides=(s, 1))
     def assert_simplified_equal(index_simplified, index_direct):
-        assert tvm.ir_pass.Equal(index_simplified, index_direct),\
+        assert tvm.tir.ir_pass.Equal(index_simplified, index_direct),\
         "index_simplified=%s, index_direct=%s" %(index_simplified, index_direct)
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
     # Test Case1
     index_simplified = A_stride.vload(
         (idxd(idxm(k0, k1), s), idxm(idxm(k0, k1), s) + idxd(k0, k1) * k1))
@@ -123,18 +124,18 @@ def assert_simplified_equal(index_simplified, index_direct):
 
 
 def test_buffer_broadcast():
-    m0, m1, m2 = tvm.size_var("m0"), tvm.size_var("m1"), tvm.size_var("m2")
-    n0, n1, n2 = tvm.size_var("n0"), tvm.size_var("n1"), tvm.size_var("n2")
-    o0, o1, o2 = tvm.size_var("o0"), tvm.size_var("o1"), tvm.size_var("o2")
+    m0, m1, m2 = te.size_var("m0"), te.size_var("m1"), te.size_var("m2")
+    n0, n1, n2 = te.size_var("n0"), te.size_var("n1"), te.size_var("n2")
+    o0, o1, o2 = te.size_var("o0"), te.size_var("o1"), te.size_var("o2")
 
-    A = tvm.placeholder((m0, m1, m2), name='A')
-    B = tvm.placeholder((n0, n1, n2), name='B')
+    A = te.placeholder((m0, m1, m2), name='A')
+    B = te.placeholder((n0, n1, n2), name='B')
 
-    C = tvm.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name='C')
+    C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name='C')
 
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-    s = tvm.create_schedule(C.op)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
+    s = te.create_schedule(C.op)
 
     def check():
         if not tvm.runtime.enabled("llvm"):
@@ -151,18 +152,18 @@ def check():
 
 
 def test_buffer_broadcast_expr():
-    n0, m0, x = tvm.size_var('n0'), tvm.size_var('m0'), tvm.size_var('x')
-    n1, m1 = tvm.size_var('n1'), tvm.size_var('m1')
-    o0, o1 = tvm.size_var('o0'), tvm.size_var('o1')
-
-    A = tvm.placeholder((m0, n0), name='A')
-    B = tvm.placeholder((m1, n1), name='B')
-    C = tvm.compute((o0, o1//x), lambda i, j: A[i, j] + B[i, j], name='C')
-
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
-    Cc = tvm.decl_buffer(C.shape, C.dtype, name="Cc", buffer_type="auto_broadcast")
-    s = tvm.create_schedule(C.op)
+    n0, m0, x = te.size_var('n0'), te.size_var('m0'), te.size_var('x')
+    n1, m1 = te.size_var('n1'), te.size_var('m1')
+    o0, o1 = te.size_var('o0'), te.size_var('o1')
+
+    A = te.placeholder((m0, n0), name='A')
+    B = te.placeholder((m1, n1), name='B')
+    C = te.compute((o0, o1//x), lambda i, j: A[i, j] + B[i, j], name='C')
+
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast")
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast")
+    Cc = tvm.tir.decl_buffer(C.shape, C.dtype, name="Cc", buffer_type="auto_broadcast")
+    s = te.create_schedule(C.op)
 
     def check_stride():
         if not tvm.runtime.enabled("llvm"):
diff --git a/tests/python/unittest/test_lang_constructor.py b/tests/python/unittest/test_lang_constructor.py
index 797a04fa4574..9edaf92d0db7 100644
--- a/tests/python/unittest/test_lang_constructor.py
+++ b/tests/python/unittest/test_lang_constructor.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_expr_constructor():
     x = tvm.tir.Var("xx", "float32")
@@ -47,8 +48,8 @@ def test_expr_constructor():
     assert x.dtype == "float32"
     assert x.value.value == 1
 
-    a = tvm.const(1.0, dtype="float32")
-    b = tvm.var("x", dtype="float32")
+    a = tvm.tir.const(1.0, dtype="float32")
+    b = te.var("x", dtype="float32")
 
     for cls in [tvm.tir.Add,
                 tvm.tir.Sub,
@@ -67,8 +68,8 @@ def test_expr_constructor():
         assert x.b.same_as(b)
 
 
-    a = tvm.convert(tvm.var("x") > 1)
-    b = tvm.convert(tvm.var("x") == 1)
+    a = tvm.runtime.convert(te.var("x") > 1)
+    b = tvm.runtime.convert(te.var("x") == 1)
 
     for cls in [tvm.tir.And,
                 tvm.tir.Or]:
@@ -87,7 +88,7 @@ def test_expr_constructor():
     assert x.false_value == b
     assert x.condition == a
 
-    buffer_var = tvm.var("x", dtype="handle")
+    buffer_var = te.var("x", dtype="handle")
     x = tvm.tir.Load("float32", buffer_var, 1, a)
     assert isinstance(x, tvm.tir.Load)
     assert x.dtype == "float32"
@@ -120,7 +121,7 @@ def test_expr_constructor():
     assert x.func == None
     assert x.value_index == 0
 
-    v = tvm.var("aa")
+    v = te.var("aa")
     x = tvm.tir.Let(v, 1, v)
     assert x.var == v
     assert x.value.value == 1
@@ -128,8 +129,8 @@ def test_expr_constructor():
 
 
 def test_stmt_constructor():
-    v = tvm.var("aa")
-    buffer_var = tvm.var("buf", dtype="handle")
+    v = te.var("aa")
+    buffer_var = te.var("buf", dtype="handle")
     nop = tvm.tir.Evaluate(1)
     x = tvm.tir.LetStmt(v, 1, tvm.tir.Evaluate(1))
     assert isinstance(x, tvm.tir.LetStmt)
@@ -141,8 +142,8 @@ def test_stmt_constructor():
     assert isinstance(x, tvm.tir.AttrStmt)
     assert x.value.value == 1
 
-    x = tvm.tir.AssertStmt(tvm.const(1, "uint1"),
-                            tvm.convert("hellow"),
+    x = tvm.tir.AssertStmt(tvm.tir.const(1, "uint1"),
+                            tvm.runtime.convert("hellow"),
                             nop)
     assert isinstance(x, tvm.tir.AssertStmt)
     assert x.body == nop
@@ -151,26 +152,26 @@ def test_stmt_constructor():
     assert isinstance(x, tvm.tir.ProducerConsumer)
     assert x.body == nop
 
-    x = tvm.tir.For(tvm.var("x"), 0, 10, 0, 0, nop)
+    x = tvm.tir.For(te.var("x"), 0, 10, 0, 0, nop)
     assert isinstance(x, tvm.tir.For)
     assert x.min.value == 0
     assert x.extent.value == 10
     assert x.body == nop
 
-    x = tvm.tir.Store(buffer_var, 1, 10, tvm.const(1, "uint1"))
+    x = tvm.tir.Store(buffer_var, 1, 10, tvm.tir.const(1, "uint1"))
     assert isinstance(x, tvm.tir.Store)
     assert x.buffer_var == buffer_var
     assert x.index.value == 10
     assert x.value.value == 1
 
-    tensor = tvm.placeholder((), dtype="float32")
+    tensor = te.placeholder((), dtype="float32")
     x = tvm.tir.Provide(tensor.op, 0, 10, [])
     assert isinstance(x, tvm.tir.Provide)
     assert x.value_index == 0
     assert x.value.value == 10
 
     x = tvm.tir.Allocate(buffer_var, "float32", [10],
-                          tvm.const(1, "uint1"), nop)
+                          tvm.tir.const(1, "uint1"), nop)
     assert isinstance(x, tvm.tir.Allocate)
     assert x.dtype == "float32"
     assert x.buffer_var == buffer_var
@@ -186,11 +187,11 @@ def test_stmt_constructor():
     assert isinstance(x, tvm.tir.Free)
     assert x.buffer_var == buffer_var
 
-    x = tvm.tir.Realize(None, 0, "float", [], tvm.const(1, "uint1"), nop)
+    x = tvm.tir.Realize(None, 0, "float", [], tvm.tir.const(1, "uint1"), nop)
     assert isinstance(x, tvm.tir.Realize)
     assert x.body == nop
 
-    x = tvm.tir.IfThenElse(tvm.const(1, "uint1"),
+    x = tvm.tir.IfThenElse(tvm.tir.const(1, "uint1"),
                             tvm.tir.Evaluate(11),
                             nop)
     assert isinstance(x, tvm.tir.IfThenElse)
diff --git a/tests/python/unittest/test_lang_container.py b/tests/python/unittest/test_lang_container.py
index 0b9fad9a2d20..c2d3aba01ec8 100644
--- a/tests/python/unittest/test_lang_container.py
+++ b/tests/python/unittest/test_lang_container.py
@@ -15,26 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def test_array():
-    a = tvm.convert([1,2,3])
+    a = tvm.runtime.convert([1,2,3])
     assert len(a) == 3
     assert a[-1].value == 3
     a_slice = a[-3:-1]
     assert (a_slice[0].value, a_slice[1].value) == (1, 2)
 
 def test_array_save_load_json():
-    a = tvm.convert([1,2,3])
+    a = tvm.runtime.convert([1,2,3])
     json_str = tvm.ir.save_json(a)
     a_loaded = tvm.ir.load_json(json_str)
     assert(a_loaded[1].value == 2)
 
 
 def test_map():
-    a = tvm.var('a')
-    b = tvm.var('b')
-    amap = tvm.convert({a: 2,
+    a = te.var('a')
+    b = te.var('b')
+    amap = tvm.runtime.convert({a: 2,
                         b: 3})
     assert a in amap
     assert len(amap) == 2
@@ -45,7 +46,7 @@ def test_map():
 
 
 def test_str_map():
-    amap = tvm.convert({'a': 2, 'b': 3})
+    amap = tvm.runtime.convert({'a': 2, 'b': 3})
     assert 'a' in amap
     assert len(amap) == 2
     dd = dict(amap.items())
@@ -55,9 +56,9 @@ def test_str_map():
 
 
 def test_map_save_load_json():
-    a = tvm.var('a')
-    b = tvm.var('b')
-    amap = tvm.convert({a: 2,
+    a = te.var('a')
+    b = te.var('b')
+    amap = tvm.runtime.convert({a: 2,
                         b: 3})
     json_str = tvm.ir.save_json(amap)
     amap = tvm.ir.load_json(json_str)
@@ -67,14 +68,14 @@ def test_map_save_load_json():
 
 
 def test_in_container():
-    arr = tvm.convert(['a', 'b', 'c'])
+    arr = tvm.runtime.convert(['a', 'b', 'c'])
     assert 'a' in arr
     assert tvm.tir.StringImm('a') in arr
     assert 'd' not in arr
 
 def test_ndarray_container():
     x = tvm.nd.array([1,2,3])
-    arr = tvm.convert([x, x])
+    arr = tvm.runtime.convert([x, x])
     assert arr[0].same_as(x)
     assert arr[1].same_as(x)
     assert isinstance(arr[0], tvm.nd.NDArray)
diff --git a/tests/python/unittest/test_lang_data_layout.py b/tests/python/unittest/test_lang_data_layout.py
index 4c1cafcf3d67..86a71da6dbeb 100644
--- a/tests/python/unittest/test_lang_data_layout.py
+++ b/tests/python/unittest/test_lang_data_layout.py
@@ -17,6 +17,7 @@
 """Test layout and bijective-layout node"""
 
 import tvm
+from tvm import te
 from topi.util import get_const_tuple
 
 def test_layout():
@@ -51,18 +52,18 @@ def test_layout():
 
 def test_bilayout_convertible():
     # not convertible
-    assert tvm.bijective_layout("NCHW", "ABCD") is None
-    assert tvm.bijective_layout("__undef__", "NCHW") is None
-    assert tvm.bijective_layout("NCHW", "__undef__") is None
-    assert tvm.bijective_layout("__undef__", "__undef__") is None
-    assert tvm.bijective_layout("", "NCHW") is None
-    assert tvm.bijective_layout("NCHW", "") is None
-    assert tvm.bijective_layout("", "") is None
+    assert tvm.tir.bijective_layout("NCHW", "ABCD") is None
+    assert tvm.tir.bijective_layout("__undef__", "NCHW") is None
+    assert tvm.tir.bijective_layout("NCHW", "__undef__") is None
+    assert tvm.tir.bijective_layout("__undef__", "__undef__") is None
+    assert tvm.tir.bijective_layout("", "NCHW") is None
+    assert tvm.tir.bijective_layout("NCHW", "") is None
+    assert tvm.tir.bijective_layout("", "") is None
     # convertible
-    assert tvm.bijective_layout("NCHW", "NCHW16c") is not None
+    assert tvm.tir.bijective_layout("NCHW", "NCHW16c") is not None
 
 def test_bilayout_shape():
-    bilayout = tvm.bijective_layout("NCHW", "NCHW16c")
+    bilayout = tvm.tir.bijective_layout("NCHW", "NCHW16c")
     assert isinstance(bilayout, tvm.tir.BijectiveLayout)
 
     dst_shape = bilayout.forward_shape((1, 32, 7, 7))
@@ -72,7 +73,7 @@ def test_bilayout_shape():
     assert get_const_tuple(src_shape) == (1, 32, 7, 7)
 
 def test_bilayout_index():
-    bilayout = tvm.bijective_layout("NCHW", "NCHW16c")
+    bilayout = tvm.tir.bijective_layout("NCHW", "NCHW16c")
 
     dst_index = bilayout.forward_index([0, 18, 6, 6])
     assert get_const_tuple(dst_index) == (0, 1, 6, 6, 2)
diff --git a/tests/python/unittest/test_lang_group.py b/tests/python/unittest/test_lang_group.py
index e78ffb3541d3..0f1118d4890b 100644
--- a/tests/python/unittest/test_lang_group.py
+++ b/tests/python/unittest/test_lang_group.py
@@ -16,20 +16,21 @@
 # under the License.
 """Test group effect"""
 import tvm
+from tvm import te
 
 def test_scan_group():
-    m = tvm.size_var("m")
-    n = tvm.size_var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    s_state = tvm.placeholder((m, n))
-    s_init = tvm.compute((1, n), lambda _, i: x[0, i])
+    m = te.size_var("m")
+    n = te.size_var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    s_state = te.placeholder((m, n))
+    s_init = te.compute((1, n), lambda _, i: x[0, i])
 
-    s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
-    s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1)
-    s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1)
-    res = tvm.scan(s_init, s_update3, s_state, inputs=x)
+    s_update1 = te.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
+    s_update2 = te.compute((m, n), lambda t, i: s_update1[t, i] + 1)
+    s_update3 = te.compute((m, n), lambda t, i: s_update2[t, i] + 1)
+    res = tvm.te.scan(s_init, s_update3, s_state, inputs=x)
 
-    s = tvm.create_schedule(res.op)
+    s = te.create_schedule(res.op)
     assert s[s_update1].group is not None
     assert s[s_update2].group == s[s_update1].group
     # Assign within group, is valid
@@ -50,12 +51,12 @@ def test_scan_group():
         pass
 
 def test_compute_group():
-    m = tvm.size_var("m")
-    n = tvm.size_var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    x1 = tvm.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = tvm.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = tvm.create_schedule(x2.op)
+    m = te.size_var("m")
+    n = te.size_var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
+    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
+    s = te.create_schedule(x2.op)
     g = s.create_group(outputs=x1, inputs=x, include_inputs=True)
     assert s[x1].group == g
     assert s[x].group == g
@@ -64,12 +65,12 @@ def test_compute_group():
     assert g.num_child_stages == 2
 
 def test_nest_group():
-    m = tvm.size_var("m")
-    n = tvm.size_var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    x1 = tvm.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = tvm.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = tvm.create_schedule(x2.op)
+    m = te.size_var("m")
+    n = te.size_var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
+    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
+    s = te.create_schedule(x2.op)
     g1 = s.create_group(outputs=x1, inputs=x)
     g2 = s.create_group(outputs=x1, inputs=x, include_inputs=True)
     assert set(s.groups) == set([g1, g2])
diff --git a/tests/python/unittest/test_lang_operator.py b/tests/python/unittest/test_lang_operator.py
index d32b4c51ef69..23c594022faf 100644
--- a/tests/python/unittest/test_lang_operator.py
+++ b/tests/python/unittest/test_lang_operator.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def check_throws(f):
     try:
@@ -27,12 +28,12 @@ def check_throws(f):
 
 def test_const_fold():
     def check(f, *args):
-        x = f(*[tvm.const(x, "int32") for x in args])
+        x = f(*[tvm.tir.const(x, "int32") for x in args])
         y = f(*args)
         if not isinstance(x, (tvm.tir.IntImm,)) or x.value != int(y):
             raise ValueError("check error: %s vs %s " % (x, y))
 
-    tmod = tvm.truncmod
+    tmod = tvm.tir.truncmod
     check(lambda x, y: x + y, 3, 4)
     check(lambda x, y: x * y, 3, 12)
     check(lambda x, y: x * y - 10, 3, 12)
@@ -47,9 +48,9 @@ def check(f, *args):
 
 
 def test_const_fold2():
-    x = tvm.var("x")
-    tmod = tvm.truncmod
-    tdiv = tvm.truncdiv
+    x = te.var("x")
+    tmod = tvm.tir.truncmod
+    tdiv = tvm.tir.truncdiv
     assert (x + 0).same_as(x)
     assert (0 + x).same_as(x)
     assert (x - 0).same_as(x)
@@ -60,48 +61,48 @@ def test_const_fold2():
 
 def test_const_fold3():
     # Test that using ints with logic operations is forbidden
-    x = tvm.var("x")
+    x = te.var("x")
     for val in [0, 1]:
-        for func in [tvm.all, tvm.any]:
-            check_throws(lambda: func(tvm.const(val, 'uint1'), x))
-            check_throws(lambda: func(x, tvm.const(val, 'uint1')))
+        for func in [tvm.tir.all, tvm.tir.any]:
+            check_throws(lambda: func(tvm.tir.const(val, 'uint1'), x))
+            check_throws(lambda: func(x, tvm.tir.const(val, 'uint1')))
 
     # Test const folding when both arguments are const
-    for tvm_func, py_func in [(tvm.all, lambda a, b: a and b), (tvm.any, lambda a, b: a or b)]:
+    for tvm_func, py_func in [(tvm.tir.all, lambda a, b: a and b), (tvm.tir.any, lambda a, b: a or b)]:
         for v1 in [0, 1]:
             for v2 in [0, 1]:
-                assert tvm.ir_pass.Equal(tvm_func(tvm.const(v1, 'uint1'), tvm.const(v2, 'uint1')),
-                                         tvm.const(py_func(v1, v2), 'uint1'))
+                assert tvm.tir.ir_pass.Equal(tvm_func(tvm.tir.const(v1, 'uint1'), tvm.tir.const(v2, 'uint1')),
+                                         tvm.tir.const(py_func(v1, v2), 'uint1'))
 
-    x = tvm.var("x", 'uint1')
-    true = tvm.const(1, 'uint1')
-    false = tvm.const(0, 'uint1')
+    x = te.var("x", 'uint1')
+    true = tvm.tir.const(1, 'uint1')
+    false = tvm.tir.const(0, 'uint1')
 
-    assert tvm.all(x, true).same_as(x)
-    assert tvm.all(true, x).same_as(x)
-    assert tvm.any(x, false).same_as(x)
-    assert tvm.any(false, x).same_as(x)
+    assert tvm.tir.all(x, true).same_as(x)
+    assert tvm.tir.all(true, x).same_as(x)
+    assert tvm.tir.any(x, false).same_as(x)
+    assert tvm.tir.any(false, x).same_as(x)
 
-    assert tvm.all(x, false).same_as(false)
-    assert tvm.all(false, x).same_as(false)
-    assert tvm.any(x, true).same_as(true)
-    assert tvm.any(true, x).same_as(true)
+    assert tvm.tir.all(x, false).same_as(false)
+    assert tvm.tir.all(false, x).same_as(false)
+    assert tvm.tir.any(x, true).same_as(true)
+    assert tvm.tir.any(true, x).same_as(true)
 
 
 def test_const_fold4():
-    x1 = tvm.const(4, "int32")
+    x1 = tvm.tir.const(4, "int32")
     x2 = x1 + 5
-    tdiv = tvm.truncdiv
+    tdiv = tvm.tir.truncdiv
     assert isinstance(x2, tvm.tir.IntImm) and x2.value == 9
     x3 = tdiv(x2, 3)
     assert isinstance(x3, tvm.tir.IntImm) and x3.value == 3
     x4 = x3 + 0.55
     assert isinstance(x4, tvm.tir.FloatImm) and abs(x4.value - 3.55) < 1e-6
-    x5 = tvm.ceil(x4)
+    x5 = te.ceil(x4)
     assert isinstance(x5, tvm.tir.FloatImm) and x5.value == 4
     x6 = x5.astype('int')
     assert isinstance(x6, tvm.tir.IntImm) and x6.value == 4, "x6={}".format(x6)
-    y = (tvm.round((tvm.const(6.5, 'float32') - 1) / 1.5) + 2).astype('int')
+    y = (te.round((tvm.tir.const(6.5, 'float32') - 1) / 1.5) + 2).astype('int')
     assert isinstance(y, tvm.tir.IntImm) and y.value == 6
 
 
@@ -112,8 +113,8 @@ def verify_general_dtype_support(f, is_conditional=False):
                  [('int32', 'int64'), 'int64'],
                  [('uint32', 'int32'), 'int32']]
         for (lhs_dtype, rhs_dtype), out_dtype in rules:
-            lhs = tvm.var('lhs', dtype=lhs_dtype)
-            rhs = tvm.var('rhs', dtype=rhs_dtype)
+            lhs = te.var('lhs', dtype=lhs_dtype)
+            rhs = te.var('rhs', dtype=rhs_dtype)
             out = f(lhs, rhs)
             if not is_conditional:
                 assert out.dtype == out_dtype
@@ -132,8 +133,8 @@ def verify_general_dtype_support(f, is_conditional=False):
     def verify_callop_float_only(f):
         for lhs_dtype in ['int32', 'float32', 'float64']:
             for rhs_dtype in ['int32', 'float32', 'float64']:
-                lhs = tvm.var('lhs', dtype=lhs_dtype)
-                rhs = tvm.var('rhs', dtype=rhs_dtype)
+                lhs = te.var('lhs', dtype=lhs_dtype)
+                rhs = te.var('rhs', dtype=rhs_dtype)
                 if 'float' not in lhs_dtype and 'float' not in rhs_dtype:
                     check_throws(lambda: f(lhs, rhs))
                 elif 'float' in lhs_dtype and 'float' in rhs_dtype and lhs_dtype != rhs_dtype:
@@ -153,36 +154,36 @@ def verify_callop_float_only(f):
     verify_general_dtype_support(lambda a, b: a * b)
     verify_general_dtype_support(lambda a, b: a >= b, is_conditional=True)
     verify_general_dtype_support(lambda a, b: a <= b, is_conditional=True)
-    verify_callop_float_only(lambda a, b: tvm.power(a, b))
+    verify_callop_float_only(lambda a, b: te.power(a, b))
 
 
 def test_if_then_else():
-    cases = [[(tvm.var('cond', dtype='bool'), 'bool', 'int32'), 'int32'],
+    cases = [[(te.var('cond', dtype='bool'), 'bool', 'int32'), 'int32'],
              [(True, 'int32', 'float32'), 'float32'],
              [(False, 'int32', 'int64'), 'int64'],
-             [(tvm.var('cond', dtype='bool'), 'uint32', 'int32'), 'int32'],
-             [(tvm.var('cond', dtype='int32'), 'uint32', 'int32'), 'int32']]
+             [(te.var('cond', dtype='bool'), 'uint32', 'int32'), 'int32'],
+             [(te.var('cond', dtype='int32'), 'uint32', 'int32'), 'int32']]
     for (cond, lhs_dtype, rhs_dtype), out_dtype in cases:
-        lhs = tvm.var('lhs', dtype=lhs_dtype)
-        rhs = tvm.var('rhs', dtype=rhs_dtype)
+        lhs = te.var('lhs', dtype=lhs_dtype)
+        rhs = te.var('rhs', dtype=rhs_dtype)
         if cond is True or cond is False:
-            out = tvm.if_then_else(cond, lhs, rhs)
-            out2 = tvm.if_then_else(not cond, rhs, lhs)
-            out3 = tvm.if_then_else(not cond, lhs, rhs)
-            assert tvm.ir_pass.Equal(out, out2) == 1
+            out = tvm.tir.if_then_else(cond, lhs, rhs)
+            out2 = tvm.tir.if_then_else(not cond, rhs, lhs)
+            out3 = tvm.tir.if_then_else(not cond, lhs, rhs)
+            assert tvm.tir.ir_pass.Equal(out, out2) == 1
             if cond:
-                assert tvm.ir_pass.Equal(out, lhs.astype(out_dtype)) == 1
-                assert tvm.ir_pass.Equal(out3, rhs.astype(out_dtype)) == 1
+                assert tvm.tir.ir_pass.Equal(out, lhs.astype(out_dtype)) == 1
+                assert tvm.tir.ir_pass.Equal(out3, rhs.astype(out_dtype)) == 1
             else:
-                assert tvm.ir_pass.Equal(out, rhs.astype(out_dtype)) == 1
-                assert tvm.ir_pass.Equal(out3, lhs.astype(out_dtype)) == 1
+                assert tvm.tir.ir_pass.Equal(out, rhs.astype(out_dtype)) == 1
+                assert tvm.tir.ir_pass.Equal(out3, lhs.astype(out_dtype)) == 1
         elif cond.dtype == 'bool':
-            out = tvm.if_then_else(cond, lhs, rhs)
+            out = tvm.tir.if_then_else(cond, lhs, rhs)
             assert out.dtype == out_dtype
             assert out.args[1].dtype == out_dtype
             assert out.args[2].dtype == out_dtype
         elif cond.dtype != 'bool':
-            check_throws(lambda: tvm.if_then_else(cond, lhs, rhs))
+            check_throws(lambda: tvm.tir.if_then_else(cond, lhs, rhs))
         else:
             raise ValueError('Unknown combinations')
 
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index e97e73a1d1cc..1691d7d11a7a 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -15,11 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_const_saveload_json():
     # save load json
-    x = tvm.const(1, "int32")
-    y = tvm.const(10, "int32")
+    x = tvm.tir.const(1, "int32")
+    y = tvm.tir.const(10, "int32")
     z = x + y
     z = z + z
     json_str = tvm.ir.save_json(z)
@@ -29,11 +30,11 @@ def test_const_saveload_json():
 
 def test_make_smap():
     # save load json
-    x = tvm.const(1, "int32")
-    y = tvm.const(10, "int32")
+    x = tvm.tir.const(1, "int32")
+    y = tvm.tir.const(10, "int32")
     z = tvm.tir.Add(x, y)
-    smap = tvm.convert({"z": z, "x": x})
-    json_str = tvm.ir.save_json(tvm.convert([smap]))
+    smap = tvm.runtime.convert({"z": z, "x": x})
+    json_str = tvm.ir.save_json(tvm.runtime.convert([smap]))
     arr = tvm.ir.load_json(json_str)
     assert len(arr) == 1
     assert arr[0]["z"].a == arr[0]["x"]
@@ -43,7 +44,7 @@ def test_make_node():
     x = tvm.ir.make_node("IntImm", dtype="int32", value=10)
     assert isinstance(x, tvm.tir.IntImm)
     assert x.value == 10
-    A = tvm.placeholder((10, ), name='A')
+    A = te.placeholder((10, ), name='A')
     AA = tvm.ir.make_node("Tensor",
                        shape=A.shape,
                        dtype=A.dtype,
@@ -81,9 +82,9 @@ def test_make_attrs():
 
 
 def test_make_sum():
-    A = tvm.placeholder((2, 10), name='A')
-    k = tvm.reduce_axis((0,10), "k")
-    B = tvm.compute((2,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
+    A = te.placeholder((2, 10), name='A')
+    k = te.reduce_axis((0,10), "k")
+    B = te.compute((2,), lambda i: te.sum(A[i, k], axis=k), name="B")
     json_str = tvm.ir.save_json(B)
     BB = tvm.ir.load_json(json_str)
     assert B.op.body[0].combiner is not None
diff --git a/tests/python/unittest/test_lang_schedule.py b/tests/python/unittest/test_lang_schedule.py
index 10843f993d06..dae43bb2bbf2 100644
--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -16,17 +16,18 @@
 # under the License.
 import pytest
 import tvm
+from tvm import te
 import pickle as pkl
 
 def test_schedule_create():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    l = tvm.size_var('l')
-    A = tvm.placeholder((m, l), name='A')
-    B = tvm.placeholder((n, l), name='B')
-    AA = tvm.compute((m, l), lambda i, j: A[i, j])
-    T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
-    s = tvm.create_schedule(T.op)
+    m = te.size_var('m')
+    n = te.size_var('n')
+    l = te.size_var('l')
+    A = te.placeholder((m, l), name='A')
+    B = te.placeholder((n, l), name='B')
+    AA = te.compute((m, l), lambda i, j: A[i, j])
+    T = te.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
+    s = te.create_schedule(T.op)
     s[AA].set_scope("shared")
     xo, xi = s[T].split(T.op.axis[0], factor=10)
     xi1, xi2 = s[T].split(xi, factor=2)
@@ -38,22 +39,22 @@ def test_schedule_create():
     # save load json
     json_str = tvm.ir.save_json(s)
     s_loaded = tvm.ir.load_json(json_str)
-    assert isinstance(s_loaded, tvm.schedule.Schedule)
+    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
     assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
 
     # pickle unpickle
     dump = pkl.dumps(s)
     s_loaded = pkl.loads(dump)
-    assert isinstance(s_loaded, tvm.schedule.Schedule)
+    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
     assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
 
 
 def test_reorder():
-    m = tvm.size_var('m')
-    A = tvm.placeholder((m,), name='A')
-    T = tvm.compute(m, lambda i: A[i+1])
+    m = te.size_var('m')
+    A = te.placeholder((m,), name='A')
+    T = te.compute(m, lambda i: A[i+1])
 
-    s = tvm.create_schedule(T.op)
+    s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=10)
     xi1, xi2 = s[T].split(xi, factor=2)
     order = (xi2, xi1, xo)
@@ -69,107 +70,107 @@ def test_reorder():
         pass
 
 def test_split():
-    m = tvm.size_var('m')
-    A = tvm.placeholder((m,), name='A')
-    T = tvm.compute((m,), lambda i: A[i])
+    m = te.size_var('m')
+    A = te.placeholder((m,), name='A')
+    T = te.compute((m,), lambda i: A[i])
 
-    s = tvm.create_schedule(T.op)
+    s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=10)
     assert tuple(s[T].leaf_iter_vars) == (xo, xi)
 
 
 def test_tile():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    A = tvm.placeholder((m, n), name='A')
-    T = tvm.compute((m, n), lambda i, j: A[i, j])
+    m = te.size_var('m')
+    n = te.size_var('n')
+    A = te.placeholder((m, n), name='A')
+    T = te.compute((m, n), lambda i, j: A[i, j])
 
-    s = tvm.create_schedule(T.op)
+    s = te.create_schedule(T.op)
     xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
     assert tuple(s[T].leaf_iter_vars) == (xo, yo, xi, yi)
 
 
 def test_fuse():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    A = tvm.placeholder((m, n), name='A')
-    T = tvm.compute((m, n), lambda i, j: A[i, j])
+    m = te.size_var('m')
+    n = te.size_var('n')
+    A = te.placeholder((m, n), name='A')
+    T = te.compute((m, n), lambda i, j: A[i, j])
 
-    s = tvm.create_schedule(T.op)
+    s = te.create_schedule(T.op)
     xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
     fused = s[T].fuse(xo, yo)
-    assert any(isinstance(x, tvm.schedule.Fuse) for x in s[T].relations)
+    assert any(isinstance(x, tvm.te.schedule.Fuse) for x in s[T].relations)
     assert tuple(s[T].leaf_iter_vars) == (fused, xi, yi)
 
 
 def test_singleton():
     print("test singleton")
-    A = tvm.placeholder((), name='A')
-    T = tvm.compute((), lambda : A() + 1)
-    s = tvm.create_schedule(T.op)
+    A = te.placeholder((), name='A')
+    T = te.compute((), lambda : A() + 1)
+    s = te.create_schedule(T.op)
     print("test singleton fin1")
     fused = s[T].fuse()
-    assert any(isinstance(x, tvm.schedule.Singleton) for x in s[T].relations)
+    assert any(isinstance(x, tvm.te.schedule.Singleton) for x in s[T].relations)
     assert tuple(s[T].leaf_iter_vars) == (fused,)
     dump = pkl.dumps(s)
     print("test singleton fin3")
     s_loaded = pkl.loads(dump)
     print("test singleton fin2")
-    assert isinstance(s_loaded, tvm.schedule.Schedule)
+    assert isinstance(s_loaded, tvm.te.schedule.Schedule)
     print("test singleton fin")
 
 def test_vectorize():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    A = tvm.placeholder((m, n), name='A')
-    T = tvm.compute((m, n), lambda i, j: A[i, j])
+    m = te.size_var('m')
+    n = te.size_var('n')
+    A = te.placeholder((m, n), name='A')
+    T = te.compute((m, n), lambda i, j: A[i, j])
 
-    s = tvm.create_schedule(T.op)
+    s = te.create_schedule(T.op)
     xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5)
     s[T].vectorize(yi)
     s[T].unroll(xi)
-    UNROLL = tvm.schedule.IterVar.Unrolled
-    VECTORIZE = tvm.schedule.IterVar.Vectorized
+    UNROLL = tvm.te.schedule.IterVar.Unrolled
+    VECTORIZE = tvm.te.schedule.IterVar.Vectorized
     assert s[T].iter_var_attrs[xi].iter_type == UNROLL
     assert s[T].iter_var_attrs[yi].iter_type == VECTORIZE
 
 @pytest.mark.xfail
 def test_vectorize_commreduce():
-    V = tvm.placeholder((128,), name='V')
-    ax = tvm.reduce_axis((0, 128), name='ax')
-    O = tvm.compute((1,), lambda _: tvm.sum(V[ax], axis=[ax]))
-    s = tvm.create_schedule(O.op)
+    V = te.placeholder((128,), name='V')
+    ax = te.reduce_axis((0, 128), name='ax')
+    O = te.compute((1,), lambda _: te.sum(V[ax], axis=[ax]))
+    s = te.create_schedule(O.op)
     s[O].vectorize(ax) # should throw here
 
 def test_pragma():
     m = 100
-    A = tvm.placeholder((m,), name='A')
-    T = tvm.compute((m,), lambda i: A[i])
+    A = te.placeholder((m,), name='A')
+    T = te.compute((m,), lambda i: A[i])
 
-    s = tvm.create_schedule(T.op)
+    s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=10)
     s[T].pragma(xo, "pragma1")
     s[T].pragma(xi, "vectorize")
-    VECTORIZE = tvm.schedule.IterVar.Vectorized
+    VECTORIZE = tvm.te.schedule.IterVar.Vectorized
     assert s[T].iter_var_attrs[xo].pragma_keys[0].value == "pragma1"
     assert s[T].iter_var_attrs[xi].iter_type == VECTORIZE
 
 
 def test_rfactor():
-    n = tvm.size_var('n')
-    k1 = tvm.reduce_axis((0, n), name="k1")
-    k2 = tvm.reduce_axis((0, n), name="k2")
-    A = tvm.placeholder((n, n, n), name='A')
-    B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k1, k2], axis=[k1, k2]))
+    n = te.size_var('n')
+    k1 = te.reduce_axis((0, n), name="k1")
+    k2 = te.reduce_axis((0, n), name="k2")
+    A = te.placeholder((n, n, n), name='A')
+    B = te.compute((n, ), lambda i: te.sum(A[i, k1, k2], axis=[k1, k2]))
     # normal schedule
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     BF = s.rfactor(B, k1)
     assert(tuple(BF.shape) == (n, n))
     assert(set(BF.op.body[0].axis) == set([k2]))
     assert(s[B].op.body[0].axis[0].dom.extent == n)
     assert(len(s[B].all_iter_vars) == 2)
     # schedule with splot
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     ko, ki = s[B].split(k1, factor=4)
     xo, xi = s[B].split(B.op.axis[0], factor=8)
     BF = s.rfactor(B, ki)
@@ -179,7 +180,7 @@ def test_rfactor():
     assert(BF.op.body[0].axis[1].var ==  ko.var)
     assert(s[B].op.body[0].axis[0].dom.extent.value == 4)
     # schedule with factor_axis
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     ko, ki = s[B].split(k1, factor=4)
     xo, xi = s[B].split(B.op.axis[0], factor=8)
     BF = s.rfactor(B, ki, 1)
@@ -191,54 +192,54 @@ def test_rfactor():
 
 def test_tensor_intrin():
     n = 16
-    x = tvm.placeholder((n,), name='x')
-    y = tvm.placeholder((n,), name='y')
-    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
+    x = te.placeholder((n,), name='x')
+    y = te.placeholder((n,), name='y')
+    z = te.compute(x.shape, lambda i: x[i] + y[i], name='z')
     def intrin_func(ins, outs):
-        assert(isinstance(ins[0], tvm.schedule.Buffer))
+        assert(isinstance(ins[0], tvm.te.schedule.Buffer))
         assert(ins[0].shape[0].value == n)
-        return tvm.call_packed("vadd", ins[0].data, outs[0].data, ins[0].shape[0])
-    intrin = tvm.decl_tensor_intrin(z.op, intrin_func)
+        return tvm.tir.call_packed("vadd", ins[0].data, outs[0].data, ins[0].shape[0])
+    intrin = te.decl_tensor_intrin(z.op, intrin_func)
     assert intrin.op == z.op
     assert intrin.reduce_init is None
     assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
     assert(intrin.buffers[0].shape[0].value == n)
     m = 32
-    x = tvm.placeholder((m,), name='x')
-    y = tvm.placeholder((m,), name='y')
-    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
-    s = tvm.create_schedule(z.op)
+    x = te.placeholder((m,), name='x')
+    y = te.placeholder((m,), name='y')
+    z = te.compute(x.shape, lambda i: x[i] + y[i], name='z')
+    s = te.create_schedule(z.op)
     xo, xi = s[z].split(z.op.axis[0], factor=n)
     s[z].tensorize(xi, intrin)
     assert(s[z].iter_var_attrs[xi].tensor_intrin == intrin)
-    assert(s[z].iter_var_attrs[xi].iter_type == tvm.schedule.IterVar.Tensorized)
+    assert(s[z].iter_var_attrs[xi].iter_type == tvm.te.schedule.IterVar.Tensorized)
 
 def test_tensor_intrin_scalar_params():
-    n = tvm.size_var("n")
-    x = tvm.placeholder((n,), name='x')
-    v = tvm.size_var("v")
-    w = tvm.size_var("w")
-    z = tvm.compute((n,), lambda i: x[i]*v + w, name='z')
+    n = te.size_var("n")
+    x = te.placeholder((n,), name='x')
+    v = te.size_var("v")
+    w = te.size_var("w")
+    z = te.compute((n,), lambda i: x[i]*v + w, name='z')
 
     def intrin_func(ins, outs, sp):
-        assert(isinstance(ins[0], tvm.schedule.Buffer))
+        assert(isinstance(ins[0], tvm.te.schedule.Buffer))
         assert(ins[0].shape[0] == n)
         assert(sp[0] == v)
         assert(sp[1] == w)
-        return tvm.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1])
+        return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1])
 
-    with tvm.build_config(offset_factor=1):
-      intrin = tvm.decl_tensor_intrin(z.op, intrin_func, scalar_params=[v, w])
+    with tvm.target.build_config(offset_factor=1):
+      intrin = te.decl_tensor_intrin(z.op, intrin_func, scalar_params=[v, w])
     assert intrin.op == z.op
     assert intrin.reduce_init is None
     assert tuple(intrin.inputs) == tuple(z.op.input_tensors)
     assert(intrin.buffers[0].shape[0] == n)
     assert tuple(intrin.scalar_params) == tuple((v, w))
 
-    A = tvm.placeholder((10,10), name='A')
+    A = te.placeholder((10,10), name='A')
     # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs
-    C = tvm.compute((10,10), lambda i, j: intrin(i*i, A[i, j], i+j), name="C")
-    s = tvm.create_schedule(C.op)
+    C = te.compute((10,10), lambda i, j: intrin(i*i, A[i, j], i+j), name="C")
+    s = te.create_schedule(C.op)
     stmt = tvm.lower(s, [A, C], simple_mode=True)
     assert isinstance(stmt.body.body.body, tvm.tir.Evaluate)
     assert len(stmt.body.body.body.value.args) == 5
diff --git a/tests/python/unittest/test_lang_tag.py b/tests/python/unittest/test_lang_tag.py
index 201abf193eb4..6cfc0b12464e 100644
--- a/tests/python/unittest/test_lang_tag.py
+++ b/tests/python/unittest/test_lang_tag.py
@@ -16,32 +16,34 @@
 # under the License.
 import json
 import tvm
+from tvm import te
+from tvm import te
 
-@tvm.tag_scope(tag="conv")
+@tvm.te.tag_scope(tag="conv")
 def compute_conv(data, weight):
     N, IC, H, W = data.shape
     OC, IC, KH, KW = weight.shape
     OH = H - KH + 1
     OW = W - KW + 1
 
-    ic = tvm.reduce_axis((0, IC), name='ic')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
+    ic = te.reduce_axis((0, IC), name='ic')
+    dh = te.reduce_axis((0, KH), name='dh')
+    dw = te.reduce_axis((0, KW), name='dw')
 
-    return tvm.compute((N, OC, OH, OW), lambda i, oc, h, w: \
-        tvm.sum(data[i, ic, h+dh, w+dw] * weight[oc, ic, dh, dw],
+    return te.compute((N, OC, OH, OW), lambda i, oc, h, w: \
+        te.sum(data[i, ic, h+dh, w+dw] * weight[oc, ic, dh, dw],
                 axis=[ic, dh, dw]))
 
 def test_with():
-    n = tvm.size_var('n')
-    m = tvm.size_var('m')
-    l = tvm.size_var('l')
+    n = te.size_var('n')
+    m = te.size_var('m')
+    l = te.size_var('l')
 
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((m, l), name='B')
-    with tvm.tag_scope(tag="gemm"):
-        k = tvm.reduce_axis((0, l), name='k')
-        C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k] * B[j, k], axis=k),
+    A = te.placeholder((n, l), name='A')
+    B = te.placeholder((m, l), name='B')
+    with tvm.te.tag_scope(tag="gemm"):
+        k = te.reduce_axis((0, l), name='k')
+        C = te.compute((n, m), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k),
                         attrs={"hello" : 1, "arr": [10, 12]})
 
     assert C.op.tag == 'gemm'
@@ -56,31 +58,31 @@ def test_with():
 
 
 def test_decorator():
-    n = tvm.size_var('n')
-    c = tvm.size_var('c')
-    h = tvm.size_var('h')
-    w = tvm.size_var('w')
-    kh = tvm.size_var('kh')
-    kw = tvm.size_var('kw')
+    n = te.size_var('n')
+    c = te.size_var('c')
+    h = te.size_var('h')
+    w = te.size_var('w')
+    kh = te.size_var('kh')
+    kw = te.size_var('kw')
 
-    A = tvm.placeholder((n, c, h, w), name='A')
-    B = tvm.placeholder((c, c, kh, kw), name='B')
+    A = te.placeholder((n, c, h, w), name='A')
+    B = te.placeholder((c, c, kh, kw), name='B')
     C = compute_conv(A, B)
     assert C.op.tag == 'conv'
     assert len(C.op.attrs) == 0
 
 def test_nested():
-    n = tvm.size_var('n')
-    c = tvm.size_var('c')
-    h = tvm.size_var('h')
-    w = tvm.size_var('w')
-    kh = tvm.size_var('kh')
-    kw = tvm.size_var('kw')
+    n = te.size_var('n')
+    c = te.size_var('c')
+    h = te.size_var('h')
+    w = te.size_var('w')
+    kh = te.size_var('kh')
+    kw = te.size_var('kw')
 
-    A = tvm.placeholder((n, c, h, w), name='A')
-    B = tvm.placeholder((c, c, kh, kw), name='B')
+    A = te.placeholder((n, c, h, w), name='A')
+    B = te.placeholder((c, c, kh, kw), name='B')
     try:
-        with tvm.tag_scope(tag='conv'):
+        with te.tag_scope(tag='conv'):
             C = compute_conv(A, B)
         assert False
     except ValueError:
diff --git a/tests/python/unittest/test_lang_target.py b/tests/python/unittest/test_lang_target.py
index 6da99f827047..da7bcee016e4 100644
--- a/tests/python/unittest/test_lang_target.py
+++ b/tests/python/unittest/test_lang_target.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 @tvm.target.generic_func
 def mygeneric(data):
diff --git a/tests/python/unittest/test_lang_tensor.py b/tests/python/unittest/test_lang_tensor.py
index 2de5e19c9e36..762b3fe75180 100644
--- a/tests/python/unittest/test_lang_tensor.py
+++ b/tests/python/unittest/test_lang_tensor.py
@@ -15,19 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from topi.nn.pooling import pool
 
 def test_tensor():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    l = tvm.size_var('l')
-    A = tvm.placeholder((m, l), name='A')
-    B = tvm.placeholder((n, l), name='B')
-    T = tvm.compute((m, n, l), lambda i, j, k: A[i, k] * B[j, k])
+    m = te.size_var('m')
+    n = te.size_var('n')
+    l = te.size_var('l')
+    A = te.placeholder((m, l), name='A')
+    B = te.placeholder((n, l), name='B')
+    T = te.compute((m, n, l), lambda i, j, k: A[i, k] * B[j, k])
     print(T)
     print(T.op.body)
     assert(tuple(T.shape) == (m, n, l))
-    assert(isinstance(A.op, tvm.tensor.PlaceholderOp))
+    assert(isinstance(A.op, tvm.te.PlaceholderOp))
     assert(A == A)
     assert(T.op.output(0) == T)
     assert(T.op.output(0).__hash__() == T.__hash__())
@@ -37,68 +38,68 @@ def test_tensor():
 
 
 def test_rank_zero():
-    m = tvm.size_var('m')
-    A = tvm.placeholder((m,), name='A')
-    scale = tvm.placeholder((), name='s')
-    k = tvm.reduce_axis((0, m), name="k")
-    T = tvm.compute((), lambda : tvm.sum(A[k] * scale(), axis=k))
+    m = te.size_var('m')
+    A = te.placeholder((m,), name='A')
+    scale = te.placeholder((), name='s')
+    k = te.reduce_axis((0, m), name="k")
+    T = te.compute((), lambda : te.sum(A[k] * scale(), axis=k))
     print(T)
     print(T.op.body)
     assert(tuple(T.shape) == ())
 
 
 def test_conv1d():
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n+2), name='A')
+    n = te.size_var('n')
+    A = te.placeholder((n+2), name='A')
     def computeB(ii):
         i = ii + 1
         return A[i-1] + A[i] + A[i+1]
-    B = tvm.compute(n, computeB)
+    B = te.compute(n, computeB)
 
 
 def test_tensor_slice():
-    n = tvm.size_var('n')
-    A = tvm.compute((n, n), lambda i, j: 1)
-    B = tvm.compute((n,), lambda i: A[0][i] + A[0][i])
+    n = te.size_var('n')
+    A = te.compute((n, n), lambda i, j: 1)
+    B = te.compute((n,), lambda i: A[0][i] + A[0][i])
 
 
 def test_tensor_reduce_multi_axis():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    A = tvm.placeholder((m, n), name='A')
-    k1 = tvm.reduce_axis((0, n), "k")
-    k2 = tvm.reduce_axis((0, m), "k")
-    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=(k1, k2)))
-    C = tvm.compute((1,), lambda _: tvm.sum(A[k1, k2], axis=[k1, k2]))
+    m = te.size_var('m')
+    n = te.size_var('n')
+    A = te.placeholder((m, n), name='A')
+    k1 = te.reduce_axis((0, n), "k")
+    k2 = te.reduce_axis((0, m), "k")
+    C = te.compute((1,), lambda _: te.sum(A[k1, k2], axis=(k1, k2)))
+    C = te.compute((1,), lambda _: te.sum(A[k1, k2], axis=[k1, k2]))
 
 
 def test_tensor_comm_reducer():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    A = tvm.placeholder((m, n), name='A')
-    k = tvm.reduce_axis((0, n), "k")
-    mysum = tvm.comm_reducer(lambda x, y: x+y, lambda t: tvm.const(0, dtype=t))
-    C = tvm.compute((m,), lambda i: mysum(A[i, k], axis=k))
+    m = te.size_var('m')
+    n = te.size_var('n')
+    A = te.placeholder((m, n), name='A')
+    k = te.reduce_axis((0, n), "k")
+    mysum = te.comm_reducer(lambda x, y: x+y, lambda t: tvm.tir.const(0, dtype=t))
+    C = te.compute((m,), lambda i: mysum(A[i, k], axis=k))
 
 def test_tensor_comm_reducer_overload():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    mysum = tvm.comm_reducer(lambda x, y: x+y, lambda t: tvm.const(0, dtype=t))
+    m = te.size_var('m')
+    n = te.size_var('n')
+    mysum = te.comm_reducer(lambda x, y: x+y, lambda t: tvm.tir.const(0, dtype=t))
     sum_res = mysum(m, n)
 
 def test_tensor_reduce():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    l = tvm.size_var('l')
-    A = tvm.placeholder((m, l), name='A')
-    B = tvm.placeholder((n, l), name='B')
-    T = tvm.compute((m, n, l), lambda i, j, k: A[i, k] * B[j, k])
-    rv = tvm.reduce_axis((0, A.shape[1]), "k")
-    C = tvm.compute((m, n), lambda i, j: tvm.sum(T(i, j, rv+1), axis=rv))
+    m = te.size_var('m')
+    n = te.size_var('n')
+    l = te.size_var('l')
+    A = te.placeholder((m, l), name='A')
+    B = te.placeholder((n, l), name='B')
+    T = te.compute((m, n, l), lambda i, j, k: A[i, k] * B[j, k])
+    rv = te.reduce_axis((0, A.shape[1]), "k")
+    C = te.compute((m, n), lambda i, j: te.sum(T(i, j, rv+1), axis=rv))
     # json load save
     C_json = tvm.ir.save_json(C)
     C_loaded = tvm.ir.load_json(C_json)
-    assert(isinstance(C_loaded, tvm.tensor.Tensor))
+    assert(isinstance(C_loaded, te.tensor.Tensor))
     assert(str(C_loaded) == str(C))
 
 def test_tensor_compute1():
@@ -107,26 +108,26 @@ def test_tensor_compute1():
     dtype = 'float32'
 
     def intrin_vadd(n):
-        x = tvm.placeholder((n,))
-        y = tvm.placeholder((n,))
-        z = tvm.compute(x.shape, lambda i: x[i] + y[i])
+        x = te.placeholder((n,))
+        y = te.placeholder((n,))
+        z = te.compute(x.shape, lambda i: x[i] + y[i])
 
         def intrin_func(ins, outs):
-            ib = tvm.ir_builder.create()
-            ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
             return ib.get()
 
-        with tvm.build_config(offset_factor=n):
-            return tvm.decl_tensor_intrin(z.op, intrin_func)
+        with tvm.target.build_config(offset_factor=n):
+            return te.decl_tensor_intrin(z.op, intrin_func)
 
     vadd = intrin_vadd(factor)
 
-    A = tvm.placeholder((m//factor, factor), name="A", dtype=dtype)
-    B = tvm.placeholder((m//factor, factor), name="B", dtype=dtype)
-    C = tvm.compute((m//factor, factor),
+    A = te.placeholder((m//factor, factor), name="A", dtype=dtype)
+    B = te.placeholder((m//factor, factor), name="B", dtype=dtype)
+    C = te.compute((m//factor, factor),
           lambda i: vadd(A[i, 0:factor], B[i, 0:factor]))
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     stmt = tvm.lower(s, [A, B, C], simple_mode=True)
     assert isinstance(stmt.body.body, tvm.tir.Evaluate)
 
@@ -140,102 +141,102 @@ def test_tensor_compute2():
     dtype = 'float32'
 
     def intrin_gemm(m, n, l):
-        k = tvm.reduce_axis((0, l))
-        x = tvm.placeholder((m, l))
-        y = tvm.placeholder((n, l))
+        k = te.reduce_axis((0, l))
+        x = te.placeholder((m, l))
+        y = te.placeholder((n, l))
         # in theory, no relation
-        z = tvm.compute((m, n), lambda i, j: tvm.sum(x[i][k] * y[j][k], axis=k))
+        z = te.compute((m, n), lambda i, j: te.sum(x[i][k] * y[j][k], axis=k))
 
         def intrin_func(ins, outs):
             x_ptr = ins[0].access_ptr("r")
             y_ptr = ins[1].access_ptr("r")
             z_ptr = outs[0].access_ptr("w")
-            body = tvm.call_packed(
+            body = tvm.tir.call_packed(
                 "gemv", x_ptr, y_ptr, z_ptr, m, n, l)
-            reset = tvm.call_packed(
+            reset = tvm.tir.call_packed(
                 "fill_zero", z_ptr, m, n)
-            update = tvm.call_packed(
+            update = tvm.tir.call_packed(
                 "gemv_add", x_ptr, y_ptr, z_ptr, m, n, l)
             return body, reset, update
 
-        with tvm.build_config(offset_factor=n):
-            return tvm.decl_tensor_intrin(z.op, intrin_func)
+        with tvm.target.build_config(offset_factor=n):
+            return te.decl_tensor_intrin(z.op, intrin_func)
 
     vgemm = intrin_gemm(factor1, factor2, factor)
 
-    A = tvm.placeholder((M//factor1, L//factor, factor1, factor), name="A", dtype=dtype)
-    B = tvm.placeholder((N//factor2, L//factor, factor2, factor), name="B", dtype=dtype)
-    k = tvm.reduce_axis((0, L//factor), name='k')
-    C = tvm.compute((M//factor1, N//factor2, factor1, factor2),
+    A = te.placeholder((M//factor1, L//factor, factor1, factor), name="A", dtype=dtype)
+    B = te.placeholder((N//factor2, L//factor, factor2, factor), name="B", dtype=dtype)
+    k = te.reduce_axis((0, L//factor), name='k')
+    C = te.compute((M//factor1, N//factor2, factor1, factor2),
           lambda i, j: vgemm(A[i, k, 0:factor1, 0:factor], B[j, k, 0:factor2, 0:factor], reduce_axis=k))
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     stmt = tvm.lower(s, [A, B, C], simple_mode=True)
     assert isinstance(stmt.body.body.body[0], tvm.tir.Evaluate)
     assert isinstance(stmt.body.body.body[1].body, tvm.tir.Evaluate)
 
 def test_tensor_scan():
-    m = tvm.size_var("m")
-    n = tvm.size_var("n")
-    x = tvm.placeholder((m, n))
-    s = tvm.placeholder((m, n))
-    res = tvm.scan(tvm.compute((1, n), lambda _, i: x[0, i]),
-                   tvm.compute((m, n), lambda t, i: s[t-1, i] + x[t, i]),
+    m = te.size_var("m")
+    n = te.size_var("n")
+    x = te.placeholder((m, n))
+    s = te.placeholder((m, n))
+    res = tvm.te.scan(te.compute((1, n), lambda _, i: x[0, i]),
+                   te.compute((m, n), lambda t, i: s[t-1, i] + x[t, i]),
                    s)
     assert tuple(res.shape) == (m, n)
 
 def test_scan_multi_out():
-    m = tvm.size_var("m")
-    n = tvm.size_var("n")
-    x1 = tvm.placeholder((m, n))
-    s1 = tvm.placeholder((m, n))
-    x2 = tvm.placeholder((m, n))
-    s2 = tvm.placeholder((m, n))
-    s1_init = tvm.compute((1, n), lambda _, i: x1[0, i])
-    s2_init = tvm.compute((1, n), lambda _, i: x2[0, i])
-    s1_update = tvm.compute((m, n), lambda t, i: s1[t-1, i] + s2[t-1, i] + x1[t, i])
-    s2_update = tvm.compute((m, n), lambda t, i: x2[t, i] + s2[t-1,i])
-
-    r0, r1 = tvm.scan([s1_init, s2_init],
+    m = te.size_var("m")
+    n = te.size_var("n")
+    x1 = te.placeholder((m, n))
+    s1 = te.placeholder((m, n))
+    x2 = te.placeholder((m, n))
+    s2 = te.placeholder((m, n))
+    s1_init = te.compute((1, n), lambda _, i: x1[0, i])
+    s2_init = te.compute((1, n), lambda _, i: x2[0, i])
+    s1_update = te.compute((m, n), lambda t, i: s1[t-1, i] + s2[t-1, i] + x1[t, i])
+    s2_update = te.compute((m, n), lambda t, i: x2[t, i] + s2[t-1,i])
+
+    r0, r1 = tvm.te.scan([s1_init, s2_init],
                       [s1_update, s2_update],
                       [s1, s2])
     assert(r0.value_index == 0)
     assert(r1.value_index == 1)
     json_str = tvm.ir.save_json(r0.op)
     zz = tvm.ir.load_json(json_str)
-    assert isinstance(zz, tvm.tensor.ScanOp)
+    assert isinstance(zz, tvm.te.ScanOp)
 
 def test_extern():
-    m = tvm.size_var('m')
-    A = tvm.placeholder((m,), name='A')
+    m = te.size_var('m')
+    A = te.placeholder((m,), name='A')
 
     def extern_func(ins, outs):
-        assert(isinstance(ins[0], tvm.schedule.Buffer))
-        return tvm.call_packed("myadd", ins[0].data, outs[0].data, m)
-    B = tvm.extern((m,), [A], extern_func)
+        assert(isinstance(ins[0], tvm.te.schedule.Buffer))
+        return tvm.tir.call_packed("myadd", ins[0].data, outs[0].data, m)
+    B = te.extern((m,), [A], extern_func)
     assert(tuple(B.shape) == (m,))
 
 
 def test_extern_multi_out():
-    m = tvm.size_var('m')
-    A = tvm.placeholder((m,), name='A')
-    B = tvm.compute((m,), lambda i: A[i] * 10)
+    m = te.size_var('m')
+    A = te.placeholder((m,), name='A')
+    B = te.compute((m,), lambda i: A[i] * 10)
 
     def extern_func(ins, outs):
-        assert(isinstance(ins[0], tvm.schedule.Buffer))
-        return tvm.call_packed(
+        assert(isinstance(ins[0], tvm.te.schedule.Buffer))
+        return tvm.tir.call_packed(
             "myadd", ins[0].data, outs[0].data, outs[1].data, m)
-    res = tvm.extern([A.shape, A.shape], [A, B], extern_func)
+    res = te.extern([A.shape, A.shape], [A, B], extern_func)
     assert(len(res) == 2)
     assert(res[1].value_index == 1)
 
 def test_tuple_inputs():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    A0 = tvm.placeholder((m, n), name='A0')
-    A1 = tvm.placeholder((m, n), name='A1')
-    T0, T1 = tvm.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name='T')
-    s = tvm.create_schedule(T0.op)
+    m = te.size_var('m')
+    n = te.size_var('n')
+    A0 = te.placeholder((m, n), name='A0')
+    A1 = te.placeholder((m, n), name='A1')
+    T0, T1 = te.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name='T')
+    s = te.create_schedule(T0.op)
 
     for i in range(len(T0.shape)):
       assert(T0.shape[i] == T1.shape[i])
@@ -244,58 +245,58 @@ def test_tuple_inputs():
     assert(T1.value_index == 1)
 
 def test_tuple_with_different_deps():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    A0 = tvm.placeholder((m, n), name='A1')
-    A1 = tvm.placeholder((m, n), name='A2')
-    B0, B1 = tvm.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name='B')
-    C = tvm.compute((m, n), lambda i, j: B0[i, j] + 4, name='C')
-
-    s = tvm.create_schedule(C.op)
+    m = te.size_var('m')
+    n = te.size_var('n')
+    A0 = te.placeholder((m, n), name='A1')
+    A1 = te.placeholder((m, n), name='A2')
+    B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name='B')
+    C = te.compute((m, n), lambda i, j: B0[i, j] + 4, name='C')
+
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], factor=10)
     s[B0.op].compute_at(s[C], xo)
     sch = s.normalize()
-    bounds = tvm.schedule.InferBound(sch)
-    stmt = tvm.schedule.ScheduleOps(sch, bounds)
+    bounds = tvm.te.schedule.InferBound(sch)
+    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
 
     def get_B1_realize(x):
         if isinstance(x, tvm.tir.Realize) and \
            x.func == B1.op and x.value_index == 1:
             ret.append(x)
     ret = []
-    tvm.ir_pass.PostOrderVisit(stmt, get_B1_realize)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, get_B1_realize)
 
     assert stmt.node == C.op and len(ret) == 1
 
 
 def test_tensor_inputs():
-    x = tvm.placeholder((1,), name='x')
-    y = tvm.compute(x.shape, lambda i: x[i] + x[i])
+    x = te.placeholder((1,), name='x')
+    y = te.compute(x.shape, lambda i: x[i] + x[i])
     assert tuple(y.op.input_tensors) == (x,)
 
 
 def test_tensor_pool():
     def intrin_pool():
-        A = tvm.placeholder((64, 16, 16), name='A')
-        kh = tvm.reduce_axis((0, 3), name='kh')
-        kw = tvm.reduce_axis((0, 3), name='kw')
-        P = tvm.compute((64, 14, 14),
-                        lambda c, oh, ow: tvm.max(A[c, oh + kh, ow + kw],
+        A = te.placeholder((64, 16, 16), name='A')
+        kh = te.reduce_axis((0, 3), name='kh')
+        kw = te.reduce_axis((0, 3), name='kw')
+        P = te.compute((64, 14, 14),
+                        lambda c, oh, ow: tvm.te.max(A[c, oh + kh, ow + kw],
                                                   axis=[kh, kw]),
                         name='p')
 
         def intrin_func(ins, outs):
             dinp = ins[0]
             dout = outs[0]
-            return tvm.call_packed("op", dinp, dout)
+            return tvm.tir.call_packed("op", dinp, dout)
 
-        with tvm.build_config(offset_factor=1):
-            return tvm.decl_tensor_intrin(P.op, intrin_func)
+        with tvm.target.build_config(offset_factor=1):
+            return te.decl_tensor_intrin(P.op, intrin_func)
 
-    A = tvm.placeholder((1, 64, 16, 16), name='A')
+    A = te.placeholder((1, 64, 16, 16), name='A')
     P = pool(data=A, kernel=(3, 3), stride=(1, 1), padding=(0, 0, 0, 0),
              pool_type='max')
-    s = tvm.create_schedule(P.op)
+    s = te.create_schedule(P.op)
     _, oh, _, _ = P.op.axis
     intrin = intrin_pool()
     s[P].tensorize(oh, intrin)
diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py
index 01c0d26dfc9b..2e4696298919 100644
--- a/tests/python/unittest/test_lang_tensor_overload_op.py
+++ b/tests/python/unittest/test_lang_tensor_overload_op.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -23,27 +24,27 @@
 
 def test_operator_type_and_tags():
     k = 1
-    n = tvm.var('n')
-    A = tvm.placeholder((), name='A')
-    B = tvm.placeholder((10, 5), name='B')
+    n = te.var('n')
+    A = te.placeholder((), name='A')
+    B = te.placeholder((10, 5), name='B')
     B1 = B[0]
     B2 = B[0,0]
 
     assert isinstance(k + n, tvm.tir.PrimExpr)
     assert isinstance(n + n, tvm.tir.PrimExpr)
-    assert isinstance(k + A, tvm.tensor.Tensor)
-    assert isinstance(A + k, tvm.tensor.Tensor)
-    assert isinstance(n + A, tvm.tensor.Tensor)
-    assert isinstance(A + n, tvm.tensor.Tensor)
-    assert isinstance(A + A, tvm.tensor.Tensor)
-
-    assert isinstance(k + B, tvm.tensor.Tensor)
-    assert isinstance(B + k, tvm.tensor.Tensor)
-    assert isinstance(n + B, tvm.tensor.Tensor)
-    assert isinstance(B + n, tvm.tensor.Tensor)
-    assert isinstance(A + B, tvm.tensor.Tensor)
-    assert isinstance(B + A, tvm.tensor.Tensor)
-    assert isinstance(B + B, tvm.tensor.Tensor)
+    assert isinstance(k + A, te.tensor.Tensor)
+    assert isinstance(A + k, te.tensor.Tensor)
+    assert isinstance(n + A, te.tensor.Tensor)
+    assert isinstance(A + n, te.tensor.Tensor)
+    assert isinstance(A + A, te.tensor.Tensor)
+
+    assert isinstance(k + B, te.tensor.Tensor)
+    assert isinstance(B + k, te.tensor.Tensor)
+    assert isinstance(n + B, te.tensor.Tensor)
+    assert isinstance(B + n, te.tensor.Tensor)
+    assert isinstance(A + B, te.tensor.Tensor)
+    assert isinstance(B + A, te.tensor.Tensor)
+    assert isinstance(B + B, te.tensor.Tensor)
 
     assert (k + B).op.tag == topi.tag.ELEMWISE
     assert (B + k).op.tag == topi.tag.ELEMWISE
@@ -58,22 +59,22 @@ def test_operator_type_and_tags():
     assert isinstance(n + B2, tvm.tir.PrimExpr)
     assert isinstance(B2 + n, tvm.tir.PrimExpr)
     assert isinstance(B2 + B2, tvm.tir.PrimExpr)
-    assert isinstance(B2 + A, tvm.tensor.Tensor)
-    assert isinstance(A + B2, tvm.tensor.Tensor)
-    assert isinstance(B2 + B, tvm.tensor.Tensor)
-    assert isinstance(B + B2, tvm.tensor.Tensor)
+    assert isinstance(B2 + A, te.tensor.Tensor)
+    assert isinstance(A + B2, te.tensor.Tensor)
+    assert isinstance(B2 + B, te.tensor.Tensor)
+    assert isinstance(B + B2, te.tensor.Tensor)
 
 
 def test_combination():
     k = 3
     n = 5
     m = 10
-    x = tvm.var('x')
-    A = tvm.placeholder((n, m), name='A')
-    B = tvm.placeholder((n, m), name='B')
-    C = tvm.placeholder((n, m), name='C')
+    x = te.var('x')
+    A = te.placeholder((n, m), name='A')
+    B = te.placeholder((n, m), name='B')
+    C = te.placeholder((n, m), name='C')
     D = k + A - B * C + x
-    s = tvm.create_schedule(D.op)
+    s = te.create_schedule(D.op)
     foo = tvm.build(s, [x, A, B, C, D], "llvm")
     ctx = tvm.cpu(0)
     x = 2
@@ -87,9 +88,9 @@ def test_combination():
 
 def verify_tensor_scalar_bop(shape, typ="add"):
     """Verify non-constant Tensor and scalar binary operations."""
-    sh = [tvm.size_var('n%d' % i) for i in range(0, len(shape))]
-    k = tvm.var('k')
-    A = tvm.placeholder(sh, name='A')
+    sh = [te.size_var('n%d' % i) for i in range(0, len(shape))]
+    k = te.var('k')
+    A = te.placeholder(sh, name='A')
     if typ == "add":
         B = A + k
     elif typ == "sub":
@@ -134,8 +135,8 @@ def check_device(device):
 
 
 def verify_broadcast_bop(lhs_shape, rhs_shape, typ="add"):
-    A = tvm.placeholder(shape=lhs_shape, name="A")
-    B = tvm.placeholder(shape=rhs_shape, name="B")
+    A = te.placeholder(shape=lhs_shape, name="A")
+    B = te.placeholder(shape=rhs_shape, name="B")
     if typ == "add":
         C = A + B
     elif typ == "sub":
@@ -195,8 +196,8 @@ def check_device(device):
         k = 10.0
         dilation = (1, 1)
         with tvm.target.create(device):
-            A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A')
-            W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+            A = te.placeholder((batch, in_channel, in_size, in_size), name='A')
+            W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W')
             B = conv2d_nchw(A, W, stride, padding, dilation, A.dtype)
             if typ == "add":
                 C = B + k
diff --git a/tests/python/unittest/test_lang_verify_compute.py b/tests/python/unittest/test_lang_verify_compute.py
index 6d17a0ce2372..4231f481d88d 100644
--- a/tests/python/unittest/test_lang_verify_compute.py
+++ b/tests/python/unittest/test_lang_verify_compute.py
@@ -15,38 +15,39 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_verify_compute():
-  n = tvm.size_var("n")
-  m = tvm.size_var("m")
-  A = tvm.placeholder((n, m), name='A')
-  k = tvm.reduce_axis((0, m), "k")
-  k_ = tvm.reduce_axis((0, m-1), "k_")
-  f1 = lambda i: tvm.sum(A[i, k], axis=k)
+  n = te.size_var("n")
+  m = te.size_var("m")
+  A = te.placeholder((n, m), name='A')
+  k = te.reduce_axis((0, m), "k")
+  k_ = te.reduce_axis((0, m-1), "k_")
+  f1 = lambda i: te.sum(A[i, k], axis=k)
   f2 = lambda i: A[i,0] + 1
-  f3 = lambda i: tvm.sum(A[i, k], axis=k) + 1
-  f4 = lambda i: A[i,0] * (tvm.sum(A[i, k], axis=k) + 1)
-  f5 = lambda i: (tvm.sum(A[i, k], axis=k), A[i,0] + 1)
-  f6 = lambda i: (tvm.sum(A[i, k], axis=k), tvm.sum(A[i, k_], axis=k_))
+  f3 = lambda i: te.sum(A[i, k], axis=k) + 1
+  f4 = lambda i: A[i,0] * (te.sum(A[i, k], axis=k) + 1)
+  f5 = lambda i: (te.sum(A[i, k], axis=k), A[i,0] + 1)
+  f6 = lambda i: (te.sum(A[i, k], axis=k), te.sum(A[i, k_], axis=k_))
 
   #
   # Valid compute
   try:
-    B = tvm.compute((n,), f1, name="B")
+    B = te.compute((n,), f1, name="B")
   except tvm._ffi.base.TVMError as ex:
     assert False
 
   #
   # Valid compute
   try:
-    B = tvm.compute((n,), f2, name="B")
+    B = te.compute((n,), f2, name="B")
   except tvm._ffi.base.TVMError as ex:
     assert False
 
   #
   # Invalid compute with non top level reduction
   try:
-    B = tvm.compute((n,), f3, name="B")
+    B = te.compute((n,), f3, name="B")
     assert False
   except tvm._ffi.base.TVMError as ex:
     pass
@@ -54,7 +55,7 @@ def test_verify_compute():
   #
   # Invalid compute with non top level reduction
   try:
-    B = tvm.compute((n,), f4, name="B")
+    B = te.compute((n,), f4, name="B")
     assert False
   except tvm._ffi.base.TVMError as ex:
     pass
@@ -62,7 +63,7 @@ def test_verify_compute():
   #
   # Invalid compute with reduction and non-reduction batch ops
   try:
-    B0, B1 = tvm.compute((n,), f5, name="B")
+    B0, B1 = te.compute((n,), f5, name="B")
     assert False
   except tvm._ffi.base.TVMError as ex:
     pass
@@ -70,7 +71,7 @@ def test_verify_compute():
   #
   # Invalid compute with unequal batch reduction ops
   try:
-    B0, B1 = tvm.compute((n,), f6, name="B")
+    B0, B1 = te.compute((n,), f6, name="B")
     assert False
   except tvm._ffi.base.TVMError as ex:
     pass
diff --git a/tests/python/unittest/test_pass_attrs_hash_equal.py b/tests/python/unittest/test_pass_attrs_hash_equal.py
index 2bd94e0d5cab..b3587cd7cb3d 100644
--- a/tests/python/unittest/test_pass_attrs_hash_equal.py
+++ b/tests/python/unittest/test_pass_attrs_hash_equal.py
@@ -15,33 +15,34 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_attrs_equal():
     x = tvm.ir.make_node("attrs.TestAttrs", name="xx", padding=(3, 4))
     y = tvm.ir.make_node("attrs.TestAttrs", name="xx", padding=(3, 4))
     z = tvm.ir.make_node("attrs.TestAttrs", name="xx", padding=(3,4,1))
-    assert tvm.ir_pass.AttrsEqual(x, y)
-    assert not tvm.ir_pass.AttrsEqual(x, z)
+    assert tvm.tir.ir_pass.AttrsEqual(x, y)
+    assert not tvm.tir.ir_pass.AttrsEqual(x, z)
 
     dattr = tvm.ir.make_node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
-    assert not tvm.ir_pass.AttrsEqual(dattr, x)
+    assert not tvm.tir.ir_pass.AttrsEqual(dattr, x)
     dattr2 = tvm.ir.make_node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
-    assert tvm.ir_pass.AttrsEqual(dattr, dattr2)
+    assert tvm.tir.ir_pass.AttrsEqual(dattr, dattr2)
 
-    assert tvm.ir_pass.AttrsEqual({"x": x}, {"x": y})
+    assert tvm.tir.ir_pass.AttrsEqual({"x": x}, {"x": y})
     # array related checks
-    assert tvm.ir_pass.AttrsEqual({"x": [x, x]}, {"x": [y, x]})
-    assert not tvm.ir_pass.AttrsEqual({"x": [x, 1]}, {"x": [y, 2]})
+    assert tvm.tir.ir_pass.AttrsEqual({"x": [x, x]}, {"x": [y, x]})
+    assert not tvm.tir.ir_pass.AttrsEqual({"x": [x, 1]}, {"x": [y, 2]})
 
-    n = tvm.var("n")
-    assert tvm.ir_pass.AttrsEqual({"x": n+1}, {"x": n+1})
+    n = te.var("n")
+    assert tvm.tir.ir_pass.AttrsEqual({"x": n+1}, {"x": n+1})
 
 
 
 
 
 def test_attrs_hash():
-    fhash = tvm.ir_pass.AttrsHash
+    fhash = tvm.tir.ir_pass.AttrsHash
     x = tvm.ir.make_node("attrs.TestAttrs", name="xx", padding=(3, 4))
     y = tvm.ir.make_node("attrs.TestAttrs", name="xx", padding=(3, 4))
     assert fhash({"x": x}) == fhash({"x": y})
diff --git a/tests/python/unittest/test_pass_basic.py b/tests/python/unittest/test_pass_basic.py
index 93c815a4a21b..f7eaa217683b 100644
--- a/tests/python/unittest/test_pass_basic.py
+++ b/tests/python/unittest/test_pass_basic.py
@@ -15,41 +15,42 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_simplify():
-  tdiv = tvm.truncdiv
-  tmod = tvm.truncmod
-  x = tvm.var('x')
-  e1 = tvm.ir_pass.Simplify(x + 2 + 1)
-  assert(tvm.ir_pass.Equal(e1, x + 3))
-  e2 = tvm.ir_pass.Simplify(x * 3 + 5 * x)
-  assert(tvm.ir_pass.Equal(e2, x * 8))
-  e3 = tvm.ir_pass.Simplify(x - tdiv(x, 3) * 3)
-  assert(tvm.ir_pass.Equal(e3, tmod(x, 3)))
+  tdiv = tvm.tir.truncdiv
+  tmod = tvm.tir.truncmod
+  x = te.var('x')
+  e1 = tvm.tir.ir_pass.Simplify(x + 2 + 1)
+  assert(tvm.tir.ir_pass.Equal(e1, x + 3))
+  e2 = tvm.tir.ir_pass.Simplify(x * 3 + 5 * x)
+  assert(tvm.tir.ir_pass.Equal(e2, x * 8))
+  e3 = tvm.tir.ir_pass.Simplify(x - tdiv(x, 3) * 3)
+  assert(tvm.tir.ir_pass.Equal(e3, tmod(x, 3)))
 
 
 def test_verify_ssa():
-    x = tvm.var('x')
-    y = tvm.var()
+    x = te.var('x')
+    y = te.var()
     z = tvm.tir.Evaluate(x + y)
-    assert(tvm.ir_pass.VerifySSA(z))
+    assert(tvm.tir.ir_pass.VerifySSA(z))
 
 
 def test_convert_ssa():
-    x = tvm.var('x')
-    y = tvm.var()
+    x = te.var('x')
+    y = te.var()
     let1 = tvm.tir.Let(x, 1, x + 1)
     let2 = tvm.tir.Let(x, 1, x + y)
     z = tvm.tir.Evaluate(let1 + let2)
-    assert(not tvm.ir_pass.VerifySSA(z))
-    z_ssa = tvm.ir_pass.ConvertSSA(z)
-    assert(tvm.ir_pass.VerifySSA(z_ssa))
+    assert(not tvm.tir.ir_pass.VerifySSA(z))
+    z_ssa = tvm.tir.ir_pass.ConvertSSA(z)
+    assert(tvm.tir.ir_pass.VerifySSA(z_ssa))
 
 
 def test_expr_use_var():
-    x = tvm.var('x')
-    assert(tvm.ir_pass.ExprUseVar(x+1, x))
-    assert(not tvm.ir_pass.ExprUseVar(1+10, x))
+    x = te.var('x')
+    assert(tvm.tir.ir_pass.ExprUseVar(x+1, x))
+    assert(not tvm.tir.ir_pass.ExprUseVar(1+10, x))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_bound_checkers.py b/tests/python/unittest/test_pass_bound_checkers.py
index 6b959e0d8da7..b3390972ab00 100644
--- a/tests/python/unittest/test_pass_bound_checkers.py
+++ b/tests/python/unittest/test_pass_bound_checkers.py
@@ -16,41 +16,42 @@
 # under the License.
 import pytest
 import tvm
+from tvm import te
 import numpy as np
 def collect_visit(stmt, f):
     ret = []
-    tvm.ir_pass.PostOrderVisit(stmt, lambda x: ret.append(f(x)))
+    tvm.tir.ir_pass.PostOrderVisit(stmt, lambda x: ret.append(f(x)))
     return ret
 
 def lower(sch, args):
     binds = {}
     arg_list = []
     for x in args:
-        if isinstance(x, tvm.tensor.Tensor):
-            buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
+        if isinstance(x, te.tensor.Tensor):
+            buf = tvm.tir.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
             assert x not in binds
             binds[x] = buf
             arg_list.append(buf)
         else:
             raise ValueError("args must be Tensor, Buffer or Var")
     sch = sch.normalize()
-    bounds = tvm.schedule.InferBound(sch)
-    stmt = tvm.schedule.ScheduleOps(sch, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, True)
-    stmt = tvm.ir_pass.RemoveNoOp(stmt)
-    stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64, True)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    bounds = tvm.te.schedule.InferBound(sch)
+    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.tir.ir_pass.RemoveNoOp(stmt)
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, binds, 64, True)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     return stmt
 
 @pytest.mark.xfail
 def test_out_of_bounds_llvm(index_a, index_b):
-    n = tvm.size_var("n")
-    A = tvm.placeholder ((n,), name='A')
-    B = tvm.placeholder ((n,), name='B')
-    C = tvm.compute(A.shape, lambda i: A[i + index_a] + B[i + index_b], name='C')
-    s = tvm.create_schedule (C.op)
+    n = te.size_var("n")
+    A = te.placeholder ((n,), name='A')
+    B = te.placeholder ((n,), name='B')
+    C = te.compute(A.shape, lambda i: A[i + index_a] + B[i + index_b], name='C')
+    s = te.create_schedule (C.op)
     tgt = "llvm"
     tgt_host = "llvm"
     stmt = tvm.lower (s, [A, B, C], simple_mode=True)
@@ -63,11 +64,11 @@ def test_out_of_bounds_llvm(index_a, index_b):
     fadd (a, b, c)
 
 def test_in_bounds_llvm():
-    n = tvm.size_var("n")
-    A = tvm.placeholder ((n,), name='A')
-    B = tvm.placeholder ((n,), name='B')
-    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name='C')
-    s = tvm.create_schedule (C.op)
+    n = te.size_var("n")
+    A = te.placeholder ((n,), name='A')
+    B = te.placeholder ((n,), name='B')
+    C = te.compute(A.shape, lambda i: A[i] + B[i], name='C')
+    s = te.create_schedule (C.op)
     tgt = "llvm"
     tgt_host = "llvm"
     stmt = tvm.lower (s, [A, B, C], simple_mode=True)
@@ -81,11 +82,11 @@ def test_in_bounds_llvm():
 
 @pytest.mark.xfail
 def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b):
-    n = tvm.convert(nn)
-    a = tvm.placeholder((n), name='a')
-    b = tvm.placeholder((n), name='b')
-    c = tvm.compute((n,), lambda i: a[i + index_a] + b[i + index_b], name='c')
-    s = tvm.create_schedule(c.op)
+    n = tvm.runtime.convert(nn)
+    a = te.placeholder((n), name='a')
+    b = te.placeholder((n), name='b')
+    c = te.compute((n,), lambda i: a[i + index_a] + b[i + index_b], name='c')
+    s = te.create_schedule(c.op)
     xo, xi = s[c].split(c.op.axis[0], factor=8)
     s[c].parallel(xo)
     s[c].vectorize(xi)
@@ -104,10 +105,10 @@ def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b):
 def test_in_bounds_vectorize_llvm():
     n = 512
     lanes = 2
-    A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes)
-    B = tvm.compute((n,), lambda i: A[i], name='B')
-    C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C')
-    s = tvm.create_schedule(C.op)
+    A = te.placeholder((n,), name='A', dtype="float32x%d" % lanes)
+    B = te.compute((n,), lambda i: A[i], name='B')
+    C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name='C')
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], nparts=2)
     _, xi = s[C].split(xi, factor=2)
     s[C].parallel(xo)
@@ -128,12 +129,12 @@ def test_in_bounds_vectorize_llvm():
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
 
 def test_in_bounds_loop_partition_basic_llvm():
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n, ), name='A')
-    B = tvm.placeholder((n, ), name='B')
+    n = te.size_var('n')
+    A = te.placeholder((n, ), name='A')
+    B = te.placeholder((n, ), name='B')
 
-    T = tvm.compute((n, ), lambda i: A[i]+B[i])
-    s = tvm.create_schedule(T.op)
+    T = te.compute((n, ), lambda i: A[i]+B[i])
+    s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=4)
     lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
     print (lowered_func.body)
@@ -147,12 +148,12 @@ def test_in_bounds_loop_partition_basic_llvm():
 
 @pytest.mark.xfail
 def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b):
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n, ), name='A')
-    B = tvm.placeholder((n, ), name='B')
+    n = te.size_var('n')
+    A = te.placeholder((n, ), name='A')
+    B = te.placeholder((n, ), name='B')
 
-    T = tvm.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
-    s = tvm.create_schedule(T.op)
+    T = te.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
+    s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=4)
     lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
     print (lowered_func.body)
@@ -187,20 +188,20 @@ def collect_branch_stmt (x):
             branch_collector.append(x)
 
     n = 21
-    A = tvm.placeholder((n, ), name='A')
-    B = tvm.placeholder((n, ), name='B')
+    A = te.placeholder((n, ), name='A')
+    B = te.placeholder((n, ), name='B')
 
-    T = tvm.compute((n, ), lambda i: A[i]+B[i])
-    s = tvm.create_schedule(T.op)
+    T = te.compute((n, ), lambda i: A[i]+B[i])
+    s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=4)
 
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     stmt = lower (s, [A, B, T])
     # num_attributes = num_buffers * num_splits = 2 * 3
     # before instrumentation
     assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3)
     assert_bound_instrumentation(stmt, check_branch_stmt, 0)
-    stmt = tvm.ir_pass.InstrumentBoundCheckers(stmt)
+    stmt = tvm.tir.ir_pass.InstrumentBoundCheckers(stmt)
     # after instrumentation
     assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3)
     assert_bound_instrumentation(stmt, check_branch_stmt, 2)
@@ -212,13 +213,13 @@ def collect_branch_stmt (x):
     print (branch_collector[1].condition)
 
 def test_in_bounds_const_loop_partition_llvm():
-    with tvm.build_config(instrument_bound_checkers=True, partition_const_loop=True):
+    with tvm.target.build_config(instrument_bound_checkers=True, partition_const_loop=True):
         n = 21
-        A = tvm.placeholder((n, ), name='A')
-        B = tvm.placeholder((n, ), name='B')
+        A = te.placeholder((n, ), name='A')
+        B = te.placeholder((n, ), name='B')
 
-        T = tvm.compute((n, ), lambda i: A[i]+B[i])
-        s = tvm.create_schedule(T.op)
+        T = te.compute((n, ), lambda i: A[i]+B[i])
+        s = te.create_schedule(T.op)
         xo, xi = s[T].split(T.op.axis[0], factor=4)
         lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
         print (lowered_func.body)
@@ -232,13 +233,13 @@ def test_in_bounds_const_loop_partition_llvm():
 
 @pytest.mark.xfail
 def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b):
-    with tvm.build_config(instrument_bound_checkers=True, partition_const_loop=True):
+    with tvm.target.build_config(instrument_bound_checkers=True, partition_const_loop=True):
         n = 21
-        A = tvm.placeholder((n, ), name='A')
-        B = tvm.placeholder((n, ), name='B')
+        A = te.placeholder((n, ), name='A')
+        B = te.placeholder((n, ), name='B')
 
-        T = tvm.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
-        s = tvm.create_schedule(T.op)
+        T = te.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
+        s = te.create_schedule(T.op)
         xo, xi = s[T].split(T.op.axis[0], factor=4)
         lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
         print (lowered_func.body)
@@ -258,18 +259,18 @@ def test_in_bounds_conv_llvm(loop_tiling=False):
     batch_size = 1
     in_height = in_width = 64
     out_height = out_width = in_height - kernel_height + 1
-    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
-    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+    data = te.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = te.placeholder((kernel_height, kernel_width, in_channel,
         out_channel), name='kernel')
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
-    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
-                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
+    ic = te.reduce_axis((0, in_channel), name='ic')
+    kh = te.reduce_axis((0, kernel_height), name='kh')
+    kw = te.reduce_axis((0, kernel_width), name='kw')
+    conv = te.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: te.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                      kernel[kh, kw, ic, oc],
                                                      axis=[ic, kh, kw]),
                        name="conv2d")
-    s = tvm.create_schedule(conv.op)
+    s = te.create_schedule(conv.op)
 
     n, oc, oh, ow = conv.op.axis
     if loop_tiling:
@@ -280,10 +281,10 @@ def test_in_bounds_conv_llvm(loop_tiling=False):
 
     f = tvm.build(s, [data, kernel, conv], "llvm")
     data_input = tvm.nd.array(np.random.uniform(
-          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
+          size=(batch_size, in_channel, in_height, in_width)).astype("float32"), ctx)
     kernel_input = tvm.nd.array(np.random.uniform(
-          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
-    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
+          size=(kernel_height, kernel_width, in_channel, out_channel)).astype("float32"), ctx)
+    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), "float32", ctx)
     f(data_input, kernel_input, conv_out)
 
 @pytest.mark.xfail
@@ -295,14 +296,14 @@ def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False
     batch_size = 1
     in_height = in_width = 64
     out_height = out_width = in_height - kernel_height + 1
-    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
-    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+    data = te.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = te.placeholder((kernel_height, kernel_width, in_channel,
         out_channel), name='kernel')
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
-    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
-                       lambda n, oc, oh, ow: tvm.sum(data[n + data_offsets[0],
+    ic = te.reduce_axis((0, in_channel), name='ic')
+    kh = te.reduce_axis((0, kernel_height), name='kh')
+    kw = te.reduce_axis((0, kernel_width), name='kw')
+    conv = te.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: te.sum(data[n + data_offsets[0],
                                                           ic + data_offsets[1],
                                                           oh*HSTR + kh + data_offsets[2],
                                                           ow*WSTR + kw + data_offsets[3]]
@@ -313,7 +314,7 @@ def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False
                                                      oc + kernel_offsets[3]],
                                                      axis=[ic, kh, kw]),
                        name="conv2d")
-    s = tvm.create_schedule(conv.op)
+    s = te.create_schedule(conv.op)
 
     n, oc, oh, ow = conv.op.axis
     if loop_tiling:
@@ -324,21 +325,21 @@ def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False
 
     f = tvm.build(s, [data, kernel, conv], "llvm")
     data_input = tvm.nd.array(np.random.uniform(
-          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
+          size=(batch_size, in_channel, in_height, in_width)).astype("float32"), ctx)
     kernel_input = tvm.nd.array(np.random.uniform(
-          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
-    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
+          size=(kernel_height, kernel_width, in_channel, out_channel)).astype("float32"), ctx)
+    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), "float32", ctx)
     f(data_input, kernel_input, conv_out)
 
 def test_in_bounds_tensors_with_same_shapes1D_llvm():
-    n = tvm.size_var('n')
-    k = tvm.size_var('k')
-    m = tvm.size_var('m')
-    A = tvm.placeholder((n, ), name='A')
-    B = tvm.placeholder((k, ), name='B')
-
-    T = tvm.compute((m, ), lambda i: A[i]*B[i])
-    s = tvm.create_schedule(T.op)
+    n = te.size_var('n')
+    k = te.size_var('k')
+    m = te.size_var('m')
+    A = te.placeholder((n, ), name='A')
+    B = te.placeholder((k, ), name='B')
+
+    T = te.compute((m, ), lambda i: A[i]*B[i])
+    s = te.create_schedule(T.op)
     lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
     print (lowered_func.body)
     ctx = tvm.cpu(0)
@@ -351,14 +352,14 @@ def test_in_bounds_tensors_with_same_shapes1D_llvm():
 
 @pytest.mark.xfail
 def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape):
-    n = tvm.size_var('n')
-    k = tvm.size_var('k')
-    m = tvm.size_var('m')
-    A = tvm.placeholder((n, ), name='A')
-    B = tvm.placeholder((k, ), name='B')
-
-    T = tvm.compute((m, ), lambda i: A[i]*B[i])
-    s = tvm.create_schedule(T.op)
+    n = te.size_var('n')
+    k = te.size_var('k')
+    m = te.size_var('m')
+    A = te.placeholder((n, ), name='A')
+    B = te.placeholder((k, ), name='B')
+
+    T = te.compute((m, ), lambda i: A[i]*B[i])
+    s = te.create_schedule(T.op)
     lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
     print (lowered_func.body)
     ctx = tvm.cpu(0)
@@ -370,14 +371,14 @@ def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape
     f(a, b, t)
 
 def test_in_bounds_tensors_with_same_shapes2D_llvm():
-    n = tvm.size_var('n')
-    k = tvm.size_var('k')
-    m = tvm.size_var('m')
-    A = tvm.placeholder((n, n), name='A')
-    B = tvm.placeholder((k, k), name='B')
-
-    T = tvm.compute((m, m), lambda i, j: A[i][j]*B[i][j])
-    s = tvm.create_schedule(T.op)
+    n = te.size_var('n')
+    k = te.size_var('k')
+    m = te.size_var('m')
+    A = te.placeholder((n, n), name='A')
+    B = te.placeholder((k, k), name='B')
+
+    T = te.compute((m, m), lambda i, j: A[i][j]*B[i][j])
+    s = te.create_schedule(T.op)
     lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
     print (lowered_func.body)
     ctx = tvm.cpu(0)
@@ -390,14 +391,14 @@ def test_in_bounds_tensors_with_same_shapes2D_llvm():
 
 @pytest.mark.xfail
 def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape):
-    n = tvm.size_var('n')
-    k = tvm.size_var('k')
-    m = tvm.size_var('m')
-    A = tvm.placeholder((n, n), name='A')
-    B = tvm.placeholder((k, k), name='B')
-
-    T = tvm.compute((m, m), lambda i, j: A[i][j]*B[i][j])
-    s = tvm.create_schedule(T.op)
+    n = te.size_var('n')
+    k = te.size_var('k')
+    m = te.size_var('m')
+    A = te.placeholder((n, n), name='A')
+    B = te.placeholder((k, k), name='B')
+
+    T = te.compute((m, m), lambda i, j: A[i][j]*B[i][j])
+    s = te.create_schedule(T.op)
     lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
     print (lowered_func.body)
     ctx = tvm.cpu(0)
@@ -409,14 +410,14 @@ def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape
     f(a, b, t)
 
 def test_in_bounds_tensors_with_same_shapes3D_llvm():
-    n = tvm.size_var('n')
-    k = tvm.size_var('k')
-    m = tvm.size_var('m')
-    A = tvm.placeholder((n, n, n), name='A')
-    B = tvm.placeholder((k, k, k), name='B')
-
-    T = tvm.compute((m, m, m), lambda i, j, p: A[i][j][p]*B[i][j][p])
-    s = tvm.create_schedule(T.op)
+    n = te.size_var('n')
+    k = te.size_var('k')
+    m = te.size_var('m')
+    A = te.placeholder((n, n, n), name='A')
+    B = te.placeholder((k, k, k), name='B')
+
+    T = te.compute((m, m, m), lambda i, j, p: A[i][j][p]*B[i][j][p])
+    s = te.create_schedule(T.op)
     lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
     print (lowered_func.body)
     ctx = tvm.cpu(0)
@@ -429,14 +430,14 @@ def test_in_bounds_tensors_with_same_shapes3D_llvm():
 
 @pytest.mark.xfail
 def test_out_of_bounds_tensors_with_diff_shapes3D_llvm(a_shape, b_shape, c_shape):
-    n = tvm.size_var('n')
-    k = tvm.size_var('k')
-    m = tvm.size_var('m')
-    A = tvm.placeholder((n, n, n), name='A')
-    B = tvm.placeholder((k, k, k), name='B')
-
-    T = tvm.compute((m, m, m), lambda i, j, p: A[i][j][p]*B[i][j][p])
-    s = tvm.create_schedule(T.op)
+    n = te.size_var('n')
+    k = te.size_var('k')
+    m = te.size_var('m')
+    A = te.placeholder((n, n, n), name='A')
+    B = te.placeholder((k, k, k), name='B')
+
+    T = te.compute((m, m, m), lambda i, j, p: A[i][j][p]*B[i][j][p])
+    s = te.create_schedule(T.op)
     lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
     print (lowered_func.body)
     ctx = tvm.cpu(0)
@@ -452,12 +453,12 @@ def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm():
     if not tvm.runtime.enabled("llvm"):
         return
     n = 64
-    A = tvm.placeholder((n, ), name='A')
-    scale = tvm.placeholder((), name='scale')
-    k = tvm.reduce_axis((0, n), name="k")
-    C = tvm.compute((), lambda : tvm.sum(A[k + k + k] * scale, axis=k), name="C")
-    D = tvm.compute((), lambda : C + 1)
-    s = tvm.create_schedule(D.op)
+    A = te.placeholder((n, ), name='A')
+    scale = te.placeholder((), name='scale')
+    k = te.reduce_axis((0, n), name="k")
+    C = te.compute((), lambda : te.sum(A[k + k + k] * scale, axis=k), name="C")
+    D = te.compute((), lambda : C + 1)
+    s = te.create_schedule(D.op)
     stmt = tvm.lower (s, [A, scale, D], simple_mode=True)
     print (stmt)
     # build and invoke the kernel.
@@ -473,7 +474,7 @@ def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm():
     tvm.testing.assert_allclose(d.asnumpy(), d_np)
 
 if __name__ == "__main__":
-    with tvm.build_config(instrument_bound_checkers=True):
+    with tvm.target.build_config(instrument_bound_checkers=True):
         # zero scale
         test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm()
         # in bound
diff --git a/tests/python/unittest/test_pass_combine_context_call.py b/tests/python/unittest/test_pass_combine_context_call.py
index ef741a4bff7b..e51d4d874ec9 100644
--- a/tests/python/unittest/test_pass_combine_context_call.py
+++ b/tests/python/unittest/test_pass_combine_context_call.py
@@ -15,28 +15,29 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_for():
-    dev_type = tvm.var("dev_type")
+    dev_type = te.var("dev_type")
     def device_context(dev_id):
-        ctx = tvm.call_extern("handle", "device_context", dev_type, dev_id)
+        ctx = tvm.tir.call_extern("handle", "device_context", dev_type, dev_id)
         return tvm.tir.Call(
             "handle", "tvm_thread_context", [ctx], tvm.tir.Call.Intrinsic, None, 0)
 
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
     A = ib.allocate("float32", n, name="A", scope="global")
     with ib.for_range(0, n, name="i") as i:
-        ib.emit(tvm.call_extern
+        ib.emit(tvm.tir.call_extern
                 ("int32", "fadd", device_context(0), A))
         with ib.for_range(0, 10, name="j") as j:
-            ib.emit(tvm.call_extern
+            ib.emit(tvm.tir.call_extern
                     ("int32", "fadd", device_context(1), A))
-            ib.emit(tvm.call_extern
+            ib.emit(tvm.tir.call_extern
                     ("int32", "fadd", device_context(0), A))
     body = ib.get()
-    f = tvm.ir_pass.MakeAPI(body, "func", [dev_type, n], 2, True)
-    f = tvm.ir_pass.CombineContextCall(f)
+    f = tvm.tir.ir_pass.MakeAPI(body, "func", [dev_type, n], 2, True)
+    f = tvm.tir.ir_pass.CombineContextCall(f)
     assert f.body.value.dtype == "handle"
     assert f.body.body.value.dtype == "handle"
 
diff --git a/tests/python/unittest/test_pass_decorate_device_scope.py b/tests/python/unittest/test_pass_decorate_device_scope.py
index b464354e008a..327cfd9ed548 100644
--- a/tests/python/unittest/test_pass_decorate_device_scope.py
+++ b/tests/python/unittest/test_pass_decorate_device_scope.py
@@ -15,24 +15,25 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_decorate_device():
-    m = tvm.size_var('m')
-    l = tvm.size_var('l')
-    A = tvm.placeholder((m, l), name='A')
+    m = te.size_var('m')
+    l = te.size_var('l')
+    A = te.placeholder((m, l), name='A')
 
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
 
-    s = tvm.create_schedule(A2.op)
+    s = te.create_schedule(A2.op)
     xo, xi = s[A2].split(A2.op.axis[0], factor=8)
     s[A1].compute_at(s[A2], xo)
     s[A1].set_scope("shared")
 
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt1 = tvm.ir_pass.Simplify(stmt)
-    stmt2 = tvm.ir_pass.DecorateDeviceScope(stmt1)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt1 = tvm.tir.ir_pass.Simplify(stmt)
+    stmt2 = tvm.tir.ir_pass.DecorateDeviceScope(stmt1)
     assert isinstance(stmt2, tvm.tir.AttrStmt)
     assert stmt2.attr_key == "device_scope"
     assert stmt1 == stmt2.body
diff --git a/tests/python/unittest/test_pass_equal.py b/tests/python/unittest/test_pass_equal.py
index 1f5bb9cba9a9..873cb7be447c 100644
--- a/tests/python/unittest/test_pass_equal.py
+++ b/tests/python/unittest/test_pass_equal.py
@@ -15,39 +15,40 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_equal_expr():
-    x = tvm.var('x')
-    y = tvm.var('y')
+    x = te.var('x')
+    y = te.var('y')
 
     def func1():
         return x + y + 1
 
     def func2():
-        return tvm.exp(tvm.truncdiv((x + y + 1) * y, 4))
+        return te.exp(tvm.tir.truncdiv((x + y + 1) * y, 4))
 
-    assert tvm.ir_pass.Equal(func1(), func1())
-    assert tvm.ir_pass.Equal(func2(), func2())
-    assert not tvm.ir_pass.Equal(func2(), func1())
+    assert tvm.tir.ir_pass.Equal(func1(), func1())
+    assert tvm.tir.ir_pass.Equal(func2(), func2())
+    assert not tvm.tir.ir_pass.Equal(func2(), func1())
 
 
 def test_equal_compute():
-    x = tvm.var('x')
-    y = tvm.var('y')
+    x = te.var('x')
+    y = te.var('y')
     n = 128
-    A = tvm.placeholder((n, n), name='A')
-    B = tvm.placeholder((n, n), name='B')
-    ii = tvm.var('i')
-    jj = tvm.var('j')
+    A = te.placeholder((n, n), name='A')
+    B = te.placeholder((n, n), name='B')
+    ii = te.var('i')
+    jj = te.var('j')
 
     def func1():
-        k = tvm.reduce_axis((0, n), name='k')
-        return tvm.sum(A[ii, k] * B[jj, k], axis=k)
+        k = te.reduce_axis((0, n), name='k')
+        return te.sum(A[ii, k] * B[jj, k], axis=k)
 
-    Ab = tvm.decl_buffer((n,), name='A')
-    n = tvm.var("n")
+    Ab = tvm.tir.decl_buffer((n,), name='A')
+    n = te.var("n")
     def func2():
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
         A = ib.buffer_ptr(Ab)
         with ib.for_range(0, n, name="i") as i:
             A[i] = A[i] + 1
@@ -56,8 +57,8 @@ def func2():
                 A[j] = A[j] + 2
         return ib.get()
 
-    assert tvm.ir_pass.Equal(func1(), func1())
-    assert tvm.ir_pass.Equal(func2(), func2())
+    assert tvm.tir.ir_pass.Equal(func1(), func1())
+    assert tvm.tir.ir_pass.Equal(func2(), func2())
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_hoist_if.py b/tests/python/unittest/test_pass_hoist_if.py
index 2eb641b0cd90..f6bdbd6130f4 100644
--- a/tests/python/unittest/test_pass_hoist_if.py
+++ b/tests/python/unittest/test_pass_hoist_if.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 
 var_list = []
@@ -31,7 +32,7 @@ def _visit(op):
         key = op
         if isinstance(op, tvm.tir.IfThenElse):
             global var_list
-            tvm.ir_pass.PostOrderVisit(op.condition, _extract_vars)
+            tvm.tir.ir_pass.PostOrderVisit(op.condition, _extract_vars)
             val = [(op.then_case, op.else_case), ("IfThenElse", tuple(var_list))]
             var_list.clear()
         elif isinstance(op, tvm.tir.For):
@@ -42,7 +43,7 @@ def _visit(op):
             return
         node_dict[key] = val
 
-    tvm.ir_pass.PostOrderVisit(stmt, _visit)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, _visit)
     for key, val in node_dict.items():
         struct[val[1]] = tuple(node_dict[child][1] if child in node_dict
                                else None for child in val[0])
@@ -52,10 +53,10 @@ def _visit(op):
     var_list.clear()
 
 def test_basic():
-    ib = tvm.ir_builder.create()
-    l = tvm.var('l')
-    m = tvm.var('m')
-    n = tvm.var('n')
+    ib = tvm.tir.ir_builder.create()
+    l = te.var('l')
+    m = te.var('m')
+    n = te.var('n')
 
     with ib.for_range(0, l, "i") as i:
         with ib.for_range(0, m, "j") as j:
@@ -66,17 +67,17 @@ def test_basic():
                     ib.emit(tvm.tir.Evaluate(n))
 
     stmt = ib.get()
-    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    new_stmt = tvm.tir.ir_pass.HoistIfThenElse(stmt)
     expected_struct = {('For', 'k'): (None,), ('For', 'j'): (('For', 'k'),),
                        ('IfThenElse', ('i',)): (('For', 'j'), ('For', 'j')),
                        ('For', 'i'): (('IfThenElse', ('i',)),)}
     verify_structure(new_stmt, expected_struct)
 
 def test_no_else():
-    ib = tvm.ir_builder.create()
-    l = tvm.var('l')
-    m = tvm.var('m')
-    n = tvm.var('n')
+    ib = tvm.tir.ir_builder.create()
+    l = te.var('l')
+    m = te.var('m')
+    n = te.var('n')
 
     with ib.for_range(0, l, "i") as i:
         with ib.for_range(0, m, "j") as j:
@@ -85,34 +86,34 @@ def test_no_else():
                     ib.emit(tvm.tir.Evaluate(m))
 
     stmt = ib.get()
-    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    new_stmt = tvm.tir.ir_pass.HoistIfThenElse(stmt)
     expected_struct = {('For', 'k'): (None,), ('For', 'j'): (('For', 'k'),),
                        ('IfThenElse', ('i',)): (('For', 'j'), None),
                        ('For', 'i'): (('IfThenElse', ('i',)),)}
     verify_structure(new_stmt, expected_struct)
 
 def test_attr_stmt():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     dshape = (32, 64)
     data = ib.pointer("float32", name="data")
-    l = tvm.var('l')
-    m = tvm.var('m')
-    n = tvm.var('n')
+    l = te.var('l')
+    m = te.var('m')
+    n = te.var('n')
 
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", dshape[0])
     ib.scope_attr(bx, "thread_extent", dshape[1])
     with ib.for_range(0, l, "i") as i:
         with ib.for_range(0, m, "j") as j:
             with ib.for_range(0, n, "k") as k:
-                with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                with ib.if_scope(tvm.tir.any(i < 4, j >= 8)):
                     data[bx * j + tx * j * k] = data[bx * j + tx * j * k]  + 0.5
                 with ib.else_scope():
                     data[bx * j + tx * j * k] = data[bx * j + tx * j * k]  + 1.0
 
     stmt = ib.get()
-    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    new_stmt = tvm.tir.ir_pass.HoistIfThenElse(stmt)
     expected_struct = {('For', 'k'): (None,), ('IfThenElse', ('i', 'j')): (('For', 'k'), ('For', 'k')),
                        ('For', 'j'): (('IfThenElse', ('i', 'j')),), ('For', 'i'): (('For', 'j'),),
                        ('AttrStmt', 'thread_extent', 64): (('For', 'i'),),
@@ -120,7 +121,7 @@ def test_attr_stmt():
     verify_structure(new_stmt, expected_struct)
 
 def test_nested_for():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     data = ib.pointer("float32", name="data")
 
 
@@ -130,22 +131,22 @@ def test_nested_for():
                 data[i * 3 + j] = data[i * 3 + j] + 0.5
                 with ib.for_range(0, 15, "k") as k:
                     with ib.for_range(0, 20, "l") as l:
-                        with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                        with ib.if_scope(tvm.tir.any(i < 4, j >= 8)):
                             data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 2
                         with ib.else_scope():
                             data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 1.5
 
     stmt = ib.get()
-    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    new_stmt = tvm.tir.ir_pass.HoistIfThenElse(stmt)
     expected_struct = {('IfThenElse', ('i', 'j')): (None, None), ('For', 'l'): (('IfThenElse', ('i', 'j')),),
                        ('For', 'k'): (('For', 'l'),), ('For', 'j'): (None,), ('IfThenElse', ('i',)): (('For', 'j'), None),
                        ('For', 'i'): (('IfThenElse', ('i',)),)}
     verify_structure(new_stmt, expected_struct)
 
 def test_if_block():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     data = ib.pointer("float32", name="data")
-    n = tvm.var("n")
+    n = te.var("n")
 
 
     with ib.for_range(0, 5, "i") as i:
@@ -154,7 +155,7 @@ def test_if_block():
                 data[i * 3 + j] = data[i * 3 + j] + 0.5
                 with ib.for_range(0, 15, "k") as k:
                     with ib.for_range(0, 20, "l") as l:
-                        with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                        with ib.if_scope(tvm.tir.any(i < 4, j >= 8)):
                             data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 2
                         with ib.else_scope():
                             data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 1.5
@@ -169,7 +170,7 @@ def test_if_block():
                         data[i * 3 + j + k] = data[i * 3 + j + k] + 0.6
 
     stmt = ib.get()
-    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    new_stmt = tvm.tir.ir_pass.HoistIfThenElse(stmt)
     expected_struct = {('IfThenElse', ('i', 'j')): (None, None), ('IfThenElse', ('j',)): (None, None),
                        ('For', 'l'): (None,), ('For', 'k'): (None,), ('For', 'j'): (('For', 'j'),),
                        ('IfThenElse', ('i',)): (('For', 'j'), None), ('For', 'i'): (('IfThenElse', ('i',)),),
diff --git a/tests/python/unittest/test_pass_inject_copy_intrin.py b/tests/python/unittest/test_pass_inject_copy_intrin.py
index f49388db3eb2..8c34e344d73e 100644
--- a/tests/python/unittest/test_pass_inject_copy_intrin.py
+++ b/tests/python/unittest/test_pass_inject_copy_intrin.py
@@ -15,102 +15,103 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_copy2d():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    B = tvm.compute((m, l), lambda i, j: A[i, j], name='B')
-    s = tvm.create_schedule(B.op)
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    B = te.compute((m, l), lambda i, j: A[i, j], name='B')
+    s = te.create_schedule(B.op)
     s[B].pragma(B.op.axis[0], "memcpy")
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
     def cb(src, dst, pad_before, pad_after, pad_value):
         assert dst.strides[0] == l
         assert dst.strides[1].value == 1
         assert src.strides[0] == l
         assert tuple(src.shape) == (m, l)
         return tvm.tir.Evaluate(0)
-    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
+    stmt = tvm.tir.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
 
 def test_copy_pad():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    B = tvm.compute((m + 2, l), lambda i, j:
-                    tvm.if_then_else(tvm.all(i >= 1, i < m + 1),
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    B = te.compute((m + 2, l), lambda i, j:
+                    tvm.tir.if_then_else(tvm.tir.all(i >= 1, i < m + 1),
                                      A[i - 1, j], 1.0), name='B')
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     s[B].pragma(B.op.axis[0], "memcpy")
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
     def cb(src, dst, pad_before, pad_after, pad_value):
-        assert tvm.ir_pass.Simplify(src.elem_offset).value == 0
+        assert tvm.tir.ir_pass.Simplify(src.elem_offset).value == 0
         assert pad_before[0].value == 1
         assert pad_before[1].value == 0
         assert pad_after[0].value == 1
         assert pad_after[1].value == 0
         assert pad_value.value == 1.0
         return tvm.tir.Evaluate(0)
-    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
+    stmt = tvm.tir.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
 
 def test_single_point_test():
-    A = tvm.placeholder((1,), name='A')
-    B = tvm.compute((1,), lambda i:
+    A = te.placeholder((1,), name='A')
+    B = te.compute((1,), lambda i:
                     A[i], name='B')
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     s[B].pragma(B.op.axis[0], "memcpy")
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
     def cb(src, dst, pad_before, pad_after, pad_value):
-        assert tvm.ir_pass.Simplify(src.elem_offset).value == 0
-        assert tvm.ir_pass.Simplify(dst.elem_offset).value == 0
-        assert tvm.ir_pass.Simplify(src.strides[0]).value == 1
-        assert tvm.ir_pass.Simplify(dst.strides[0]).value == 1
+        assert tvm.tir.ir_pass.Simplify(src.elem_offset).value == 0
+        assert tvm.tir.ir_pass.Simplify(dst.elem_offset).value == 0
+        assert tvm.tir.ir_pass.Simplify(src.strides[0]).value == 1
+        assert tvm.tir.ir_pass.Simplify(dst.strides[0]).value == 1
         return tvm.tir.Evaluate(0)
-    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
+    stmt = tvm.tir.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
 
 def assert_expr_equal(a, b):
-    assert tvm.ir_pass.Simplify(a - b).value == 0
+    assert tvm.tir.ir_pass.Simplify(a - b).value == 0
 
 def test_copy_pad_split():
     m = 4 * 3
-    A = tvm.placeholder((m, ), name="A")
-    Apad = tvm.compute((m + 2,), lambda i:
-                       tvm.if_then_else(tvm.all(i >= 1, i <= m),
+    A = te.placeholder((m, ), name="A")
+    Apad = te.compute((m + 2,), lambda i:
+                       tvm.tir.if_then_else(tvm.tir.all(i >= 1, i <= m),
                                         A[i - 1], 0.0), "Apad")
-    B = tvm.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2])
-    s = tvm.create_schedule(B.op)
+    B = te.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2])
+    s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=4)
     s[Apad].compute_at(s[B], xo)
     s[Apad].pragma(s[Apad].op.axis[0], "memcpy")
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
     def cb(src, dst, pad_before, pad_after, pad_value):
         assert(dst.elem_offset.value == 0)
-        assert_expr_equal(src.elem_offset, tvm.max(xo * 4, 1) - 1)
+        assert_expr_equal(src.elem_offset, tvm.te.max(xo * 4, 1) - 1)
 
-        rpad_before = tvm.max(1 - xo * 4, 0)
-        rpad_after = tvm.max(xo * 4 - 7, 0)
+        rpad_before = tvm.te.max(1 - xo * 4, 0)
+        rpad_after = tvm.te.max(xo * 4 - 7, 0)
         assert_expr_equal(pad_before[0], rpad_before)
         assert_expr_equal(pad_after[0], rpad_after)
         assert_expr_equal(src.shape[0], 6 - rpad_before - rpad_after)
         return tvm.tir.Evaluate(0)
-    stmt = tvm.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
+    stmt = tvm.tir.ir_pass.InjectCopyIntrin(stmt, "memcpy", cb)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_inject_double_buffer.py b/tests/python/unittest/test_pass_inject_double_buffer.py
index cf8f78c8090d..0fe3f614796b 100644
--- a/tests/python/unittest/test_pass_inject_double_buffer.py
+++ b/tests/python/unittest/test_pass_inject_double_buffer.py
@@ -15,13 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_double_buffer():
     dtype = 'int64'
     n = 100
     m = 4
-    tx = tvm.thread_axis("threadIdx.x")
-    ib = tvm.ir_builder.create()
+    tx = te.thread_axis("threadIdx.x")
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     C = ib.pointer("float32", name="C")
     ib.scope_attr(tx, "thread_extent", 1)
@@ -35,17 +36,17 @@ def test_double_buffer():
             C[j] = B[j] + 1
 
     stmt = ib.get()
-    stmt = tvm.ir_pass.InjectDoubleBuffer(stmt, 2)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.InjectDoubleBuffer(stmt, 2)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert isinstance(stmt.body.body, tvm.tir.Allocate)
     assert stmt.body.body.extents[0].value == 2
-    f = tvm.ir_pass.MakeAPI(stmt, "db", [A.asobject(), C.asobject()], 2, True)
-    f = tvm.ir_pass.ThreadSync(f, "shared")
+    f = tvm.tir.ir_pass.MakeAPI(stmt, "db", [A.asobject(), C.asobject()], 2, True)
+    f = tvm.tir.ir_pass.ThreadSync(f, "shared")
     count = [0]
     def count_sync(op):
         if isinstance(op, tvm.tir.Call) and op.name == "tvm_storage_sync":
             count[0] += 1
-    tvm.ir_pass.PostOrderVisit(f.body, count_sync)
+    tvm.tir.ir_pass.PostOrderVisit(f.body, count_sync)
     assert count[0] == 4
 
 
diff --git a/tests/python/unittest/test_pass_inject_vthread.py b/tests/python/unittest/test_pass_inject_vthread.py
index 08e261b68f6d..8fbd8295d238 100644
--- a/tests/python/unittest/test_pass_inject_vthread.py
+++ b/tests/python/unittest/test_pass_inject_vthread.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_vthread():
     dtype = 'int64'
@@ -22,9 +23,9 @@ def test_vthread():
     m = 4
     nthread = 2
     def get_vthread(name):
-        tx = tvm.thread_axis(name)
-        ty = tvm.thread_axis(name)
-        ib = tvm.ir_builder.create()
+        tx = te.thread_axis(name)
+        ty = te.thread_axis(name)
+        ib = tvm.tir.ir_builder.create()
         A = ib.pointer("float32", name="A")
         C = ib.pointer("float32", name="C")
         with ib.for_range(0, n) as i:
@@ -32,16 +33,16 @@ def get_vthread(name):
             ib.scope_attr(ty, "virtual_thread", nthread)
             B = ib.allocate("float32", m, name="B", scope="shared")
             B[i] = A[i * nthread + tx]
-            bbuffer = tvm.decl_buffer((m,), dtype=B.dtype, data=B.asobject())
-            ib.emit(tvm.call_extern("int32", "Run",
+            bbuffer = tvm.tir.decl_buffer((m,), dtype=B.dtype, data=B.asobject())
+            ib.emit(tvm.tir.call_extern("int32", "Run",
                                     bbuffer.access_ptr("r"),
-                                    tvm.call_pure_intrin("int32", "tvm_context_id")))
+                                    tvm.tir.call_pure_intrin("int32", "tvm_context_id")))
             C[i * nthread + tx] = B[i] + 1
         return ib.get()
 
-    stmt = tvm.ir_pass.InjectVirtualThread(get_vthread("vthread"))
+    stmt = tvm.tir.ir_pass.InjectVirtualThread(get_vthread("vthread"))
     assert stmt.body.body.extents[0].value == 2
-    stmt = tvm.ir_pass.InjectVirtualThread(get_vthread("cthread"))
+    stmt = tvm.tir.ir_pass.InjectVirtualThread(get_vthread("cthread"))
     assert len(stmt.body.body.extents) == 3
 
 
@@ -51,35 +52,35 @@ def test_vthread_extern():
     m = 4
     nthread = 2
     def get_vthread(name):
-        tx = tvm.thread_axis(name)
-        ty = tvm.thread_axis(name)
-        ib = tvm.ir_builder.create()
+        tx = te.thread_axis(name)
+        ty = te.thread_axis(name)
+        ib = tvm.tir.ir_builder.create()
         with ib.for_range(0, n) as i:
             ib.scope_attr(tx, "virtual_thread", nthread)
             ib.scope_attr(ty, "virtual_thread", nthread)
             A = ib.allocate("float32", m, name="A", scope="shared")
             B = ib.allocate("float32", m, name="B", scope="shared")
             C = ib.allocate("float32", m, name="C", scope="shared")
-            cbuffer = tvm.decl_buffer((m,), dtype=C.dtype, data=C.asobject())
-            abuffer = tvm.decl_buffer((m,), dtype=A.dtype, data=A.asobject())
-            bbuffer = tvm.decl_buffer((m,), dtype=B.dtype, data=B.asobject())
+            cbuffer = tvm.tir.decl_buffer((m,), dtype=C.dtype, data=C.asobject())
+            abuffer = tvm.tir.decl_buffer((m,), dtype=A.dtype, data=A.asobject())
+            bbuffer = tvm.tir.decl_buffer((m,), dtype=B.dtype, data=B.asobject())
             A[tx] = tx + 1.0
             B[ty] = ty + 1.0
-            ib.emit(tvm.call_extern("int32", "Run",
+            ib.emit(tvm.tir.call_extern("int32", "Run",
                                     abuffer.access_ptr("r"),
                                     bbuffer.access_ptr("r"),
                                     cbuffer.access_ptr("rw")))
         return ib.get()
 
-    stmt = tvm.ir_pass.InjectVirtualThread(get_vthread("vthread"))
+    stmt = tvm.tir.ir_pass.InjectVirtualThread(get_vthread("vthread"))
     assert stmt.body.body.extents[0].value == 2
     assert stmt.body.body.body.body.body.body.extents[0].value == 2
     assert len(stmt.body.body.body.body.body.body.extents) == 3
 
 def test_vthread_if_then_else():
     nthread = 2
-    tx = tvm.thread_axis("vthread")
-    ib = tvm.ir_builder.create()
+    tx = te.thread_axis("vthread")
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, 100) as i:
         ib.scope_attr(tx, "virtual_thread", nthread)
@@ -91,7 +92,7 @@ def test_vthread_if_then_else():
         with ib.if_scope(i == 0):
             B[i] = A[i * nthread + tx] + 2
     stmt = ib.get()
-    stmt = tvm.ir_pass.InjectVirtualThread(stmt)
+    stmt = tvm.tir.ir_pass.InjectVirtualThread(stmt)
     assert stmt.body.body.body[0].else_case != None
     assert stmt.body.body.body[1].else_case == None
 
diff --git a/tests/python/unittest/test_pass_inline.py b/tests/python/unittest/test_pass_inline.py
index 521a6f99e026..ad0591d3a7c1 100644
--- a/tests/python/unittest/test_pass_inline.py
+++ b/tests/python/unittest/test_pass_inline.py
@@ -15,37 +15,38 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_inline():
-    m = tvm.size_var('m')
-    A = tvm.placeholder((m,), name='A')
-    T = tvm.compute((m,), lambda i,: A[i] + 10, name='T')
+    m = te.size_var('m')
+    A = te.placeholder((m,), name='A')
+    T = te.compute((m,), lambda i,: A[i] + 10, name='T')
     stmt = tvm.tir.Evaluate(T[10] + 11 * T[100])
-    stmt = tvm.ir_pass.Inline(
+    stmt = tvm.tir.ir_pass.Inline(
         stmt, T.op, [x.var for x in T.op.axis], T.op.body[0])
     print(stmt)
-    assert(tvm.ir_pass.VerifySSA(stmt))
+    assert(tvm.tir.ir_pass.VerifySSA(stmt))
 
     try:
         # pass in int array(wrong argument type)
         # must raise an error
-        stmt = tvm.ir_pass.Inline(
+        stmt = tvm.tir.ir_pass.Inline(
             T.op, [1,2,3], T.op.body, stmt)
         assert False
     except tvm.error.TVMError:
         pass
 
 def test_inline2():
-    m = tvm.size_var('m')
-    A = tvm.placeholder((m,), name='A')
-    T = tvm.compute((m,), lambda i,: A[i] + 10, name='T')
-    stmt = tvm.tir.Evaluate(tvm.exp(T[10]) + 11 * T[100])
-    stmt = tvm.ir_pass.Inline(
+    m = te.size_var('m')
+    A = te.placeholder((m,), name='A')
+    T = te.compute((m,), lambda i,: A[i] + 10, name='T')
+    stmt = tvm.tir.Evaluate(te.exp(T[10]) + 11 * T[100])
+    stmt = tvm.tir.ir_pass.Inline(
         stmt, T.op, [x.var for x in T.op.axis], T.op.body[0])
     def check(op):
         if isinstance(op, tvm.tir.Call):
             assert op.func != T.op
-    tvm.ir_pass.PostOrderVisit(stmt, check)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, check)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_ir_transform.py b/tests/python/unittest/test_pass_ir_transform.py
index b024a3c8d5b9..cb7417a7a54f 100644
--- a/tests/python/unittest/test_pass_ir_transform.py
+++ b/tests/python/unittest/test_pass_ir_transform.py
@@ -15,28 +15,29 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_ir_transform():
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
-            x = tvm.call_extern("int32", "TestA", i * 3 + j * 1)
-            ib.emit(tvm.call_extern("int32", "TestB", x))
-            ib.emit(tvm.call_extern("int32", "TestC", x))
+            x = tvm.tir.call_extern("int32", "TestA", i * 3 + j * 1)
+            ib.emit(tvm.tir.call_extern("int32", "TestB", x))
+            ib.emit(tvm.tir.call_extern("int32", "TestC", x))
     body = ib.get()
 
     def preorder(op):
         if op.name == "TestC":
-            return tvm.const(0, "int32")
+            return tvm.tir.const(0, "int32")
         return None
 
     def postorder(op):
         assert isinstance(op, tvm.tir.Call)
         if op.name == "TestA":
-            return tvm.call_extern("int32", "TestB", op.args[0] + 1)
+            return tvm.tir.call_extern("int32", "TestB", op.args[0] + 1)
         return op
-    body = tvm.ir_pass.IRTransform(body, preorder, postorder, ["Call"])
+    body = tvm.tir.ir_pass.IRTransform(body, preorder, postorder, ["Call"])
     stmt_list = tvm.tir.stmt_list(body.body.body)
     assert stmt_list[0].value.args[0].name == "TestB"
     assert stmt_list[1].value.value == 0
diff --git a/tests/python/unittest/test_pass_lift_attr_scope.py b/tests/python/unittest/test_pass_lift_attr_scope.py
index 181f4ef57a4f..0831565dc155 100644
--- a/tests/python/unittest/test_pass_lift_attr_scope.py
+++ b/tests/python/unittest/test_pass_lift_attr_scope.py
@@ -15,11 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_coproc_lift():
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
-    cp = tvm.thread_axis((0, 1), "cop")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
+    cp = te.thread_axis((0, 1), "cop")
     value = tvm.tir.StringImm("xxx")
 
     A = ib.allocate("float32", n, name="A", scope="global")
@@ -34,11 +35,11 @@ def test_coproc_lift():
                 A[j] = A[j] + 3
                 A[j] = A[j] + 3
     body = ib.get()
-    body = tvm.ir_pass.LiftAttrScope(body, "coproc_uop_scope")
+    body = tvm.tir.ir_pass.LiftAttrScope(body, "coproc_uop_scope")
     assert body.body.body.node == cp
 
     # only able to lift to the common pattern of the last two fors.
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     A = ib.allocate("float32", n, name="A", scope="global")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
@@ -51,7 +52,7 @@ def test_coproc_lift():
             A[i] = A[i] + 2
 
     body = ib.get()
-    body = tvm.ir_pass.LiftAttrScope(body, "coproc_uop_scope")
+    body = tvm.tir.ir_pass.LiftAttrScope(body, "coproc_uop_scope")
     assert body.body.body.body[1].node == cp
     assert len(body.body.body.body) == 2
 
diff --git a/tests/python/unittest/test_pass_loop_partition.py b/tests/python/unittest/test_pass_loop_partition.py
index e9df98e43d79..7ec35e618aa3 100644
--- a/tests/python/unittest/test_pass_loop_partition.py
+++ b/tests/python/unittest/test_pass_loop_partition.py
@@ -15,11 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy
 
 def collect_visit(stmt, f):
     ret = []
-    tvm.ir_pass.PostOrderVisit(stmt, lambda x : ret.append(f(x)))
+    tvm.tir.ir_pass.PostOrderVisit(stmt, lambda x : ret.append(f(x)))
     return ret
 
 def find_top_produce(stmt):
@@ -27,65 +28,65 @@ def f(x, ret):
         if isinstance(x, tvm.tir.ProducerConsumer):
             ret.append(x)
     ret = []
-    tvm.ir_pass.PostOrderVisit(stmt, lambda x : f(x, ret))
+    tvm.tir.ir_pass.PostOrderVisit(stmt, lambda x : f(x, ret))
     return ret[-1]
 
 def lower(sch, args):
     binds = {}
     arg_list = []
     for x in args:
-        if isinstance(x, tvm.tensor.Tensor):
-            buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
+        if isinstance(x, te.tensor.Tensor):
+            buf = tvm.tir.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
             assert x not in binds
             binds[x] = buf
             arg_list.append(buf)
         else:
             raise ValueError("args must be Tensor, Buffer or Var")
     sch = sch.normalize()
-    bounds = tvm.schedule.InferBound(sch)
-    stmt = tvm.schedule.ScheduleOps(sch, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    bounds = tvm.te.schedule.InferBound(sch)
+    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, binds, 64)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     return stmt
 
 def test_basic():
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n, ), name='A')
-    B = tvm.placeholder((n, ), name='B')
+    n = te.size_var('n')
+    A = te.placeholder((n, ), name='A')
+    B = te.placeholder((n, ), name='B')
 
-    T = tvm.compute((n, ), lambda i: A[i]+B[i])
-    s = tvm.create_schedule(T.op)
+    T = te.compute((n, ), lambda i: A[i]+B[i])
+    s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=4)
 
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert('if' not in str(stmt.body.body.body[0]))
     assert('if' in str(stmt.body.body.body[1]))
 
 def test_const_loop():
     n = 21
-    A = tvm.placeholder((n, ), name='A')
-    B = tvm.placeholder((n, ), name='B')
+    A = te.placeholder((n, ), name='A')
+    B = te.placeholder((n, ), name='B')
 
-    T = tvm.compute((n, ), lambda i: A[i]+B[i])
-    s = tvm.create_schedule(T.op)
+    T = te.compute((n, ), lambda i: A[i]+B[i])
+    s = te.create_schedule(T.op)
     xo, xi = s[T].split(T.op.axis[0], factor=4)
 
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, True)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert('if' not in str(stmt.body.body.body[0]))
 
 def test_multi_loop():
-    ib = tvm.ir_builder.create()
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
+    ib = tvm.tir.ir_builder.create()
+    m = te.size_var('m')
+    n = te.size_var('n')
     with ib.for_range(0, 4, "i") as i:
         with ib.for_range(0, n, "j") as j:
             with ib.for_range(0, m, "k") as k:
@@ -94,14 +95,14 @@ def test_multi_loop():
                 with ib.else_scope():
                     ib.emit(tvm.tir.Evaluate(n))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse))))
 
 def test_multi_if():
-    ib = tvm.ir_builder.create()
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
+    ib = tvm.tir.ir_builder.create()
+    m = te.size_var('m')
+    n = te.size_var('n')
     with ib.for_range(0, 4, 'i') as i:
         with ib.for_range(0, n, 'j') as j:
             with ib.for_range(0, m, 'k') as k:
@@ -114,45 +115,45 @@ def test_multi_if():
                 with ib.else_scope():
                     ib.emit(tvm.tir.Evaluate(n))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert('if' not in str(stmt.body[0]))
 
 def test_thread_axis():
-    m = tvm.size_var('m')
-    l = tvm.size_var('l')
-    A = tvm.placeholder((m, l), name='A')
-    B = tvm.compute((m, l), lambda i, j: A[i, j] + 3, name='B')
-    s = tvm.create_schedule(B.op)
+    m = te.size_var('m')
+    l = te.size_var('l')
+    A = te.placeholder((m, l), name='A')
+    B = te.compute((m, l), lambda i, j: A[i, j] + 3, name='B')
+    s = te.create_schedule(B.op)
 
     s[B].set_scope("shared")
     num_thread = 16
     xo, xi = s[B].split(B.op.axis[0], 32)
     xi0, xi1 = s[B].split(xi, nparts=num_thread)
-    s[B].bind(xi0, tvm.thread_axis("threadIdx.x"))
+    s[B].bind(xi0, te.thread_axis("threadIdx.x"))
 
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert('if' not in str(stmt.body.body.body[0]))
 
 def test_vectorize():
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    bias = tvm.size_var("bias", dtype="float32")
-    scale = tvm.size_var("scale", dtype="float32")
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i) * scale + bias, name='C')
+    n = te.size_var('n')
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    bias = te.size_var("bias", dtype="float32")
+    scale = te.size_var("scale", dtype="float32")
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i) * scale + bias, name='C')
     # schedule
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     # create iter var and assign them tags.
     num_thread = 32
     bx, x = s[C].split(C.op.axis[0], factor=num_thread*4)
     tx, x = s[C].split(x, nparts=num_thread)
     _, x = s[C].split(x, factor=4)
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
     s[C].vectorize(x)
     stmt = lower(s, [A, B])
     body = stmt.body.body.body.body.body
@@ -160,135 +161,135 @@ def test_vectorize():
     assert(any(collect_visit(body.then_case, lambda x: isinstance(x, tvm.tir.Ramp))))
 
 def test_condition():
-    ib = tvm.ir_builder.create()
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    with ib.for_range(0, tvm.truncdiv(n+3,4), 'i') as i:
+    ib = tvm.tir.ir_builder.create()
+    m = te.size_var('m')
+    n = te.size_var('n')
+    with ib.for_range(0, tvm.tir.truncdiv(n+3,4), 'i') as i:
       with ib.for_range(0, 4, 'j') as j:
         ib.emit(tvm.tir.Evaluate(
           tvm.tir.Select(ib.likely(i*4+j<n), m, n)))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt[0], lambda x: isinstance(x, tvm.tir.Select))))
 
 def test_condition_EQ():
-    ib = tvm.ir_builder.create()
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
+    ib = tvm.tir.ir_builder.create()
+    m = te.size_var('m')
+    n = te.size_var('n')
     with ib.for_range(0, 10, 'i') as i:
             ib.emit(tvm.tir.Evaluate(
                 tvm.tir.Select(ib.likely(tvm.tir.EQ(i, 5)), m, n)))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt, True)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt[0], lambda x: isinstance(x, tvm.tir.Select))))
 
 def test_thread_axis2():
-    n = tvm.convert(4096)
-    m = tvm.size_var('m')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name='C')
-    s = tvm.create_schedule(C.op)
+    n = tvm.runtime.convert(4096)
+    m = te.size_var('m')
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda i: A[i] + B[i], name='C')
+    s = te.create_schedule(C.op)
     num_thread = 32
     bx, x = s[C].split(C.op.axis[0], factor=32)
     tx, x = s[C].split(x, nparts=num_thread)
     _,  x = s[C].split(x, factor=m)
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
     stmt = lower(s, [A, B])
     for_body = stmt.body.body.body.body.body[0]
     assert('threadIdx' not in str(for_body.extent))
 
 def test_everything_during_deduction():
-    m = tvm.size_var('m')
-    n = tvm.size_var('n')
-    ib = tvm.ir_builder.create()
+    m = te.size_var('m')
+    n = te.size_var('n')
+    ib = tvm.tir.ir_builder.create()
     with ib.for_range(0, n, 'i') as i:
         with ib.for_range(0, 32, 'j') as j:
-            with ib.if_scope(ib.likely(tvm.truncdiv(i,j) < m)):
+            with ib.if_scope(ib.likely(tvm.tir.truncdiv(i,j) < m)):
                 # this guard will produce everything during deduction
                 ib.emit(tvm.tir.Evaluate(m))
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(isinstance(stmt.body.body, tvm.tir.IfThenElse))
 
 def test_single_likely():
     n = 60
-    A = tvm.placeholder((n, ), name='A')
-    B = tvm.placeholder((n, ), name='B')
+    A = te.placeholder((n, ), name='A')
+    B = te.placeholder((n, ), name='B')
 
-    T = tvm.compute((n, ), lambda i: A[i]+B[i])
-    s = tvm.create_schedule(T.op)
+    T = te.compute((n, ), lambda i: A[i]+B[i])
+    s = te.create_schedule(T.op)
     x = T.op.axis[0]
     xo, xi = s[T].split(x, factor=16)
 
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, True)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))))
 
 def test_multi_likely():
     n = 94
     m = 62
-    A = tvm.placeholder((n, m), name='A')
-    B = tvm.placeholder((n, m), name='B')
+    A = te.placeholder((n, m), name='A')
+    B = te.placeholder((n, m), name='B')
 
-    T = tvm.compute((n, m), lambda i, j: A[i, j]+B[i, j])
-    s = tvm.create_schedule(T.op)
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    T = te.compute((n, m), lambda i, j: A[i, j]+B[i, j])
+    s = te.create_schedule(T.op)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
     x, y = T.op.axis
     xo, xi = s[T].split(x, factor=16)
     yo, yi = s[T].split(y, factor=16)
     s[T].reorder(xo, yo, xi, yi)
 
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, True)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))))
 
 def test_oneD_pool():
-    m = tvm.size_var('m')
-    ib = tvm.ir_builder.create()
-    #data = tvm.placeholder((16,), name = 'data')
+    m = te.size_var('m')
+    ib = tvm.tir.ir_builder.create()
+    #data = te.placeholder((16,), name = 'data')
     data = ib.pointer("float32", name="A")
     out = ib.pointer("float32", name="A")
     with ib.for_range(0, 16, 'ow') as ow:
         with ib.for_range(0, 3, 'kw') as kw:
             with ib.if_scope(ib.likely(ow > 0)):
                 with ib.if_scope(ib.likely(ow < 15)):
-                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+                    out[ow] = tvm.te.max(out[ow], data[ow + kw - 1])
     with ib.for_range(0, 16, 'ow') as ow:
         with ib.for_range(0, 3, 'kw') as kw:
             with ib.if_scope(ib.likely(ow < 1)):
                 with ib.if_scope(ib.likely(kw > 0)):
-                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+                    out[ow] = tvm.te.max(out[ow], data[ow + kw - 1])
     with ib.for_range(0, 16, 'ow') as ow:
         with ib.for_range(0, 3, 'kw') as kw:
             with ib.if_scope(ib.likely(ow > 14)):
                 with ib.if_scope(ib.likely(kw < 2)):
-                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+                    out[ow] = tvm.te.max(out[ow], data[ow + kw - 1])
 
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt, True)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))))
 
 def test_cce_loop_1():
-  ib = tvm.ir_builder.create()
+  ib = tvm.tir.ir_builder.create()
   dtype = 'float16'
   n = 514
   m = 514
-  _A = tvm.placeholder((n*m,), name = 'A')
-  Ab = tvm.decl_buffer((n*m,), dtype, name="A")
+  _A = te.placeholder((n*m,), name = 'A')
+  Ab = tvm.tir.decl_buffer((n*m,), dtype, name="A")
   A = ib.buffer_ptr(Ab)
-  _B = tvm.placeholder((n*m,), name = 'B')
-  Bb = tvm.decl_buffer((n*m,), dtype, name="B")
+  _B = te.placeholder((n*m,), name = 'B')
+  Bb = tvm.tir.decl_buffer((n*m,), dtype, name="B")
   B = ib.buffer_ptr(Bb)
   #for i in 0 to n-1:
   with ib.for_range(0, 11, name="i") as i:
@@ -296,12 +297,12 @@ def test_cce_loop_1():
           with ib.if_scope(ib.likely(((i*160) + j) < 1600)):
                A[(i+1)*m+j+1] = B[(i)*m+j+1] + B[(i+1)*m+j+1] + B[(i+2)*m+j+1]
   stmt = ib.get()
-  stmt = tvm.ir_pass.LoopPartition(stmt, True)
-  stmt = tvm.ir_pass.Simplify(stmt)
+  stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+  stmt = tvm.tir.ir_pass.Simplify(stmt)
   assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))))
 
 def test_cce_loop_2():
-  ib = tvm.ir_builder.create()
+  ib = tvm.tir.ir_builder.create()
   len = 112
   tile = 32
   loop = (len + tile - 1) // tile
@@ -309,19 +310,19 @@ def test_cce_loop_2():
     head = i * tile
     with ib.if_scope(ib.likely(head + tile > len)):
       tail = len
-      ib.emit(tvm.call_extern('float32', "cce_intrisic", head, tail))
+      ib.emit(tvm.tir.call_extern('float32', "cce_intrisic", head, tail))
     with ib.else_scope():
       tail = head + tile
-      ib.emit(tvm.call_extern('float32', "cce_intrisic", head, tail))
+      ib.emit(tvm.tir.call_extern('float32', "cce_intrisic", head, tail))
 
   stmt = ib.get()
-  stmt = tvm.ir_pass.LoopPartition(stmt, True)
-  stmt = tvm.ir_pass.Simplify(stmt)
+  stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+  stmt = tvm.tir.ir_pass.Simplify(stmt)
   assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))))
 
 
 def test_cce_loop_3():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     loop1 = 4
     loop2 = 9998
     tile = 39991
@@ -330,11 +331,11 @@ def test_cce_loop_3():
             head1 = i
             head2 = j
             with ib.if_scope(ib.likely(head1*loop1 + head2 < tile)):
-                ib.emit(tvm.call_extern('float16',"cce_intrisic",head1))
+                ib.emit(tvm.tir.call_extern('float16',"cce_intrisic",head1))
 
     stmt = ib.get()
-    stmt = tvm.ir_pass.LoopPartition(stmt,True)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt,True)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))))
 
 def test_conv_tiling():
@@ -345,33 +346,33 @@ def test_conv_tiling():
     batch_size = 1
     in_height = in_width = 64
     out_height = out_width = in_height - kernel_height + 1
-    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
-    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+    data = te.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = te.placeholder((kernel_height, kernel_width, in_channel,
         out_channel), name='kernel')
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
-    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
-                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
+    ic = te.reduce_axis((0, in_channel), name='ic')
+    kh = te.reduce_axis((0, kernel_height), name='kh')
+    kw = te.reduce_axis((0, kernel_width), name='kw')
+    conv = te.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: te.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
                                                      kernel[kh, kw, ic, oc],
                                                      axis=[ic, kh, kw]),
                        name="conv2d")
-    s = tvm.create_schedule(conv.op)
+    s = te.create_schedule(conv.op)
 
     n, oc, oh, ow = conv.op.axis
     oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, True)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))))
 
 
 def test_multilevel_splitting_with_indivisble_factors():
     import topi
-    A = tvm.placeholder((130,), dtype="float32")
+    A = te.placeholder((130,), dtype="float32")
     B = topi.nn.relu(A)
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     (y,) = s[B].op.axis
     (yo, yi) = s[B].split(y, factor=8)
     (yoo, yoi) = s[B].split(yo, factor=16)
@@ -379,7 +380,7 @@ def test_multilevel_splitting_with_indivisble_factors():
     s[B].unroll(yi)
 
     ## But this does the right thing.
-    with tvm.build_config(partition_const_loop=True):
+    with tvm.target.build_config(partition_const_loop=True):
         lowered_body = tvm.lower(s, [A, B]).body
         def visit_stmt(op):
             return(isinstance(op, tvm.tir.Max))
@@ -390,17 +391,17 @@ def visit_stmt(op):
 def test_double_splitting_with_indivisible_factors():
     m = 48
     dtype="float32"
-    A = tvm.placeholder((m,), name='A', dtype=dtype)
-    C = tvm.compute((m,), lambda i: A[i], name='C')
-    D = tvm.compute((m,), lambda i: C[i], name='D')
+    A = te.placeholder((m,), name='A', dtype=dtype)
+    C = te.compute((m,), lambda i: A[i], name='C')
+    D = te.compute((m,), lambda i: C[i], name='D')
 
-    s = tvm.create_schedule(D.op)
+    s = te.create_schedule(D.op)
     co, ci = s[C].split(C.op.axis[0], factor=10)
     do, di = s[D].split(D.op.axis[0], 32)
     s[C].compute_at(s[D], do)
 
     target = 'llvm'
-    with tvm.build_config(partition_const_loop=True):
+    with tvm.target.build_config(partition_const_loop=True):
         f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False)
         func = tvm.build(f, target=target)
 
@@ -420,30 +421,30 @@ def test_double_splitting_with_indivisible_factors():
 
 def test_simple_rfactor():
     K = 16*4+4
-    k = tvm.reduce_axis((0, K), 'k')
+    k = te.reduce_axis((0, K), 'k')
 
-    A = tvm.placeholder((1, K), name='A')
+    A = te.placeholder((1, K), name='A')
 
-    B = tvm.compute( (1,), lambda b:
-            tvm.sum(A[b, k], axis=k),
+    B = te.compute( (1,), lambda b:
+            te.sum(A[b, k], axis=k),
             name='B'
     )
 
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     ko, _ = s[B].split(s[B].op.reduce_axis[0], 16)
     BF = s.rfactor(B, ko, 0)
 
     s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
 
-    stmt1 = tvm.schedule.ScheduleOps(s, bounds)
-    stmt1 = tvm.ir_pass.Simplify(stmt1)
+    stmt1 = tvm.te.schedule.ScheduleOps(s, bounds)
+    stmt1 = tvm.tir.ir_pass.Simplify(stmt1)
 
-    stmt2 = tvm.ir_pass.LoopPartition(stmt1, True)
-    stmt2 = tvm.ir_pass.Simplify(stmt2)
+    stmt2 = tvm.tir.ir_pass.LoopPartition(stmt1, True)
+    stmt2 = tvm.tir.ir_pass.Simplify(stmt2)
 
     #make sure loop partition actually did something
-    assert not tvm.ir_pass.Equal(stmt1.body, stmt2.body)
+    assert not tvm.tir.ir_pass.Equal(stmt1.body, stmt2.body)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_lower_intrin.py b/tests/python/unittest/test_pass_lower_intrin.py
index 1e54f38b87d0..f36b4a5d95fe 100644
--- a/tests/python/unittest/test_pass_lower_intrin.py
+++ b/tests/python/unittest/test_pass_lower_intrin.py
@@ -15,21 +15,22 @@
  # specific language governing permissions and limitations
  # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def lower_intrin(stmt):
     """wrapper to call transformation in stmt"""
     lower_expr = isinstance(stmt, tvm.tir.PrimExpr)
     stmt = tvm.tir.Evaluate(stmt) if lower_expr else stmt
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt  = tvm.ir_pass._LowerIntrinStmt(stmt, "llvm")
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt  = tvm.tir.ir_pass._LowerIntrinStmt(stmt, "llvm")
     return stmt.value if lower_expr else stmt.body
 
 
 def check_value(expr, vx, vy, data, fref):
     n = len(data)
-    A = tvm.placeholder((n,), name="A", dtype=expr.dtype)
-    B = tvm.placeholder((n,), name="B", dtype=expr.dtype)
+    A = te.placeholder((n,), name="A", dtype=expr.dtype)
+    B = te.placeholder((n,), name="B", dtype=expr.dtype)
 
     def make_binds(i):
         x = expr
@@ -37,8 +38,8 @@ def make_binds(i):
         x = tvm.tir.Let(vy, B[i], x)
         return x
 
-    C = tvm.compute((n,), make_binds)
-    s = tvm.create_schedule([C.op])
+    C = te.compute((n,), make_binds)
+    s = te.create_schedule([C.op])
 
     if not tvm.runtime.enabled("llvm"):
         return
@@ -65,43 +66,43 @@ def get_ref_data():
 def test_lower_floordiv():
     data = get_ref_data()
     for dtype in ["int32", "int64", "int16"]:
-        x = tvm.var("x", dtype=dtype)
-        y = tvm.var("y", dtype=dtype)
-        zero = tvm.const(0, dtype)
+        x = te.var("x", dtype=dtype)
+        y = te.var("y", dtype=dtype)
+        zero = tvm.tir.const(0, dtype)
         # no constraints
-        res = lower_intrin(tvm.floordiv(x, y))
+        res = lower_intrin(tvm.te.floordiv(x, y))
         check_value(res, x, y, data, lambda a, b: a // b)
         # rhs >= 0
-        res = lower_intrin(tvm.tir.Select(y >= 0, tvm.floordiv(x, y), zero))
+        res = lower_intrin(tvm.tir.Select(y >= 0, tvm.te.floordiv(x, y), zero))
         check_value(res, x, y, data, lambda a, b: a // b if b > 0 else 0)
         # involves max
-        res = lower_intrin(tvm.tir.Select(y >= 0, tvm.max(tvm.floordiv(x, y), zero), zero))
+        res = lower_intrin(tvm.tir.Select(y >= 0, tvm.te.max(tvm.te.floordiv(x, y), zero), zero))
         check_value(res, x, y, data, lambda a, b: max(a // b, 0) if b > 0 else 0)
         # lhs >= 0
-        res = lower_intrin(tvm.tir.Select(tvm.all(y >= 0, x >= 0), tvm.floordiv(x, y), zero))
+        res = lower_intrin(tvm.tir.Select(tvm.tir.all(y >= 0, x >= 0), tvm.te.floordiv(x, y), zero))
         check_value(res, x, y, data, lambda a, b: a // b if b > 0 and a >= 0 else 0)
         # const power of two
-        res = lower_intrin(tvm.floordiv(x, tvm.const(8, dtype=dtype)))
+        res = lower_intrin(tvm.te.floordiv(x, tvm.tir.const(8, dtype=dtype)))
         check_value(res, x, y, [(a, b) for a, b in data if b == 8], lambda a, b: a // b)
 
 
 def test_lower_floormod():
     data = get_ref_data()
     for dtype in ["int32", "int64", "int16"]:
-        x = tvm.var("x", dtype=dtype)
-        y = tvm.var("y", dtype=dtype)
-        zero = tvm.const(0, dtype)
+        x = te.var("x", dtype=dtype)
+        y = te.var("y", dtype=dtype)
+        zero = tvm.tir.const(0, dtype)
         # no constraints
-        res = lower_intrin(tvm.floormod(x, y))
+        res = lower_intrin(tvm.te.floormod(x, y))
         check_value(res, x, y, data, lambda a, b: a % b)
         # rhs >= 0
-        res = lower_intrin(tvm.tir.Select(y >= 0, tvm.floormod(x, y), zero))
+        res = lower_intrin(tvm.tir.Select(y >= 0, tvm.te.floormod(x, y), zero))
         check_value(res, x, y, data, lambda a, b: a % b if b > 0 else 0)
         # lhs >= 0
-        res = lower_intrin(tvm.tir.Select(tvm.all(y >= 0, x >= 0), tvm.floormod(x, y), zero))
+        res = lower_intrin(tvm.tir.Select(tvm.tir.all(y >= 0, x >= 0), tvm.te.floormod(x, y), zero))
         check_value(res, x, y, data, lambda a, b: a % b if b > 0 and a >= 0 else 0)
         # const power of two
-        res = lower_intrin(tvm.floormod(x, tvm.const(8, dtype=dtype)))
+        res = lower_intrin(tvm.te.floormod(x, tvm.tir.const(8, dtype=dtype)))
         check_value(res, x, y, [(a, b) for a, b in data if b == 8], lambda a, b: a % b)
 
 
diff --git a/tests/python/unittest/test_pass_lower_warp_memory.py b/tests/python/unittest/test_pass_lower_warp_memory.py
index 4f0927137444..266ca7eac99f 100644
--- a/tests/python/unittest/test_pass_lower_warp_memory.py
+++ b/tests/python/unittest/test_pass_lower_warp_memory.py
@@ -15,26 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_lower_warp_mem():
     m = 128
-    A = tvm.placeholder((m,), name='A')
-    B = tvm.compute((m,), lambda i: A[i] + 3, name='B')
+    A = te.placeholder((m,), name='A')
+    B = te.compute((m,), lambda i: A[i] + 3, name='B')
 
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     AA = s.cache_read(A, "warp", [B])
     xo, xi = s[B].split(B.op.axis[0], 32)
     xi0, xi1 = s[B].split(xi, factor=16)
-    tx = tvm.thread_axis("threadIdx.x")
+    tx = te.thread_axis("threadIdx.x")
     s[B].bind(xi1, tx)
-    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[B].bind(xo, te.thread_axis("blockIdx.x"))
     s[AA].compute_at(s[B], xo)
     xo, xi = s[AA].split(s[AA].op.axis[0], 16)
     s[AA].bind(xi, tx)
 
     f = tvm.lower(s, [A, B])
-    fhost, fdevice = tvm.ir_pass.SplitHostDevice(f)
-    fdevice = tvm.ir_pass.LowerWarpMemory(fdevice, 16)
+    fhost, fdevice = tvm.tir.ir_pass.SplitHostDevice(f)
+    fdevice = tvm.tir.ir_pass.LowerWarpMemory(fdevice, 16)
     assert(fdevice.body.body.value.value == "local")
     assert(fdevice.body.body.body.extents[0].value == 2)
 
diff --git a/tests/python/unittest/test_pass_makeapi.py b/tests/python/unittest/test_pass_makeapi.py
index 34f32ef01c7c..6b28ef6cee18 100644
--- a/tests/python/unittest/test_pass_makeapi.py
+++ b/tests/python/unittest/test_pass_makeapi.py
@@ -15,26 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy
 
 def test_makeapi():
     """Not yet working, mock design"""
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
-    s = tvm.create_schedule(C.op)
+    n = te.size_var('n')
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = te.create_schedule(C.op)
 
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    Cb = tvm.decl_buffer(C.shape, C.dtype, name='C')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, C:Cb}, 64)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    Cb = tvm.tir.decl_buffer(C.shape, C.dtype, name='C')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B:Bb, C:Cb}, 64)
 
     num_unpacked_args = 2
-    f = tvm.ir_pass.MakeAPI(
+    f = tvm.tir.ir_pass.MakeAPI(
         stmt, "myadd", [n, Ab, Bb, Cb], num_unpacked_args, True)
     assert(f.handle_data_type[Ab.data].dtype == Ab.dtype)
     assert(len(f.args) == 7)
diff --git a/tests/python/unittest/test_pass_remove_no_op.py b/tests/python/unittest/test_pass_remove_no_op.py
index a3927f7db49d..c9ecfbe21871 100644
--- a/tests/python/unittest/test_pass_remove_no_op.py
+++ b/tests/python/unittest/test_pass_remove_no_op.py
@@ -15,18 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def nop():
     return tvm.tir.Evaluate(0)
 
 def test_remove_no_op():
-    i = tvm.var('i')
-    j = tvm.var('j')
-    k = tvm.var('k')
-    m = tvm.var('m')
-    n = tvm.var('n')
+    i = te.var('i')
+    j = te.var('j')
+    k = te.var('k')
+    m = te.var('m')
+    n = te.var('n')
     dtype = 'int64'
-    Ab = tvm.decl_buffer((n, ), dtype)
+    Ab = tvm.tir.decl_buffer((n, ), dtype)
     stmt = tvm.tir.For(
         i, 0, 4, 0, 0,
         tvm.tir.For(
@@ -35,16 +36,16 @@ def test_remove_no_op():
                 k, 0, m, 0, 0,
                 tvm.tir.IfThenElse(
                     (i*m+j+k < n), tvm.tir.Evaluate(m), tvm.tir.Evaluate(n)))))
-    ret = tvm.ir_pass.RemoveNoOp(stmt)
+    ret = tvm.tir.ir_pass.RemoveNoOp(stmt)
     assert(isinstance(ret, tvm.tir.Evaluate))
     store = tvm.tir.Store(Ab.data,
                            tvm.tir.Load(dtype, Ab.data, i) + 1,
                            i + 1)
     stmt2 = tvm.tir.SeqStmt([nop(), tvm.tir.SeqStmt([store, nop()])])
-    assert(tvm.ir_pass.RemoveNoOp(stmt2) == store)
+    assert(tvm.tir.ir_pass.RemoveNoOp(stmt2) == store)
     # remove zero extent loop
     stmt3 = tvm.tir.For(i, 0, 0, 0, 0, store)
-    ret = tvm.ir_pass.RemoveNoOp(stmt3)
+    ret = tvm.tir.ir_pass.RemoveNoOp(stmt3)
     assert(isinstance(ret, tvm.tir.Evaluate))
 
 
diff --git a/tests/python/unittest/test_pass_rewrite_for_tensor_core.py b/tests/python/unittest/test_pass_rewrite_for_tensor_core.py
index cc99a25d81e9..977dfc3d6b26 100644
--- a/tests/python/unittest/test_pass_rewrite_for_tensor_core.py
+++ b/tests/python/unittest/test_pass_rewrite_for_tensor_core.py
@@ -15,16 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import topi
 import numpy as np
 from tvm.contrib import nvcc
 
 def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
-    A = tvm.placeholder((n, l), name='A', dtype='float16')
-    B = tvm.placeholder((l, m), name='B', dtype='float16')
-    k = tvm.reduce_axis((0, l), name='k')
-    C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k))
-    s = tvm.create_schedule(C.op)
+    A = te.placeholder((n, l), name='A', dtype='float16')
+    B = te.placeholder((l, m), name='B', dtype='float16')
+    k = te.reduce_axis((0, l), name='k')
+    C = te.compute((n, m), lambda i, j: te.sum(A[i, k].astype('float32') * B[k, j].astype('float32'), axis=k))
+    s = te.create_schedule(C.op)
     y, x = s[C].op.axis
     k = s[C].op.reduce_axis[0]
 
@@ -57,12 +58,12 @@ def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
     kl, ki = s[CL].split(ki, tile_k)
 
     s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
-    s[C].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
-    s[C].bind(vy, tvm.thread_axis((0, vthread), "vthread", name="vy"))
+    s[C].bind(yo, te.thread_axis("blockIdx.y"))
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tz, te.thread_axis("threadIdx.z"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+    s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy"))
     s[CL].compute_at(s[C], tx)
     yo, xo = CL.op.axis
     s[CL].reorder(ko, kl, ki, yo, xo)
@@ -73,9 +74,9 @@ def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
     tx, vec = s[AA].split(tx, factor=v)
     fused = s[AA].fuse(s[AA].op.axis[0], xo)
     _, ty = s[AA].split(fused, factor=by)
-    s[AA].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[AA].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[AA].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
+    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
+    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
     s[AA].vectorize(vec)
 
     s[BB].compute_at(s[CL], ko)
@@ -84,9 +85,9 @@ def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
     tx, vec = s[BB].split(tx, factor=v)
     fused = s[BB].fuse(s[BB].op.axis[0], xo)
     _, ty = s[BB].split(fused, factor=by)
-    s[BB].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[BB].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
+    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
+    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
     s[BB].vectorize(vec)
 
     s[AL].compute_at(s[CL], kl)
@@ -111,11 +112,11 @@ def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96):
     np.testing.assert_allclose(c_np, c.asnumpy(), rtol=1e-3)
 
 def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
-    A = tvm.placeholder((batch, n, l), name='A', dtype='float16')
-    B = tvm.placeholder((batch, l, m), name='B', dtype='float16')
-    k = tvm.reduce_axis((0, l), name='k')
-    C = tvm.compute((batch, n, m), lambda b, i, j: tvm.sum((A[b, i, k] * B[b, k, j]).astype('float32'), axis=k))
-    s = tvm.create_schedule(C.op)
+    A = te.placeholder((batch, n, l), name='A', dtype='float16')
+    B = te.placeholder((batch, l, m), name='B', dtype='float16')
+    k = te.reduce_axis((0, l), name='k')
+    C = te.compute((batch, n, m), lambda b, i, j: te.sum((A[b, i, k] * B[b, k, j]).astype('float32'), axis=k))
+    s = te.create_schedule(C.op)
     z, y, x = s[C].op.axis
     k = s[C].op.reduce_axis[0]
 
@@ -148,13 +149,13 @@ def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
     kl, ki = s[CL].split(ki, tile_k)
 
     s[C].reorder(z, yo, xo, tz, ty, tx, yi, xi)
-    s[C].bind(z, tvm.thread_axis("blockIdx.z"))
-    s[C].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
-    s[C].bind(vy, tvm.thread_axis((0, vthread), "vthread", name="vy"))
+    s[C].bind(z, te.thread_axis("blockIdx.z"))
+    s[C].bind(yo, te.thread_axis("blockIdx.y"))
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tz, te.thread_axis("threadIdx.z"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+    s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy"))
     s[CL].compute_at(s[C], tx)
     zo, yo, xo = CL.op.axis
     s[CL].reorder(ko, kl, ki, zo, yo, xo)
@@ -165,9 +166,9 @@ def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
     tx, vec = s[AA].split(tx, factor=v)
     fused = s[AA].fuse(s[AA].op.axis[1], xo)
     _, ty = s[AA].split(fused, factor=by)
-    s[AA].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[AA].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[AA].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
+    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
+    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
     s[AA].vectorize(vec)
 
     s[BB].compute_at(s[CL], ko)
@@ -176,9 +177,9 @@ def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2):
     tx, vec = s[BB].split(tx, factor=v)
     fused = s[BB].fuse(s[BB].op.axis[1], xo)
     _, ty = s[BB].split(fused, factor=by)
-    s[BB].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[BB].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
+    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
+    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
     s[BB].vectorize(vec)
 
     s[AL].compute_at(s[CL], kl)
diff --git a/tests/python/unittest/test_pass_rewrite_unsafe_select.py b/tests/python/unittest/test_pass_rewrite_unsafe_select.py
index dc6ae8286213..f1e411eab9d5 100644
--- a/tests/python/unittest/test_pass_rewrite_unsafe_select.py
+++ b/tests/python/unittest/test_pass_rewrite_unsafe_select.py
@@ -15,21 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 
 def test_rewrite_Select():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     A = ib.allocate("float32", 100, name="A", scope="global")
-    i = tvm.var("i")
+    i = te.var("i")
     y = tvm.tir.Select(i > 1, A[i-1], 1.0)
-    yy = tvm.ir_pass.RewriteUnsafeSelect(tvm.tir.Evaluate(y)).value
+    yy = tvm.tir.ir_pass.RewriteUnsafeSelect(tvm.tir.Evaluate(y)).value
 
     z = tvm.tir.Select(
         tvm.tir.Select(i > 1, A[i-1], 1.0) > 0.0, A[i], 0.1)
-    zz = tvm.ir_pass.RewriteUnsafeSelect(tvm.tir.Evaluate(z)).value
+    zz = tvm.tir.ir_pass.RewriteUnsafeSelect(tvm.tir.Evaluate(z)).value
 
-    a = tvm.tir.Select(tvm.floordiv(i, 4) > 10, y, z)
-    aa = tvm.ir_pass.RewriteUnsafeSelect(tvm.tir.Evaluate(a)).value
+    a = tvm.tir.Select(tvm.te.floordiv(i, 4) > 10, y, z)
+    aa = tvm.tir.ir_pass.RewriteUnsafeSelect(tvm.tir.Evaluate(a)).value
     assert yy.name == "tvm_if_then_else"
     assert zz.name == "tvm_if_then_else"
     assert isinstance(aa, tvm.tir.Select)
diff --git a/tests/python/unittest/test_pass_split_host_device.py b/tests/python/unittest/test_pass_split_host_device.py
index e8858b8aa41e..09f7740df9c9 100644
--- a/tests/python/unittest/test_pass_split_host_device.py
+++ b/tests/python/unittest/test_pass_split_host_device.py
@@ -16,13 +16,14 @@
 # under the License.
 import pytest
 import tvm
+from tvm import te
 
 @pytest.mark.xfail
 def test_loop_dependent_allocate():
-    N = tvm.size_var("N")
-    A = tvm.placeholder((2*N,), "float32", "A")
-    C = tvm.compute((N, ), lambda i: A[2*i] + A[i+1], name='C')
-    s = tvm.create_schedule(C.op)
+    N = te.size_var("N")
+    A = te.placeholder((2*N,), "float32", "A")
+    C = te.compute((N, ), lambda i: A[2*i] + A[i+1], name='C')
+    s = te.create_schedule(C.op)
     AA = s.cache_read(A, "local", [C])
     s[AA].compute_at(s[C], s[C].op.axis[0])
     # this line should fail due to IRUseDefAnalysis sees an allocate statement
diff --git a/tests/python/unittest/test_pass_storage_flatten.py b/tests/python/unittest/test_pass_storage_flatten.py
index 47a43c7ac2a0..e8a78cbc5209 100644
--- a/tests/python/unittest/test_pass_storage_flatten.py
+++ b/tests/python/unittest/test_pass_storage_flatten.py
@@ -15,35 +15,36 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_flatten2():
-    m = tvm.size_var('m')
-    l = tvm.size_var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    m = te.size_var('m')
+    l = te.size_var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
 
-    s = tvm.create_schedule(A2.op)
+    s = te.create_schedule(A2.op)
     xo, xi = s[A2].split(A2.op.axis[0], 8)
     s[A1].compute_at(s[A2], xo)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    A2b = tvm.tir.decl_buffer(A2.shape, A2.dtype, name='A2')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
 
 def test_flatten_prefetch():
-    A = tvm.placeholder((25, 100, 4), name = 'A')
-    _A= tvm.decl_buffer(A.shape, A.dtype, name = 'A');
-    i = tvm.size_var('i')
-    j = tvm.size_var('j')
+    A = te.placeholder((25, 100, 4), name = 'A')
+    _A= tvm.tir.decl_buffer(A.shape, A.dtype, name = 'A');
+    i = te.size_var('i')
+    j = te.size_var('j')
     region = [tvm.ir.Range.make_by_min_extent(i[0], i[1]) for i in [(i, 2), (j, 8), (0, 4)]]
     stmt = tvm.tir.Prefetch(A.op, 0, A.dtype, region)
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: _A}, 64)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: _A}, 64)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert stmt.extent.value == 2
     assert isinstance(stmt.body, tvm.tir.For)
     assert stmt.body.extent.value == 2
@@ -52,27 +53,27 @@ def test_flatten_prefetch():
 def test_flatten_storage_align():
     m = 8
     l = 16
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
 
-    s = tvm.create_schedule(A2.op)
+    s = te.create_schedule(A2.op)
     s[A1].storage_align(A1.op.axis[0], 2, 1)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    A2b = tvm.tir.decl_buffer(A2.shape, A2.dtype, name='A2')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert(stmt.body.extents[0].value == 17 * 8)
 
 def test_flatten_double_buffer():
     dtype = 'int64'
     n = 100
     m = 4
-    tx = tvm.thread_axis("threadIdx.x")
-    ib = tvm.ir_builder.create()
+    tx = te.thread_axis("threadIdx.x")
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     C = ib.pointer("float32", name="C")
     ib.scope_attr(tx, "thread_extent", 1)
@@ -86,18 +87,18 @@ def test_flatten_double_buffer():
             C[j] = B[j] + 1
 
     stmt = ib.get()
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {}, 64)
-    stmt = tvm.ir_pass.InjectDoubleBuffer(stmt, 2)
-    stmt = tvm.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {}, 64)
+    stmt = tvm.tir.ir_pass.InjectDoubleBuffer(stmt, 2)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
     assert isinstance(stmt.body.body, tvm.tir.Allocate)
     assert stmt.body.body.extents[0].value == 2
-    f = tvm.ir_pass.MakeAPI(stmt, "db", [A.asobject(), C.asobject()], 2, True)
-    f = tvm.ir_pass.ThreadSync(f, "shared")
+    f = tvm.tir.ir_pass.MakeAPI(stmt, "db", [A.asobject(), C.asobject()], 2, True)
+    f = tvm.tir.ir_pass.ThreadSync(f, "shared")
     count = [0]
     def count_sync(op):
         if isinstance(op, tvm.tir.Call) and op.name == "tvm_storage_sync":
             count[0] += 1
-    tvm.ir_pass.PostOrderVisit(f.body, count_sync)
+    tvm.tir.ir_pass.PostOrderVisit(f.body, count_sync)
     assert count[0] == 4
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py
index d4125d093198..c74225d3a4be 100644
--- a/tests/python/unittest/test_pass_storage_rewrite.py
+++ b/tests/python/unittest/test_pass_storage_rewrite.py
@@ -15,33 +15,34 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_storage_share():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
     num_stage = 5
     B = A
     for t in range(num_stage):
-        B = tvm.compute((m, l), lambda i, j: B[i, j] + (t+1), name='A%d' % t)
+        B = te.compute((m, l), lambda i, j: B[i, j] + (t+1), name='A%d' % t)
 
-    s = tvm.create_schedule(B.op)
-    bounds = tvm.schedule.InferBound(s)
+    s = te.create_schedule(B.op)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.StorageRewrite(stmt)
     # verify only have one allocations.
     # verify inplace folding works
     num_alloc = [0]
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             num_alloc[0] += 1
-    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 1
 
 def register_mem(scope_tb, max_bits):
@@ -60,8 +61,8 @@ def test_alloc_seq():
 
     register_mem(scope_tb, max_bits)
 
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
             A = ib.allocate("float32", 200, name="A", scope=scope_tb)
@@ -71,37 +72,37 @@ def test_alloc_seq():
             A[j] = 1.3
 
     body = ib.get()
-    body = tvm.ir_pass.StorageRewrite(body)
+    body = tvm.tir.ir_pass.StorageRewrite(body)
     num_alloc = [0]
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             num_alloc[0] += 1
             assert n.extents[0].value == 200
-    tvm.ir_pass.PostOrderVisit(body, verify)
+    tvm.tir.ir_pass.PostOrderVisit(body, verify)
     assert num_alloc[0] == 1
 
 def test_alloc_different_dtypes():
     def stmt_generater(dtype_list, length):
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
         base_dtype = dtype_list[0]
-        global_a = tvm.placeholder((length,), name = "global_a", dtype = base_dtype)
+        global_a = te.placeholder((length,), name = "global_a", dtype = base_dtype)
         assert len(dtype_list) == 4
         with ib.for_range(0, length, name="j") as j:
             dtype = dtype_list[0]
             A = ib.allocate(dtype, length, name="A", scope="local.L0A")
-            A[j] = tvm.const(1, dtype = dtype)
+            A[j] = tvm.tir.const(1, dtype = dtype)
         with ib.for_range(0, length, name="j") as j:
             dtype = dtype_list[1]
             B = ib.allocate(dtype, length, name="B", scope="local.L0A")
-            B[j] = tvm.const(1, dtype = dtype)
+            B[j] = tvm.tir.const(1, dtype = dtype)
         with ib.for_range(0, length, name="j") as j:
             dtype = dtype_list[2]
             C = ib.allocate(dtype, length, name="C", scope="local.L0A")
-            C[j] = tvm.const(1, dtype = dtype)
+            C[j] = tvm.tir.const(1, dtype = dtype)
         with ib.for_range(0, length, name="j") as j:
             dtype = dtype_list[3]
             D = ib.allocate(dtype, length, name="D", scope="local.L0A")
-            D[j] = tvm.const(1, dtype = dtype)
+            D[j] = tvm.tir.const(1, dtype = dtype)
         with ib.for_range(0, length, name="j") as j:
             dtype = "int8"
             E = ib.allocate(dtype, length, name="E", scope="local.L0A")
@@ -128,8 +129,8 @@ def verify(n):
 
         body = stmt_generater(dtype_list, length)
         offset = offset_generater(dtype_list, length)
-        body = tvm.ir_pass.StorageRewrite(body)
-        tvm.ir_pass.PostOrderVisit(body, verify)
+        body = tvm.tir.ir_pass.StorageRewrite(body)
+        tvm.tir.ir_pass.PostOrderVisit(body, verify)
 
     length = 1024
     dtype_list = ["float16", "int32", "uint16", "int8"]
@@ -147,121 +148,121 @@ def verify(n):
 
 def test_inplace_rule():
     m = 10
-    A = tvm.placeholder((m,), name='A')
-    A0 = tvm.compute((m,), lambda i: A[i], name='A0')
-    A1 = tvm.compute((m,), lambda i: A[i] + 1, name='A1')
-    AA =  tvm.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name='AA')
-    B = tvm.compute((m,), lambda i: AA[i] + 1, name='B')
-    s = tvm.create_schedule(B.op)
-    bounds = tvm.schedule.InferBound(s)
+    A = te.placeholder((m,), name='A')
+    A0 = te.compute((m,), lambda i: A[i], name='A0')
+    A1 = te.compute((m,), lambda i: A[i] + 1, name='A1')
+    AA =  te.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name='AA')
+    B = te.compute((m,), lambda i: AA[i] + 1, name='B')
+    s = te.create_schedule(B.op)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.StorageRewrite(stmt)
     # verify only have one allocations.
     # verify inplace folding works
     num_alloc = [0]
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             num_alloc[0] += 1
-    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 2
 
 
 def test_storage_combine():
     n = 8
-    A = tvm.placeholder((4,), name='A')
+    A = te.placeholder((4,), name='A')
     num_stage = 5
     B = A
     stages = []
     for t in range(num_stage):
-        B = tvm.compute((n, ), lambda i: B[i] + B[0] + (t+1), name='A%d' % t)
+        B = te.compute((n, ), lambda i: B[i] + B[0] + (t+1), name='A%d' % t)
         stages.append(B)
 
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     for S in stages[:-1]:
         s[S].set_scope("global:tag")
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.StorageRewrite(stmt)
     num_alloc = [0]
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             num_alloc[0] += 1
             assert (n.extents[0].value == 16)
-    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 1
 
 
 def test_storage_share_gpu():
-    m = tvm.var('m')
-    A = [tvm.placeholder((m), name='A')]
+    m = te.var('m')
+    A = [te.placeholder((m), name='A')]
     num_stage = 5
     for t in range(num_stage):
-        A.append(tvm.compute((m,), lambda i: A[-1][i] + (t+1), name='A%d_s' % t))
-        A.append(tvm.compute((m,), lambda i: A[-1][i], name='A%d' % t))
-    s = tvm.create_schedule(A[-1].op)
+        A.append(te.compute((m,), lambda i: A[-1][i] + (t+1), name='A%d_s' % t))
+        A.append(te.compute((m,), lambda i: A[-1][i], name='A%d' % t))
+    s = te.create_schedule(A[-1].op)
     for t in range(num_stage):
         x = A[2*t+2].op.axis[0]
         bx, tx = s[A[2*t+2]].split(x, factor=32)
-        s[A[2*t+2]].bind(bx, tvm.thread_axis("blockIdx.x"))
-        s[A[2*t+2]].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[A[2*t+2]].bind(bx, te.thread_axis("blockIdx.x"))
+        s[A[2*t+2]].bind(tx, te.thread_axis("threadIdx.x"))
         s[A[2*t+1]].compute_at(s[A[2*t+2]], tx)
         s[A[2*t+1]].set_scope("shared")
 
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A[0].shape, A[0].dtype, name='A')
-    Bb = tvm.decl_buffer(A[0].shape, A[0].dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A[0].shape, A[0].dtype, name='A')
+    Bb = tvm.tir.decl_buffer(A[0].shape, A[0].dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A[0]: Ab, A[-1]: Bb}, 64)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.StorageRewrite(stmt)
     alloc_stats = {"global": 0, "shared": 0}
 
     def verify(n):
         if isinstance(n, tvm.tir.AttrStmt):
             if n.attr_key == "storage_scope":
                 alloc_stats[n.value.value] += 1
-    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
     assert alloc_stats["global"] == 2
     assert alloc_stats["shared"] == num_stage
 
 def test_parallel_alloc():
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
     with ib.for_range(0, n, name="i", for_type="parallel") as i:
         with ib.for_range(0, 10, name="j") as j:
             A = ib.allocate("float32", n, name="A", scope="global")
             A[j] = A[j] + 2
 
     body = ib.get()
-    body = tvm.ir_pass.StorageRewrite(body)
+    body = tvm.tir.ir_pass.StorageRewrite(body)
     assert (isinstance(body.body.body, tvm.tir.Allocate))
 
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
     with ib.for_range(0, n, name="t") as i:
         ib.scope_attr(
-            tvm.const(1, "int32") , "pragma_scope",
+            tvm.tir.const(1, "int32") , "pragma_scope",
             tvm.tir.StringImm("parallel_launch_point"))
         with ib.for_range(0, n, name="i", for_type="parallel") as i:
             with ib.for_range(0, 10, name="j") as j:
                 A = ib.allocate("float32", n, name="A", scope="global")
                 A[j] = A[j] + 2
     body = ib.get()
-    body = tvm.ir_pass.StorageRewrite(body)
+    body = tvm.tir.ir_pass.StorageRewrite(body)
 
     assert(isinstance(body.body.body.body.body, tvm.tir.Allocate))
 
@@ -269,35 +270,35 @@ def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024):
     #Test Buffer
     register_mem(scope_tb, max_bits)
     m = 10
-    A = tvm.placeholder((m,), name='A')
-    C = tvm.placeholder((m,), name='C')
-    D = tvm.placeholder((m,), name='D')
-    A0 = tvm.compute((m,), lambda i: A[i] + C[i], name='A0')
-    A1 = tvm.compute((m,), lambda i: D[i] * D[i], name='A1')
-    A2 = tvm.compute((m,), lambda i: A0[i] + A1[i], name='A2')
-    B = tvm.compute((m,), lambda i: A2[i], name='B')
-    s = tvm.create_schedule(B.op)
+    A = te.placeholder((m,), name='A')
+    C = te.placeholder((m,), name='C')
+    D = te.placeholder((m,), name='D')
+    A0 = te.compute((m,), lambda i: A[i] + C[i], name='A0')
+    A1 = te.compute((m,), lambda i: D[i] * D[i], name='A1')
+    A2 = te.compute((m,), lambda i: A0[i] + A1[i], name='A2')
+    B = te.compute((m,), lambda i: A2[i], name='B')
+    s = te.create_schedule(B.op)
     A0L = s.cache_read(A0, scope_tb, [A2])
     A1L = s.cache_read(A1, scope_tb, [A2])
     A2L = s.cache_read(A2, scope_tb, [B])
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    Cc = tvm.decl_buffer(C.shape, B.dtype, name='C')
-    Dd = tvm.decl_buffer(D.shape, B.dtype, name='D')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    Cc = tvm.tir.decl_buffer(C.shape, B.dtype, name='C')
+    Dd = tvm.tir.decl_buffer(D.shape, B.dtype, name='D')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, C: Cc, D:Dd}, 64)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.StorageRewrite(stmt)
     # verify only have one allocations.
     # verify inplace folding works
     num_alloc = [0]
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             num_alloc[0] += 1
-    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 2
 
 def test_exceed_mem():
@@ -318,27 +319,27 @@ def test_inplace_rule3():
 
     register_mem(scope_tb, max_bits)
     m = 10
-    B0 = tvm.placeholder((m,), name='B0')
-    B1 = tvm.placeholder((m,), name='B1')
-    B2 = tvm.placeholder((m,), name='B2')
-    B3 = tvm.placeholder((m,), name='B3')
-    B4 = tvm.placeholder((m,), name='B4')
-    B5 = tvm.placeholder((m,), name='B5')
+    B0 = te.placeholder((m,), name='B0')
+    B1 = te.placeholder((m,), name='B1')
+    B2 = te.placeholder((m,), name='B2')
+    B3 = te.placeholder((m,), name='B3')
+    B4 = te.placeholder((m,), name='B4')
+    B5 = te.placeholder((m,), name='B5')
 
-    B6 = tvm.compute((m,), lambda i: B1[i] * B5[i], name='B6')
-    B7 = tvm.compute((m,), lambda i: B2[i] * B4[i], name='B7')
-    B8 = tvm.compute((m,), lambda i: B6[i] - B7[i], name='B8')
+    B6 = te.compute((m,), lambda i: B1[i] * B5[i], name='B6')
+    B7 = te.compute((m,), lambda i: B2[i] * B4[i], name='B7')
+    B8 = te.compute((m,), lambda i: B6[i] - B7[i], name='B8')
 
-    B9 = tvm.compute((m,), lambda i: B2[i] * B3[i], name='B9')
-    B10 = tvm.compute((m,), lambda i: B0[i] * B5[i], name='B10')
-    B11 = tvm.compute((m,), lambda i: B9[i] - B10[i], name='B11')
+    B9 = te.compute((m,), lambda i: B2[i] * B3[i], name='B9')
+    B10 = te.compute((m,), lambda i: B0[i] * B5[i], name='B10')
+    B11 = te.compute((m,), lambda i: B9[i] - B10[i], name='B11')
 
-    B12 = tvm.compute((m,), lambda i: B0[i] * B4[i], name='B12')
-    B13 = tvm.compute((m,), lambda i: B1[i] * B3[i], name='B13')
-    B14 = tvm.compute((m,), lambda i: B12[i] - B13[i], name='B14')
+    B12 = te.compute((m,), lambda i: B0[i] * B4[i], name='B12')
+    B13 = te.compute((m,), lambda i: B1[i] * B3[i], name='B13')
+    B14 = te.compute((m,), lambda i: B12[i] - B13[i], name='B14')
 
-    B = tvm.compute((m,), lambda i: B8[i] * B11[i] + B14[i], name='B')
-    s = tvm.create_schedule(B.op)
+    B = te.compute((m,), lambda i: B8[i] * B11[i] + B14[i], name='B')
+    s = te.create_schedule(B.op)
 
     B1L = s.cache_read(B1, scope_tb, [B6, B13])
     B5L = s.cache_read(B5, scope_tb, [B6, B10])
@@ -368,32 +369,32 @@ def test_inplace_rule3():
     s[B10].compute_inline()
 
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-
-    B0a = tvm.decl_buffer(B0.shape, B0.dtype, name='B0')
-    B1a = tvm.decl_buffer(B1.shape, B1.dtype, name='B1')
-    B2a = tvm.decl_buffer(B2.shape, B2.dtype, name='B2')
-    B3a = tvm.decl_buffer(B3.shape, B3.dtype, name='B3')
-    B4a = tvm.decl_buffer(B4.shape, B4.dtype, name='B4')
-    B5a = tvm.decl_buffer(B5.shape, B5.dtype, name='B5')
-
-    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {B0: B0a, B1: B1a, B2: B2a, B3: B2a, B4: B4a, B5: B5a, B: Bb}, 64)
-    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    stmt = tvm.ir_pass.StorageRewrite(stmt)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+
+    B0a = tvm.tir.decl_buffer(B0.shape, B0.dtype, name='B0')
+    B1a = tvm.tir.decl_buffer(B1.shape, B1.dtype, name='B1')
+    B2a = tvm.tir.decl_buffer(B2.shape, B2.dtype, name='B2')
+    B3a = tvm.tir.decl_buffer(B3.shape, B3.dtype, name='B3')
+    B4a = tvm.tir.decl_buffer(B4.shape, B4.dtype, name='B4')
+    B5a = tvm.tir.decl_buffer(B5.shape, B5.dtype, name='B5')
+
+    Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name='B')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {B0: B0a, B1: B1a, B2: B2a, B3: B2a, B4: B4a, B5: B5a, B: Bb}, 64)
+    stmt = tvm.tir.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.StorageRewrite(stmt)
     # verify only have one allocations.
     # verify inplace folding works
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             assert n.extents[0].value == 70
-    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
 
 def test_alloc_seq_type():
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
             A = ib.allocate("float32", 200, name="A", scope="local.L0A")
@@ -401,22 +402,22 @@ def test_alloc_seq_type():
             A[j] = 1.2
             A1[j] = 1.3
             B = ib.allocate("int16", 200, name="B", scope="local.L0A")
-            B[j] = tvm.const(1, "int16")
+            B[j] = tvm.tir.const(1, "int16")
             C = ib.allocate("int16", 200, name="C", scope="local.L0A")
-            C[j] = tvm.const(1, "int16")
+            C[j] = tvm.tir.const(1, "int16")
             D = ib.allocate("int16", 200, name="D", scope="local.L0A")
             D[j] = B[j] + C[j]
             A2 = ib.allocate("float32", 200, name="A2", scope="local.L0A")
             A2[j] = A[j]
 
     body = ib.get()
-    body = tvm.ir_pass.StorageRewrite(body)
+    body = tvm.tir.ir_pass.StorageRewrite(body)
     num_alloc = [0]
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             num_alloc[0] += 1
             assert n.extents[0].value == 500
-    tvm.ir_pass.PostOrderVisit(body, verify)
+    tvm.tir.ir_pass.PostOrderVisit(body, verify)
     assert num_alloc[0] == 1
 
 def test_alloc_seq_type2():
@@ -425,50 +426,50 @@ def test_alloc_seq_type2():
 
     register_mem(scope_tb, max_bits)
 
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
             A = ib.allocate("float32", 200, name="A", scope=scope_tb)
             A[j] = 1.2
         with ib.for_range(0, 20, name="j") as j:
             B = ib.allocate("int16", 400, name="B", scope=scope_tb)
-            B[j] = tvm.const(1, "int16")
+            B[j] = tvm.tir.const(1, "int16")
         with ib.for_range(0, 10, name="j") as j:
             C = ib.allocate("float32", 200, name="C", scope=scope_tb)
             C[j] = 1.2
 
     body = ib.get()
-    body = tvm.ir_pass.StorageRewrite(body)
+    body = tvm.tir.ir_pass.StorageRewrite(body)
     num_alloc = [0]
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             num_alloc[0] += 1
             assert n.extents[0].value == 200
-    tvm.ir_pass.PostOrderVisit(body, verify)
+    tvm.tir.ir_pass.PostOrderVisit(body, verify)
     assert num_alloc[0] == 1
 
 
 def test_reuse_small_buffer():
-    ib = tvm.ir_builder.create()
-    n = tvm.var("n")
+    ib = tvm.tir.ir_builder.create()
+    n = te.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
             A = ib.allocate("int16", 200, name="A", scope="local.L0A")
-            A[j] = tvm.const(1, "int16")
+            A[j] = tvm.tir.const(1, "int16")
             B = ib.allocate("int16", 200, name="B", scope="local.L0A")
-            B[j] = tvm.const(1, "int16")
+            B[j] = tvm.tir.const(1, "int16")
             B1 = ib.allocate("int16", 200, name="B1", scope="local.L0A")
             B1[j] = A[j] + B[j]
             C = ib.allocate("int16", 400, name="C", scope="local.L0A")
-            C[j] = tvm.const(1, "int16")
+            C[j] = tvm.tir.const(1, "int16")
             D = ib.allocate("int16", 400, name="D", scope="local.L0A")
-            D[j] = tvm.const(1, "int16")
+            D[j] = tvm.tir.const(1, "int16")
             E = ib.allocate("int16", 400, name="E", scope="local.L0A")
             E[j] = C[j]
 
     body = ib.get()
-    body = tvm.ir_pass.StorageRewrite(body)
+    body = tvm.tir.ir_pass.StorageRewrite(body)
 
     num_alloc = [0]
 
@@ -476,20 +477,20 @@ def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             num_alloc[0] += 1
             assert n.extents[0].value == 800
-    tvm.ir_pass.PostOrderVisit(body, verify)
+    tvm.tir.ir_pass.PostOrderVisit(body, verify)
     assert num_alloc[0] == 1
 
 def test_replace_dataflow():
     shape = (255,)
-    A = tvm.placeholder(shape, name = "A")
-    B = tvm.compute(shape, lambda i: A[i] + A[i], name = "B")
-    C = tvm.compute(shape, lambda i: A[i] + B[i], name = "C")
-    D = tvm.compute(shape, lambda i: A[i] + C[i], name = "D")
-    E = tvm.compute(shape, lambda i: A[i] + D[i], name = "E")
+    A = te.placeholder(shape, name = "A")
+    B = te.compute(shape, lambda i: A[i] + A[i], name = "B")
+    C = te.compute(shape, lambda i: A[i] + B[i], name = "C")
+    D = te.compute(shape, lambda i: A[i] + C[i], name = "D")
+    E = te.compute(shape, lambda i: A[i] + D[i], name = "E")
 
-    s = tvm.create_schedule(E.op)
+    s = te.create_schedule(E.op)
     s.cache_read(A, "local", [B, C, D, E])
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
 
 
@@ -505,16 +506,16 @@ def compute(a, b):
 
     n = 16384
     shape = (n, n)
-    a = tvm.placeholder(shape, name='a', dtype='int32')
-    b = tvm.placeholder(shape, name='b', dtype='int32')
-    c = tvm.compute(shape, lambda i, j: compute(a, b)[i, j])
-    c = tvm.compute(shape, lambda i, j: 1 + c[i, j])
-    s = tvm.create_schedule(c.op)
+    a = te.placeholder(shape, name='a', dtype='int32')
+    b = te.placeholder(shape, name='b', dtype='int32')
+    c = te.compute(shape, lambda i, j: compute(a, b)[i, j])
+    c = te.compute(shape, lambda i, j: 1 + c[i, j])
+    s = te.create_schedule(c.op)
     stmt = tvm.lower(s, [a, b, c], simple_mode=True)
     def verify(n):
         if isinstance(n, tvm.tir.Allocate):
             assert n.extents[0].value == 268435456
-    tvm.ir_pass.PostOrderVisit(stmt, verify)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, verify)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_storage_sync.py b/tests/python/unittest/test_pass_storage_sync.py
index 0ed0c993ac55..9edfa9575435 100644
--- a/tests/python/unittest/test_pass_storage_sync.py
+++ b/tests/python/unittest/test_pass_storage_sync.py
@@ -15,31 +15,32 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_storage_sync():
-    m = tvm.size_var('m')
-    l = tvm.size_var('l')
-    A = tvm.placeholder((m, l), name='A')
+    m = te.size_var('m')
+    l = te.size_var('l')
+    A = te.placeholder((m, l), name='A')
 
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
 
-    s = tvm.create_schedule(A2.op)
+    s = te.create_schedule(A2.op)
     xo, xi = s[A2].split(A2.op.axis[0], factor=8)
-    s[A2].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[A2].bind(xo, te.thread_axis("blockIdx.x"))
     s[A1].compute_at(s[A2], xo)
     s[A1].set_scope("shared")
 
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
-    f = tvm.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 0, True)
-    flist = tvm.ir_pass.SplitHostDevice(f)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    A2b = tvm.tir.decl_buffer(A2.shape, A2.dtype, name='A2')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
+    f = tvm.tir.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 0, True)
+    flist = tvm.tir.ir_pass.SplitHostDevice(f)
     f = flist[1]
-    f = tvm.ir_pass.ThreadSync(f, "shared")
+    f = tvm.tir.ir_pass.ThreadSync(f, "shared")
     body_list = tvm.tir.stmt_list(f.body.body.body.body)
     assert(body_list[1].value.name == "tvm_storage_sync")
 
@@ -52,10 +53,10 @@ def meminfo_cache():
             unit_bits=8,
             max_simd_bits=32,
             max_num_bits=128,
-            head_address=tvm.call_extern("handle", "global_cache"))
-    ib = tvm.ir_builder.create()
-    n = tvm.size_var("n")
-    cp = tvm.thread_axis((0, 1), "cop")
+            head_address=tvm.tir.call_extern("handle", "global_cache"))
+    ib = tvm.tir.ir_builder.create()
+    n = te.size_var("n")
+    cp = te.thread_axis((0, 1), "cop")
     A = ib.allocate("float32", 128, name="A", scope="global.cache")
     with ib.for_range(0, n, name="i") as i:
         A[i] = A[i] + 1
@@ -64,7 +65,7 @@ def meminfo_cache():
                 ib.scope_attr(cp, "coproc_scope", 1)
                 A[j] = A[j + k * 10] + 2
     stmt = ib.get()
-    stmt = tvm.ir_pass.CoProcSync(stmt)
+    stmt = tvm.tir.ir_pass.CoProcSync(stmt)
     body = stmt.body.body.body
     blist = tvm.tir.stmt_list(body)
     assert(blist[1].value.name == "cop.coproc_read_barrier")
@@ -75,10 +76,10 @@ def meminfo_cache():
 
 
 def test_coproc_sync2():
-    ib = tvm.ir_builder.create()
-    n = tvm.size_var("n")
-    cp = tvm.thread_axis((0, 1), "cop")
-    ty = tvm.thread_axis("cthread")
+    ib = tvm.tir.ir_builder.create()
+    n = te.size_var("n")
+    cp = te.thread_axis((0, 1), "cop")
+    ty = te.thread_axis("cthread")
     A = ib.allocate("float32", 128, name="A")
     ib.scope_attr(ty, "virtual_thread", 2)
     with ib.new_scope():
@@ -92,7 +93,7 @@ def test_coproc_sync2():
             ib.scope_attr(cp, "coproc_scope", 2)
             A[ty] = 1.0
     stmt = ib.get()
-    stmt = tvm.ir_pass.CoProcSync(stmt)
+    stmt = tvm.tir.ir_pass.CoProcSync(stmt)
 
 def test_coproc_sync3():
     def __check_list(tvm_array, py_list):
@@ -101,9 +102,9 @@ def __check_list(tvm_array, py_list):
                 return False
         return True
 
-    ib = tvm.ir_builder.create()
-    n = tvm.size_var("n")
-    cp = tvm.thread_axis((0, 1), "cop")
+    ib = tvm.tir.ir_builder.create()
+    n = te.size_var("n")
+    cp = te.thread_axis((0, 1), "cop")
     A = ib.allocate("float32", 128, name="A", scope="global.cache")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, n, name="i") as j:
@@ -118,7 +119,7 @@ def __check_list(tvm_array, py_list):
         A[0] = 0.0
 
     stmt = ib.get()
-    stmt = tvm.ir_pass.CoProcSync(stmt)
+    stmt = tvm.tir.ir_pass.CoProcSync(stmt)
     slist = tvm.tir.stmt_list(stmt[0].body.body)
     push_st = slist[2]
     slist = tvm.tir.stmt_list(slist[-1])
diff --git a/tests/python/unittest/test_pass_unroll.py b/tests/python/unittest/test_pass_unroll.py
index c6b536bf970e..165edab55f4e 100644
--- a/tests/python/unittest/test_pass_unroll.py
+++ b/tests/python/unittest/test_pass_unroll.py
@@ -15,14 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import os
 
 
 def test_unroll_loop():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     dtype = 'int64'
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((n, ), dtype)
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((n, ), dtype)
     Aptr = ib.buffer_ptr(Ab)
     # for i in 0 to n-1:
     with ib.for_range(n, n + 2, name="i") as i:
@@ -31,31 +32,31 @@ def test_unroll_loop():
 
     stmt = ib.get()
     assert isinstance(stmt, tvm.tir.For)
-    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, True)
+    ret = tvm.tir.ir_pass.UnrollLoop(stmt, 16, 8, 0, True)
     assert not isinstance(ret, tvm.tir.For)
-    ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, 0, True)
+    ret = tvm.tir.ir_pass.UnrollLoop(stmt, 15, 8, 0, True)
     assert isinstance(ret, tvm.tir.For)
-    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, False)
+    ret = tvm.tir.ir_pass.UnrollLoop(stmt, 16, 8, 0, False)
     assert isinstance(ret, tvm.tir.For)
     assert ret.for_type == tvm.tir.For.Unrolled
 
-    ib = tvm.ir_builder.create()
-    ib.scope_attr(tvm.const(0, "int32"), "pragma_auto_unroll_max_step", 16)
+    ib = tvm.tir.ir_builder.create()
+    ib.scope_attr(tvm.tir.const(0, "int32"), "pragma_auto_unroll_max_step", 16)
     ib.emit(stmt)
     wrapped = ib.get()
     wrapped = tvm.tir.SeqStmt([wrapped, stmt])
     assert isinstance(ret, tvm.tir.For)
-    ret = tvm.ir_pass.UnrollLoop(wrapped, 0, 8, 0, False)
+    ret = tvm.tir.ir_pass.UnrollLoop(wrapped, 0, 8, 0, False)
     assert isinstance(ret[0], tvm.tir.For)
     assert ret[0].for_type == tvm.tir.For.Unrolled
     assert isinstance(ret[1], tvm.tir.For)
     assert ret[1].for_type != tvm.tir.For.Unrolled
 
 def test_unroll_fake_loop():
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     dtype = 'int32'
-    n = tvm.size_var('n')
-    Ab = tvm.decl_buffer((n, ), dtype)
+    n = te.size_var('n')
+    Ab = tvm.tir.decl_buffer((n, ), dtype)
     Aptr = ib.buffer_ptr(Ab)
     # for i in 0 to n-1:
     with ib.for_range(0, 1, name="i") as i:
@@ -64,20 +65,20 @@ def test_unroll_fake_loop():
             Aptr[j + 1] = Aptr[i] + 1
 
     stmt = ib.get()
-    ret = tvm.ir_pass.UnrollLoop(stmt, 8, 0, 1, True)
+    ret = tvm.tir.ir_pass.UnrollLoop(stmt, 8, 0, 1, True)
     assert isinstance(ret[0], tvm.tir.Store)
 
 def test_unroll_single_count_loops():
-    n = tvm.size_var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute((n,), lambda *i: A(*i), name='B')
-    s = tvm.create_schedule(B.op)
+    n = te.size_var('n')
+    A = te.placeholder((n,), name='A')
+    B = te.compute((n,), lambda *i: A(*i), name='B')
+    s = te.create_schedule(B.op)
     s = s.normalize()
-    dom_map = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, dom_map)
+    dom_map = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
     # all parameters to UnrolLoops are default values except for
     # auto_unroll_max_extent which has been set to 1 (default:0)
-    after_unroll_stmt = tvm.ir_pass.UnrollLoop(stmt, 0, 8, 1, True)
+    after_unroll_stmt = tvm.tir.ir_pass.UnrollLoop(stmt, 0, 8, 1, True)
     assert after_unroll_stmt == stmt
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_vectorize.py b/tests/python/unittest/test_pass_vectorize.py
index d1cd2d46074a..2ade843361c0 100644
--- a/tests/python/unittest/test_pass_vectorize.py
+++ b/tests/python/unittest/test_pass_vectorize.py
@@ -15,19 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_vectorize_loop():
     dtype = 'int64'
-    n = tvm.var('n')
-    ib = tvm.ir_builder.create()
+    n = te.var('n')
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, n) as i:
         with ib.for_range(0, 4, for_type="vectorize") as j:
-            A[j] = tvm.const(1, A.dtype)
+            A[j] = tvm.tir.const(1, A.dtype)
     stmt = ib.get()
 
     assert isinstance(stmt.body, tvm.tir.For)
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
     assert isinstance(stmt, tvm.tir.For)
     assert not isinstance(stmt.body, tvm.tir.For)
     assert isinstance(stmt.body.index, tvm.tir.Ramp)
@@ -35,15 +36,15 @@ def test_vectorize_loop():
 
 def test_vectorize_vector():
     dtype = 'int64'
-    n = tvm.var('n')
-    ib = tvm.ir_builder.create()
+    n = te.var('n')
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32x4", name="A")
     with ib.for_range(0, n) as i:
         with ib.for_range(0, 4, for_type="vectorize") as j:
-            A[j] = tvm.const(1, A.dtype)
+            A[j] = tvm.tir.const(1, A.dtype)
     stmt = ib.get()
     assert isinstance(stmt.body, tvm.tir.For)
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
     assert isinstance(stmt, tvm.tir.For)
     assert not isinstance(stmt.body, tvm.tir.For)
     assert isinstance(stmt.body.index, tvm.tir.Ramp)
@@ -51,9 +52,9 @@ def test_vectorize_vector():
 
 
 def test_vectorize_with_if():
-    n = tvm.var('n')
-    x = tvm.var('x')
-    ib = tvm.ir_builder.create()
+    n = te.var('n')
+    x = te.var('x')
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, 4, for_type="vectorize") as i:
         with ib.if_scope(x < n):
@@ -62,7 +63,7 @@ def test_vectorize_with_if():
             with ib.if_scope(i < n):
                 A[i] = 2.0
     stmt = ib.get()
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
     assert isinstance(stmt, tvm.tir.IfThenElse)
     assert isinstance(stmt.then_case.index, tvm.tir.Ramp)
     assert isinstance(stmt.then_case.value, tvm.tir.Add)
@@ -70,51 +71,51 @@ def test_vectorize_with_if():
     assert isinstance(stmt.else_case, tvm.tir.For)
 
 def test_vectorize_with_le_cond():
-    n = tvm.var('n')
-    ib = tvm.ir_builder.create()
+    n = te.var('n')
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, 4, for_type="vectorize") as i:
         with ib.if_scope(i <= n):
             A[i] = A[i] + 1
     stmt = ib.get()
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
     assert isinstance(stmt, tvm.tir.For)
 
 def test_vectorize_with_ge_cond():
-    n = tvm.var('n')
-    ib = tvm.ir_builder.create()
+    n = te.var('n')
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, 4, for_type="vectorize") as i:
         with ib.if_scope(i >= n):
             A[i] = A[i] + 1
     stmt = ib.get()
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
     assert isinstance(stmt, tvm.tir.For)
 
 def test_vectorize_if_then_else():
-    n = tvm.var('n')
-    x = tvm.var('x')
-    ib = tvm.ir_builder.create()
+    n = te.var('n')
+    x = te.var('x')
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, 4, for_type="vectorize") as i:
-        A[i] = tvm.call_intrin("float32", "tvm_if_then_else",
+        A[i] = tvm.tir.call_intrin("float32", "tvm_if_then_else",
                                i > 0,
                                A[i] + 1, A[i])
     stmt = ib.get()
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
     assert isinstance(stmt, tvm.tir.For)
 
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     A = ib.pointer("float32", name="A")
     with ib.for_range(0, n) as k:
         with ib.for_range(0, 4, for_type="vectorize") as i:
-            A[k * 4 + i] = tvm.call_intrin("float32", "tvm_if_then_else",
+            A[k * 4 + i] = tvm.tir.call_intrin("float32", "tvm_if_then_else",
                                            k > 0,
                                            A[k * 4 + i], 0)
     stmt = ib.get()
     assert isinstance(stmt.body, tvm.tir.For)
-    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.tir.ir_pass.VectorizeLoop(stmt)
     assert not isinstance(stmt.body, tvm.tir.For)
     assert isinstance(stmt.body.value.args[2], tvm.tir.Broadcast)
 
diff --git a/tests/python/unittest/test_pass_verify_gpu_code.py b/tests/python/unittest/test_pass_verify_gpu_code.py
index 76e5f0d38c3c..6e138a29b3e9 100644
--- a/tests/python/unittest/test_pass_verify_gpu_code.py
+++ b/tests/python/unittest/test_pass_verify_gpu_code.py
@@ -16,10 +16,11 @@
 # under the License.
 """Test gpu code verifier"""
 import tvm
+from tvm import te
 
 def get_verify_pass(valid, **kwargs):
     def verify_pass(stmt):
-        valid[0] = tvm.ir_pass.VerifyGPUCode(stmt, kwargs)
+        valid[0] = tvm.tir.ir_pass.VerifyGPUCode(stmt, kwargs)
         return stmt
     return verify_pass
 
@@ -31,15 +32,15 @@ def check_shared_memory(dtype):
         tvm_type = tvm.runtime.DataType(dtype)
         type_size = tvm_type.bits // 8 * tvm_type.lanes
 
-        A = tvm.placeholder((N,), name='A', dtype=dtype)
-        B = tvm.compute((N, ), lambda i: A[i], name='B')
+        A = te.placeholder((N,), name='A', dtype=dtype)
+        B = te.compute((N, ), lambda i: A[i], name='B')
 
-        s = tvm.create_schedule([B.op])
+        s = te.create_schedule([B.op])
         AA = s.cache_read(A, "shared", [B])
         o, i = s[B].split(s[B].op.axis[0], M)
         s[AA].compute_at(s[B], o)
-        s[B].bind(o, tvm.thread_axis("blockIdx.x"))
-        s[B].bind(i, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(o, te.thread_axis("blockIdx.x"))
+        s[B].bind(i, te.thread_axis("threadIdx.x"))
 
         # shared memory usage: M * sizeof(dtype) Bytes
         # thread usage: M
@@ -48,14 +49,14 @@ def check_shared_memory(dtype):
             if not tvm.context(target).exist:
                 continue
             valid = [None]
-            with tvm.build_config(**{"add_lower_pass": [
+            with tvm.target.build_config(**{"add_lower_pass": [
                 (2, get_verify_pass(valid,
                                     max_shared_memory_per_block=type_size * M - 1,
                                     max_threads_per_block=M))]}):
                 tvm.build(s, [A, B], target)
             assert not valid[0]
 
-            with tvm.build_config(**{"add_lower_pass": [
+            with tvm.target.build_config(**{"add_lower_pass": [
                 (2, get_verify_pass(valid,
                                     max_shared_memory_per_block=type_size * M,
                                     max_threads_per_block=M))]}):
@@ -68,14 +69,14 @@ def test_local_memory():
     N = 1024
     M = 128
 
-    A = tvm.placeholder((N,), name='A', dtype='float32')
-    B = tvm.compute((N, ), lambda i: A[i], name='B')
+    A = te.placeholder((N,), name='A', dtype='float32')
+    B = te.compute((N, ), lambda i: A[i], name='B')
 
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
     AA = s.cache_read(A, "local", [B])
     o, i = s[B].split(s[B].op.axis[0], M)
     s[AA].compute_at(s[B], o)
-    s[B].bind(o, tvm.thread_axis("blockIdx.x"))
+    s[B].bind(o, te.thread_axis("blockIdx.x"))
 
     # local memory usage: M * 4B
     # thread usage: M
@@ -85,14 +86,14 @@ def test_local_memory():
             continue
 
         valid = [None]
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
             (2, get_verify_pass(valid,
                                 max_local_memory_per_block=4 * M - 1,
                                 max_threads_per_block=1))]}):
             tvm.build(s, [A, B], target)
         assert not valid[0]
 
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
             (2, get_verify_pass(valid,
                                 max_local_memory_per_block=4 * M,
                                 max_threads_per_block=1))]}):
@@ -103,14 +104,14 @@ def test_num_thread():
     N = 1024
     M = 128
 
-    A = tvm.placeholder((N,), name='A', dtype='float32')
-    B = tvm.compute((N, ), lambda i: A[i], name='B')
+    A = te.placeholder((N,), name='A', dtype='float32')
+    B = te.compute((N, ), lambda i: A[i], name='B')
 
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
     o, i = s[B].split(s[B].op.axis[0], M)
 
-    s[B].bind(o, tvm.thread_axis('threadIdx.x'))
-    s[B].bind(i, tvm.thread_axis("threadIdx.y"))
+    s[B].bind(o, te.thread_axis('threadIdx.x'))
+    s[B].bind(i, te.thread_axis("threadIdx.y"))
 
     # shared memory usage: 0
     # thread usage: N
@@ -120,21 +121,21 @@ def test_num_thread():
             continue
 
         valid = [None]
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
             (2, get_verify_pass(valid,
                                 max_shared_memory_per_block=0,
                                 max_threads_per_block=N - 1))]}):
             tvm.build(s, [A, B], target)
         assert not valid[0]
 
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
             (2, get_verify_pass(valid,
                                 max_shared_memory_per_block=0,
                                 max_threads_per_block=N))]}):
             tvm.build(s, [A, B], target)
         assert valid[0]
 
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
             (2, get_verify_pass(valid,
                                 max_shared_memory_per_block=0,
                                 max_threads_per_block=N,
@@ -142,7 +143,7 @@ def test_num_thread():
             tvm.build(s, [A, B], target)
         assert not valid[0]
 
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
             (2, get_verify_pass(valid,
                                 max_shared_memory_per_block=0,
                                 max_threads_per_block=N,
@@ -153,14 +154,14 @@ def test_num_thread():
 def test_multiple_kernels():
     N = 1024
 
-    A = tvm.placeholder((N, N), name='A')
-    B = tvm.compute((N, N), lambda i, j: A[i, j])
-    C = tvm.compute((N, N), lambda i, j: B[i, j])
+    A = te.placeholder((N, N), name='A')
+    B = te.compute((N, N), lambda i, j: A[i, j])
+    C = te.compute((N, N), lambda i, j: B[i, j])
 
-    s = tvm.create_schedule([C.op])
+    s = te.create_schedule([C.op])
 
-    s[C].bind(s[C].op.axis[1], tvm.thread_axis("threadIdx.x"))
-    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))
+    s[C].bind(s[C].op.axis[1], te.thread_axis("threadIdx.x"))
+    s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x"))
 
     # shared memory usage: 0
     # thread usage: N
@@ -170,14 +171,14 @@ def test_multiple_kernels():
             continue
 
         valid = [None]
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
             (2, get_verify_pass(valid,
                                 max_shared_memory_per_block=0,
                                 max_threads_per_block=N - 1))]}):
             tvm.build(s, [A, C], target)
         assert not valid[0]
 
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
             (2, get_verify_pass(valid,
                                 max_shared_memory_per_block=0,
                                 max_threads_per_block=N))]}):
@@ -187,21 +188,21 @@ def test_multiple_kernels():
 def test_wrong_bind():
     N = 1024
 
-    A = tvm.placeholder((N, N-1), name='A')
-    B = tvm.compute((N, N-1), lambda i, j: A[i, j])
+    A = te.placeholder((N, N-1), name='A')
+    B = te.compute((N, N-1), lambda i, j: A[i, j])
 
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
 
     # bind a thread axis to two loop axes with different lengths
-    s[B].bind(s[B].op.axis[0], tvm.thread_axis("threadIdx.x"))
-    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))
+    s[B].bind(s[B].op.axis[0], te.thread_axis("threadIdx.x"))
+    s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x"))
 
     for target in ['opencl', 'cuda']:
         if not tvm.context(target).exist:
             continue
 
         valid = [None]
-        with tvm.build_config(**{"add_lower_pass": [
+        with tvm.target.build_config(**{"add_lower_pass": [
                 (2, get_verify_pass(valid, max_threads_per_block=N*N))]}):
             tvm.build(s, [A, B], target)
         assert not valid[0]
diff --git a/tests/python/unittest/test_pass_verify_memory.py b/tests/python/unittest/test_pass_verify_memory.py
index e76b6e55144f..3747caed1586 100644
--- a/tests/python/unittest/test_pass_verify_memory.py
+++ b/tests/python/unittest/test_pass_verify_memory.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 # The following DLDeviceType/TVMDeviceExtType values
 # are originally defined in dlpack.h and c_runtime_api.h.
@@ -26,19 +27,19 @@ def lower(sch, args):
     binds = {}
     arg_list = []
     for x in args:
-        if isinstance(x, tvm.tensor.Tensor):
-            buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
+        if isinstance(x, te.tensor.Tensor):
+            buf = tvm.tir.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
             assert x not in binds
             binds[x] = buf
             arg_list.append(buf)
         else:
             raise ValueError("args must be Tensor, Buffer or Var")
     sch = sch.normalize()
-    bounds = tvm.schedule.InferBound(sch)
-    stmt = tvm.schedule.ScheduleOps(sch, bounds)
-    stmt = tvm.ir_pass.LoopPartition(stmt, False)
-    stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64)
-    func = tvm.ir_pass.MakeAPI(stmt, "myadd", arg_list, 0, True)
+    bounds = tvm.te.schedule.InferBound(sch)
+    stmt = tvm.te.schedule.ScheduleOps(sch, bounds)
+    stmt = tvm.tir.ir_pass.LoopPartition(stmt, False)
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, binds, 64)
+    func = tvm.tir.ir_pass.MakeAPI(stmt, "myadd", arg_list, 0, True)
     return func
 
 
@@ -46,63 +47,63 @@ def lower(sch, args):
 # So VerifyMemory pass is expected to succeed.
 #
 def test_verify_memory_all_bind():
-  n = tvm.var("n")
-  A = tvm.placeholder((n,), name='A')
-  B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B")
+  n = te.var("n")
+  A = te.placeholder((n,), name='A')
+  B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
 
   # B is bound to threads.
-  s = tvm.create_schedule(B.op)
+  s = te.create_schedule(B.op)
   bx, tx = s[B].split(B.op.axis[0], factor=64)
-  s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-  s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+  s[B].bind(bx, te.thread_axis("blockIdx.x"))
+  s[B].bind(tx, te.thread_axis("threadIdx.x"))
 
   func = lower(s, [A, B])
 
   for dev_type in gpu_devices + other_devices:
-    assert tvm.ir_pass.VerifyMemory(func, dev_type)
+    assert tvm.tir.ir_pass.VerifyMemory(func, dev_type)
 
 
 # Computations are not bound.
 # So VerifyMemory pass fails when device type is GPU.
 #
 def test_verify_memory_not_bind():
-  n = tvm.var("n")
-  A = tvm.placeholder((n,), name='A')
-  B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B")
+  n = te.var("n")
+  A = te.placeholder((n,), name='A')
+  B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
 
   # B is not bound to threads.
-  s = tvm.create_schedule(B.op)
+  s = te.create_schedule(B.op)
 
   func = lower(s, [A, B])
 
   for dev_type in gpu_devices:
-    assert not tvm.ir_pass.VerifyMemory(func, dev_type)
+    assert not tvm.tir.ir_pass.VerifyMemory(func, dev_type)
   for dev_type in other_devices:
-    assert tvm.ir_pass.VerifyMemory(func, dev_type)
+    assert tvm.tir.ir_pass.VerifyMemory(func, dev_type)
 
 
 # Computations are partially bound.
 # So VerifyMemory pass fails when device type is GPU.
 #
 def test_verify_memory_partially_bind():
-  n = tvm.var("n")
-  A = tvm.placeholder((n,), name='A')
-  B = tvm.compute(A.shape, lambda i: A[i] + 1.0, name="B")
-  C = tvm.compute(B.shape, lambda i: B[i] + 2.0, name="C")
-  D = tvm.compute(C.shape, lambda i: C[i] + 2.0, name="D")
+  n = te.var("n")
+  A = te.placeholder((n,), name='A')
+  B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B")
+  C = te.compute(B.shape, lambda i: B[i] + 2.0, name="C")
+  D = te.compute(C.shape, lambda i: C[i] + 2.0, name="D")
 
   # C is bound to threads, but B and D are not.
-  s = tvm.create_schedule([B.op, C.op, D.op])
+  s = te.create_schedule([B.op, C.op, D.op])
   bx, tx = s[C].split(C.op.axis[0], factor=64)
-  s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-  s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+  s[C].bind(bx, te.thread_axis("blockIdx.x"))
+  s[C].bind(tx, te.thread_axis("threadIdx.x"))
 
   func = lower(s, [A, B, C, D])
 
   for dev_type in gpu_devices:
-    assert not tvm.ir_pass.VerifyMemory(func, dev_type)
+    assert not tvm.tir.ir_pass.VerifyMemory(func, dev_type)
   for dev_type in other_devices:
-    assert tvm.ir_pass.VerifyMemory(func, dev_type)
+    assert tvm.tir.ir_pass.VerifyMemory(func, dev_type)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_pass_virtual_thread.py b/tests/python/unittest/test_pass_virtual_thread.py
index 48a769faed31..2d96696eed88 100644
--- a/tests/python/unittest/test_pass_virtual_thread.py
+++ b/tests/python/unittest/test_pass_virtual_thread.py
@@ -15,29 +15,30 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_virtual_thread():
-    m = tvm.var('m')
-    A = tvm.placeholder((m, ), name='A')
-    A1 = tvm.compute((m,), lambda i: A[i], name='A1')
-    A2 = tvm.compute((m,), lambda i: A1[i] + 3, name='A2')
+    m = te.var('m')
+    A = te.placeholder((m, ), name='A')
+    A1 = te.compute((m,), lambda i: A[i], name='A1')
+    A2 = te.compute((m,), lambda i: A1[i] + 3, name='A2')
 
-    s = tvm.create_schedule(A2.op)
-    vx = tvm.thread_axis("vthread", name="vx")
+    s = te.create_schedule(A2.op)
+    vx = te.thread_axis("vthread", name="vx")
     xo, xi = s[A2].split(A2.op.axis[0], nparts=2)
     s[A2].bind(xo, vx)
     xo, xi = s[A2].split(xi, 8)
     s[A1].compute_at(s[A2], xo)
 
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
-    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
-    A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2')
-    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
-    stmt = tvm.ir_pass.Simplify(stmt)
-    stmt = tvm.ir_pass.InjectVirtualThread(stmt)
+    Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name='A')
+    A2b = tvm.tir.decl_buffer(A2.shape, A2.dtype, name='A2')
+    stmt = tvm.tir.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
+    stmt = tvm.tir.ir_pass.Simplify(stmt)
+    stmt = tvm.tir.ir_pass.InjectVirtualThread(stmt)
     print(stmt)
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_runtime_error.py b/tests/python/unittest/test_runtime_error.py
index ac019a0aab40..70166b327cb6 100644
--- a/tests/python/unittest/test_runtime_error.py
+++ b/tests/python/unittest/test_runtime_error.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test runtime error handling"""
 import tvm
+from tvm import te
 import tvm.testing
 
 def test_op_translation():
diff --git a/tests/python/unittest/test_runtime_extension.py b/tests/python/unittest/test_runtime_extension.py
index 5207b0956941..375b99b0ad31 100644
--- a/tests/python/unittest/test_runtime_extension.py
+++ b/tests/python/unittest/test_runtime_extension.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 @tvm.register_extension
@@ -29,16 +30,16 @@ def _tvm_handle(self):
 
 def test_dltensor_compatible():
     dtype = 'int64'
-    n = tvm.var('n')
-    Ab = tvm.decl_buffer((n,), dtype)
-    i = tvm.var('i')
-    ib = tvm.ir_builder.create()
+    n = te.var('n')
+    Ab = tvm.tir.decl_buffer((n,), dtype)
+    i = te.var('i')
+    ib = tvm.tir.ir_builder.create()
     A = ib.buffer_ptr(Ab)
     with ib.for_range(0, n - 1, "i") as i:
         A[i + 1] = A[i] + 1
     stmt = ib.get()
-    fapi = tvm.ir_pass.MakeAPI(stmt, "arange", [Ab], 0, True)
-    fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+    fapi = tvm.tir.ir_pass.MakeAPI(stmt, "arange", [Ab], 0, True)
+    fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
     f = tvm.target.codegen.build_module(fapi, "stackvm")
     a = tvm.nd.array(np.zeros(10, dtype=dtype))
     aview = MyTensorView(a)
diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py
index da5bea1f19ff..ee2cd718e45f 100644
--- a/tests/python/unittest/test_runtime_graph.py
+++ b/tests/python/unittest/test_runtime_graph.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 import json
 from tvm import rpc
@@ -22,9 +23,9 @@
 
 def test_graph_simple():
     n = 4
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
 
     node0 = {"op": "null", "name": "x", "inputs": []}
     node1 = {"op": "tvm_op", "name": "add",
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index aeb4809e6c86..658d9eb95ef9 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -16,6 +16,7 @@
 # under the License.
 import os
 import tvm
+from tvm import te
 import numpy as np
 import json
 from tvm import rpc
@@ -24,9 +25,9 @@
 
 def test_graph_simple():
     n = 4
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
 
     node0 = {"op": "null", "name": "x", "inputs": []}
     node1 = {"op": "tvm_op", "name": "add",
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
index a718ed8342e6..8ca61c1920ba 100644
--- a/tests/python/unittest/test_runtime_heterogeneous.py
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime, util
 import topi
 
@@ -132,9 +133,9 @@ def check_device(device, target_device):
         shape = (4,)
 
         # Create module for add whose target is the device.
-        tensor_a = tvm.placeholder(shape, name="A")
-        tensor_b = tvm.placeholder(shape, name="B")
-        elemwise_add = tvm.compute(shape, lambda *i: tensor_a(*i)
+        tensor_a = te.placeholder(shape, name="A")
+        tensor_b = te.placeholder(shape, name="B")
+        elemwise_add = te.compute(shape, lambda *i: tensor_a(*i)
                                    + tensor_b(*i), name="elemwise_add")
         target = topi.cpp.TEST_create_target(device)
         schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
@@ -144,13 +145,13 @@ def check_device(device, target_device):
         # Insert copy. Neither compute nor schedule is required for the copy
         # node. The compute will be performed at runtime which is just data
         # copy from the input to the output.
-        tensor_copy = tvm.placeholder(shape, name="__copy")
+        tensor_copy = te.placeholder(shape, name="__copy")
 
         # Create module for sub whose target is the host.
-        tensor_c = tvm.placeholder(shape, name="C")
-        elemwise_sub = tvm.compute(shape, lambda *i: tensor_copy(*i)
+        tensor_c = te.placeholder(shape, name="C")
+        elemwise_sub = te.compute(shape, lambda *i: tensor_copy(*i)
                                    - tensor_c(*i), name="elemwise_sub")
-        schedule_sub = tvm.create_schedule(elemwise_sub.op)
+        schedule_sub = te.create_schedule(elemwise_sub.op)
         lower_sub = tvm.lower(schedule_sub, [tensor_copy, tensor_c,
                                              elemwise_sub],
                               name="elemwise_sub")
@@ -321,17 +322,17 @@ def check_device(device, target_device):
 
         # Insert copy nodes for data transferring between add and sub nodes.
         # Transfers data from gpu to cpu.
-        copy_add_sub = tvm.placeholder(shape, name="__copy0")
+        copy_add_sub = te.placeholder(shape, name="__copy0")
         # Transfers data from cpu to gpu.
-        copy_sub_add = tvm.placeholder(shape, name="__copy1")
+        copy_sub_add = te.placeholder(shape, name="__copy1")
 
         # Create a module containing adds on the device.
-        tensor_a = tvm.placeholder(shape, name="A")
-        tensor_b = tvm.placeholder(shape, name="B")
-        tensor_d = tvm.placeholder(shape, name="D")
-        elemwise_add0 = tvm.compute(shape, lambda *i: tensor_a(*i)
+        tensor_a = te.placeholder(shape, name="A")
+        tensor_b = te.placeholder(shape, name="B")
+        tensor_d = te.placeholder(shape, name="D")
+        elemwise_add0 = te.compute(shape, lambda *i: tensor_a(*i)
                                     + tensor_b(*i), name="elemwise_add0")
-        elemwise_add1 = tvm.compute(shape, lambda *i: copy_sub_add(*i)
+        elemwise_add1 = te.compute(shape, lambda *i: copy_sub_add(*i)
                                     + tensor_d(*i), name="elemwise_add1")
         target = topi.cpp.TEST_create_target(device)
         add_schedule0 = topi.cpp.cuda.schedule_injective(
@@ -345,10 +346,10 @@ def check_device(device, target_device):
             add_schedule1, [tensor_d, copy_sub_add, elemwise_add1],
             name="elemwise_add1")
         # Create module for sub whose target is the host.
-        tensor_c = tvm.placeholder(shape, name="C")
-        elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i)
+        tensor_c = te.placeholder(shape, name="C")
+        elemwise_sub = te.compute(shape, lambda *i: copy_add_sub(*i)
                                    - tensor_c(*i), name="elemwise_sub")
-        sub_schedule = tvm.create_schedule(elemwise_sub.op)
+        sub_schedule = te.create_schedule(elemwise_sub.op)
         lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c,
                                              elemwise_sub],
                               name="elemwise_sub")
diff --git a/tests/python/unittest/test_runtime_measure.py b/tests/python/unittest/test_runtime_measure.py
index 7413a3732086..25361a191263 100644
--- a/tests/python/unittest/test_runtime_measure.py
+++ b/tests/python/unittest/test_runtime_measure.py
@@ -18,6 +18,7 @@
 import ctypes
 
 import tvm
+from tvm import te
 from tvm.contrib.util import tempdir
 
 
@@ -32,8 +33,8 @@ def my_debug(filename):
         with open(filename, "a") as fout:
             fout.write("c")
 
-    X = tvm.compute((), lambda : tvm.call_packed("my_debug", filename))
-    s = tvm.create_schedule(X.op)
+    X = te.compute((), lambda : tvm.tir.call_packed("my_debug", filename))
+    s = te.create_schedule(X.op)
     func = tvm.build(s, [X])
 
     x = tvm.nd.empty((), dtype="int32")
diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py
index f6114dae6f1b..28fdb11c3de4 100644
--- a/tests/python/unittest/test_runtime_micro.py
+++ b/tests/python/unittest/test_runtime_micro.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime, util
 from tvm import relay
 import tvm.micro as micro
@@ -46,7 +47,7 @@ def relay_micro_build(func, dev_config, params=None):
     mod : tvm.runtime.Module
         graph runtime module for the target device
     """
-    with tvm.build_config(disable_vectorize=True):
+    with tvm.target.build_config(disable_vectorize=True):
         graph, c_mod, params = relay.build(func, target="c", params=params)
     micro_mod = create_micro_mod(c_mod, dev_config)
     ctx = tvm.micro_dev(0)
@@ -76,11 +77,11 @@ def test_add():
     dtype = "float32"
 
     # Construct TVM expression.
-    tvm_shape = tvm.convert(shape)
-    A = tvm.placeholder(tvm_shape, name="A", dtype=dtype)
-    B = tvm.placeholder(tvm_shape, name="B", dtype=dtype)
-    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    s = tvm.create_schedule(C.op)
+    tvm_shape = tvm.runtime.convert(shape)
+    A = te.placeholder(tvm_shape, name="A", dtype=dtype)
+    B = te.placeholder(tvm_shape, name="B", dtype=dtype)
+    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
+    s = te.create_schedule(C.op)
 
     func_name = "fadd"
     c_mod = tvm.build(s, [A, B, C], target="c", name=func_name)
@@ -105,12 +106,12 @@ def test_workspace_add():
     dtype = "float32"
 
     # Construct TVM expression.
-    tvm_shape = tvm.convert(shape)
-    A = tvm.placeholder(tvm_shape, name="A", dtype=dtype)
-    B = tvm.placeholder(tvm_shape, name="B", dtype=dtype)
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name="B")
-    C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name="C")
-    s = tvm.create_schedule(C.op)
+    tvm_shape = tvm.runtime.convert(shape)
+    A = te.placeholder(tvm_shape, name="A", dtype=dtype)
+    B = te.placeholder(tvm_shape, name="B", dtype=dtype)
+    B = te.compute(A.shape, lambda *i: A(*i) + 1, name="B")
+    C = te.compute(A.shape, lambda *i: B(*i) + 1, name="C")
+    s = te.create_schedule(C.op)
 
     func_name = "fadd_two_workspace"
     c_mod = tvm.build(s, [A, C], target="c", name=func_name)
diff --git a/tests/python/unittest/test_runtime_module_export.py b/tests/python/unittest/test_runtime_module_export.py
index ee82da65c8f4..35bafb4ba3c7 100644
--- a/tests/python/unittest/test_runtime_module_export.py
+++ b/tests/python/unittest/test_runtime_module_export.py
@@ -17,6 +17,7 @@
 from tvm import relay
 from tvm.relay import testing
 import tvm
+from tvm import te
 
 from tvm.contrib import util
 header_file_dir_path = util.tempdir()
@@ -95,9 +96,9 @@ def verify_multi_dso_mod_export(obj_format):
         with relay.build_config(opt_level=3):
             _, resnet18_cpu_lib, _ = relay.build_module.build(resnet18_mod, "llvm", params=resnet18_params)
 
-        A = tvm.placeholder((1024,), name='A')
-        B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-        s = tvm.create_schedule(B.op)
+        A = te.placeholder((1024,), name='A')
+        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+        s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "llvm", name="myadd")
         from tvm.contrib import util
         temp = util.tempdir()
@@ -144,9 +145,9 @@ def verify_json_import_dso(obj_format):
             f.write(subgraph_json)
 
         # Get Json and module.
-        A = tvm.placeholder((1024,), name='A')
-        B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-        s = tvm.create_schedule(B.op)
+        A = te.placeholder((1024,), name='A')
+        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+        s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "llvm", name="myadd")
         try:
             ext_lib = tvm.runtime.load_module(subgraph_path, "examplejson")
@@ -179,9 +180,9 @@ def verify_multi_c_mod_export():
         with relay.build_config(opt_level=3):
             _, resnet18_cpu_lib, _ = relay.build_module.build(resnet18_mod, "llvm", params=resnet18_params)
 
-        A = tvm.placeholder((1024,), name='A')
-        B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-        s = tvm.create_schedule(B.op)
+        A = te.placeholder((1024,), name='A')
+        B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+        s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "c", name="myadd")
         engine_module = generate_engine_module()
         from tvm.contrib import util
diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py
index 1cbc157a154c..e7771e3c6721 100644
--- a/tests/python/unittest/test_runtime_module_load.py
+++ b/tests/python/unittest/test_runtime_module_load.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm.contrib import cc, util
 import ctypes
 import os
@@ -29,6 +30,7 @@
 os.environ["TVM_USE_RUNTIME_LIB"] = "1"
 os.environ["TVM_FFI"] = "ctypes"
 import tvm
+from tvm import te
 import numpy as np
 path_dso = sys.argv[1]
 dtype = sys.argv[2]
@@ -46,17 +48,17 @@ def test_dso_module_load():
     temp = util.tempdir()
 
     def save_object(names):
-        n = tvm.size_var('n')
-        Ab = tvm.decl_buffer((n, ), dtype)
-        i = tvm.var('i')
+        n = te.size_var('n')
+        Ab = tvm.tir.decl_buffer((n, ), dtype)
+        i = te.var('i')
         # for i in 0 to n-1:
         stmt = tvm.tir.For(
             i, 0, n - 1, 0, 0,
             tvm.tir.Store(Ab.data,
                            tvm.tir.Load(dtype, Ab.data, i) + 1,
                            i + 1))
-        fapi = tvm.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
-        fapi = tvm.ir_pass.LowerTVMBuiltin(fapi)
+        fapi = tvm.tir.ir_pass.MakeAPI(stmt, "ramp", [Ab], 0, True)
+        fapi = tvm.tir.ir_pass.LowerTVMBuiltin(fapi)
         m = tvm.target.codegen.build_module(fapi, "llvm")
         for name in names:
             m.save(name)
@@ -88,15 +90,15 @@ def save_object(names):
 
 def test_device_module_dump():
     # graph
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
     # create iter var and assign them tags.
     num_thread = 8
     bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-    s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[B].bind(bx, te.thread_axis("blockIdx.x"))
+    s[B].bind(tx, te.thread_axis("threadIdx.x"))
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -150,10 +152,10 @@ def test_combine_module_llvm():
     """Test combine multiple module into one shared lib."""
     # graph
     nn = 12
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
 
     def check_llvm():
         ctx = tvm.cpu(0)
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py
index ed23a0bc9d9d..e3143794cc34 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def enabled_ctx_list():
@@ -55,10 +56,10 @@ def test_fp16_conversion():
     n = 100
 
     for (src, dst) in [('float32', 'float16'), ('float16', 'float32')]:
-        A = tvm.placeholder((n,), dtype=src)
-        B = tvm.compute((n,), lambda i: A[i].astype(dst))
+        A = te.placeholder((n,), dtype=src)
+        B = te.compute((n,), lambda i: A[i].astype(dst))
 
-        s = tvm.create_schedule([B.op])
+        s = te.create_schedule([B.op])
         func = tvm.build(s, [A, B], 'llvm')
 
         x_tvm = tvm.nd.array(100 * np.random.randn(n).astype(src) - 50)
diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_packed_func.py
index 4f7377008c76..3570fe149608 100644
--- a/tests/python/unittest/test_runtime_packed_func.py
+++ b/tests/python/unittest/test_runtime_packed_func.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import tvm.testing
 import numpy as np
 
@@ -32,12 +33,12 @@ def my_packed_func(*args):
     assert y == 10
 
 def test_get_callback_with_node():
-    x = tvm.convert(10)
+    x = tvm.runtime.convert(10)
     def test(y):
         assert y.handle != x.handle
         return y
 
-    f2 = tvm.convert(test)
+    f2 = tvm.runtime.convert(test)
     # register into global function table
     @tvm.register_func
     def my_callback_with_node(y, f):
@@ -54,9 +55,9 @@ def my_callback_with_node(y, f):
 def test_return_func():
     def addy(y):
         def add(x):
-            return tvm.convert(x + y)
+            return tvm.runtime.convert(x + y)
         return add
-    myf = tvm.convert(addy)
+    myf = tvm.runtime.convert(addy)
     f = myf(10)
     assert f(11).value == 21
 
@@ -67,7 +68,7 @@ def test_convert():
     def myfunc(*args):
         assert(tuple(args) == targs)
 
-    f = tvm.convert(myfunc)
+    f = tvm.runtime.convert(myfunc)
     assert isinstance(f, tvm.runtime.PackedFunc)
 
 def test_byte_array():
@@ -76,15 +77,15 @@ def test_byte_array():
 
     def myfunc(ss):
         assert ss == a
-    f = tvm.convert(myfunc)
+    f = tvm.runtime.convert(myfunc)
     f(a)
 
 
 def test_empty_array():
     def myfunc(ss):
         assert tuple(ss) == ()
-    x = tvm.convert(())
-    tvm.convert(myfunc)(x)
+    x = tvm.runtime.convert(())
+    tvm.runtime.convert(myfunc)(x)
 
 
 def test_ctx():
@@ -99,25 +100,25 @@ def test_ctx_func(ctx):
 
 def test_trace_default_action():
     n = 2
-    x = tvm.placeholder((n,n,n), name="X", dtype="float32")
-    y = tvm.compute(x.shape, lambda i, j, k: tvm.trace([i, j, k, x[i][j][k]]))
-    s = tvm.create_schedule(y.op)
+    x = te.placeholder((n,n,n), name="X", dtype="float32")
+    y = te.compute(x.shape, lambda i, j, k: tvm.tir.trace([i, j, k, x[i][j][k]]))
+    s = te.create_schedule(y.op)
     f = tvm.build(s, [x, y], target="llvm")
     xnd = tvm.nd.array(np.ones((n,n,n), dtype=x.dtype))
     ynd = tvm.nd.array(np.zeros((n,n,n), dtype=y.dtype))
     f(xnd, ynd)
 
 def test_trace_expr_assign():
-    @tvm.register_func("tvm.trace_callback2")
+    @tvm.register_func("tvm.tir.trace_callback2")
     def trace_buffer(x):
         return
 
     def check_assign(dtype):
         n = 4
-        x = tvm.placeholder((n,n,n), name="X", dtype=dtype)
-        y = tvm.compute(x.shape, lambda i, j, k: tvm.trace([x[i][j][k]], "tvm.trace_callback2"))
-        z = tvm.compute(x.shape, lambda i, j, k: tvm.trace([y[i][j][k]], "tvm.trace_callback2"))
-        s = tvm.create_schedule(z.op)
+        x = te.placeholder((n,n,n), name="X", dtype=dtype)
+        y = te.compute(x.shape, lambda i, j, k: tvm.tir.trace([x[i][j][k]], "tvm.tir.trace_callback2"))
+        z = te.compute(x.shape, lambda i, j, k: tvm.tir.trace([y[i][j][k]], "tvm.tir.trace_callback2"))
+        s = te.create_schedule(z.op)
         f = tvm.build(s, [x, y, z], "llvm")
 
         xnd = tvm.nd.array(np.ones((n,n,n), dtype=x.dtype))
@@ -133,17 +134,17 @@ def check_assign(dtype):
         check_assign(t)
 
 def test_trace_expr_sum_generated():
-    @tvm.register_func("tvm.trace_callback3")
+    @tvm.register_func("tvm.tir.trace_callback3")
     def trace_buffer(x):
         return
 
     def check_expr_sum(dtype):
         n = 4
-        a = tvm.placeholder((n,n,n), name="a", dtype=dtype)
-        b = tvm.placeholder((n,n,n), name="b", dtype=dtype)
-        c = tvm.compute(a.shape, lambda i, j, k: tvm.trace([a[i][j][k]],"tvm.trace_callback3")
-                                         + tvm.trace([b[i][j][k]],"tvm.trace_callback3"))
-        s = tvm.create_schedule(c.op)
+        a = te.placeholder((n,n,n), name="a", dtype=dtype)
+        b = te.placeholder((n,n,n), name="b", dtype=dtype)
+        c = te.compute(a.shape, lambda i, j, k: tvm.tir.trace([a[i][j][k]],"tvm.tir.trace_callback3")
+                                         + tvm.tir.trace([b[i][j][k]],"tvm.tir.trace_callback3"))
+        s = te.create_schedule(c.op)
         f = tvm.build(s, [a, b, c])
         xnd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=a.dtype)))
         ynd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=b.dtype)))
@@ -155,22 +156,22 @@ def check_expr_sum(dtype):
         check_expr_sum(t)
 
 def test_trace_expr_sum_args():
-    @tvm.register_func("tvm.trace_silent")
+    @tvm.register_func("tvm.tir.trace_silent")
     def silent(*args):
       return
 
     def check_expr_sum(dtype):
         n = 4
-        a = tvm.placeholder((n,n,n), name="a", dtype=dtype)
-        b = tvm.placeholder((n,n,n), name="b", dtype=dtype)
-        e = tvm.placeholder((n,n,n), name="e", dtype=dtype)
-        d = tvm.placeholder((n,n,n), name="d", dtype=dtype)
-
-        c = tvm.compute(a.shape, lambda i, j, k: tvm.trace([i, j, k, a[i][j][k]], "tvm.trace_silent")
-                                               + tvm.trace([i, j, k, b[i][j][k]], "tvm.trace_silent")
-                                               + tvm.trace([i, j, k, d[i][j][k]], "tvm.trace_silent")
-                                               + tvm.trace([i, j, k, e[i][j][k]], "tvm.trace_silent"))
-        s = tvm.create_schedule(c.op)
+        a = te.placeholder((n,n,n), name="a", dtype=dtype)
+        b = te.placeholder((n,n,n), name="b", dtype=dtype)
+        e = te.placeholder((n,n,n), name="e", dtype=dtype)
+        d = te.placeholder((n,n,n), name="d", dtype=dtype)
+
+        c = te.compute(a.shape, lambda i, j, k: tvm.tir.trace([i, j, k, a[i][j][k]], "tvm.tir.trace_silent")
+                                               + tvm.tir.trace([i, j, k, b[i][j][k]], "tvm.tir.trace_silent")
+                                               + tvm.tir.trace([i, j, k, d[i][j][k]], "tvm.tir.trace_silent")
+                                               + tvm.tir.trace([i, j, k, e[i][j][k]], "tvm.tir.trace_silent"))
+        s = te.create_schedule(c.op)
         f = tvm.build(s, [a, b, d, e, c])
         a_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=a.dtype)))
         b_nd = tvm.nd.array(np.array(np.ones((n,n,n), dtype=b.dtype)))
@@ -187,17 +188,17 @@ def check_expr_sum(dtype):
         check_expr_sum(t)
 
 def test_trace_expr_sum_custom():
-    @tvm.register_func("tvm.trace_callback4")
+    @tvm.register_func("tvm.tir.trace_callback4")
     def trace_buffer(x):
         return
 
     def check_expr_sum_custom(dtype):
         n = 4
-        a = tvm.placeholder((n,n), name="a", dtype=dtype)
-        b = tvm.placeholder((n,n), name="b", dtype=dtype)
-        c = tvm.compute(a.shape, lambda i,j: tvm.trace([a[i][j]], "tvm.trace_callback4")
-                                         + tvm.trace([b[i][j]], "tvm.trace_callback4"))
-        s = tvm.create_schedule(c.op)
+        a = te.placeholder((n,n), name="a", dtype=dtype)
+        b = te.placeholder((n,n), name="b", dtype=dtype)
+        c = te.compute(a.shape, lambda i,j: tvm.tir.trace([a[i][j]], "tvm.tir.trace_callback4")
+                                         + tvm.tir.trace([b[i][j]], "tvm.tir.trace_callback4"))
+        s = te.create_schedule(c.op)
         f = tvm.build(s, [a, b, c])
         npa = np.array([[1,0,0,0], [0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=a.dtype)
         npb = np.array([[1,0,0,0], [0,1,0,0],[0,0,1,0],[0,0,0,1]], dtype=a.dtype)
@@ -211,20 +212,20 @@ def check_expr_sum_custom(dtype):
         check_expr_sum_custom(t)
 
 def test_trace_can_change_traced_value_int():
-    @tvm.register_func("tvm.trace_change_int_first")
+    @tvm.register_func("tvm.tir.trace_change_int_first")
     def trace_buffer(x):
         return 13
 
-    @tvm.register_func("tvm.trace_change_int_second")
+    @tvm.register_func("tvm.tir.trace_change_int_second")
     def trace_buffer(x):
         return 14
 
     def check_assign(dtype):
         n = 4
-        x = tvm.placeholder((n,), name="X", dtype=dtype)
-        y = tvm.compute(x.shape, lambda i: tvm.trace([x[i]], "tvm.trace_change_int_first"))
-        z = tvm.compute(x.shape, lambda i: tvm.trace([y[i]], "tvm.trace_change_int_second"))
-        s = tvm.create_schedule(z.op)
+        x = te.placeholder((n,), name="X", dtype=dtype)
+        y = te.compute(x.shape, lambda i: tvm.tir.trace([x[i]], "tvm.tir.trace_change_int_first"))
+        z = te.compute(x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_int_second"))
+        s = te.create_schedule(z.op)
         f = tvm.build(s, [x, y, z], "llvm")
 
         xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
@@ -240,20 +241,20 @@ def check_assign(dtype):
         check_assign(t)
 
 def test_trace_can_change_traced_value_float():
-    @tvm.register_func("tvm.trace_change_float_first")
+    @tvm.register_func("tvm.tir.trace_change_float_first")
     def trace_buffer(x):
         return 13.0
 
-    @tvm.register_func("tvm.trace_change_float_second")
+    @tvm.register_func("tvm.tir.trace_change_float_second")
     def trace_buffer(x):
         return 14.0
 
     def check_assign(dtype):
         n = 4
-        x = tvm.placeholder((n,), name="X", dtype=dtype)
-        y = tvm.compute(x.shape, lambda i: tvm.trace([x[i]], "tvm.trace_change_float_first"))
-        z = tvm.compute(x.shape, lambda i: tvm.trace([y[i]], "tvm.trace_change_float_second"))
-        s = tvm.create_schedule(z.op)
+        x = te.placeholder((n,), name="X", dtype=dtype)
+        y = te.compute(x.shape, lambda i: tvm.tir.trace([x[i]], "tvm.tir.trace_change_float_first"))
+        z = te.compute(x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_float_second"))
+        s = te.create_schedule(z.op)
         f = tvm.build(s, [x, y, z], "llvm")
 
         xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype))
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 75169da9a2ce..1d9b79eca875 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import tvm.testing
 import os
 import logging
@@ -34,9 +35,9 @@ def test_bigendian_rpc():
     if host is None:
         return
     def verify_rpc(remote, target, shape, dtype):
-        A = tvm.placeholder(shape, dtype=dtype)
-        B = tvm.compute(A.shape, lambda i: A[i]+tvm.const(1, A.dtype))
-        s = tvm.create_schedule(B.op)
+        A = te.placeholder(shape, dtype=dtype)
+        B = te.compute(A.shape, lambda i: A[i]+tvm.tir.const(1, A.dtype))
+        s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], target, name="myadd")
 
         ctx = remote.cpu(0)
@@ -116,10 +117,10 @@ def test_rpc_remote_module():
     server = rpc.Server("localhost")
     client = rpc.connect(server.host, server.port)
     # graph
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
 
     def check_remote(remote):
         if not tvm.runtime.enabled("llvm"):
@@ -155,10 +156,10 @@ def check_remote_link_cl(remote):
             return
         temp = util.tempdir()
         ctx = remote.cl(0)
-        s = tvm.create_schedule(B.op)
+        s = te.create_schedule(B.op)
         xo, xi = s[B].split(B.op.axis[0], factor=32)
-        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, te.thread_axis("blockIdx.x"))
+        s[B].bind(xi, te.thread_axis("threadIdx.x"))
         f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd")
         # Option 1: save modules separately and rely on remote compiler
         path_o = temp.relpath("myadd.o")
diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py
index 849a9ef3f823..064b733de7bd 100644
--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm.runtime import profiler_vm
 from tvm import relay
 from tvm.relay.testing import resnet
diff --git a/tests/python/unittest/test_schedule_bound_inference.py b/tests/python/unittest/test_schedule_bound_inference.py
index 9c3d1df17f2b..484aa503e066 100644
--- a/tests/python/unittest/test_schedule_bound_inference.py
+++ b/tests/python/unittest/test_schedule_bound_inference.py
@@ -15,81 +15,82 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_bound1():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
 
-    s = tvm.create_schedule([A2.op])
+    s = te.create_schedule([A2.op])
     xo, xi = s[A2].split(s[A2].op.axis[0], 8)
     s[A1].compute_at(s[A2], xo)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     assert(bounds[A1.op.axis[0]].extent.value == 8)
 
 def test_bound2():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
-    s = tvm.create_schedule(A2.op)
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    s = te.create_schedule(A2.op)
     xo, yo, xi, yi = s[A2].tile(A2.op.axis[0], A2.op.axis[1], 8, 8)
     # test normalize not affecting schedule
     _ = s.normalize()
     s[A1].compute_at(s[A2], yo)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     assert(bounds[A1.op.axis[0]].extent.value == 8)
     assert(bounds[A1.op.axis[1]].extent.value == 8)
 
 def test_bound3():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
 
-    s = tvm.create_schedule(A2.op)
+    s = te.create_schedule(A2.op)
     s[A1].set_scope("shared")
     xo, xi = s[A2].split(A2.op.axis[0], 32)
     xi0, xi1 = s[A2].split(xi, nparts=16)
-    s[A2].bind(xi0, tvm.thread_axis("threadIdx.x"))
+    s[A2].bind(xi0, te.thread_axis("threadIdx.x"))
     yo, yi = s[A2].split(A2.op.axis[1], 16)
     # test normalize not affecting schedule
     _ = s.normalize()
     s[A2].reorder(xo, xi0, yo, xi1, yi)
     s[A1].compute_at(s[A2], yo)
 
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     assert(bounds[A1.op.axis[0]].extent.value==32)
     assert(bounds[A1.op.axis[1]].extent.value==16)
 
 def test_bound_split_divisible():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((8 * m, l), name='A')
-    B = tvm.compute((8 * m, l), lambda i, j: A[i, j], name='B')
-    s = tvm.create_schedule(B.op)
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((8 * m, l), name='A')
+    B = te.compute((8 * m, l), lambda i, j: A[i, j], name='B')
+    s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], 8)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     assert bounds[xo].extent == m
     assert bounds[xi].extent.value == 8
 
 def test_bound_tile_divisible():
-    m = tvm.var('m')
-    l = tvm.var('l')
+    m = te.var('m')
+    l = te.var('l')
     shape = (8 * m, 32 * l)
-    A = tvm.placeholder(shape, name='A')
-    B = tvm.compute(shape, lambda i, j: A[i, j], name='B')
-    s = tvm.create_schedule(B.op)
+    A = te.placeholder(shape, name='A')
+    B = te.compute(shape, lambda i, j: A[i, j], name='B')
+    s = te.create_schedule(B.op)
     xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], 8, 32)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     assert bounds[xo].extent == m
     assert bounds[xi].extent.value == 8
@@ -97,165 +98,165 @@ def test_bound_tile_divisible():
     assert bounds[yi].extent.value == 32
 
 def test_bound_fusesplit1():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    split1 = tvm.var('s')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
-
-    s = tvm.create_schedule(A2.op)
+    m = te.var('m')
+    l = te.var('l')
+    split1 = te.var('s')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+
+    s = te.create_schedule(A2.op)
     fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1])
     xo, xi = s[A2].split(fused_axes, split1)
     s[A1].compute_at(s[A2], xo)
 
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    idxdiv = tvm.indexdiv
-    assert(tvm.ir_pass.Simplify(
+    idxdiv = tvm.tir.indexdiv
+    assert(tvm.tir.ir_pass.Simplify(
             bounds[A1.op.axis[0]].min - idxdiv(xo * split1, l)).value == 0)
 
     expected_extent = (idxdiv((xo + 1) * split1 - 1, l) - idxdiv(xo * split1, l) + 1)
     for i in range(1, 6):
         for j in range(1, 6):
             for k in range(1, 6):
-                vars = tvm.convert({split1: tvm.const(i, "int32"), l: tvm.const(j, "int32"), xo.var: tvm.const(k, "int32")})
-                comp_ext = tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[0]].extent, vars)).value
-                exp_ext = tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(expected_extent, vars)).value
+                vars = tvm.runtime.convert({split1: tvm.tir.const(i, "int32"), l: tvm.tir.const(j, "int32"), xo.var: tvm.tir.const(k, "int32")})
+                comp_ext = tvm.tir.ir_pass.Simplify(tvm.tir.ir_pass.Substitute(bounds[A1.op.axis[0]].extent, vars)).value
+                exp_ext = tvm.tir.ir_pass.Simplify(tvm.tir.ir_pass.Substitute(expected_extent, vars)).value
                 assert(comp_ext == exp_ext)
 
-    assert(tvm.ir_pass.Simplify(bounds[A1.op.axis[1]].extent - l).value == 0)
+    assert(tvm.tir.ir_pass.Simplify(bounds[A1.op.axis[1]].extent - l).value == 0)
 
 def test_bound_fusesplit2():
-    m = tvm.var("m")
-    l = tvm.convert(6)
-    split = tvm.convert(3)
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
-
-    s = tvm.create_schedule(A2.op)
+    m = te.var("m")
+    l = tvm.runtime.convert(6)
+    split = tvm.runtime.convert(3)
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+
+    s = te.create_schedule(A2.op)
     fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1])
     xo, xi = s[A2].split(fused_axes, split)
     s[A1].compute_at(s[A2], xo)
 
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    vars = tvm.convert({xo.var: tvm.const(5, "int32")})
-    assert(tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[0]].min, vars)).value == 2)
-    assert(tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[1]].min, vars)).value == 3)
-    assert(tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[0]].extent, vars)).value == 1)
-    assert(tvm.ir_pass.Simplify(tvm.ir_pass.Substitute(bounds[A1.op.axis[1]].extent, vars)).value == 3)
+    vars = tvm.runtime.convert({xo.var: tvm.tir.const(5, "int32")})
+    assert(tvm.tir.ir_pass.Simplify(tvm.tir.ir_pass.Substitute(bounds[A1.op.axis[0]].min, vars)).value == 2)
+    assert(tvm.tir.ir_pass.Simplify(tvm.tir.ir_pass.Substitute(bounds[A1.op.axis[1]].min, vars)).value == 3)
+    assert(tvm.tir.ir_pass.Simplify(tvm.tir.ir_pass.Substitute(bounds[A1.op.axis[0]].extent, vars)).value == 1)
+    assert(tvm.tir.ir_pass.Simplify(tvm.tir.ir_pass.Substitute(bounds[A1.op.axis[1]].extent, vars)).value == 3)
 
 
 def test_bound_warp():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
 
-    s = tvm.create_schedule(A2.op)
+    s = te.create_schedule(A2.op)
     s[A1].set_scope("warp")
     xo, xi = s[A2].split(A2.op.axis[0], 32)
     xi0, xi1 = s[A2].split(xi, factor=16)
-    tx = tvm.thread_axis("threadIdx.x")
+    tx = te.thread_axis("threadIdx.x")
     s[A2].bind(xi1, tx)
-    s[A2].bind(xi0, tvm.thread_axis("threadIdx.y"))
+    s[A2].bind(xi0, te.thread_axis("threadIdx.y"))
     y = s[A2].op.axis[1]
     s[A1].compute_at(s[A2], y)
     xo, xi = s[A1].split(s[A1].op.axis[0], factor=16)
     s[A1].bind(xi, tx)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     assert(bounds[A1.op.axis[0]].extent.value==16)
 
 def test_bound_scan():
-    m = tvm.var("m")
-    n = tvm.var("n")
-    X = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    s_state = tvm.placeholder((m, n))
-    s_init = tvm.compute((1, n), lambda _, i: X[0, i])
-    s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
-    s_scan = tvm.scan(s_init, s_update, s_state)
+    m = te.var("m")
+    n = te.var("n")
+    X = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    s_state = te.placeholder((m, n))
+    s_init = te.compute((1, n), lambda _, i: X[0, i])
+    s_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+    s_scan = tvm.te.scan(s_init, s_update, s_state)
 
     assert tuple(s_scan.shape) == (m, n)
-    s = tvm.create_schedule(s_scan.op)
+    s = te.create_schedule(s_scan.op)
     XX = s.cache_read(X, "local", s_update)
     xo, xi = s[s_update].split(s_update.op.axis[1], factor=4)
     s[XX].compute_at(s[s_update], xo)
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
     assert bounds[XX.op.axis[1]].extent.value == 4
 
 def test_bound_conv1d():
-    n = tvm.var('n')
-    A = tvm.compute((n+2), lambda i: 1,  name='A')
+    n = te.var('n')
+    A = te.compute((n+2), lambda i: 1,  name='A')
     def computeB(ii):
         i = ii + 1
         return A[i-1] + A[i] + A[i+1]
-    B = tvm.compute(n, computeB, name='B')
-    s = tvm.create_schedule(B.op)
+    B = te.compute(n, computeB, name='B')
+    s = te.create_schedule(B.op)
     s[A].compute_at(s[B], B.op.axis[0])
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert(bounds[A.op.axis[0]].extent.value == 3)
 
 def test_bound_blur():
-    n = tvm.convert(12)
-    A = tvm.compute((n, n), lambda i, j: 1, name='A')
+    n = tvm.runtime.convert(12)
+    A = te.compute((n, n), lambda i, j: 1, name='A')
     def computeB(ii, jj):
         # set the correct center
         i = ii + 1
         j = jj + 1
         return A[i][j] + A[i-1][j] + A[i+1][j] + A[i][j+1] + A[i][j-1]
-    B = tvm.compute((n-2, n-2), computeB, name='B')
-    s = tvm.create_schedule(B.op)
+    B = te.compute((n-2, n-2), computeB, name='B')
+    s = te.create_schedule(B.op)
     s[A].compute_at(s[B], B.op.axis[1])
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert(bounds[A.op.axis[0]].extent.value == 3)
     assert(bounds[A.op.axis[1]].extent.value == 3)
 
 def test_bound_rfactor():
-    n = tvm.var('n')
-    A = tvm.placeholder((n,), name='A')
-    k = tvm.reduce_axis((0, n))
-    B = tvm.compute((1,), lambda i: tvm.sum(A[k], axis=k, where=(i>1)), name='B')
+    n = te.var('n')
+    A = te.placeholder((n,), name='A')
+    k = te.reduce_axis((0, n))
+    B = te.compute((1,), lambda i: te.sum(A[k], axis=k, where=(i>1)), name='B')
     # schedule
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     kf, ki = s[B].split(k, nparts=4)
     BF = s.rfactor(B, kf)
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
 
     assert(bounds[BF.op.axis[0]].extent.value == 4)
     assert(bounds[BF.op.axis[1]].extent.value == 1)
 
 def test_bound_group_schedule():
-    m = tvm.var("m")
-    n = tvm.var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    x1 = tvm.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = tvm.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = tvm.create_schedule(x2.op)
+    m = te.var("m")
+    n = te.var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
+    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
+    s = te.create_schedule(x2.op)
     g = s.create_group(outputs=x1, inputs=x, include_inputs=True)
     g.compute_at(s[x2], x2.op.axis[0])
     assert s[x1].group == g
     assert s[x].group == g
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert bounds[x.op.axis[0]].extent.value == 1
     assert bounds[x.op.axis[1]].extent == n
 
 def test_bound_nest_group():
-    m = tvm.var("m")
-    n = tvm.var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    x1 = tvm.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
-    x2 = tvm.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
-    s = tvm.create_schedule(x2.op)
+    m = te.var("m")
+    n = te.var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1")
+    x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2")
+    s = te.create_schedule(x2.op)
     g1 = s.create_group(outputs=x, inputs=x, include_inputs=True)
     g2 = s.create_group(outputs=x1, inputs=x, include_inputs=True)
     assert s[x].group == g1
@@ -263,7 +264,7 @@ def test_bound_nest_group():
     g2.compute_at(s[x2], x2.op.axis[0])
     g1.compute_at(s[x1], s[x1].op.axis[1])
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert bounds[x.op.axis[0]].extent.value == 1
     assert bounds[x.op.axis[1]].extent.value == 1
     assert bounds[x1.op.axis[0]].extent.value == 1
@@ -271,18 +272,18 @@ def test_bound_nest_group():
 
 
 def test_bound_nest_thread():
-    m = tvm.var('m')
-    A = tvm.placeholder((m), name='A')
-    A1 = tvm.compute((m,), lambda i: A[i], name='A1')
-    A2 = tvm.compute((m,), lambda i: A1[i] + 2, name='A2')
-    A3 = tvm.compute((m,), lambda i: A2[i] + 3, name='A3')
+    m = te.var('m')
+    A = te.placeholder((m), name='A')
+    A1 = te.compute((m,), lambda i: A[i], name='A1')
+    A2 = te.compute((m,), lambda i: A1[i] + 2, name='A2')
+    A3 = te.compute((m,), lambda i: A2[i] + 3, name='A3')
 
-    s = tvm.create_schedule(A3.op)
+    s = te.create_schedule(A3.op)
     s[A2].set_scope("shared")
     s[A1].set_scope("local")
 
-    block_x = tvm.thread_axis("blockIdx.x")
-    thread_x = tvm.thread_axis("threadIdx.x")
+    block_x = te.thread_axis("blockIdx.x")
+    thread_x = te.thread_axis("threadIdx.x")
     bx, tx = s[A3].split(A3.op.axis[0], factor=32)
     s[A3].bind(bx, block_x)
     s[A3].bind(tx, thread_x)
@@ -291,31 +292,31 @@ def test_bound_nest_thread():
     s[A2].bind(xi, thread_x)
     s[A1].compute_at(s[A3], tx)
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert(bounds[A1.op.axis[0]].extent.value==1)
     assert(bounds[A2.op.axis[0]].extent.value==32)
     assert(bounds[A3.op.axis[0]].extent == m)
 
 def test_gemm_bound():
     nn = 1024
-    n = tvm.convert(nn)
-    A = tvm.placeholder((n, n), name='A')
-    B = tvm.placeholder((n, n), name='B')
-    k = tvm.reduce_axis((0, n), name='k')
-    C = tvm.compute(
+    n = tvm.runtime.convert(nn)
+    A = te.placeholder((n, n), name='A')
+    B = te.placeholder((n, n), name='B')
+    k = te.reduce_axis((0, n), name='k')
+    C = te.compute(
         (n, n),
-        lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
+        lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k),
         name='CC')
     # schedule
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     xtile, ytile = 32, 32
     scale = 8
     num_thread = 8
     block_factor = scale * num_thread
-    block_x = tvm.thread_axis("blockIdx.x")
-    thread_x = tvm.thread_axis("threadIdx.x")
-    block_y = tvm.thread_axis("blockIdx.y")
-    thread_y = tvm.thread_axis("threadIdx.y")
+    block_x = te.thread_axis("blockIdx.x")
+    thread_x = te.thread_axis("threadIdx.x")
+    block_y = te.thread_axis("blockIdx.y")
+    thread_y = te.thread_axis("threadIdx.y")
 
     CC = s.cache_write(C, "local")
     AA = s.cache_read(A, "shared", [CC])
@@ -347,7 +348,7 @@ def test_gemm_bound():
     s[BB].bind(ty, thread_y)
     s[BB].bind(tx, thread_x)
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert(bounds[BB.op.axis[0]].extent.value==64)
     assert(bounds[AA.op.axis[0]].extent.value==64)
     assert(bounds[CC.op.axis[0]].extent.value == 8)
@@ -356,54 +357,54 @@ def test_gemm_bound():
 
 def test_bound_tensor_compute_op():
     def intrin_test():
-      m1 = tvm.var("m1")
-      n1 = tvm.var("n1")
-      a = tvm.placeholder((m1, n1), name='a')
-      c = tvm.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c')
+      m1 = te.var("m1")
+      n1 = te.var("n1")
+      a = te.placeholder((m1, n1), name='a')
+      c = te.compute((1, n1), lambda i, j : a[0, j] + a[1, j] + a[2, j], name='c')
 
-      Ab = tvm.decl_buffer(a.shape, name="Abuf", offset_factor=1)
-      Cb = tvm.decl_buffer(c.shape, name="Cbuf", offset_factor=1)
+      Ab = tvm.tir.decl_buffer(a.shape, name="Abuf", offset_factor=1)
+      Cb = tvm.tir.decl_buffer(c.shape, name="Cbuf", offset_factor=1)
 
       def intrin_func(ins, outs):
         aa = ins[0]
         cc = outs[0]
         def _body():
-          ib = tvm.ir_builder.create()
-          ib.emit(tvm.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r")))
+          ib = tvm.tir.ir_builder.create()
+          ib.emit(tvm.tir.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r")))
           return ib.get()
         return _body()
-      with tvm.build_config(offset_factor=1):
-        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb})
+      with tvm.target.build_config(offset_factor=1):
+        return te.decl_tensor_intrin(c.op, intrin_func, binds={a : Ab, c : Cb})
 
     test_func = intrin_test()
-    A = tvm.placeholder((20,20), name='A')
-    B = tvm.compute(A.shape, lambda i,j : A[i,j], name='B')
-    C = tvm.compute((10, 20), lambda i : test_func(B[i:10, 0:20]), name='C')
-    s = tvm.create_schedule(C.op)
-    bounds = tvm.schedule.InferBound(s)
+    A = te.placeholder((20,20), name='A')
+    B = te.compute(A.shape, lambda i,j : A[i,j], name='B')
+    C = te.compute((10, 20), lambda i : test_func(B[i:10, 0:20]), name='C')
+    s = te.create_schedule(C.op)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
     assert(bounds[B.op.axis[0]].extent.value == 10)
 
 def test_bound_simplification_failure():
     # Check that the bounds are not expanded
-    A = tvm.compute((2,), lambda j: j, "A")
+    A = te.compute((2,), lambda j: j, "A")
 
     def _check(B, A=A):
-        s = tvm.create_schedule(B.op)
+        s = te.create_schedule(B.op)
         s = s.normalize()
-        bounds = tvm.schedule.InferBound(s)
+        bounds = tvm.te.schedule.InferBound(s)
         stmt = tvm.lower(s, [B, A], simple_mode=True)
         if not bounds[A.op.axis[0]].extent.value <= 2:
             print(stmt)
             assert bounds[A.op.axis[0]].extent.value <= 2
-    tdiv = tvm.truncdiv
+    tdiv = tvm.tir.truncdiv
     # These are hard to simplify, moreover we don't simplify them
-    _check(tvm.compute((10,), lambda i: A[tvm.min(3*i, 4*i) + tvm.min(-3*i, -2*i)]))
-    _check(tvm.compute((10,), lambda i: A[tvm.min(3*i, 4*i) + tvm.max(-3*i, -4*i)]))
-    _check(tvm.compute((10,), lambda i: A[-2*tdiv(i,2) - tvm.min(i, 0-i)]))
-    _check(tvm.compute((10,), lambda i: A[i + (0 - i)]))
+    _check(te.compute((10,), lambda i: A[tvm.te.min(3*i, 4*i) + tvm.te.min(-3*i, -2*i)]))
+    _check(te.compute((10,), lambda i: A[tvm.te.min(3*i, 4*i) + tvm.te.max(-3*i, -4*i)]))
+    _check(te.compute((10,), lambda i: A[-2*tdiv(i,2) - tvm.te.min(i, 0-i)]))
+    _check(te.compute((10,), lambda i: A[i + (0 - i)]))
     # This would cause out of bounds, but we nevertheless include it
-    _check(tvm.compute((10,), lambda i: A[i]))
+    _check(te.compute((10,), lambda i: A[i]))
 
 if __name__ == "__main__":
     test_bound_nest_thread()
diff --git a/tests/python/unittest/test_schedule_graph.py b/tests/python/unittest/test_schedule_graph.py
index d77c1d470aba..d6d38e5f05c9 100644
--- a/tests/python/unittest/test_schedule_graph.py
+++ b/tests/python/unittest/test_schedule_graph.py
@@ -15,96 +15,97 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_scan():
-    m = tvm.var("m")
-    n = tvm.var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    s_state = tvm.placeholder((m, n))
-    s_init = tvm.compute((1, n), lambda _, i: x[0, i], name="s_init")
-    x_trans = tvm.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans")
-    s_up1 = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1")
-    s_update = tvm.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update")
-    s_scan = tvm.scan(s_init, s_update, s_state)
+    m = te.var("m")
+    n = te.var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    s_state = te.placeholder((m, n))
+    s_init = te.compute((1, n), lambda _, i: x[0, i], name="s_init")
+    x_trans = te.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans")
+    s_up1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1")
+    s_update = te.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update")
+    s_scan = tvm.te.scan(s_init, s_update, s_state)
 
     def test_getbody():
-        body = tvm.schedule.ScanGetBody(s_scan.op)
+        body = tvm.te.schedule.ScanGetBody(s_scan.op)
         assert set(body) == set([s_scan.op, s_update.op, s_up1.op])
 
     def test_attach_path():
-        s = tvm.create_schedule(s_scan.op)
+        s = te.create_schedule(s_scan.op)
         s[x_trans].compute_at(s[s_update], s_update.op.axis[0])
-        apath = tvm.schedule.CreateAttachPath(s)
+        apath = tvm.te.schedule.CreateAttachPath(s)
         assert(tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis]))
         assert(tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis]))
 
     def test_fix_pt():
-        body = tvm.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
+        body = tvm.te.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op, body)
         assert(fxpt[s_scan.spatial_axis_[0]].value != 0)
 
 def test_scan_fix_point():
-    m = tvm.var("m")
-    n = tvm.var("n")
-    l = tvm.var("l")
-    x = tvm.compute((l, m, n), lambda *i: tvm.const(1, "float32"), name="x")
-    s_state = tvm.placeholder((l, m, n))
-    s_init = tvm.compute((1, m, n), lambda _, i, j: x[0, i, j], name="s_init")
+    m = te.var("m")
+    n = te.var("n")
+    l = te.var("l")
+    x = te.compute((l, m, n), lambda *i: tvm.tir.const(1, "float32"), name="x")
+    s_state = te.placeholder((l, m, n))
+    s_init = te.compute((1, m, n), lambda _, i, j: x[0, i, j], name="s_init")
 
     def test_scan0():
-        s_update = tvm.compute((l, m, n),
+        s_update = te.compute((l, m, n),
                                lambda t, i, j: x[t, j, i]  + s_state[t-1, i, j], name="update")
-        s_scan = tvm.scan(s_init, s_update, s_state)
-        body = tvm.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
+        s_scan = tvm.te.scan(s_init, s_update, s_state)
+        body = tvm.te.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op, body)
         assert(fxpt[s_scan.op.spatial_axis_[0]].value == 1)
         assert(fxpt[s_scan.op.spatial_axis_[1]].value == 1)
 
     def test_scan1():
-        s_update = tvm.compute((l, m, n),
+        s_update = te.compute((l, m, n),
                                lambda t, i, j: x[t, j, i]  + s_state[t-1, j, i], name="update")
-        s_scan = tvm.scan(s_init, s_update, s_state)
-        body = tvm.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
+        s_scan = tvm.te.scan(s_init, s_update, s_state)
+        body = tvm.te.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op, body)
         assert(fxpt[s_scan.op.spatial_axis_[0]].value == 0)
         assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
 
     def test_scan3_not_exact_reach():
-        s_h1 = tvm.compute((l, n, m), lambda t, j, i: s_state[t-1, i, j], name="h1")
-        s_h2 = tvm.compute((l, m, n), lambda t, i, j: s_state[t-1, i, 10] * 2, name="h1")
-        s_update = tvm.compute((l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update")
-        s_scan = tvm.scan(s_init, s_update, s_state)
-        body = tvm.schedule.ScanGetBody(s_scan.op)
-        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op)
+        s_h1 = te.compute((l, n, m), lambda t, j, i: s_state[t-1, i, j], name="h1")
+        s_h2 = te.compute((l, m, n), lambda t, i, j: s_state[t-1, i, 10] * 2, name="h1")
+        s_update = te.compute((l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update")
+        s_scan = tvm.te.scan(s_init, s_update, s_state)
+        body = tvm.te.schedule.ScanGetBody(s_scan.op)
+        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
         assert(fxpt[s_scan.op.spatial_axis_[0]].value == 1)
         assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
 
     def test_scan4_reach_other():
-        s_h1 = tvm.compute((l, n, m), lambda t, j, i: s_state[t-1, j, j], name="h1")
-        s_h2 = tvm.compute((l, m, n), lambda t, i, j: s_state[t-1, i, j] * 2, name="h1")
-        s_update = tvm.compute((l, m, n),
+        s_h1 = te.compute((l, n, m), lambda t, j, i: s_state[t-1, j, j], name="h1")
+        s_h2 = te.compute((l, m, n), lambda t, i, j: s_state[t-1, i, j] * 2, name="h1")
+        s_update = te.compute((l, m, n),
                                lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update")
-        s_scan = tvm.scan(s_init, s_update, s_state)
-        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op)
+        s_scan = tvm.te.scan(s_init, s_update, s_state)
+        fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op)
         assert(fxpt[s_scan.op.spatial_axis_[0]].value == 0)
         assert(fxpt[s_scan.op.spatial_axis_[1]].value == 0)
 
     def test_scan5_multi_output():
-        m = tvm.var("m")
-        n = tvm.var("n")
-        x1 = tvm.placeholder((m, n))
-        s1 = tvm.placeholder((m, n))
-        x2 = tvm.placeholder((m, n))
-        s2 = tvm.placeholder((m, n))
-        s1_init = tvm.compute((1, n), lambda _, i: x1[0, i])
-        s2_init = tvm.compute((1, n), lambda _, i: x2[0, i])
-        s1_update = tvm.compute((m, n), lambda t, i: s1[t-1, i] +  x1[t, i])
-        s2_update = tvm.compute((m, n), lambda t, i: x2[t, i] + s2[t-1,i])
-        r0, r1 = tvm.scan([s1_init, s2_init],
+        m = te.var("m")
+        n = te.var("n")
+        x1 = te.placeholder((m, n))
+        s1 = te.placeholder((m, n))
+        x2 = te.placeholder((m, n))
+        s2 = te.placeholder((m, n))
+        s1_init = te.compute((1, n), lambda _, i: x1[0, i])
+        s2_init = te.compute((1, n), lambda _, i: x2[0, i])
+        s1_update = te.compute((m, n), lambda t, i: s1[t-1, i] +  x1[t, i])
+        s2_update = te.compute((m, n), lambda t, i: x2[t, i] + s2[t-1,i])
+        r0, r1 = tvm.te.scan([s1_init, s2_init],
                           [s1_update, s2_update],
                           [s1, s2])
-        body = tvm.schedule.ScanGetBody(r0.op)
-        fxpt = tvm.schedule.ScanFixPointAnalysis(r0.op)
+        body = tvm.te.schedule.ScanGetBody(r0.op)
+        fxpt = tvm.te.schedule.ScanFixPointAnalysis(r0.op)
         assert(fxpt[r1.op.spatial_axis_[0]].value == 1)
 
     test_scan0()
@@ -114,17 +115,17 @@ def test_scan5_multi_output():
     test_scan5_multi_output()
 
 def test_create_read_graph():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j])
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3)
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j])
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3)
 
-    g = tvm.schedule.CreateReadGraph([A2.op])
+    g = tvm.te.schedule.CreateReadGraph([A2.op])
 
     assert g[A2.op][0] == A1
     assert g[A1.op][0] == A
-    post_order = tvm.schedule.PostDFSOrder([A2.op], g)
+    post_order = tvm.te.schedule.PostDFSOrder([A2.op], g)
     assert(post_order[0] == A.op)
     assert(post_order[1] == A1.op)
 
diff --git a/tests/python/unittest/test_schedule_lstm.py b/tests/python/unittest/test_schedule_lstm.py
index 21cf8e81badd..23c748688137 100644
--- a/tests/python/unittest/test_schedule_lstm.py
+++ b/tests/python/unittest/test_schedule_lstm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def test_lstm_cell_inline():
     num_step = 128
@@ -22,52 +23,52 @@ def test_lstm_cell_inline():
     num_hidden = 1152
     batch_size = 4
     # Global transition matrix
-    X = tvm.placeholder((num_step - 1, batch_size, num_input), name="X")
-    Wi2h = tvm.placeholder((4, num_hidden, num_input), name="Wi2h")
-    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
+    X = te.placeholder((num_step - 1, batch_size, num_input), name="X")
+    Wi2h = te.placeholder((4, num_hidden, num_input), name="Wi2h")
+    Wh2h = te.placeholder((4, num_hidden, num_hidden), name="Wh2h")
     # h: output hidden state, c: cell state.
-    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
-    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
-    s_init_c = tvm.compute((1, batch_size, num_hidden),
+    s_state_h = te.placeholder((num_step, batch_size, num_hidden))
+    s_state_c = te.placeholder((num_step, batch_size, num_hidden))
+    s_init_c = te.compute((1, batch_size, num_hidden),
                            lambda *i: 0.0, name="init_c")
-    s_init_h = tvm.compute((1, batch_size, num_hidden),
+    s_init_h = te.compute((1, batch_size, num_hidden),
                            lambda *i: 0.0, name="init_h")
     # LSTM transition
-    k = tvm.reduce_axis((0, num_input), name="ki2h")
-    s_i2h = tvm.compute(
+    k = te.reduce_axis((0, num_input), name="ki2h")
+    s_i2h = te.compute(
         (num_step, 4, batch_size, num_hidden),
-        lambda t, x, i, j: tvm.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
+        lambda t, x, i, j: te.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k),
         name="s_i2h")
-    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
-    s_h2h = tvm.compute(
+    k = te.reduce_axis((0, num_hidden), name="ki2h")
+    s_h2h = te.compute(
         (num_step, 4, batch_size, num_hidden),
-        lambda t, x, i, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
+        lambda t, x, i, j: te.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
         name="s_h2h")
     # Gate rules
-    gates = tvm.compute(s_i2h.shape, lambda *i:
+    gates = te.compute(s_i2h.shape, lambda *i:
                         s_i2h(*i) + s_h2h(*i), name="gates")
     gshape = (num_step, batch_size, num_hidden)
-    in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 0, i, j]), name="in_gate")
-    in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, 1, i, j]), name="in_transform")
-    forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 2, i, j]), name="forget_gate")
-    out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, 3, i, j]), name="out_gate")
-    next_c = tvm.compute(gshape,
+    in_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 0, i, j]), name="in_gate")
+    in_transform = te.compute(gshape, lambda t, i, j: te.tanh(gates[t, 1, i, j]), name="in_transform")
+    forget_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 2, i, j]), name="forget_gate")
+    out_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 3, i, j]), name="out_gate")
+    next_c = te.compute(gshape,
                          lambda t, i, j:
                          forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
                          in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
-    next_h = tvm.compute(gshape,
-                         lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
-    update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
-    update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
+    next_h = te.compute(gshape,
+                         lambda t, i, j: out_gate[t, i, j] * te.tanh(next_c[t, i, j]), name="next_h")
+    update_c = te.compute(gshape, lambda *i: next_c(*i), name="update_c")
+    update_h = te.compute(gshape, lambda *i: next_h(*i), name="update_h")
     # schedule
-    scan_h, scan_c = tvm.scan(
+    scan_h, scan_c = tvm.te.scan(
         [s_init_h, s_init_c],
         [update_h, update_c],
         [s_state_h, s_state_c],
         inputs=[X],
         name="lstm_scan")
     # schedule
-    s = tvm.create_schedule(scan_h.op)
+    s = te.create_schedule(scan_h.op)
     # Inline gate computations
     s[gates].compute_inline()
     s[in_gate].compute_inline()
diff --git a/tests/python/unittest/test_schedule_schedule_ops.py b/tests/python/unittest/test_schedule_schedule_ops.py
index 2fc84bb43b16..8d10ceea0b48 100644
--- a/tests/python/unittest/test_schedule_schedule_ops.py
+++ b/tests/python/unittest/test_schedule_schedule_ops.py
@@ -15,66 +15,67 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def test_schedule0():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    s = tvm.create_schedule(A1.op)
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    s = te.create_schedule(A1.op)
 
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule1():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
 
-    s = tvm.create_schedule(A1.op)
+    s = te.create_schedule(A1.op)
     xo, xi = s[A1].split(A1.op.axis[0], 8)
     s[A1].pragma(xo, "auto_unroll_max_step", 10)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule2():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
-    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
-    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
+    A1 = te.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
 
-    s = tvm.create_schedule(A2.op)
+    s = te.create_schedule(A2.op)
     xo, xi = s[A2].split(A2.op.axis[0], 8)
     s[A1].compute_at(s[A2], xo)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule_scan():
-    m = tvm.var("m")
-    n = tvm.var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    s_state = tvm.placeholder((m, n))
-    s_init = tvm.compute((1, n), lambda _, i: x[0, i])
-    s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
-    res = tvm.scan(s_init, s_update, s_state)
+    m = te.var("m")
+    n = te.var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    s_state = te.placeholder((m, n))
+    s_init = te.compute((1, n), lambda _, i: x[0, i])
+    s_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
+    res = tvm.te.scan(s_init, s_update, s_state)
 
     assert tuple(res.shape) == (m, n)
-    s = tvm.create_schedule(res.op)
+    s = te.create_schedule(res.op)
     s = s.normalize()
     ir = tvm.lower(s, [s_state], simple_mode=True)
     assert not hasattr(ir.body.body.body.body[1].body.body[1].body, "condition")
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert(bounds[res.op.scan_axis].min.value == 1)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_inline_multi_reduce():
@@ -83,107 +84,107 @@ def argmax_comp(x, y):
         val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
         return idx, val
     def argmax_init(idx_typ, val_typ):
-        return tvm.const(-1, idx_typ), tvm.min_value(val_typ)
-
-    argmax = tvm.comm_reducer(argmax_comp, argmax_init, name='argmax')
-    m = tvm.var('m')
-    n = tvm.var('n')
-    val = tvm.placeholder((m, n), name='val', dtype='float32')
-    val1 = tvm.compute((m, n), lambda i, j: val[i, j]+1, name='val1')
-    val2 = tvm.compute((m, n), lambda i, j: tvm.exp(val1[i, j]), name='val2')
-    k = tvm.reduce_axis((0, n), 'k')
-    T_idx, T_val = tvm.compute((m, ), lambda i: argmax((k.var, val2[i, k]), axis=k), name='T')
-    s = tvm.create_schedule(T_idx.op)
+        return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ)
+
+    argmax = te.comm_reducer(argmax_comp, argmax_init, name='argmax')
+    m = te.var('m')
+    n = te.var('n')
+    val = te.placeholder((m, n), name='val', dtype='float32')
+    val1 = te.compute((m, n), lambda i, j: val[i, j]+1, name='val1')
+    val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name='val2')
+    k = te.reduce_axis((0, n), 'k')
+    T_idx, T_val = te.compute((m, ), lambda i: argmax((k.var, val2[i, k]), axis=k), name='T')
+    s = te.create_schedule(T_idx.op)
     s[val1].compute_inline()
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_auto_inline():
-    m = tvm.var('m')
-    n = tvm.var('n')
-    A = tvm.placeholder((m, n), name='A')
-    B = tvm.placeholder((m, n), name='B')
-    C = tvm.placeholder((m, n), name='C')
-    T1 = tvm.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='T1')
-    T2 = tvm.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name='T2')
-
-    s = tvm.create_schedule(T2.op)
-    tvm.schedule.AutoInlineElemWise(s)
+    m = te.var('m')
+    n = te.var('n')
+    A = te.placeholder((m, n), name='A')
+    B = te.placeholder((m, n), name='B')
+    C = te.placeholder((m, n), name='C')
+    T1 = te.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='T1')
+    T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name='T2')
+
+    s = te.create_schedule(T2.op)
+    tvm.te.schedule.AutoInlineElemWise(s)
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule_const_bound():
     n = 128
-    A = tvm.placeholder((n,), name='A')
-    A1 = tvm.compute((n,), lambda i: A[i] + 1, name='A1')
-    s = tvm.create_schedule(A1.op)
+    A = te.placeholder((n,), name='A')
+    A1 = te.compute((n,), lambda i: A[i] + 1, name='A1')
+    s = te.create_schedule(A1.op)
     xo, xi = s[A1].split(A1.op.axis[0], 8)
-    bounds = tvm.schedule.InferBound(s)
+    bounds = tvm.te.schedule.InferBound(s)
     assert isinstance(bounds, tvm.container.Map)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_inline_mixed():
-    n = tvm.var('n')
-    A = tvm.placeholder((n, ), name='A')
-    A1 = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='A1')
-    A2 = tvm.compute(A.shape, lambda *i: A1(*i) + 2, name='A2')
-    C = tvm.compute((n,), lambda i: A2[i] + A1[i], name='C')
+    n = te.var('n')
+    A = te.placeholder((n, ), name='A')
+    A1 = te.compute(A.shape, lambda *i: A(*i) + 1, name='A1')
+    A2 = te.compute(A.shape, lambda *i: A1(*i) + 2, name='A2')
+    C = te.compute((n,), lambda i: A2[i] + A1[i], name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     xo, xi = s[C].split(C.op.axis[0], factor=8)
     s[A1].compute_at(s[C], xo)
     s[A2].compute_inline()
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
     def check(x):
         if isinstance(x, tvm.tir.Call):
             assert x.func != A2
-    tvm.ir_pass.PostOrderVisit(s[C].op.body[0], check)
+    tvm.tir.ir_pass.PostOrderVisit(s[C].op.body[0], check)
 
 
 def test_scan_inline1():
-    m = tvm.var("m")
-    n = tvm.var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    s_state1 = tvm.placeholder((m, n))
-    s_state2 = tvm.placeholder((m, n))
-    s_init1 = tvm.compute((1, n), lambda _, i: x[0, i])
-    s_init2 = tvm.compute((1, n), lambda _, i: x[0, i])
-    s_x1 = tvm.compute((m, n), lambda t, i: s_state1[t-1, i] + x[t, i], name="x1")
-    s_x2 = tvm.compute((m, n), lambda t, i: s_state2[t-1, i] + 1 , name="x2")
-    s_update1 = tvm.compute((m, n), lambda t, i: s_x1[t, i], "u1")
-    s_update2 = tvm.compute((m, n), lambda t, i: s_x2[t, i], "u2")
-    res1, res2 = tvm.scan([s_init1, s_init2],
+    m = te.var("m")
+    n = te.var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    s_state1 = te.placeholder((m, n))
+    s_state2 = te.placeholder((m, n))
+    s_init1 = te.compute((1, n), lambda _, i: x[0, i])
+    s_init2 = te.compute((1, n), lambda _, i: x[0, i])
+    s_x1 = te.compute((m, n), lambda t, i: s_state1[t-1, i] + x[t, i], name="x1")
+    s_x2 = te.compute((m, n), lambda t, i: s_state2[t-1, i] + 1 , name="x2")
+    s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1")
+    s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2")
+    res1, res2 = tvm.te.scan([s_init1, s_init2],
                           [s_update1, s_update2],
                           [s_state1, s_state2])
-    s = tvm.create_schedule(res1.op)
+    s = te.create_schedule(res1.op)
     s[s_x1].compute_inline()
     stmt = tvm.lower(s, [x, res1, res2])
 
 
 def test_scan_inline2():
-    m = tvm.var("m")
-    n = tvm.var("n")
-    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
-    s_state1 = tvm.placeholder((m, n))
-    s_state2 = tvm.placeholder((m, n))
-    s_init1 = tvm.compute((1, n), lambda _, i: x[0, i])
-    s_init2 = tvm.compute((1, n), lambda _, i: x[0, i])
-    s_xx = tvm.compute((m, n), lambda t, i: s_state1[t-1, i] + x[t, i], name="xx")
-    s_x1 = tvm.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1")
-    s_x2 = tvm.compute((m, n), lambda t, i: s_xx[t, i] + s_state2[t-1, 2], name="x2")
-    s_update1 = tvm.compute((m, n), lambda t, i: s_x1[t, i], "u1")
-    s_update2 = tvm.compute((m, n), lambda t, i: s_x2[t, i], "u2")
-    res1, res2 = tvm.scan([s_init1, s_init2],
+    m = te.var("m")
+    n = te.var("n")
+    x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x")
+    s_state1 = te.placeholder((m, n))
+    s_state2 = te.placeholder((m, n))
+    s_init1 = te.compute((1, n), lambda _, i: x[0, i])
+    s_init2 = te.compute((1, n), lambda _, i: x[0, i])
+    s_xx = te.compute((m, n), lambda t, i: s_state1[t-1, i] + x[t, i], name="xx")
+    s_x1 = te.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1")
+    s_x2 = te.compute((m, n), lambda t, i: s_xx[t, i] + s_state2[t-1, 2], name="x2")
+    s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1")
+    s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2")
+    res1, res2 = tvm.te.scan([s_init1, s_init2],
                           [s_update1, s_update2],
                           [s_state1, s_state2])
-    s = tvm.create_schedule(res1.op)
+    s = te.create_schedule(res1.op)
     s[s_xx].compute_inline()
     s[s_x1].compute_inline()
     s[s_x2].compute_inline()
@@ -191,128 +192,128 @@ def test_scan_inline2():
 
 
 def test_schedule_cache():
-    m = tvm.var('m')
-    n = tvm.var('n')
-    A = tvm.placeholder((m, n), name='A')
-    B = tvm.placeholder((m, n), name='B')
-    C = tvm.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='C')
+    m = te.var('m')
+    n = te.var('n')
+    A = te.placeholder((m, n), name='A')
+    B = te.placeholder((m, n), name='B')
+    C = te.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     AA = s.cache_read(A, "shared", readers=[C])
     CC = s.cache_write(C, "shared")
     s[AA].compute_at(s[CC], CC.op.axis[0])
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule_middle_cache():
-    m = tvm.var('m')
-    n = tvm.var('n')
-    A = tvm.placeholder((m, n), name='A')
-    B = tvm.placeholder((m, n), name='B')
+    m = te.var('m')
+    n = te.var('n')
+    A = te.placeholder((m, n), name='A')
+    B = te.placeholder((m, n), name='B')
 
-    C = tvm.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='C')
-    D = tvm.compute((m, n), lambda i, j:  C(i , j) , name='D')
+    C = te.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='C')
+    D = te.compute((m, n), lambda i, j:  C(i , j) , name='D')
 
-    s = tvm.create_schedule(D.op)
+    s = te.create_schedule(D.op)
     AA = s.cache_read(A, "local", readers=[C])
     BB = s.cache_read(B, "local", readers=[C])
     CC = s.cache_read(C, "local", readers=[D])
     DD = s.cache_write(D, "local")
     #s[AA].compute_at(s[CC], CC.op.axis[0])
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule_cache_relayout1():
-    m = tvm.var('m')
-    n = tvm.var('n')
-    A = tvm.placeholder((m, n), name='A')
-    B = tvm.placeholder((m, n), name='B')
-    C = tvm.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='C')
+    m = te.var('m')
+    n = te.var('n')
+    A = te.placeholder((m, n), name='A')
+    B = te.placeholder((m, n), name='B')
+    C = te.compute((m, n), lambda i, j:  A(i, j) * B(i, j), name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     s[C].reorder(C.op.axis[1], C.op.axis[0])
     CC = s.cache_write(C, "global")
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule_cache_relayout2():
-    m = tvm.var('m')
-    n = tvm.var('n')
-    A = tvm.placeholder((m*4, n), name='A')
-    B = tvm.placeholder((m*4, n), name='B')
-    C = tvm.compute(A.shape, lambda i, j:  A(i, j) * B(i, j), name='C')
-    s = tvm.create_schedule(C.op)
+    m = te.var('m')
+    n = te.var('n')
+    A = te.placeholder((m*4, n), name='A')
+    B = te.placeholder((m*4, n), name='B')
+    C = te.compute(A.shape, lambda i, j:  A(i, j) * B(i, j), name='C')
+    s = te.create_schedule(C.op)
     x, y = C.op.axis
     xo, xi = s[C].split(x, factor=4)
     s[C].reorder(xo, y, xi)
     CC = s.cache_write(C, "global")
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule_cache_relayout3():
-    m = tvm.var('m')
-    n = tvm.var('n')
-    A = tvm.placeholder((m*4, n), name='A')
-    B = tvm.placeholder((m*4, n), name='B')
-    k = tvm.reduce_axis((0, n), "k")
-    C = tvm.compute((A.shape[0],),
-                    lambda i: tvm.sum(A(i, k) * B(i, k), axis=k), name='C')
-    s = tvm.create_schedule(C.op)
+    m = te.var('m')
+    n = te.var('n')
+    A = te.placeholder((m*4, n), name='A')
+    B = te.placeholder((m*4, n), name='B')
+    k = te.reduce_axis((0, n), "k")
+    C = te.compute((A.shape[0],),
+                    lambda i: te.sum(A(i, k) * B(i, k), axis=k), name='C')
+    s = te.create_schedule(C.op)
     x = C.op.axis[0]
     xo, xi = s[C].split(x, factor=4)
     CC = s.cache_write(C, "global")
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule_cache_relayout4():
     def _compute(*indice):
         return A(*indice) + 1, B(*indice) / 2
-    m = tvm.var('m')
-    n = tvm.var('n')
-    A = tvm.placeholder((m*4, n), name='A')
-    B = tvm.placeholder((m*4, n), name='B')
-    C1, C2 = tvm.compute(A.shape, _compute, name='C')
-    s = tvm.create_schedule([C1.op, C2.op])
+    m = te.var('m')
+    n = te.var('n')
+    A = te.placeholder((m*4, n), name='A')
+    B = te.placeholder((m*4, n), name='B')
+    C1, C2 = te.compute(A.shape, _compute, name='C')
+    s = te.create_schedule([C1.op, C2.op])
     C1_cache, C2_cache = s.cache_write([C1, C2], "local")
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def intrin_gemv(m, n):
-    w = tvm.placeholder((m, n), name='w')
-    x = tvm.placeholder((n,), name='x')
-    k = tvm.reduce_axis((0, n), name='k')
-    z = tvm.compute((m,), lambda i:
-                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
-    Wb = tvm.decl_buffer(w.shape, w.dtype,
+    w = te.placeholder((m, n), name='w')
+    x = te.placeholder((n,), name='x')
+    k = te.reduce_axis((0, n), name='k')
+    z = te.compute((m,), lambda i:
+                    te.sum(w[i, k] * x[k], axis=k), name='z')
+    Wb = tvm.tir.decl_buffer(w.shape, w.dtype,
                          name="W",
                          offset_factor=16,
-                         strides=[tvm.var('ldw'), 1])
+                         strides=[te.var('ldw'), 1])
     def intrin_func(ins, outs):
         ww, xx = ins
         zz = outs[0]
         ww_ptr = ww.access_ptr("r")
         xx_ptr = xx.access_ptr("r")
         zz_ptr = zz.access_ptr("w")
-        body = tvm.call_packed(
+        body = tvm.tir.call_packed(
             "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        reset = tvm.call_packed(
+        reset = tvm.tir.call_packed(
             "fill_zero", zz_ptr, n)
-        update = tvm.call_packed(
+        update = tvm.tir.call_packed(
             "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
         return body, reset, update
 
-    with tvm.build_config(data_alignment=16,
+    with tvm.target.build_config(data_alignment=16,
                           offset_factor=16):
-        return tvm.decl_tensor_intrin(z.op, intrin_func,
+        return te.decl_tensor_intrin(z.op, intrin_func,
                                       binds={w: Wb})
 
 
@@ -320,36 +321,36 @@ def test_schedule_tensor_compute1():
     # basic: split, reorder, tile
     M, N, L = 2048, 1024, 512
     factor, rfactor = 16, 16
-    A = tvm.placeholder((N//factor, L//rfactor, factor, rfactor), name='A')
-    B = tvm.placeholder((M, L//rfactor, rfactor), name='B')
-    k = tvm.reduce_axis((0, L//rfactor), name='k')
+    A = te.placeholder((N//factor, L//rfactor, factor, rfactor), name='A')
+    B = te.placeholder((M, L//rfactor, rfactor), name='B')
+    k = te.reduce_axis((0, L//rfactor), name='k')
 
     gemv = intrin_gemv(factor, rfactor)
-    C = tvm.compute((N, M//factor, factor),
+    C = te.compute((N, M//factor, factor),
         lambda i, j: gemv(A[i, k, 0:factor, 0:factor], B[j, k, 0:rfactor], reduce_axis=k),
         name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     ai, aj, ax = s[C].op.axis
     aio, aii = s[C].split(ai, 16)
     s[C].reorder(aio, aj, aii)
     aioo, ajo, aioi, aji = s[C].tile(aio, aj, 16, 4)
 
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def intrin_vadd(n, cache_read=False, cache_write=False):
     scope_ubuf = 'local'
     dtype = 'float32'
-    x = tvm.placeholder((n,), dtype=dtype, name='vx')
-    y = tvm.placeholder((n,), dtype=dtype, name='vy')
-    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
-    s = tvm.create_schedule(z.op)
+    x = te.placeholder((n,), dtype=dtype, name='vx')
+    y = te.placeholder((n,), dtype=dtype, name='vy')
+    z = te.compute(x.shape, lambda i: x[i] + y[i], name='z')
+    s = te.create_schedule(z.op)
 
     def create_buffer(t):
-        return tvm.decl_buffer(t.shape, t.dtype,
+        return tvm.tir.decl_buffer(t.shape, t.dtype,
                                name='W'+t.name,
                                scope=scope_ubuf,
                                offset_factor=16)
@@ -362,12 +363,12 @@ def create_buffer(t):
         binds[z] = create_buffer(z)
 
     def intrin_func(ins, outs):
-        ib = tvm.ir_builder.create()
-        ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
+        ib = tvm.tir.ir_builder.create()
+        ib.emit(tvm.tir.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
         return ib.get()
 
-    with tvm.build_config(offset_factor=16):
-        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
+    with tvm.target.build_config(offset_factor=16):
+        return te.decl_tensor_intrin(z.op, intrin_func, binds=binds)
 
 
 def test_schedule_tensor_compute2():
@@ -377,20 +378,20 @@ def test_schedule_tensor_compute2():
     dtype = 'float32'
     scope_ubuf = 'local'
 
-    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
-    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
+    A = te.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = te.placeholder((M//factor, factor), name="B", dtype=dtype)
 
     vadd = intrin_vadd(factor, True, True)
-    C = tvm.compute((M//factor, factor),
+    C = te.compute((M//factor, factor),
         lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     AL = s.cache_read(A, scope_ubuf, C)
     BL = s.cache_read(B, scope_ubuf, C)
     CL = s.cache_write(C, scope_ubuf)
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_schedule_tensor_compute3():
@@ -398,48 +399,48 @@ def test_schedule_tensor_compute3():
     M = 1024
     factor = 16
     dtype = 'float32'
-    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
-    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
-    Bi = tvm.compute((M//factor, factor), lambda i, j: B[i, j] + 5, name="Bi")
+    A = te.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = te.placeholder((M//factor, factor), name="B", dtype=dtype)
+    Bi = te.compute((M//factor, factor), lambda i, j: B[i, j] + 5, name="Bi")
 
     vadd = intrin_vadd(factor)
-    C = tvm.compute((M//factor, factor),
+    C = te.compute((M//factor, factor),
         lambda i: vadd(A[i, 0:factor], Bi[i, 0:factor]), name='C')
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     s[Bi].compute_at(s[C], C.op.axis[0])
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 
 def test_loop_dep_reduce():
-    X = tvm.placeholder(shape=(10,), name="x")
+    X = te.placeholder(shape=(10,), name="x")
     def f(n):
-        rv = tvm.reduce_axis((0, n))
-        return tvm.sum(X[rv], axis=rv)
-    Y = tvm.compute(X.shape, f, name="y")
-    s = tvm.create_schedule([Y.op])
+        rv = te.reduce_axis((0, n))
+        return te.sum(X[rv], axis=rv)
+    Y = te.compute(X.shape, f, name="y")
+    s = te.create_schedule([Y.op])
     f = tvm.build(s, [X, Y])
 
 
 def test_loop_dep_reduce_cache_write():
-    X = tvm.placeholder(shape=(10,), name="x")
+    X = te.placeholder(shape=(10,), name="x")
     def f(n):
-        rv = tvm.reduce_axis((0, n))
-        init = lambda dtype: tvm.tir.Select(n > 1, tvm.const(0, dtype), n.astype(dtype))
-        sum = tvm.comm_reducer(lambda x, y: tvm.max(x + y, n.astype('float32')), init, name='sum')
+        rv = te.reduce_axis((0, n))
+        init = lambda dtype: tvm.tir.Select(n > 1, tvm.tir.const(0, dtype), n.astype(dtype))
+        sum = te.comm_reducer(lambda x, y: tvm.te.max(x + y, n.astype('float32')), init, name='sum')
         return sum(X[rv], axis=rv)
-    Y = tvm.compute(X.shape, f, name="y")
-    s = tvm.create_schedule([Y.op])
+    Y = te.compute(X.shape, f, name="y")
+    s = te.create_schedule([Y.op])
     s.cache_write(Y, 'local')
     f = tvm.build(s, [X, Y])
 
 def test_reduction_and_dummy_fuse_split():
     n = 10
-    X = tvm.placeholder(shape=(n,), dtype='int32', name="X")
-    k = tvm.reduce_axis((0, n))
-    Y = tvm.compute((), lambda: tvm.sum(X[k], k), name="Y")
-    s = tvm.create_schedule([Y.op])
+    X = te.placeholder(shape=(n,), dtype='int32', name="X")
+    k = te.reduce_axis((0, n))
+    Y = te.compute((), lambda: te.sum(X[k], k), name="Y")
+    s = te.create_schedule([Y.op])
     ax = s[Y.op].fuse(*Y.op.axis)
     axo, axi = s[Y.op].split(ax, nparts=20)
     f = tvm.build(s, [Y, X])
@@ -449,10 +450,10 @@ def test_reduction_and_dummy_fuse_split():
     assert args[0].asnumpy() == n
 
     n = 10
-    X = tvm.placeholder(shape=(n,), dtype='int32', name="X")
-    k = tvm.reduce_axis((0, n))
-    Y = tvm.compute((n,), lambda i: tvm.sum(X[k], k), name="Y")
-    s = tvm.create_schedule([Y.op])
+    X = te.placeholder(shape=(n,), dtype='int32', name="X")
+    k = te.reduce_axis((0, n))
+    Y = te.compute((n,), lambda i: te.sum(X[k], k), name="Y")
+    s = te.create_schedule([Y.op])
     ax = s[Y.op].fuse(*(list(Y.op.axis) + list(Y.op.reduce_axis)))
     f = tvm.build(s, [Y, X])
 
@@ -463,14 +464,14 @@ def test_reduction_and_dummy_fuse_split():
 
 def test_schedule_compute_inline():
     shape = [10, 1024]
-    A = tvm.placeholder(shape, name="A")
-    B = tvm.placeholder(shape, name="B")
-    C = tvm.compute(shape, lambda *index:A(*index)+ B(*index), name = "C")
+    A = te.placeholder(shape, name="A")
+    B = te.placeholder(shape, name="B")
+    C = te.compute(shape, lambda *index:A(*index)+ B(*index), name = "C")
     def _compute(*index) :
         return C(*index) , C(*index) * B(*index)
-    F,E = tvm.compute(shape, _compute, name = "F")
+    F,E = te.compute(shape, _compute, name = "F")
 
-    s = tvm.create_schedule([F.op, E.op])
+    s = te.create_schedule([F.op, E.op])
     AL = s.cache_read(A, "local", [C])
     BL = s.cache_read(B, "local", [C,E])
     CL = s.cache_write(C, "local")
@@ -478,8 +479,8 @@ def _compute(*index) :
     s[C].compute_inline()
 
     s = s.normalize()
-    bounds = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    bounds = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, bounds)
 
 if __name__ == "__main__":
     test_loop_dep_reduce()
diff --git a/tests/python/unittest/test_schedule_tensor_core.py b/tests/python/unittest/test_schedule_tensor_core.py
index cd9e062dc07b..ae2301caffa3 100644
--- a/tests/python/unittest/test_schedule_tensor_core.py
+++ b/tests/python/unittest/test_schedule_tensor_core.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from topi.testing import conv2d_nhwc_python
 from tvm.contrib import nvcc
@@ -28,49 +29,49 @@ def intrin_wmma_load_matrix(shape, scope):
         row, col = n, l
     elif scope == "wmma.matrix_b":
         row, col = l, m
-    A = tvm.placeholder((row, col), name='A', dtype='float16')
-    BA = tvm.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=row * col)
-    C = tvm.compute((row, col), lambda i, j: A[i, j], name='C')
-    BC = tvm.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=row * col)
+    A = te.placeholder((row, col), name='A', dtype='float16')
+    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=row * col)
+    C = te.compute((row, col), lambda i, j: A[i, j], name='C')
+    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=row * col)
 
     def intrin_func(ins, outs):
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
 
         BA = ins[0]
         BC = outs[0]
-        ib.emit(tvm.call_intrin('handle', 'tvm_load_matrix_sync',
+        ib.emit(tvm.tir.call_intrin('handle', 'tvm_load_matrix_sync',
                                 BC.data, n, m, l, BC.elem_offset // (row * col),
                                 BA.access_ptr('r'), col, 'row_major'))
         return ib.get()
 
-    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
 
 
 def intrin_wmma_gemm(shape):
     n, m, l = shape
-    A = tvm.placeholder((n, l), name='A', dtype='float16')
-    B = tvm.placeholder((l, m), name='B', dtype='float16')
-    k = tvm.reduce_axis((0, l), name="k")
-    C = tvm.compute((n, m),
+    A = te.placeholder((n, l), name='A', dtype='float16')
+    B = te.placeholder((l, m), name='B', dtype='float16')
+    k = te.reduce_axis((0, l), name="k")
+    C = te.compute((n, m),
                     lambda ii, jj:
-                    tvm.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
+                    te.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
                     name='C')
-    BA = tvm.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=n * l)
-    BB = tvm.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=l * m)
-    BC = tvm.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=n * m)
+    BA = tvm.tir.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=n * l)
+    BB = tvm.tir.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=l * m)
+    BC = tvm.tir.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=n * m)
 
     def intrin_func(ins, outs):
         BA, BB = ins
         BC, = outs
 
         def init():
-            ib = tvm.ir_builder.create()
-            ib.emit(tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, m, l, BC.elem_offset // (n * m), 0.0))
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, m, l, BC.elem_offset // (n * m), 0.0))
             return ib.get()
 
         def update():
-            ib = tvm.ir_builder.create()
-            ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync',
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_intrin('handle', 'tvm_mma_sync',
                                     BC.data, BC.elem_offset // (n * m),
                                     BA.data, BA.elem_offset // (n * l),
                                     BB.data, BB.elem_offset // (l * m),
@@ -79,27 +80,27 @@ def update():
 
         return update(), init(), update()
 
-    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
+    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
 
 
 def intrin_wmma_store_matrix(shape):
     n, m, l = shape
-    A = tvm.placeholder((n, m), name='A', dtype='float32')
-    BA = tvm.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=n * m)
-    C = tvm.compute((n, m), lambda i, j: A[i, j], name='C')
-    BC = tvm.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=n * m)
+    A = te.placeholder((n, m), name='A', dtype='float32')
+    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=n * m)
+    C = te.compute((n, m), lambda i, j: A[i, j], name='C')
+    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=n * m)
 
     def intrin_func(ins, outs):
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
 
         BA = ins[0]
         BC = outs[0]
-        ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync',
+        ib.emit(tvm.tir.call_intrin('handle', 'tvm_store_matrix_sync',
                                 BA.data, n, m, l, BA.elem_offset // (n * m),
                                 BC.access_ptr('w'), m, 'row_major'))
         return ib.get()
 
-    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
 
 
 def test_tensor_core_batch_matmal():
@@ -117,15 +118,15 @@ def test_tensor_core_batch_matmal():
     assert (m % 8 == 0)
     assert (l % 16 == 0)
     nn, mm, ll = n // 32, m // 8, l // 16
-    A = tvm.placeholder((batch_size, nn, ll, 32, 16), name='A', dtype='float16')
-    B = tvm.placeholder((batch_size, ll, mm, 16, 8), name='B', dtype='float16')
-    k1 = tvm.reduce_axis((0, ll), name='k1')
-    k2 = tvm.reduce_axis((0, 16), name='k2')
-    C = tvm.compute((batch_size, nn, mm, 32, 8),
+    A = te.placeholder((batch_size, nn, ll, 32, 16), name='A', dtype='float16')
+    B = te.placeholder((batch_size, ll, mm, 16, 8), name='B', dtype='float16')
+    k1 = te.reduce_axis((0, ll), name='k1')
+    k2 = te.reduce_axis((0, 16), name='k2')
+    C = te.compute((batch_size, nn, mm, 32, 8),
                     lambda b, i, j, ii, jj:
-                    tvm.sum(A[b, i, k1, ii, k2].astype('float') * B[b, k1, j, k2, jj].astype('float'), axis=[k1, k2]),
+                    te.sum(A[b, i, k1, ii, k2].astype('float') * B[b, k1, j, k2, jj].astype('float'), axis=[k1, k2]),
                     name='Fragment_C')
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
 
     warp_size = 32
     kernel_size = 16
@@ -135,12 +136,12 @@ def test_tensor_core_batch_matmal():
     warp_col_tiles = 2
     chunk = 4
 
-    block_x = tvm.thread_axis('blockIdx.x')
-    block_y = tvm.thread_axis('blockIdx.y')
-    block_z = tvm.thread_axis('blockIdx.z')
-    thread_x = tvm.thread_axis('threadIdx.x')
-    thread_y = tvm.thread_axis('threadIdx.y')
-    thread_z = tvm.thread_axis('threadIdx.z')
+    block_x = te.thread_axis('blockIdx.x')
+    block_y = te.thread_axis('blockIdx.y')
+    block_z = te.thread_axis('blockIdx.z')
+    thread_x = te.thread_axis('threadIdx.x')
+    thread_y = te.thread_axis('threadIdx.y')
+    thread_z = te.thread_axis('threadIdx.z')
 
     AS = s.cache_read(A, 'shared', [C])
     BS = s.cache_read(B, 'shared', [C])
@@ -271,30 +272,30 @@ def test_tensor_core_batch_conv():
     assert (in_channels % block_size == 0)
     assert (out_channels % block_size == 0)
 
-    kh = tvm.reduce_axis((0, kernel_h), name='kh')
-    kw = tvm.reduce_axis((0, kernel_w), name='kw')
-    ic = tvm.reduce_axis((0, in_channels // block_size), name='ic')
-    ii = tvm.reduce_axis((0, block_size), name='ii')
+    kh = te.reduce_axis((0, kernel_h), name='kh')
+    kw = te.reduce_axis((0, kernel_w), name='kw')
+    ic = te.reduce_axis((0, in_channels // block_size), name='ic')
+    ii = te.reduce_axis((0, block_size), name='ii')
 
     # Algorithm
-    A = tvm.placeholder(data_shape, name='A', dtype="float16")
-    W = tvm.placeholder(kernel_shape, name='W', dtype="float16")
-    Apad = tvm.compute(
+    A = te.placeholder(data_shape, name='A', dtype="float16")
+    W = te.placeholder(kernel_shape, name='W', dtype="float16")
+    Apad = te.compute(
         (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
          block_size),
-        lambda n, h, w, i, nn, ii: tvm.if_then_else(
-            tvm.all(h >= pad_h, h - pad_h < height,
+        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
+            tvm.tir.all(h >= pad_h, h - pad_h < height,
                     w >= pad_w, w - pad_w < width),
-            A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")),
+            A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0., "float16")),
         name='Apad')
-    Conv = tvm.compute(output_shape,
-                       lambda n, h, w, o, nn, oo: tvm.sum(
+    Conv = te.compute(output_shape,
+                       lambda n, h, w, o, nn, oo: te.sum(
                            Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
                            W[kh, kw, ic, o, ii, oo].astype("float32"),
                            axis=[ic, kh, kw, ii]),
                        name="Conv")
 
-    s = tvm.create_schedule(Conv.op)
+    s = te.create_schedule(Conv.op)
     s[Apad].compute_inline()
 
     AS = s.cache_read(Apad, 'shared', [Conv])
@@ -303,12 +304,12 @@ def test_tensor_core_batch_conv():
     WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
     ConvF = s.cache_write(Conv, 'wmma.accumulator')
 
-    block_x = tvm.thread_axis('blockIdx.x')
-    block_y = tvm.thread_axis('blockIdx.y')
-    block_z = tvm.thread_axis('blockIdx.z')
-    thread_x = tvm.thread_axis('threadIdx.x')
-    thread_y = tvm.thread_axis('threadIdx.y')
-    thread_z = tvm.thread_axis('threadIdx.z')
+    block_x = te.thread_axis('blockIdx.x')
+    block_y = te.thread_axis('blockIdx.y')
+    block_z = te.thread_axis('blockIdx.z')
+    thread_x = te.thread_axis('threadIdx.x')
+    thread_y = te.thread_axis('threadIdx.y')
+    thread_z = te.thread_axis('threadIdx.z')
 
     nc, hc, wc, oc, nnc, ooc = Conv.op.axis
     block_k = s[Conv].fuse(hc, wc)
diff --git a/tests/python/unittest/test_schedule_tensorize.py b/tests/python/unittest/test_schedule_tensorize.py
index ac60c2d34ebd..28a3ae875fc7 100644
--- a/tests/python/unittest/test_schedule_tensorize.py
+++ b/tests/python/unittest/test_schedule_tensorize.py
@@ -15,98 +15,99 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 
 def intrin_vadd(n):
-    x = tvm.placeholder((n,), name='vx')
-    y = tvm.placeholder((n,), name='vy')
-    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
+    x = te.placeholder((n,), name='vx')
+    y = te.placeholder((n,), name='vy')
+    z = te.compute(x.shape, lambda i: x[i] + y[i], name='z')
     def intrin_func(ins, outs):
         xx, yy = ins
         zz = outs[0]
-        return tvm.call_packed("vadd", xx, yy, zz)
-    with tvm.build_config(offset_factor=16):
-        return tvm.decl_tensor_intrin(z.op, intrin_func)
+        return tvm.tir.call_packed("vadd", xx, yy, zz)
+    with tvm.target.build_config(offset_factor=16):
+        return te.decl_tensor_intrin(z.op, intrin_func)
 
 def intrin_gemv(m, n):
-    w = tvm.placeholder((m, n), name='w')
-    x = tvm.placeholder((n,), name='x')
-    k = tvm.reduce_axis((0, n), name='k')
-    z = tvm.compute((m,), lambda i:
-                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
-    Wb = tvm.decl_buffer(w.shape, w.dtype,
+    w = te.placeholder((m, n), name='w')
+    x = te.placeholder((n,), name='x')
+    k = te.reduce_axis((0, n), name='k')
+    z = te.compute((m,), lambda i:
+                    te.sum(w[i, k] * x[k], axis=k), name='z')
+    Wb = tvm.tir.decl_buffer(w.shape, w.dtype,
                          name="W",
                          offset_factor=16,
-                         strides=[tvm.var('ldw'), 1])
+                         strides=[te.var('ldw'), 1])
     def intrin_func(ins, outs):
         ww, xx = ins
         zz = outs[0]
         ww_ptr = ww.access_ptr("r")
         xx_ptr = xx.access_ptr("r")
         zz_ptr = zz.access_ptr("w")
-        body = tvm.call_packed(
+        body = tvm.tir.call_packed(
             "gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        reset = tvm.call_packed(
+        reset = tvm.tir.call_packed(
             "fill_zero", zz_ptr, n)
-        update = tvm.call_packed(
+        update = tvm.tir.call_packed(
             "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
         return body, reset, update
 
-    with tvm.build_config(data_alignment=16,
+    with tvm.target.build_config(data_alignment=16,
                           offset_factor=16):
-        return tvm.decl_tensor_intrin(z.op, intrin_func,
+        return te.decl_tensor_intrin(z.op, intrin_func,
                                       binds={w: Wb})
 
 def intrin_gemv_no_reset(m, n):
-    w = tvm.placeholder((m, n), name='w')
-    x = tvm.placeholder((n,), name='x')
-    k = tvm.reduce_axis((0, n), name='k')
-    z = tvm.compute((m,), lambda i:
-                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
-    Wb = tvm.decl_buffer(w.shape, w.dtype,
+    w = te.placeholder((m, n), name='w')
+    x = te.placeholder((n,), name='x')
+    k = te.reduce_axis((0, n), name='k')
+    z = te.compute((m,), lambda i:
+                    te.sum(w[i, k] * x[k], axis=k), name='z')
+    Wb = tvm.tir.decl_buffer(w.shape, w.dtype,
                          name="W",
                          offset_factor=16,
-                         strides=[tvm.var('ldw'), 1])
+                         strides=[te.var('ldw'), 1])
     def intrin_func(ins, outs):
         ww, xx = ins
         zz = outs[0]
         ww_ptr = ww.access_ptr("r")
         xx_ptr = xx.access_ptr("r")
         zz_ptr = zz.access_ptr("w")
-        body = tvm.call_packed(
+        body = tvm.tir.call_packed(
             "gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
-        update = tvm.call_packed(
+        update = tvm.tir.call_packed(
             "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
         return body, None, update
 
-    with tvm.build_config(data_alignment=16,
+    with tvm.target.build_config(data_alignment=16,
                           offset_factor=16):
-        return tvm.decl_tensor_intrin(z.op, intrin_func,
+        return te.decl_tensor_intrin(z.op, intrin_func,
                                       binds={w: Wb})
 
 
 def test_tensorize_vadd():
     m = 128
-    x = tvm.placeholder((m,), name='x')
-    y = tvm.placeholder((m,), name='y')
-    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
+    x = te.placeholder((m,), name='x')
+    y = te.placeholder((m,), name='y')
+    z = te.compute(x.shape, lambda i: x[i] + y[i], name='z')
 
     def check(factor):
-        s = tvm.create_schedule(z.op)
+        s = te.create_schedule(z.op)
         xo, xi = s[z].split(z.op.axis[0], factor=factor)
         vadd = intrin_vadd(factor)
         s[z].tensorize(xi, vadd)
         s = s.normalize()
-        dom_map = tvm.schedule.InferBound(s)
+        dom_map = tvm.te.schedule.InferBound(s)
         finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
         out_dom, in_dom = finfer(s[z], dom_map)
-        assert tvm.ir_pass.Equal(out_dom[z.op.axis[0]].extent, factor)
-        assert tvm.ir_pass.Equal(out_dom[z.op.axis[0]].min, xo * factor)
-        assert tvm.ir_pass.Equal(in_dom.items()[0][1][0].extent, factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[z.op.axis[0]].extent, factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[z.op.axis[0]].min, xo * factor)
+        assert tvm.tir.ir_pass.Equal(in_dom.items()[0][1][0].extent, factor)
         fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
         body = fmatch(s[z], out_dom, in_dom, vadd)
-        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
-                                 tvm.ir_pass.CanonicalSimplify(vadd.op.body[0]))
-        stmt = tvm.schedule.ScheduleOps(s, dom_map)
+        assert tvm.tir.ir_pass.Equal(tvm.tir.ir_pass.CanonicalSimplify(body[0]),
+                                 tvm.tir.ir_pass.CanonicalSimplify(vadd.op.body[0]))
+        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
         tvm.lower(s, [x, y, z])
 
     check(16)
@@ -116,35 +117,35 @@ def test_tensorize_matmul():
     n = 1024
     m = n
     l = n
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((m, l), name='B')
-    k = tvm.reduce_axis((0, l), name='k')
-    C = tvm.compute((n, m), lambda i, j:
-                    tvm.sum(B[j, k] * A[i, k], axis=k), name='C')
+    A = te.placeholder((n, l), name='A')
+    B = te.placeholder((m, l), name='B')
+    k = te.reduce_axis((0, l), name='k')
+    C = te.compute((n, m), lambda i, j:
+                    te.sum(B[j, k] * A[i, k], axis=k), name='C')
 
     def check(factor):
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
         x, y = C.op.axis
         yo, yi = s[C].split(y, factor=factor)
         gemv = intrin_gemv(factor, l)
         s[C].tensorize(yi, gemv)
         s = s.normalize()
-        dom_map = tvm.schedule.InferBound(s)
+        dom_map = tvm.te.schedule.InferBound(s)
         finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
         out_dom, in_dom = finfer(s[C], dom_map)
-        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
-        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
-        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1)
+        assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor)
         fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
         body = fmatch(s[C], out_dom, in_dom, gemv)
-        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
-                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
-        stmt = tvm.schedule.ScheduleOps(s, dom_map)
+        assert tvm.tir.ir_pass.Equal(tvm.tir.ir_pass.CanonicalSimplify(body[0]),
+                                 tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0]))
+        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
         tvm.lower(s, [A, B, C])
 
 
     def check_rfactor(factor, rfactor):
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
         x, y = C.op.axis
         rk = C.op.reduce_axis[0]
         yo, yi = s[C].split(y, factor=factor)
@@ -153,21 +154,21 @@ def check_rfactor(factor, rfactor):
         gemv = intrin_gemv(factor, rfactor)
         s[C].tensorize(yi, gemv)
         s = s.normalize()
-        dom_map = tvm.schedule.InferBound(s)
+        dom_map = tvm.te.schedule.InferBound(s)
         finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
         out_dom, in_dom = finfer(s[C], dom_map)
-        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
-        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
-        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1)
+        assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor)
         fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
         body = fmatch(s[C], out_dom, in_dom, gemv)
-        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
-                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
-        stmt = tvm.schedule.ScheduleOps(s, dom_map)
+        assert tvm.tir.ir_pass.Equal(tvm.tir.ir_pass.CanonicalSimplify(body[0]),
+                                 tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0]))
+        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
         tvm.lower(s, [A, B, C])
 
     def check_rfactor_no_reset(factor, rfactor):
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
         x, y = C.op.axis
         rk = C.op.reduce_axis[0]
         yo, yi = s[C].split(y, factor=factor)
@@ -176,21 +177,21 @@ def check_rfactor_no_reset(factor, rfactor):
         gemv = intrin_gemv_no_reset(factor, rfactor)
         s[C].tensorize(yi, gemv)
         s = s.normalize()
-        dom_map = tvm.schedule.InferBound(s)
+        dom_map = tvm.te.schedule.InferBound(s)
         finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
         out_dom, in_dom = finfer(s[C], dom_map)
-        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
-        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
-        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1)
+        assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor)
         fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
         body = fmatch(s[C], out_dom, in_dom, gemv)
-        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
-                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
-        stmt = tvm.schedule.ScheduleOps(s, dom_map)
+        assert tvm.tir.ir_pass.Equal(tvm.tir.ir_pass.CanonicalSimplify(body[0]),
+                                 tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0]))
+        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
         tvm.lower(s, [A, B, C])
 
     def check_rfactor_no_reset_multi_reduction(factor, rfactor):
-        s = tvm.create_schedule(C.op)
+        s = te.create_schedule(C.op)
         x, y = C.op.axis
         rk = C.op.reduce_axis[0]
         yo, yi = s[C].split(y, factor=factor)
@@ -200,17 +201,17 @@ def check_rfactor_no_reset_multi_reduction(factor, rfactor):
         gemv = intrin_gemv_no_reset(factor, rfactor)
         s[C].tensorize(yi, gemv)
         s = s.normalize()
-        dom_map = tvm.schedule.InferBound(s)
+        dom_map = tvm.te.schedule.InferBound(s)
         finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
         out_dom, in_dom = finfer(s[C], dom_map)
-        assert tvm.ir_pass.Equal(out_dom[x].extent, 1)
-        assert tvm.ir_pass.Equal(out_dom[y].extent, factor)
-        assert tvm.ir_pass.Equal(out_dom[y].min, yo * factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1)
+        assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor)
+        assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor)
         fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
         body = fmatch(s[C], out_dom, in_dom, gemv)
-        assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(body[0]),
-                                 tvm.ir_pass.CanonicalSimplify(gemv.op.body[0]))
-        stmt = tvm.schedule.ScheduleOps(s, dom_map)
+        assert tvm.tir.ir_pass.Equal(tvm.tir.ir_pass.CanonicalSimplify(body[0]),
+                                 tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0]))
+        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
         tvm.lower(s, [A, B, C])
 
     check(16)
@@ -221,28 +222,28 @@ def check_rfactor_no_reset_multi_reduction(factor, rfactor):
 # This tests whether algorithm and intrinsics expressions are simplified
 # as much as possible first and then checked for equality. See Issue #696
 def test_tensorize_op():
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     def op_intrin():
         bh = 9
         bw = 9
-        x = tvm.placeholder((5, 5), name='A')
-        y = tvm.compute((bh, bw),
+        x = te.placeholder((5, 5), name='A')
+        y = te.compute((bh, bw),
                         lambda i, j: x[idxd(j,3) + idxm(i,3), idxm(j,3)+ idxd(i,3)])
 
         def intrin_func(ins, outs):
             xx, = ins
             zz = outs[0]
-            return tvm.call_packed("op", xx, zz)
+            return tvm.tir.call_packed("op", xx, zz)
 
-        with tvm.build_config(offset_factor=2):
-            return tvm.decl_tensor_intrin(y.op, intrin_func)
+        with tvm.target.build_config(offset_factor=2):
+            return te.decl_tensor_intrin(y.op, intrin_func)
 
-    A = tvm.placeholder((5, 5), name='A')
-    B = tvm.compute((9,9), lambda i, j: A[idxd(j,3) + idxm(i,3), idxm(j,3) + idxd(i,3)])
+    A = te.placeholder((5, 5), name='A')
+    B = te.compute((9,9), lambda i, j: A[idxd(j,3) + idxm(i,3), idxm(j,3) + idxd(i,3)])
     bt = op_intrin()
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
 
     x,y = B.op.axis
     s[B].tensorize(x, bt)
@@ -255,16 +256,16 @@ def test_tensorize_tensor_compute_op():
     # an intrinsic called "multivadd" whose definition (pattern)
     # is a loop of another intrinsic called "vadd"
     def intrin_multivadd(n):
-        n_a = tvm.var("n_a")
-        Ab = tvm.decl_buffer((n, ), tvm.float32, strides=[n_a])
+        n_a = te.var("n_a")
+        Ab = tvm.tir.decl_buffer((n, ), "float32", strides=[n_a])
 
-        n_b = tvm.var("n_b")
-        Bb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_b])
+        n_b = te.var("n_b")
+        Bb = tvm.tir.decl_buffer((n, ), "float32", strides=[n_b])
 
-        n_c = tvm.var("n_c")
-        Cb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_c])
+        n_c = te.var("n_c")
+        Cb = tvm.tir.decl_buffer((n, ), "float32", strides=[n_c])
 
-        z = tvm.compute((n,), lambda i: tvm.call_extern("float32", 'vadd',
+        z = te.compute((n,), lambda i: tvm.tir.call_extern("float32", 'vadd',
                                                         Ab.access_ptr("w", offset=n_a*i),
                                                         Bb.access_ptr("r", offset=n_b*i),
                                                         Cb.access_ptr("r", offset=n_c*i)))
@@ -272,32 +273,32 @@ def intrin_multivadd(n):
         # replace the pattern with the multivadd call. I need to figure out
         # how to pass it the right parameters.
         def intrin_func(ins, outs):
-            return tvm.call_packed("multivadd")
+            return tvm.tir.call_packed("multivadd")
 
-        with tvm.build_config():
-            return tvm.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
+        with tvm.target.build_config():
+            return te.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
 
     def intrin_vadd(n):
         dtype = 'float32'
-        x = tvm.placeholder((n,), dtype=dtype, name='vx')
-        y = tvm.placeholder((n,), dtype=dtype, name='vy')
-        z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
-        s = tvm.create_schedule(z.op)
+        x = te.placeholder((n,), dtype=dtype, name='vx')
+        y = te.placeholder((n,), dtype=dtype, name='vy')
+        z = te.compute(x.shape, lambda i: x[i] + y[i], name='z')
+        s = te.create_schedule(z.op)
 
         def create_buffer(t):
-            return tvm.decl_buffer(t.shape, t.dtype,
+            return tvm.tir.decl_buffer(t.shape, t.dtype,
                                    name='W'+t.name,
                                    offset_factor=16)
 
         def intrin_func(ins, outs):
-            ib = tvm.ir_builder.create()
-            ib.emit(tvm.call_extern("float32", 'vadd',
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_extern("float32", 'vadd',
                                     ins[0].access_ptr("r"), ins[1].access_ptr('r'),
                                     outs[0].access_ptr('wr')))
             return ib.get()
 
-        with tvm.build_config(offset_factor=16):
-            return tvm.decl_tensor_intrin(z.op, intrin_func, binds={x: create_buffer(x),
+        with tvm.target.build_config(offset_factor=16):
+            return te.decl_tensor_intrin(z.op, intrin_func, binds={x: create_buffer(x),
                                                                     y: create_buffer(y),
                                                                     z: create_buffer(z)})
 
@@ -306,19 +307,19 @@ def intrin_func(ins, outs):
     factor = 16
     dtype = 'float32'
 
-    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
-    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
+    A = te.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = te.placeholder((M//factor, factor), name="B", dtype=dtype)
 
     vadd = intrin_vadd(factor)
-    C = tvm.compute((M//factor, factor),
+    C = te.compute((M//factor, factor),
                     lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     multivadd = intrin_multivadd(64)
     s[C].tensorize(C.op.axis[0], multivadd)
     s = s.normalize()
-    dom_map = tvm.schedule.InferBound(s)
-    stmt = tvm.schedule.ScheduleOps(s, dom_map)
+    dom_map = tvm.te.schedule.InferBound(s)
+    stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
     # The loop that we tried to tensorize still exists in the code
     # That means tensorize didn't work as expected
     assert isinstance(stmt.body.body.body, tvm.tir.For)
diff --git a/tests/python/unittest/test_testing.py b/tests/python/unittest/test_testing.py
index b17d8893a955..ecf520d251f1 100644
--- a/tests/python/unittest/test_testing.py
+++ b/tests/python/unittest/test_testing.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm.testing import check_numerical_grads
 
 def test_check_numerical_grads():
diff --git a/tests/python/unittest/test_tvm_intrin.py b/tests/python/unittest/test_tvm_intrin.py
index 23e921d3f1ce..5bb1c6538750 100644
--- a/tests/python/unittest/test_tvm_intrin.py
+++ b/tests/python/unittest/test_tvm_intrin.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import topi
 from tvm.contrib import util, clang
 import numpy as np
@@ -23,10 +24,10 @@
 
 
 def test_nearbyint():
-    m = tvm.var("m",)
-    A = tvm.placeholder((m,), name='A')
-    A_rounded = tvm.compute((m,), lambda *i: tvm.nearbyint(A(*i)), name='A')
-    s = tvm.create_schedule(A_rounded.op)
+    m = te.var("m",)
+    A = te.placeholder((m,), name='A')
+    A_rounded = te.compute((m,), lambda *i: tvm.tir.nearbyint(A(*i)), name='A')
+    s = te.create_schedule(A_rounded.op)
     f = tvm.build(s, [A, A_rounded], "llvm")
     ctx = tvm.cpu(0)
     n = 10
diff --git a/tests/web/prepare_test_libs.py b/tests/web/prepare_test_libs.py
index ada40e66f757..a0e2c13eab82 100644
--- a/tests/web/prepare_test_libs.py
+++ b/tests/web/prepare_test_libs.py
@@ -16,6 +16,7 @@
 # under the License.
 # Prepare test library for js.
 import tvm
+from tvm import te
 from tvm.contrib import emscripten
 import os
 
@@ -23,10 +24,10 @@ def prepare_test_libs(base_path):
     target = "llvm -target=asmjs-unknown-emscripten -system-lib"
     if not tvm.runtime.enabled(target):
         raise RuntimeError("Target %s is not enbaled" % target)
-    n = tvm.var("n")
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    n = te.var("n")
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
     fadd1 = tvm.build(s, [A, B], target, name="add_one")
     obj_path = os.path.join(base_path, "test_add_one.bc")
     fadd1.save(obj_path)
diff --git a/tests/web/websock_rpc_test.py b/tests/web/websock_rpc_test.py
index 92b0ad350bc0..8be8ce04cb75 100644
--- a/tests/web/websock_rpc_test.py
+++ b/tests/web/websock_rpc_test.py
@@ -21,6 +21,7 @@
 """
 
 import tvm
+from tvm import te
 import os
 from tvm import rpc
 from tvm.contrib import util, emscripten
@@ -33,10 +34,10 @@ def test_rpc_array():
     if not tvm.runtime.enabled("rpc"):
         return
     # graph
-    n = tvm.convert(1024)
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
-    s = tvm.create_schedule(B.op)
+    n = tvm.runtime.convert(1024)
+    A = te.placeholder((n,), name='A')
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = te.create_schedule(B.op)
     remote = rpc.connect(proxy_host, proxy_port, key="js")
     target = "llvm -target=asmjs-unknown-emscripten -system-lib"
     def check_remote():
diff --git a/tests/webgl/test_local_gemm.py b/tests/webgl/test_local_gemm.py
index ff3c1a77bb8d..6bd22bf0057b 100644
--- a/tests/webgl/test_local_gemm.py
+++ b/tests/webgl/test_local_gemm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def test_local_gemm():
@@ -24,17 +25,17 @@ def test_local_gemm():
         return
 
     nn = 1024
-    n = tvm.var('n')
-    n = tvm.convert(nn)
+    n = te.var('n')
+    n = tvm.runtime.convert(nn)
     m = n
     l = n
-    A = tvm.placeholder((n, l), name='A', dtype='int32')
-    B = tvm.placeholder((m, l), name='B', dtype='int32')
-    k = tvm.reduce_axis((0, l), name='k')
-    C = tvm.compute((n, m), lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
+    A = te.placeholder((n, l), name='A', dtype='int32')
+    B = te.placeholder((m, l), name='B', dtype='int32')
+    k = te.reduce_axis((0, l), name='k')
+    C = te.compute((n, m), lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k),
                     name='CC')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     s[C].opengl()
     print(tvm.lower(s, [A, B, C], simple_mode=True))
 
diff --git a/tests/webgl/test_local_multi_stage.py b/tests/webgl/test_local_multi_stage.py
index 578639962bb7..54a554b74ed9 100644
--- a/tests/webgl/test_local_multi_stage.py
+++ b/tests/webgl/test_local_multi_stage.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 
 def test_local_multi_stage():
@@ -23,12 +24,12 @@ def test_local_multi_stage():
     if not tvm.runtime.enabled("llvm"):
         return
 
-    n = tvm.var("n")
-    A = tvm.placeholder((n,), name='A', dtype="int32")
-    B = tvm.compute((n,), lambda i: A[i] + 1, name="B")
-    C = tvm.compute((n,), lambda i: B[i] * 2, name="C")
+    n = te.var("n")
+    A = te.placeholder((n,), name='A', dtype="int32")
+    B = te.compute((n,), lambda i: A[i] + 1, name="B")
+    C = te.compute((n,), lambda i: B[i] * 2, name="C")
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     s[B].opengl()
     s[C].opengl()
 
diff --git a/tests/webgl/test_local_save_load.py b/tests/webgl/test_local_save_load.py
index 0a63a77cf52e..cca68020c0c2 100644
--- a/tests/webgl/test_local_save_load.py
+++ b/tests/webgl/test_local_save_load.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import rpc
 from tvm.contrib import util, emscripten
 
@@ -25,11 +26,11 @@ def test_local_save_load():
     if not tvm.runtime.enabled("llvm"):
         return
 
-    n = tvm.var("n")
-    A = tvm.placeholder((n,), name='A', dtype='int32')
-    B = tvm.placeholder((n,), name='B', dtype='int32')
-    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = tvm.create_schedule(C.op)
+    n = te.var("n")
+    A = te.placeholder((n,), name='A', dtype='int32')
+    B = te.placeholder((n,), name='B', dtype='int32')
+    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = te.create_schedule(C.op)
     s[C].opengl()
 
     f = tvm.build(s, [A, B, C], "opengl", target_host="llvm", name="myadd")
diff --git a/tests/webgl/test_local_topi_conv2d_nchw.py b/tests/webgl/test_local_topi_conv2d_nchw.py
index c03d9dcc9007..0d9b7776096a 100644
--- a/tests/webgl/test_local_topi_conv2d_nchw.py
+++ b/tests/webgl/test_local_topi_conv2d_nchw.py
@@ -20,6 +20,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
@@ -27,8 +28,8 @@
 def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W')
     B = topi.nn.conv2d_nchw(A, W, stride, padding)
     C = topi.nn.relu(B)
 
@@ -59,7 +60,7 @@ def check_device(device):
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=1400,
+        with tvm.target.build_config(auto_unroll_max_step=1400,
                               unroll_explicit=(device != "cuda")):
             func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
             func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
diff --git a/tests/webgl/test_local_topi_dense.py b/tests/webgl/test_local_topi_dense.py
index d57bfd20f186..60dfe1ff690f 100644
--- a/tests/webgl/test_local_topi_dense.py
+++ b/tests/webgl/test_local_topi_dense.py
@@ -20,15 +20,16 @@
 """
 import numpy as np
 import tvm
+from tvm import te
 import topi
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 
 def verify_dense(batch, in_dim, out_dim, use_bias=True):
-    A = tvm.placeholder((batch, in_dim), name='A')
-    B = tvm.placeholder((out_dim, in_dim), name='B')
-    C = tvm.placeholder((out_dim,), name='C')
+    A = te.placeholder((batch, in_dim), name='A')
+    B = te.placeholder((out_dim, in_dim), name='B')
+    C = te.placeholder((out_dim,), name='C')
     D = topi.nn.dense(A, B, C if use_bias else None)
     D = topi.nn.relu(D)
     dtype = A.dtype
diff --git a/tests/webgl/test_local_topi_pooling.py b/tests/webgl/test_local_topi_pooling.py
index c1b66604c6a7..3adae7bba51c 100644
--- a/tests/webgl/test_local_topi_pooling.py
+++ b/tests/webgl/test_local_topi_pooling.py
@@ -20,6 +20,7 @@
 """
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import math
 from topi.util import get_const_tuple
@@ -29,7 +30,7 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode):
     kw = kh
     sw = sh
     ph, pw = padding
-    A = tvm.placeholder((n, ic, ih, iw), name='A')
+    A = te.placeholder((n, ic, ih, iw), name='A')
     B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
                      pool_type=pool_type, ceil_mode=ceil_mode)
     B = topi.nn.relu(B)
@@ -91,7 +92,7 @@ def test_pool():
 
 
 def verify_global_pool(n, c, h, w, pool_type):
-    A = tvm.placeholder((n, c, h, w), name='A')
+    A = te.placeholder((n, c, h, w), name='A')
     B = topi.nn.global_pool(A, pool_type=pool_type)
     B = topi.nn.relu(B)
 
diff --git a/tests/webgl/test_local_topi_softmax.py b/tests/webgl/test_local_topi_softmax.py
index 5d9ed9345e76..c0ddbf21419a 100644
--- a/tests/webgl/test_local_topi_softmax.py
+++ b/tests/webgl/test_local_topi_softmax.py
@@ -22,15 +22,16 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import logging
 from topi.util import get_const_tuple
 
 def verify_softmax(m, n):
-    A = tvm.placeholder((m, n), name='A')
+    A = te.placeholder((m, n), name='A')
     B = topi.nn.softmax(A)
     # confirm lower works
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
     tvm.lower(s, [A, B], simple_mode=True)
 
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
@@ -59,10 +60,10 @@ def test_softmax():
 
 
 def verify_log_softmax(m, n):
-    A = tvm.placeholder((m, n), name='A')
+    A = te.placeholder((m, n), name='A')
     B = topi.nn.log_softmax(A)
     # confirm lower works
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
     tvm.lower(s, [A, B], simple_mode=True)
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = topi.testing.log_softmax_python(a_np)
diff --git a/tests/webgl/test_remote_save_load.py b/tests/webgl/test_remote_save_load.py
index 1e2ca0f8dd5c..34bbb3fa0f00 100644
--- a/tests/webgl/test_remote_save_load.py
+++ b/tests/webgl/test_remote_save_load.py
@@ -30,6 +30,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import rpc
 from tvm.contrib import util, emscripten
 
@@ -45,11 +46,11 @@ def try_remote_save_load():
         return
 
     # Build the module.
-    n = tvm.var("n")
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.placeholder((n,), name='B')
-    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = tvm.create_schedule(C.op)
+    n = te.var("n")
+    A = te.placeholder((n,), name='A')
+    B = te.placeholder((n,), name='B')
+    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
+    s = te.create_schedule(C.op)
     s[C].opengl()
     target_host = "llvm -target=asmjs-unknown-emscripten -system-lib"
     f = tvm.build(s, [A, B, C], "opengl", target_host=target_host, name="myadd")
diff --git a/tests/webgl/test_static_webgl_library.py b/tests/webgl/test_static_webgl_library.py
index 365f821845ae..929da4ca294c 100644
--- a/tests/webgl/test_static_webgl_library.py
+++ b/tests/webgl/test_static_webgl_library.py
@@ -20,6 +20,7 @@
 
 import os, shutil, SimpleHTTPServer, SocketServer
 import tvm
+from tvm import te
 from tvm.contrib import emscripten, util
 import numpy as np
 
@@ -30,11 +31,11 @@ def try_static_webgl_library():
     os.chdir(os.path.join(curr_path, "../../lib"))
 
     # Create OpenGL module.
-    n = tvm.var("n")
-    A = tvm.placeholder((n,), name='A', dtype="float")
-    B = tvm.compute((n,), lambda *i: A[i], name="B")
+    n = te.var("n")
+    A = te.placeholder((n,), name='A', dtype="float")
+    B = te.compute((n,), lambda *i: A[i], name="B")
 
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     s[B].opengl()
 
     target_host = "llvm -target=asmjs-unknown-emscripten -system-lib"
diff --git a/topi/python/topi/argwhere.py b/topi/python/topi/argwhere.py
index c2a9adea0c2a..7d8429b95904 100644
--- a/topi/python/topi/argwhere.py
+++ b/topi/python/topi/argwhere.py
@@ -24,12 +24,12 @@ def hybrid_argwhere_1d(output_shape, condition):
 
     Parameters
     ----------
-    condition : tvm.Tensor
+    condition : tvm.te.Tensor
         1-D tensor with boolean values.
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         Indices of non-zero elements.
     """
     a = output_tensor(output_shape, "int32")
@@ -47,12 +47,12 @@ def hybrid_argwhere_2d(output_shape, condition):
 
     Parameters
     ----------
-    condition : tvm.Tensor
+    condition : tvm.te.Tensor
         2-D tensor with boolean values.
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         Indices of non-zero elements.
     """
     a = output_tensor(output_shape, "int32")
@@ -73,12 +73,12 @@ def hybrid_argwhere_3d(output_shape, condition):
 
     Parameters
     ----------
-    condition : tvm.Tensor
+    condition : tvm.te.Tensor
         3-D tensor with boolean values.
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         Indices of non-zero elements.
     """
     a = output_tensor(output_shape, "int32")
@@ -102,12 +102,12 @@ def hybrid_argwhere_4d(output_shape, condition):
 
     Parameters
     ----------
-    condition : tvm.Tensor
+    condition : tvm.te.Tensor
         4-D tensor with boolean values.
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         Indices of non-zero elements.
     """
     a = output_tensor(output_shape, "int32")
@@ -134,12 +134,12 @@ def hybrid_argwhere_5d(output_shape, condition):
 
     Parameters
     ----------
-    condition : tvm.Tensor
+    condition : tvm.te.Tensor
         5-D tensor with boolean values.
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         Indices of non-zero elements.
     """
     a = output_tensor(output_shape, "int32")
@@ -168,12 +168,12 @@ def argwhere(output_shape, condition):
 
     Parameters
     ----------
-    condition : tvm.Tensor
+    condition : tvm.te.Tensor
         Tensor with boolean values.
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         Indices of non-zero elements.
     """
     if len(condition.shape) == 1:
diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py
index d28ec09925c2..bdda496f8fb8 100644
--- a/topi/python/topi/arm_cpu/bitserial_conv2d.py
+++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@@ -18,6 +18,7 @@
 """Bitserial conv2d schedule on arm cpu"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm import relay
 from .. import tag
@@ -34,8 +35,8 @@ def _kernel_vec_spatial_pack_nhwc(kernel, kernel_bits, VC, use_bitpack=True):
         kernel_q = kernel
     KH, KW, KB, CI, CO = kernel_q.shape
     kvshape = (CO//VC, KH, KW, KB, VC, CI)
-    return tvm.compute(kvshape, lambda co, dh, dw, b, vc, ci: \
-        kernel_q[dh][dw][b][ci][co*VC+vc], name='kernel_vec')
+    return te.compute(kvshape, lambda co, dh, dw, b, vc, ci: \
+                      kernel_q[dh][dw][b][ci][co*VC+vc], name='kernel_vec')
 
 @autotvm.register_topi_compute("bitserial_conv2d_nhwc.arm_cpu")
 def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, activation_bits, weight_bits,
@@ -69,8 +70,8 @@ def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, activation_bits, w
     OW = (PAD_W - KW) // WSTR + 1
     oshape = (1, OH, OW, CO)
 
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     # Pad input channels of weights and data when it is not a multiple of 8
     if CI_packed % 8 != 0:
@@ -108,7 +109,7 @@ def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, activation_bits, w
     data_q = bitpack(data, activation_bits, pack_axis=3, bit_axis=3, pack_type='uint8')
 
     kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC, len(kernel.shape) == 4)
-    idxm = tvm.indexmod
+    idxm = tvm.tir.indexmod
     if idxm(kernel_vec.shape[-1], 8) != 0 and CI_PAD != 0:
         kernel_vec = pad(kernel_vec, [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, CI_PAD])
 
@@ -125,78 +126,79 @@ def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, activation_bits, w
     else:
         data_pad = data_q
 
-    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
-        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
-    ib = tvm.reduce_axis((0, IB), name='ib')
-    kb = tvm.reduce_axis((0, KB), name='kb')
+    data_vec = te.compute(dvshape, lambda n, h, w, vh, vw, b, ci: \
+                          data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][b][ci], name='data_vec')
+    ci = te.reduce_axis((0, CI), name='ci')
+    dh = te.reduce_axis((0, KH), name='dh')
+    dw = te.reduce_axis((0, KW), name='dw')
+    ib = te.reduce_axis((0, IB), name='ib')
+    kb = te.reduce_axis((0, KB), name='kb')
 
     def _bipolar_conv(n, h, w, co, vh, vw, vc):
-        return tvm.sum((tvm.popcount(
+        return te.sum((tvm.tir.popcount(
             kernel_vec[co, dh, dw, kb, vc, ci].astype('uint16') &
             data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('uint16'))
-                        << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci])
+                       << (kb + ib).astype('uint16')), axis=[dh, dw, kb, ib, ci])
     def _unipolar_conv(n, h, w, co, vh, vw, vc):
-        return tvm.sum(
-            ((tvm.popcount(kernel_vec[co, dh, dw, kb, vc, ci].astype('int16') &
-                           data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('int16')) -
-              tvm.popcount(~kernel_vec[co, dh, dw, kb, vc, ci].astype('int16') &
-                           data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci]).astype('int16'))
+        return te.sum(
+            ((tvm.tir.popcount(kernel_vec[co, dh, dw, kb, vc, ci].astype('int16') &
+                               data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci].astype('int16')) -
+              tvm.tir.popcount(~kernel_vec[co, dh, dw, kb, vc, ci].astype('int16') &
+                               data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ib, ci]).astype('int16'))
              << (kb + ib).astype('int16')), axis=[dh, dw, kb, ib, ci])
     if unipolar:
-        conv_vec = tvm.compute(ovshape, _unipolar_conv, name='conv_vec', tag='unipolar')
+        conv_vec = te.compute(ovshape, _unipolar_conv, name='conv_vec', tag='unipolar')
     else:
-        conv_vec = tvm.compute(ovshape, _bipolar_conv, name='conv_vec', tag='bipolar')
+        conv_vec = te.compute(ovshape, _bipolar_conv, name='conv_vec', tag='bipolar')
 
 
-    conv = tvm.compute(oshape,
-                       lambda n, h, w, co:
-                       conv_vec[n,
-                                idxd(h, VH), idxd(w, VW), idxd(co, VC),
-                                idxm(h, VH), idxm(w, VW), idxm(co, VC)].astype(out_dtype),
-                       name='conv', tag='spatial_bitserial_conv_nhwc')
+    conv = te.compute(oshape,
+                      lambda n, h, w, co:
+                      conv_vec[n,
+                               idxd(h, VH), idxd(w, VW), idxd(co, VC),
+                               idxm(h, VH), idxm(w, VW), idxm(co, VC)].astype(out_dtype),
+                      name='conv', tag='spatial_bitserial_conv_nhwc')
 
     return conv
 
 def _intrin_popcount(m, k_i, w_b, x_b, unipolar):
     pack_dtype = 'uint8'
-    w = tvm.placeholder((w_b, m, k_i), dtype=pack_dtype, name='w')
-    x = tvm.placeholder((x_b, k_i,), dtype=pack_dtype, name='x')
-    k = tvm.reduce_axis((0, k_i), name='k')
-    bw = tvm.reduce_axis((0, w_b), name='bw')
-    bx = tvm.reduce_axis((0, x_b), name='bx')
+    w = te.placeholder((w_b, m, k_i), dtype=pack_dtype, name='w')
+    x = te.placeholder((x_b, k_i,), dtype=pack_dtype, name='x')
+    k = te.reduce_axis((0, k_i), name='k')
+    bw = te.reduce_axis((0, w_b), name='bw')
+    bx = te.reduce_axis((0, x_b), name='bx')
     if unipolar:
         dtype = 'int16'
-        z = tvm.compute((m,), lambda i:
-                        tvm.sum((tvm.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype)) -
-                                 tvm.popcount(~w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype)))
-                                << (bw+bx).astype(dtype), axis=[bw, bx, k]), name='z')
+        z = te.compute(
+            (m,), lambda i:
+            te.sum((tvm.tir.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype)) -
+                    tvm.tir.popcount(~w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype)))
+                   << (bw+bx).astype(dtype), axis=[bw, bx, k]), name='z')
     else:
         dtype = 'uint16'
-        z = tvm.compute((m,), lambda i:
-                        tvm.sum(tvm.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype))
-                                << (bw+bx).astype(dtype), axis=[bw, bx, k]), name='z')
-    Wb = tvm.decl_buffer(w.shape, w.dtype,
-                         name="W",
-                         offset_factor=k_i,
-                         strides=[tvm.var('ldw'), tvm.var('ldw'), 1]) # stride can be inferred
-    Xb = tvm.decl_buffer(x.shape, x.dtype,
-                         name="X",
-                         offset_factor=k_i,
-                         strides=[tvm.var('ldw'), 1])
-    Zb = tvm.decl_buffer(z.shape, z.dtype,
-                         name="Z",
-                         offset_factor=1,
-                         strides=[1])
+        z = te.compute((m,), lambda i:
+                       te.sum(tvm.tir.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype(dtype))
+                              << (bw+bx).astype(dtype), axis=[bw, bx, k]), name='z')
+    Wb = tvm.tir.decl_buffer(w.shape, w.dtype,
+                             name="W",
+                             offset_factor=k_i,
+                             strides=[te.var('ldw'), te.var('ldw'), 1]) # stride can be inferred
+    Xb = tvm.tir.decl_buffer(x.shape, x.dtype,
+                             name="X",
+                             offset_factor=k_i,
+                             strides=[te.var('ldw'), 1])
+    Zb = tvm.tir.decl_buffer(z.shape, z.dtype,
+                             name="Z",
+                             offset_factor=1,
+                             strides=[1])
 
     def _intrin_func(ins, outs):
         ww, xx = ins
         zz = outs[0]
 
-        args_1 = tvm.const(1, 'uint32')
-        args_2 = tvm.const(2, 'uint32')
+        args_1 = tvm.tir.const(1, 'uint32')
+        args_2 = tvm.tir.const(2, 'uint32')
 
         if unipolar:
             vpadd = "llvm.arm.neon.vpadd.v8i8"
@@ -212,9 +214,9 @@ def _intrin_func(ins, outs):
             return_dtype = 'uint16x8'
 
         def _instr(index):
-            irb = tvm.ir_builder.create()
+            irb = tvm.tir.ir_builder.create()
             if index == 1: # reduce reset
-                irb.emit(zz.vstore(0, tvm.const(0, return_dtype)))
+                irb.emit(zz.vstore(0, tvm.tir.const(0, return_dtype)))
                 return irb.get()
             # body and reduce update
             cnts8 = [None] * 8
@@ -227,46 +229,50 @@ def _instr(index):
                             w_ = ww.vload([bw, i, 0], 'uint8x16').astype(full_dtype)
                             x_ = xx.vload([bx, 0], 'uint8x16').astype(full_dtype)
                             if unipolar:
-                                cnts = tvm.popcount(w_ & x_) - tvm.popcount(~w_ & x_)
+                                cnts = tvm.tir.popcount(w_ & x_) - tvm.tir.popcount(~w_ & x_)
                             else:
-                                cnts = tvm.popcount(w_ & x_)
-                            upper_half = tvm.call_pure_intrin(half_dtype, 'vectorhigh', cnts)
-                            lower_half = tvm.call_pure_intrin(half_dtype, 'vectorlow', cnts)
+                                cnts = tvm.tir.popcount(w_ & x_)
+                            upper_half = tvm.tir.call_pure_intrin(half_dtype, 'vectorhigh', cnts)
+                            lower_half = tvm.tir.call_pure_intrin(half_dtype, 'vectorlow', cnts)
                             cnts8[i] = upper_half + lower_half
                         for i in range(m//2):
-                            cnts4[i] = tvm.call_llvm_intrin(half_dtype, vpadd,
-                                                            args_1, cnts8[i*2], cnts8[i*2+1])
+                            cnts4[i] = tvm.tir.call_llvm_intrin(half_dtype, vpadd,
+                                                                args_1, cnts8[i*2], cnts8[i*2+1])
                         for i in range(m//4):
-                            cnts2[i] = tvm.call_llvm_intrin(half_dtype, vpadd,
-                                                            args_1, cnts4[i*2], cnts4[i*2+1])
-                        cnts = tvm.call_pure_intrin(full_dtype, 'vectorcombine', cnts2[0], cnts2[1])
-                        shifted_cnts = cnts << tvm.const(bw+bx, pack_dtype)
-                        out = tvm.call_llvm_intrin(return_dtype, vpadalu,
-                                                   args_2, zz.vload(0, return_dtype), shifted_cnts)
+                            cnts2[i] = tvm.tir.call_llvm_intrin(half_dtype, vpadd,
+                                                                args_1, cnts4[i*2], cnts4[i*2+1])
+                        cnts = tvm.tir.call_pure_intrin(
+                            full_dtype, 'vectorcombine', cnts2[0], cnts2[1])
+                        shifted_cnts = cnts << tvm.tir.const(bw+bx, pack_dtype)
+                        out = tvm.tir.call_llvm_intrin(
+                            return_dtype, vpadalu,
+                            args_2, zz.vload(0, return_dtype), shifted_cnts)
                     else: # ki == 8
                         for i in range(m):
                             w_ = ww.vload([bw, i, 0], 'uint8x8').astype(half_dtype)
                             x_ = xx.vload([bx, 0], 'uint8x8').astype(half_dtype)
                             if unipolar:
-                                cnts8[i] = tvm.popcount(w_ & x_) - tvm.popcount(~w_ & x_)
+                                cnts8[i] = tvm.tir.popcount(w_ & x_) - tvm.tir.popcount(~w_ & x_)
                             else:
-                                cnts8[i] = tvm.popcount(w_ & x_)
+                                cnts8[i] = tvm.tir.popcount(w_ & x_)
                         for i in range(m//2):
-                            cnts4[i] = tvm.call_llvm_intrin(half_dtype, vpadd,
-                                                            args_1, cnts8[i*2], cnts8[i*2+1])
+                            cnts4[i] = tvm.tir.call_llvm_intrin(half_dtype, vpadd,
+                                                                args_1, cnts8[i*2], cnts8[i*2+1])
                         for i in range(m//4):
-                            cnts2[i] = tvm.call_llvm_intrin(half_dtype, vpadd,
-                                                            args_1, cnts4[i*2], cnts4[i*2+1])
-                        cnts = tvm.call_pure_intrin(full_dtype, 'vectorcombine', cnts2[0], cnts2[1])
-                        shifted_cnts = cnts << tvm.const(bw+bx, pack_dtype)
-                        out = tvm.call_llvm_intrin(return_dtype, vpadalu,
-                                                   args_2, zz.vload(0, return_dtype), shifted_cnts)
+                            cnts2[i] = tvm.tir.call_llvm_intrin(half_dtype, vpadd,
+                                                                args_1, cnts4[i*2], cnts4[i*2+1])
+                        cnts = tvm.tir.call_pure_intrin(
+                            full_dtype, 'vectorcombine', cnts2[0], cnts2[1])
+                        shifted_cnts = cnts << tvm.tir.const(bw+bx, pack_dtype)
+                        out = tvm.tir.call_llvm_intrin(
+                            return_dtype, vpadalu,
+                            args_2, zz.vload(0, return_dtype), shifted_cnts)
                     irb.emit(zz.vstore(0, out))
             return irb.get()
         # body, reset, update
         return _instr(0), _instr(1), _instr(2)
-    with tvm.build_config(offset_factor=1, partition_const_loop=True):
-        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x:Xb, z:Zb})
+    with tvm.target.build_config(offset_factor=1, partition_const_loop=True):
+        return te.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x:Xb, z:Zb})
 
 # ARM specific schedule that using custom microkernel
 def _schedule_spatial_conv2d_nhwc(cfg, s, data_pad, data_vec, kernel_vec,
@@ -325,7 +331,7 @@ def _schedule_spatial_conv2d_nhwc(cfg, s, data_pad, data_vec, kernel_vec,
 @autotvm.register_topi_schedule("bitserial_conv2d_nhwc.arm_cpu")
 def schedule_bitserial_conv2d_nhwc(cfg, outs):
     """Arm cpu schedule for bitserial conv2d"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def traverse(op):
@@ -335,7 +341,7 @@ def traverse(op):
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
         if 'spatial_bitserial_conv_nhwc' in op.tag:
@@ -347,7 +353,7 @@ def traverse(op):
             data_q = data_vec.op.input_tensors[0]
             data = data_q.op.input_tensors[0]
             data_pad = None
-            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+            if isinstance(data_q.op, te.tensor.ComputeOp) and "pad" in data_q.op.tag:
                 data_pad = data_q
                 data_q = data
                 data = data.op.input_tensors[0]
diff --git a/topi/python/topi/arm_cpu/bitserial_dense.py b/topi/python/topi/arm_cpu/bitserial_dense.py
index 3f1889c8d7ff..beed79da49d0 100644
--- a/topi/python/topi/arm_cpu/bitserial_dense.py
+++ b/topi/python/topi/arm_cpu/bitserial_dense.py
@@ -18,6 +18,7 @@
 """Schedule for bitserial dense operator."""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from tvm import autotvm
 from topi.util import get_const_tuple
 from .. import tag
@@ -32,15 +33,15 @@ def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype, out_d
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D with shape [batch, in_dim]
 
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [out_dim, in_dim]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
     data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
@@ -83,23 +84,23 @@ def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype, out_d
     wvshape = (out_dim//VY, in_dim//VK, WB, VY, VK)
     oshape = (batch, out_dim)
 
-    k = tvm.reduce_axis((0, in_dim), name='k')
-    db = tvm.reduce_axis((0, DB), name='db')
-    wb = tvm.reduce_axis((0, WB), name='wb')
+    k = te.reduce_axis((0, in_dim), name='k')
+    db = te.reduce_axis((0, DB), name='db')
+    wb = te.reduce_axis((0, WB), name='wb')
 
     # Tile data and weights
-    weight_vec = tvm.compute(wvshape, lambda yo, ko, wb, vy, vk:
-                             weight_packed[yo*VY+vy][wb][ko*VK+vk], name='weight_vec')
-    matmul_unipolar = tvm.compute(oshape, lambda x, y: tvm.sum(
-        (tvm.popcount(weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
-                      data_packed[x, db, k].astype(out_dtype)) -
-         tvm.popcount(~weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
-                      data_packed[x, db, k].astype(out_dtype)))
+    weight_vec = te.compute(wvshape, lambda yo, ko, wb, vy, vk:
+                            weight_packed[yo*VY+vy][wb][ko*VK+vk], name='weight_vec')
+    matmul_unipolar = te.compute(oshape, lambda x, y: te.sum(
+        (tvm.tir.popcount(weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
+                          data_packed[x, db, k].astype(out_dtype)) -
+         tvm.tir.popcount(~weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
+                          data_packed[x, db, k].astype(out_dtype)))
         << (wb+db).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar')
 
-    matmul = tvm.compute(oshape, lambda x, y: tvm.sum(
-        tvm.popcount(weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
-                     data_packed[x, db, k].astype(out_dtype))
+    matmul = te.compute(oshape, lambda x, y: te.sum(
+        tvm.tir.popcount(weight_vec[y//VY, k//VK, wb, y%VY, k%VK].astype(out_dtype) &
+                         data_packed[x, db, k].astype(out_dtype))
         << (wb+db).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense')
 
     cfg.add_flop(batch * out_dim * in_dim * binary_op_multiplier(pack_dtype))
@@ -124,8 +125,8 @@ def schedule_bitserial_dense(cfg, outs):
     s: Schedule
         The computation schedule for bitserial_dense.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(cfg, s, data_vec, weight_vec, output, unipolar):
 
@@ -162,7 +163,7 @@ def traverse(op):
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if isinstance(tensor.op, tvm.te.ComputeOp):
                     traverse(tensor.op)
 
         elif op.tag == 'bitserial_dense' or 'bitserial_dense_unipolar':
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 2144d260c5b1..25b338e06b5f 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -19,6 +19,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import te
 from tvm import autotvm
 import tvm.contrib.nnpack
 
@@ -27,9 +28,9 @@
 from ..nn.util import get_const_int, get_pad_tuple
 from ..nn.winograd_util import winograd_transform_matrices
 from .conv2d_spatial_pack import conv2d_spatial_pack_nchw, \
-                                 conv2d_spatial_pack_nhwc, \
-                                 schedule_conv2d_spatial_pack_nchw, \
-                                 schedule_conv2d_spatial_pack_nhwc
+    conv2d_spatial_pack_nhwc, \
+    schedule_conv2d_spatial_pack_nchw, \
+    schedule_conv2d_spatial_pack_nhwc
 
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -42,7 +43,7 @@ def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_
 @autotvm.register_topi_schedule("conv2d_nchw_spatial_pack.arm_cpu")
 def schedule_conv2d_nchw_spatial_pack(cfg, outs):
     """Create schedule for conv2d_nchw"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         # schedule conv2d
@@ -59,7 +60,7 @@ def _callback(op):
                 kernel = kernel_vec.op.input_tensors[0]
             else:
                 kernel = kernel_vec
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
 
             schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
@@ -79,7 +80,7 @@ def conv2d_nhwc_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_
 @autotvm.register_topi_schedule("conv2d_nhwc_spatial_pack.arm_cpu")
 def schedule_conv2d_nhwc_spatial_pack(cfg, outs):
     """Create schedule for conv2d_nhwc"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'spatial_conv_output_NHWC' in op.tag:
@@ -100,7 +101,7 @@ def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtyp
 @autotvm.register_topi_schedule("conv2d_nchw_winograd.arm_cpu")
 def schedule_conv2d_nchw_winograd(cfg, outs):
     """Create schedule for conv2d_nchw_winograd"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'winograd_conv2d_output' in op.tag:
@@ -136,8 +137,8 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, til
     assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
     data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")
 
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     r = KW
     m = tile_size
@@ -158,48 +159,48 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, til
     VK = cfg['tile_k'].size[-1]
 
     # pack input tile
-    input_tile = tvm.compute((C, idxd(P, VP), alpha, alpha, VP),
-                             lambda c, b, eps, nu, bb:
-                             data_pad[idxd(b*VP + bb, nH*nW), c,
-                                      idxm(idxd(b*VP + bb, nW), nH) * m + eps,
-                                      idxm(b*VP + bb, nW) * m + nu],
-                             name='d')
+    input_tile = te.compute((C, idxd(P, VP), alpha, alpha, VP),
+                            lambda c, b, eps, nu, bb:
+                            data_pad[idxd(b*VP + bb, nH*nW), c,
+                                     idxm(idxd(b*VP + bb, nW), nH) * m + eps,
+                                     idxm(b*VP + bb, nW) * m + nu],
+                            name='d')
 
     # transform kernel
     if pre_computed:
         U = kernel
     else:
-        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
-        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
-        U = tvm.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk:
-                        tvm.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
-                                G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
+        r_kh = te.reduce_axis((0, KH), 'r_kh')
+        r_kw = te.reduce_axis((0, KW), 'r_kw')
+        U = te.compute((alpha, alpha, idxd(K, VK), C, VK), lambda eps, nu, k, c, kk:
+                       te.sum(kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype) *
+                              G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U')
 
     # transform image
-    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
-    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
-    V = tvm.compute((alpha, alpha, idxd(P, VP), C, VP), lambda eps, nu, b, c, bb:
-                    tvm.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) *
-                            B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V')
+    r_eps = te.reduce_axis((0, alpha), 'r_eps')
+    r_nu = te.reduce_axis((0, alpha), 'r_nu')
+    V = te.compute((alpha, alpha, idxd(P, VP), C, VP), lambda eps, nu, b, c, bb:
+                   te.sum(input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) *
+                          B[r_eps][eps] * B[r_nu][nu], axis=[r_eps, r_nu]), name='V')
 
     # batch gemm
-    c = tvm.reduce_axis((0, C), name='c')
-    M = tvm.compute((alpha, alpha, K, P), lambda eps, nu, k, b:
-                    tvm.sum(U[eps][nu][idxd(k, VK)][c][idxm(k, VK)] *
-                            V[eps][nu][idxd(b, VP)][c][idxm(b, VP)], axis=c), name='M')
+    c = te.reduce_axis((0, C), name='c')
+    M = te.compute((alpha, alpha, K, P), lambda eps, nu, k, b:
+                   te.sum(U[eps][nu][idxd(k, VK)][c][idxm(k, VK)] *
+                          V[eps][nu][idxd(b, VP)][c][idxm(b, VP)], axis=c), name='M')
 
     # inverse transform
-    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
-    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
-    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
-                    tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
-                            axis=[r_eps, r_nu]), name='Y')
+    r_eps = te.reduce_axis((0, alpha), 'r_eps')
+    r_nu = te.reduce_axis((0, alpha), 'r_nu')
+    Y = te.compute((K, P, m, m), lambda k, b, vh, vw:
+                   te.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
+                          axis=[r_eps, r_nu]), name='Y')
 
     # unpack output
-    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
-                         Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m),
-                              idxm(h, m), idxm(w, m)],
-                         name='output', tag='winograd_conv2d_output')
+    output = te.compute((N, K, H, W), lambda n, k, h, w:
+                        Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m),
+                             idxm(h, m), idxm(w, m)],
+                        name='output', tag='winograd_conv2d_output')
 
     # we have to manually assign effective GFLOP for winograd
     cfg.add_flop(2 * N * K * H * W * KH * KW * C)
@@ -220,7 +221,7 @@ def _schedule_winograd(cfg, s, output, last):
     s[d].compute_inline()
 
     # transform kernel
-    if isinstance(U.op, tvm.tensor.ComputeOp):
+    if isinstance(U.op, tvm.te.ComputeOp):
         kernel, G = U.op.input_tensors
         s[G].compute_inline()
         eps, nu, k, c, kk, = s[U].op.axis
@@ -236,7 +237,7 @@ def _schedule_winograd(cfg, s, output, last):
             s[U].vectorize(kk)
             s[U].parallel(k)
 
-        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
             s[kernel].compute_inline()
 
     # transform image
@@ -310,7 +311,7 @@ def conv2d_nchw_winograd_nnpack(cfg, data, kernel, strides, padding, dilation, o
 @autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack.arm_cpu")
 def schedule_conv2d_nchw_winograd_nnpack(cfg, outs):
     """Create schedule for conv2d_nchw_winograd_nnpack"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'winograd_nnpack_conv2d_output' in op.tag:
@@ -344,13 +345,13 @@ def _conv2d_arm_cpu_winograd_nnpack(
     cfg.define_knob('winograd_nnpack_algorithm', [convolution_algorithm])
 
     assert N == 1
-    with tvm.tag_scope("winograd_nnpack_conv2d_weight_transform"):
+    with tvm.te.tag_scope("winograd_nnpack_conv2d_weight_transform"):
         transformed_kernel = tvm.contrib.nnpack.convolution_inference_weight_transform(
             kernel, algorithm=cfg['winograd_nnpack_algorithm'].val)
         if autotvm.GLOBAL_SCOPE.in_tuning:
-            transformed_kernel = tvm.compute(transformed_kernel.shape, lambda *args: 0.0)
+            transformed_kernel = te.compute(transformed_kernel.shape, lambda *args: 0.0)
 
-    with tvm.tag_scope("winograd_nnpack_conv2d_output"):
+    with tvm.te.tag_scope("winograd_nnpack_conv2d_output"):
         output = tvm.contrib.nnpack.convolution_inference_without_weight_transform(
             data, transformed_kernel,
             bias=None,
@@ -369,8 +370,8 @@ def _schedule_winograd_nnpack(cfg, s, output, last):
     (X, TK) = output.op.input_tensors[:2]
 
     # transform kernel
-    assert isinstance(TK.op, (tvm.tensor.ComputeOp, tvm.tensor.ExternOp, tvm.tensor.PlaceholderOp))
-    if autotvm.GLOBAL_SCOPE.in_tuning and isinstance(TK.op, tvm.tensor.ComputeOp):
+    assert isinstance(TK.op, (te.tensor.ComputeOp, te.tensor.ExternOp, te.tensor.PlaceholderOp))
+    if autotvm.GLOBAL_SCOPE.in_tuning and isinstance(TK.op, te.tensor.ComputeOp):
         # kernel transformation will be pre-computed during compilation, so we skip
         # this part to make tuning records correct
         s[TK].pragma(s[TK].op.axis[0], 'debug_skip_region')
@@ -398,7 +399,7 @@ def conv2d_nchw_winograd_nnpack_without_weight_transform(
     W = (IW + pl + pr - 3) // WSTR + 1
 
     assert N == 1
-    with tvm.tag_scope("winograd_nnpack_conv2d_output"):
+    with tvm.te.tag_scope("winograd_nnpack_conv2d_output"):
         output = tvm.contrib.nnpack.convolution_inference_without_weight_transform(
             data=data,
             transformed_kernel=transformed_kernel,
@@ -415,7 +416,7 @@ def conv2d_nchw_winograd_nnpack_without_weight_transform(
 @autotvm.register_topi_schedule("conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu")
 def schedule_conv2d_nchw_winograd_nnpack_without_weight_transform(cfg, outs):
     """TOPI schedule callback"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'winograd_nnpack_conv2d_output' in op.tag:
diff --git a/topi/python/topi/arm_cpu/conv2d_alter_op.py b/topi/python/topi/arm_cpu/conv2d_alter_op.py
index bfbf5d6d62b0..3a22611ed128 100644
--- a/topi/python/topi/arm_cpu/conv2d_alter_op.py
+++ b/topi/python/topi/arm_cpu/conv2d_alter_op.py
@@ -20,6 +20,7 @@
 import logging
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 
@@ -58,7 +59,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     data, kernel = tinfos
     out_dtype = out_type.dtype
 
-    idxd = tvm.indexdiv
+    idxd = tvm.tir.indexdiv
 
     if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
@@ -69,7 +70,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['kernel_layout'] = 'OIHW%do' % VC
 
         new_data = data
-        new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
+        new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "conv2d_nchw_spatial_pack.arm_cpu")
@@ -86,7 +87,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['kernel_layout'] = 'OHWI%do' % VC
 
         new_data = data
-        new_kernel = tvm.placeholder((idxd(CO, VC), KH, KW, CI, VC), dtype=kernel.dtype)
+        new_kernel = te.placeholder((idxd(CO, VC), KH, KW, CI, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "conv2d_nhwc_spatial_pack.arm_cpu")
@@ -113,10 +114,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['tile_size'] = tile_size
 
         new_data = data
-        new_kernel = tvm.placeholder((KH + tile_size - 1,
-                                      KW + tile_size -1,
-                                      idxd(CO, VC), CI, VC),
-                                     kernel.dtype)
+        new_kernel = te.placeholder((KH + tile_size - 1,
+                                     KW + tile_size -1,
+                                     idxd(CO, VC), CI, VC),
+                                    kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             'conv2d_nchw_winograd.arm_cpu')
@@ -141,7 +142,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
             out_dtype=weight_dtype)
 
         new_data = data
-        new_kernel = tvm.placeholder((CO, CI, 8, 8), "float32")
+        new_kernel = te.placeholder((CO, CI, 8, 8), "float32")
 
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, None, strides, padding, dilation, out_dtype],
@@ -160,7 +161,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
         # Store the same config for the altered operator (workload)
         new_data = data
-        new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
+        new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "depthwise_conv2d_nchw_spatial_pack.arm_cpu")
diff --git a/topi/python/topi/arm_cpu/conv2d_int8.py b/topi/python/topi/arm_cpu/conv2d_int8.py
index 5d177fe76ab6..06412b656b4b 100644
--- a/topi/python/topi/arm_cpu/conv2d_int8.py
+++ b/topi/python/topi/arm_cpu/conv2d_int8.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D int8 schedule on ARM"""
-
-import tvm
+from tvm import te
 from tvm import autotvm
 from .. import tag
 from ..util import get_const_tuple
@@ -55,8 +54,8 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides,
 
     # If no config was set, we can fallback to NCHW config.
     if cfg.is_fallback:
-        _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
-                            tvm.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype),
+        _get_default_config(cfg, te.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
+                            te.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype),
                             strides, padding, out_dtype)
     return nn.conv2d_NCHWc_int8_compute(data,
                                         kernel,
@@ -71,7 +70,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides,
 @autotvm.register_topi_schedule("conv2d_NCHWc_int8.arm_cpu")
 def schedule_conv2d_NCHWc_int8(cfg, outs):
     """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def traverse(op):
@@ -81,7 +80,7 @@ def traverse(op):
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
         if 'conv2d_NCHWc_int8' in op.tag:
@@ -89,9 +88,9 @@ def traverse(op):
             kernel_vec = conv_out.op.input_tensors[1]
             data_vec = conv_out.op.input_tensors[0]
             data = data_vec.op.input_tensors[0] \
-                if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
+                if isinstance(data_vec.op, te.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
                 else data_vec
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+            if isinstance(data.op, te.tensor.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
diff --git a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
index 032ac76ff6a2..3bb9dc73e2db 100644
--- a/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
+++ b/topi/python/topi/arm_cpu/conv2d_spatial_pack.py
@@ -18,6 +18,7 @@
 """Conv2D spatial pack implementation for ARM CPU"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from tvm import autotvm
 from .. import nn
 from ..util import get_const_tuple
@@ -98,46 +99,46 @@ def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation,
     if dilation_h != 1 or dilation_w != 1:
         # undilate input data
         dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW)
-        data_vec = tvm.compute(dvshape, lambda n, h, w, ci, kh, kw, vh, vw:
-                               data_pad[n][ci][(h*VH+vh)*HSTR+kh*dilation_h]
-                               [(w*VW+vw)*WSTR+kw*dilation_w],
-                               name='data_vec_undilated')
+        data_vec = te.compute(dvshape, lambda n, h, w, ci, kh, kw, vh, vw:
+                              data_pad[n][ci][(h*VH+vh)*HSTR+kh*dilation_h]
+                              [(w*VW+vw)*WSTR+kw*dilation_w],
+                              name='data_vec_undilated')
     else:
         dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
-        data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
-                               data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
-                               name='data_vec')
+        data_vec = te.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                              data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
+                              name='data_vec')
 
     if pre_packed:
         kernel_vec = kernel
     else:
-        kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
-                                 kernel[co*VC+vc][ci][kh][kw],
-                                 name='kernel_vec')
+        kernel_vec = te.compute(kvshape, lambda co, ci, kh, kw, vc:
+                                kernel[co*VC+vc][ci][kh][kw],
+                                name='kernel_vec')
 
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    kh = tvm.reduce_axis((0, KH), name='kh')
-    kw = tvm.reduce_axis((0, KW), name='kw')
+    ci = te.reduce_axis((0, CI), name='ci')
+    kh = te.reduce_axis((0, KH), name='kh')
+    kw = te.reduce_axis((0, KW), name='kw')
 
     if dilation_h != 1 or dilation_w != 1:
-        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
-            tvm.sum(data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) *
-                    kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
-                    axis=[ci, kh, kw]), name='conv')
+        conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+                          te.sum(data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) *
+                                 kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                                 axis=[ci, kh, kw]), name='conv')
     else:
-        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
-            tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
-                    kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
-                    axis=[ci, kh, kw]), name='conv')
-
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
-
-    output = tvm.compute(oshape, lambda n, co, h, w:
-                         conv[n,
-                              idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW),
-                              idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)],
-                         name='output_unpack', tag='spatial_conv2d_output')
+        conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+                          te.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
+                                 kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                                 axis=[ci, kh, kw]), name='conv')
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+
+    output = te.compute(oshape, lambda n, co, h, w:
+                        conv[n,
+                             idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW),
+                             idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)],
+                        name='output_unpack', tag='spatial_conv2d_output')
     return output
 
 def schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
@@ -216,7 +217,7 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_
     dilated_kernel_w = (KW - 1) * dilation_w + 1
 
     pad_top, pad_left, pad_down, pad_right = \
-            get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
+        get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
 
     OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
@@ -257,40 +258,41 @@ def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_
     if dilation_h != 1 or dilation_w != 1:
         # undilate input data
         dvshape = (N, OHO, OWO, KH, KW, IC, OHI, OWI)
-        data_vec = tvm.compute(dvshape, lambda n, oho, owo, kh, kw, ic, ohi, owi:
-                               data_pad[n][(oho*OHI+ohi)*HSTR+kh*dilation_h]
-                               [(owo*OWI+owi)*WSTR+kw*dilation_w][ic],
-                               name='data_vec_undilated')
+        data_vec = te.compute(dvshape, lambda n, oho, owo, kh, kw, ic, ohi, owi:
+                              data_pad[n][(oho*OHI+ohi)*HSTR+kh*dilation_h]
+                              [(owo*OWI+owi)*WSTR+kw*dilation_w][ic],
+                              name='data_vec_undilated')
     else:
         dvshape = (N, OHO, OWO, KH + (OHI-1)*HSTR, KW + (OWI-1)*WSTR, IC)
-        data_vec = tvm.compute(dvshape, lambda n, oho, owo, ohi, owi, ic:
-                               data_pad[n][oho*OHI*HSTR+ohi][owo*OWI*WSTR+owi][ic],
-                               name='data_vec')
-    kernel_vec = tvm.compute(kvshape, lambda oco, kh, kw, ic, oci: \
-                             kernel[kh][kw][ic][oco*OCI+oci],
-                             name='kernel_vec')
+        data_vec = te.compute(dvshape, lambda n, oho, owo, ohi, owi, ic:
+                              data_pad[n][oho*OHI*HSTR+ohi][owo*OWI*WSTR+owi][ic],
+                              name='data_vec')
+    kernel_vec = te.compute(kvshape, lambda oco, kh, kw, ic, oci: \
+                            kernel[kh][kw][ic][oco*OCI+oci],
+                            name='kernel_vec')
 
-    ic = tvm.reduce_axis((0, IC), name='ic')
-    kh = tvm.reduce_axis((0, KH), name='kh')
-    kw = tvm.reduce_axis((0, KW), name='kw')
+    ic = te.reduce_axis((0, IC), name='ic')
+    kh = te.reduce_axis((0, KH), name='kh')
+    kw = te.reduce_axis((0, KW), name='kw')
 
     if dilation_h != 1 or dilation_w != 1:
-        conv = tvm.compute(ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \
-            tvm.sum(data_vec[n, oho, owo, kh, kw, ohi, owi, ic].astype(out_dtype) *
-                    kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
-                    axis=[ic, kh, kw]), name='conv')
+        conv = te.compute(ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \
+                          te.sum(data_vec[n, oho, owo, kh, kw, ohi, owi, ic].astype(out_dtype) *
+                                 kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
+                                 axis=[ic, kh, kw]), name='conv')
     else:
-        conv = tvm.compute(ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \
-            tvm.sum(data_vec[n, oho, owo, ohi*HSTR+kh, owi*WSTR+kw, ic].astype(out_dtype) *
-                    kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
-                    axis=[ic, kh, kw]), name='conv')
-
-    idiv = tvm.indexdiv
-    imod = tvm.indexmod
-    output = tvm.compute(oshape, lambda n, oho, owo, oc:
-                         conv[n][idiv(oho, OHI)][idiv(owo, OWI)][idiv(oc, OCI)]\
-                             [imod(oho, OHI)][imod(owo, OWI)][imod(oc, OCI)],
-                         name='output_unpack', tag='spatial_conv_output_NHWC')
+        conv = te.compute(
+            ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \
+            te.sum(data_vec[n, oho, owo, ohi*HSTR+kh, owi*WSTR+kw, ic].astype(out_dtype) *
+                   kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
+                   axis=[ic, kh, kw]), name='conv')
+
+    idiv = tvm.tir.indexdiv
+    imod = tvm.tir.indexmod
+    output = te.compute(oshape, lambda n, oho, owo, oc:
+                        conv[n][idiv(oho, OHI)][idiv(owo, OWI)][idiv(oc, OCI)]\
+                        [imod(oho, OHI)][imod(owo, OWI)][imod(oc, OCI)],
+                        name='output_unpack', tag='spatial_conv_output_NHWC')
     return output
 
 def schedule_conv2d_spatial_pack_nhwc(cfg, s, op, output):
diff --git a/topi/python/topi/arm_cpu/conv2d_transpose.py b/topi/python/topi/arm_cpu/conv2d_transpose.py
index 93ff02900f37..7eaa5eeb7c90 100644
--- a/topi/python/topi/arm_cpu/conv2d_transpose.py
+++ b/topi/python/topi/arm_cpu/conv2d_transpose.py
@@ -19,6 +19,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import te
 from tvm import autotvm
 
 from ..nn import dilate, pad, get_pad_tuple
@@ -31,10 +32,10 @@ def conv2d_transpose_nchw(cfg, Input, Filter, strides, padding, out_dtype):
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [in_channel, num_filter, filter_height, filter_width]
 
     strides : tuple of two ints
@@ -48,7 +49,7 @@ def conv2d_transpose_nchw(cfg, Input, Filter, strides, padding, out_dtype):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     return _decl_spatial_pack(cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2)
@@ -105,31 +106,31 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
     ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
     oshape = (N, CO, OH, OW)
 
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
-                           data_pad[n][ci][h*VH + vh][w*VW + vw],
-                           name='data_vec')
+    data_vec = te.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                          data_pad[n][ci][h*VH + vh][w*VW + vw],
+                          name='data_vec')
 
-    kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
-                             kernel[ci][co*VC+vc][kh][kw],
-                             name='kernel_vec_conv2d_transpose')
+    kernel_vec = te.compute(kvshape, lambda co, ci, kh, kw, vc:
+                            kernel[ci][co*VC+vc][kh][kw],
+                            name='kernel_vec_conv2d_transpose')
 
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    kh = tvm.reduce_axis((0, KH), name='kh')
-    kw = tvm.reduce_axis((0, KW), name='kw')
+    ci = te.reduce_axis((0, CI), name='ci')
+    kh = te.reduce_axis((0, KH), name='kh')
+    kw = te.reduce_axis((0, KW), name='kw')
 
-    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
-        tvm.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) *
-                kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype),
-                axis=[ci, kh, kw]), name='conv')
+    conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+                      te.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) *
+                             kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype),
+                             axis=[ci, kh, kw]), name='conv')
 
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
 
-    output = tvm.compute(oshape, lambda n, co, h, w:
-                         conv[n,
-                              idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW),
-                              idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)],
-                         name='output_unpack', tag='spatial_conv2d_transpose_output')
+    output = te.compute(oshape, lambda n, co, h, w:
+                        conv[n,
+                             idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW),
+                             idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)],
+                        name='output_unpack', tag='spatial_conv2d_transpose_output')
     return output
 
 
@@ -137,7 +138,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
 @autotvm.register_topi_schedule("conv2d_transpose_nchw.arm_cpu")
 def schedule_conv2d_transpose_nchw(cfg, outs):
     """Schedule conv2d transpose for arm cpu"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'spatial_conv2d_transpose_output' in op.tag:
@@ -155,7 +156,7 @@ def _callback(op):
                 kernel = kernel_vec.op.input_tensors[0]
             else:
                 kernel = kernel_vec
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
 
             schedule_conv2d_spatial_pack_nchw(cfg, s, data_vec, kernel_vec,
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index 8d668f3e9188..5214972b255e 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -18,6 +18,7 @@
 """Depthwise convolution schedule for ARM CPU"""
 
 import tvm
+from tvm import te
 from tvm import autotvm
 
 from .. import nn
@@ -48,8 +49,8 @@ def schedule_depthwise_conv2d_nchw(cfg, outs):
     s: Schedule
         The computation schedule for depthwise_conv2d nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(cfg, s, data, data_pad, kernel, output):
         A, B, C = data, kernel, output
@@ -129,7 +130,7 @@ def _callback(op):
             kernel = op.input_tensors[1]
             data = op.input_tensors[0]
             data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
             _schedule(cfg, s, data, data_pad, kernel, output)
@@ -147,10 +148,10 @@ def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dila
     cfg: ConfigEntity
         The config for this template
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         4-D with shape [num_filter, multiplier, filter_height, filter_width] or
         pre-packed 5-D with shape [num_filter_chunk, multiplier, filter_height,
         filter_width, num_filter_block]
@@ -169,7 +170,7 @@ def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dila
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
 
@@ -179,8 +180,8 @@ def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dila
 @autotvm.register_topi_schedule("depthwise_conv2d_nchw_spatial_pack.arm_cpu")
 def schedule_depthwise_conv2d_nchw_spatial_pack(cfg, outs):
     """Create the schedule for depthwise_conv2d_nchw_spatial_pack"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'spatial_depthwise_conv2d_nchw_output':
@@ -192,7 +193,7 @@ def _callback(op):
                 kernel = kernel_vec.op.input_tensors[0]
             else:
                 kernel = kernel_vec
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
             _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
 
@@ -284,50 +285,50 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype,
     if dilation_h != 1 or dilation_w != 1:
         # undilate input data
         dvshape = (N, OH // VH, OW // VW, C, KH, KW, VH, VW)
-        data_vec = tvm.compute(dvshape, lambda n, h, w, c, kh, kw, vh, vw:
-                               data_pad[n][c][(h * VH + vh) * HSTR + kh * dilation_h]
-                               [(w*VW+vw)*WSTR+kw*dilation_w],
-                               name='data_vec_undilated')
+        data_vec = te.compute(dvshape, lambda n, h, w, c, kh, kw, vh, vw:
+                              data_pad[n][c][(h * VH + vh) * HSTR + kh * dilation_h]
+                              [(w*VW+vw)*WSTR+kw*dilation_w],
+                              name='data_vec_undilated')
     else:
         dvshape = (N, OH // VH, OW // VW, C, VH*HSTR + KH-1, VW*WSTR + KW-1)
-        data_vec = tvm.compute(dvshape, lambda n, h, w, c, vh, vw:
-                               data_pad[n][c][h * VH * HSTR + vh][w * VW * WSTR + vw],
-                               name='data_vec')
+        data_vec = te.compute(dvshape, lambda n, h, w, c, vh, vw:
+                              data_pad[n][c][h * VH * HSTR + vh][w * VW * WSTR + vw],
+                              name='data_vec')
 
     if pre_packed:
         kernel_vec = kernel
     else:
-        kernel_vec = tvm.compute(kvshape, lambda co, m, kh, kw, vc:
-                                 kernel[co*VC+vc][m][kh][kw],
-                                 name='kernel_vec')
+        kernel_vec = te.compute(kvshape, lambda co, m, kh, kw, vc:
+                                kernel[co*VC+vc][m][kh][kw],
+                                name='kernel_vec')
 
-    kh = tvm.reduce_axis((0, KH), name='kh')
-    kw = tvm.reduce_axis((0, KW), name='kw')
+    kh = te.reduce_axis((0, KH), name='kh')
+    kw = te.reduce_axis((0, KW), name='kw')
 
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
 
     if dilation_h != 1 or dilation_w != 1:
-        conv = tvm.compute(
+        conv = te.compute(
             ovshape, lambda n, co, h, w, vh, vw, vc: \
-            tvm.sum(data_vec[n, h, w, idxdiv(co * VC + vc, M), kh, kw, vh, vw]
-                    .astype(out_dtype) *
-                    kernel_vec[idxdiv(co, M), idxmod(co, M), kh, kw, vc].astype(out_dtype),
-                    axis=[kh, kw]), name='depthwise_conv')
+            te.sum(data_vec[n, h, w, idxdiv(co * VC + vc, M), kh, kw, vh, vw]
+                   .astype(out_dtype) *
+                   kernel_vec[idxdiv(co, M), idxmod(co, M), kh, kw, vc].astype(out_dtype),
+                   axis=[kh, kw]), name='depthwise_conv')
     else:
-        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
-                           tvm.sum(data_vec[n, h, w, idxdiv((co * VC + vc), M), vh * HSTR + kh,
-                                            vw * WSTR + kw].astype(out_dtype) *
-                                   kernel_vec[idxdiv(co, M),
-                                              idxmod(co, M),
-                                              kh, kw, vc].astype(out_dtype),
-                                   axis=[kh, kw]), name='depthwise_conv')
-
-    output = tvm.compute(oshape, lambda n, co, h, w:
-                         conv[n,
-                              idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW),
-                              idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)],
-                         name='output_unpack', tag='spatial_depthwise_conv2d_nchw_output')
+        conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+                          te.sum(data_vec[n, h, w, idxdiv((co * VC + vc), M), vh * HSTR + kh,
+                                          vw * WSTR + kw].astype(out_dtype) *
+                                 kernel_vec[idxdiv(co, M),
+                                            idxmod(co, M),
+                                            kh, kw, vc].astype(out_dtype),
+                                 axis=[kh, kw]), name='depthwise_conv')
+
+    output = te.compute(oshape, lambda n, co, h, w:
+                        conv[n,
+                             idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW),
+                             idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)],
+                        name='output_unpack', tag='spatial_depthwise_conv2d_nchw_output')
     return output
 
 def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
@@ -343,10 +344,10 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
 
     data_pad = data_vec.op.input_tensors[0]
     if data_pad.op.name == "data_pad":
-        assert isinstance(data_pad.op, tvm.tensor.ComputeOp)
+        assert isinstance(data_pad.op, tvm.te.ComputeOp)
         has_padding = True
     else:
-        assert isinstance(data_pad.op, tvm.tensor.PlaceholderOp)
+        assert isinstance(data_pad.op, tvm.te.PlaceholderOp)
         has_padding = False
 
     cfg.define_knob('data_pad_inline', [0, 1, 2, 3, 4])
diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py
index 644a7e3fb523..696b70895825 100644
--- a/topi/python/topi/arm_cpu/injective.py
+++ b/topi/python/topi/arm_cpu/injective.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-variable
 """Schedule for pooling operators"""
 import tvm
+from tvm import te
 from ..util import is_empty_shape
 
 def schedule_injective_from_existing(sch, out):
@@ -58,14 +59,14 @@ def schedule_injective(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     x = outs[0]
     if list(s[x].op.axis):
         # do not vectorize for broadcast
         (io, ii) = s[x].split(list(s[x].op.axis)[-1], 8)
         s[x].vectorize(ii)
-    tvm.schedule.AutoInlineInjective(s)
+    tvm.te.schedule.AutoInlineInjective(s)
 
     if not is_empty_shape(x.shape):
         schedule_injective_from_existing(s, x)
@@ -85,10 +86,10 @@ def schedule_concatenate(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     x = outs[0]
-    tvm.schedule.AutoInlineInjective(s)
+    tvm.te.schedule.AutoInlineInjective(s)
     if len(s[x].op.axis) >= 4:
         fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
         s[x].parallel(fused)
diff --git a/topi/python/topi/arm_cpu/tensor_intrin.py b/topi/python/topi/arm_cpu/tensor_intrin.py
index 2f300a18e117..135c87d59511 100644
--- a/topi/python/topi/arm_cpu/tensor_intrin.py
+++ b/topi/python/topi/arm_cpu/tensor_intrin.py
@@ -18,6 +18,7 @@
 """Conv2D int8 schedule on ARM"""
 
 import tvm
+from tvm import te
 
 def dot_int8_int8_int32(int32_lanes, dtype='uint'):
     """
@@ -57,27 +58,27 @@ def dot_int8_int8_int32(int32_lanes, dtype='uint'):
     """
     num_int8_elements = 4  # 4 int8 elements in int32
 
-    data = tvm.placeholder((num_int8_elements,), dtype='%s8' % dtype, name='data')
-    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='%s8' % dtype, name='kernel')
+    data = te.placeholder((num_int8_elements,), dtype='%s8' % dtype, name='data')
+    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype='%s8' % dtype, name='kernel')
 
-    k = tvm.reduce_axis((0, num_int8_elements), name='k')
-    C = tvm.compute((int32_lanes,),
-                    lambda i: tvm.sum(data[k].astype('%s32' % dtype) *
-                                      kernel[i, k].astype('%s32' % dtype),
-                                      axis=k), name="C")
+    k = te.reduce_axis((0, num_int8_elements), name='k')
+    C = te.compute((int32_lanes,),
+                   lambda i: te.sum(data[k].astype('%s32' % dtype) *
+                                    kernel[i, k].astype('%s32' % dtype),
+                                    axis=k), name="C")
 
-    a_buffer = tvm.decl_buffer(data.shape, dtype='%s8' % dtype, name="a_buffer",
-                               offset_factor=1,
-                               strides=[1])
-    b_buffer = tvm.decl_buffer(kernel.shape, dtype='%s8' % dtype, name="b_buffer",
-                               offset_factor=1,
-                               strides=[tvm.var('s'), 1])
+    a_buffer = tvm.tir.decl_buffer(data.shape, dtype='%s8' % dtype, name="a_buffer",
+                                   offset_factor=1,
+                                   strides=[1])
+    b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype='%s8' % dtype, name="b_buffer",
+                                   offset_factor=1,
+                                   strides=[te.var('s'), 1])
 
     def _intrin_func(ins, outs):
         def _instr(index):
-            ib = tvm.ir_builder.create()
+            ib = tvm.tir.ir_builder.create()
             if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.const(0, '%s32x%d' % (dtype, int32_lanes))))
+                ib.emit(outs[0].vstore(0, tvm.tir.const(0, '%s32x%d' % (dtype, int32_lanes))))
                 return ib.get()
 
             dtype_a = '%s8x%d' % (dtype, num_int8_elements)
@@ -85,26 +86,26 @@ def _instr(index):
             dtype_c = '%s32x%d' % (dtype, int32_lanes)
 
             a_int8 = ins[0].vload([0], dtype_a)
-            re_int32 = tvm.call_pure_intrin('%s32' % dtype, 'reinterpret', a_int8)
+            re_int32 = tvm.tir.call_pure_intrin('%s32' % dtype, 'reinterpret', a_int8)
             # broadcast a
             vec_ai32 = re_int32.astype(dtype_c)
 
-            vec_a = tvm.call_pure_intrin(dtype_b, 'reinterpret', vec_ai32)
+            vec_a = tvm.tir.call_pure_intrin(dtype_b, 'reinterpret', vec_ai32)
             vec_b = ins[1].vload([0, 0], dtype_b)
             vec_c = outs[0].vload([0], dtype_c)
 
             inst = 'udot' if dtype == 'uint' else 'sdot'
             inst = 'llvm.aarch64.neon.%s.v%di32.v%di8' % (
                 inst, int32_lanes, int32_lanes * num_int8_elements)
-            vdot = tvm.call_llvm_intrin(dtype_c,
-                                        inst,
-                                        tvm.const(2, 'uint32'),
-                                        vec_c, vec_a, vec_b)
+            vdot = tvm.tir.call_llvm_intrin(dtype_c,
+                                            inst,
+                                            tvm.tir.const(2, 'uint32'),
+                                            vec_c, vec_a, vec_b)
             ib.emit(outs[0].vstore(0, vdot))
             return ib.get()
 
         # body, reset, update
         return _instr(0), _instr(1), _instr(2)
 
-    with tvm.build_config(offset_factor=1, partition_const_loop=True):
-        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
+    with tvm.target.build_config(offset_factor=1, partition_const_loop=True):
+        return te.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py
index 816024ebdb25..92e874afa2a5 100644
--- a/topi/python/topi/bifrost/conv2d.py
+++ b/topi/python/topi/bifrost/conv2d.py
@@ -19,6 +19,7 @@
 """conv2d schedule on ARM Mali (Bifrost) GPU"""
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 
@@ -41,10 +42,10 @@ def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_
     cfg: ConfigEntity
         The config for this template
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width] or
         pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
         filter_width, num_filter_block]
@@ -63,7 +64,7 @@ def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
@@ -87,7 +88,7 @@ def schedule_conv2d_nchw_spatial_pack(cfg, outs):
     s: Schedule
         The computation schedule for conv2d
     """
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         # schedule conv2d
@@ -104,7 +105,7 @@ def _callback(op):
                 kernel = kernel_vec.op.input_tensors[0]
             else:
                 kernel = kernel_vec
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
 
             _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
@@ -125,12 +126,12 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
     BW, TW, VW = cfg["tile_ow"].size
 
     # schedule padding
-    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
         data_pad = data
         s[data_pad].compute_inline()
 
     # schedule data packing
-    if isinstance(data_vec.op, tvm.tensor.ComputeOp) and data_vec.op.name == 'data_vec_undilated':
+    if isinstance(data_vec.op, te.tensor.ComputeOp) and data_vec.op.name == 'data_vec_undilated':
         _, h, w, ci, _, _, vh, vw = s[data_vec].op.axis
     else:
         _, h, w, ci, vh, vw = s[data_vec].op.axis
@@ -140,7 +141,7 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
     if vw.dom.extent.value < max_unroll:
         s[data_vec].unroll(vw)
 
-    if isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and kernel_vec.name == 'kernel_vec':
+    if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
         if autotvm.GLOBAL_SCOPE.in_tuning:
             # kernel packing will be pre-computed during compilation, so we skip
             # this part to make tuning records correct
@@ -151,8 +152,8 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
             fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
             fused, vec = s[kernel_vec].split(fused, VC)
             bb, tt = s[kernel_vec].split(fused, max_threads)
-            s[kernel_vec].bind(bb, tvm.thread_axis("blockIdx.x"))
-            s[kernel_vec].bind(tt, tvm.thread_axis("threadIdx.x"))
+            s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x"))
+            s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x"))
             if VC in vec_size:
                 s[kernel_vec].vectorize(vec)
 
@@ -193,7 +194,7 @@ def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtyp
 
 @autotvm.register_topi_schedule("conv2d_nchw_winograd.bifrost")
 def schedule_conv2d_nchw_winograd(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'winograd_conv2d_output' in op.tag:
@@ -210,7 +211,7 @@ def _decl_winograd_kernel_transform(kernel, tile_size, G):
 
     Parameters
     ----------
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         The kernel to transform
 
     tile_size : int
@@ -218,7 +219,7 @@ def _decl_winograd_kernel_transform(kernel, tile_size, G):
 
     Returns
     -------
-    U : tvm.Tensor
+    U : tvm.te.Tensor
         Transformed kernel
 
     """
@@ -238,22 +239,22 @@ def upround(x, align):
 
     # Padded Kernel [K_round, C, KH, KW]
     # Pad the number of kernels to multiple of ALIGN
-    padded_kernel = tvm.compute((K_round, C, KH, KW),
-                                lambda k, c, h, w:
-                                tvm.if_then_else(k < K,
-                                                 kernel[k][c][h][w],
-                                                 tvm.const(0, out_dtype)),
-                                name='padded_kernel')
+    padded_kernel = te.compute((K_round, C, KH, KW),
+                               lambda k, c, h, w:
+                               tvm.tir.if_then_else(k < K,
+                                                    kernel[k][c][h][w],
+                                                    tvm.tir.const(0, out_dtype)),
+                               name='padded_kernel')
 
     # U [alpha, alpha, K_round, C]
     # Perform the kernel transform
-    r_kh = tvm.reduce_axis((0, KH), 'r_kh')
-    r_kw = tvm.reduce_axis((0, KW), 'r_kw')
-    U = tvm.compute((alpha, alpha, K_round, C),
-                    lambda eps, nu, k, c:
-                    tvm.sum(padded_kernel[k][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
-                            axis=[r_kh, r_kw]),
-                    name='U')
+    r_kh = te.reduce_axis((0, KH), 'r_kh')
+    r_kw = te.reduce_axis((0, KW), 'r_kw')
+    U = te.compute((alpha, alpha, K_round, C),
+                   lambda eps, nu, k, c:
+                   te.sum(padded_kernel[k][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                          axis=[r_kh, r_kw]),
+                   name='U')
 
     return U
 
@@ -307,10 +308,10 @@ def upround(x, align):
     cfg.define_knob("data_transform_wgy", [1, 2, 4, 8, 16, 32, 64])
 
     # Pack input tile
-    input_tile = tvm.compute((N, C, H + 2, W + 2),
-                             lambda n, c, h, w:
-                             data_pad[n][c][h][w],
-                             name='d')
+    input_tile = te.compute((N, C, H + 2, W + 2),
+                            lambda n, c, h, w:
+                            data_pad[n][c][h][w],
+                            name='d')
 
     if pre_computed:
         U = kernel
@@ -319,33 +320,33 @@ def upround(x, align):
 
     # V [alpha * alpha, C, P_round)
     # Perform the image transform
-    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
-    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
-    V = tvm.compute((alpha * alpha, C, P_round),
-                    lambda epsnu, c, b:
-                    tvm.sum(input_tile[b // (nH*nW)][c][b // nW % nH * m + r_eps][b % nW * m +r_nu]\
-                            * B[r_eps][epsnu // alpha] * B[r_nu][epsnu % alpha],
-                            axis=[r_eps, r_nu]),
-                    name='V')
+    r_eps = te.reduce_axis((0, alpha), 'r_eps')
+    r_nu = te.reduce_axis((0, alpha), 'r_nu')
+    V = te.compute((alpha * alpha, C, P_round),
+                   lambda epsnu, c, b:
+                   te.sum(input_tile[b // (nH*nW)][c][b // nW % nH * m + r_eps][b % nW * m +r_nu]\
+                          * B[r_eps][epsnu // alpha] * B[r_nu][epsnu % alpha],
+                          axis=[r_eps, r_nu]),
+                   name='V')
 
     # Winograd GEMM is a wrapper around batched GEMM to convert U to a 3D Tensor
     _, M = decl_winograd_gemm(cfg, U, V)
 
     # Y [K, P, m, m]
     # Winograd output transform
-    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
-    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
-    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
-                    tvm.sum(M[r_eps * alpha + r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
-                            axis=[r_eps, r_nu]), name='Y')
+    r_eps = te.reduce_axis((0, alpha), 'r_eps')
+    r_nu = te.reduce_axis((0, alpha), 'r_nu')
+    Y = te.compute((K, P, m, m), lambda k, b, vh, vw:
+                   te.sum(M[r_eps * alpha + r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
+                          axis=[r_eps, r_nu]), name='Y')
 
     # Output [N, K, H, W]
     # Unpack back to NCHW format
     # The last term ensures alignment is not lost to bound inference
-    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
-                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
-                         + tvm.const(0, out_dtype) * M[(alpha*alpha)-1][K_round-1][P_round-1],
-                         name='output', tag='winograd_conv2d_output')
+    output = te.compute((N, K, H, W), lambda n, k, h, w:
+                        Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
+                        + tvm.tir.const(0, out_dtype) * M[(alpha*alpha)-1][K_round-1][P_round-1],
+                        name='output', tag='winograd_conv2d_output')
 
     return output
 
@@ -363,7 +364,7 @@ def _schedule_winograd(cfg, s, op):
     d, B = s[V].op.input_tensors
     data_pad = s[d].op.input_tensors[0]
 
-    if isinstance(U.op, tvm.tensor.ComputeOp):
+    if isinstance(U.op, tvm.te.ComputeOp):
         padded_kernel, G = s[U].op.input_tensors
         kernel = s[padded_kernel].op.input_tensors[0]
         s[G].compute_inline()
@@ -390,7 +391,7 @@ def _schedule_winograd(cfg, s, op):
             yo, xo, yi, xi = tile_and_bind(s, U, k, c, 1, 4)
 
         # Dilation
-        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
             s[kernel].compute_inline()
 
     # Pad data
@@ -485,7 +486,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     data, kernel = tinfos
     out_dtype = out_type.dtype
 
-    idxd = tvm.indexdiv
+    idxd = tvm.tir.indexdiv
 
     if topi_tmpl == "conv2d_nchw_spatial_pack.bifrost":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
@@ -496,7 +497,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['kernel_layout'] = 'OIHW%do' % VC
 
         new_data = data
-        new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
+        new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "conv2d_nchw_spatial_pack.bifrost")
@@ -519,7 +520,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['tile_size'] = tile_size
 
         new_data = data
-        new_kernel = tvm.placeholder(
+        new_kernel = te.placeholder(
             (KH + tile_size - 1, KW + tile_size -1, CO, CI), kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
diff --git a/topi/python/topi/bifrost/dense.py b/topi/python/topi/bifrost/dense.py
index 2a85db753226..710484235fbb 100644
--- a/topi/python/topi/bifrost/dense.py
+++ b/topi/python/topi/bifrost/dense.py
@@ -16,10 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,unused-variable
 """dense schedule on ARM Mali Biforst GPU"""
-
-from __future__ import absolute_import as _abs
-
-import tvm
+from tvm import te
 from tvm import autotvm
 
 from .. import nn
@@ -47,8 +44,8 @@ def schedule_dense(cfg, outs):
     s: Schedule
         The computation schedule for dense.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'dense':
@@ -79,10 +76,10 @@ def _callback(op):
             by, ty, yi = cfg['tile_y'].apply(s, output, y)
             bx, tx, xi = cfg['tile_x'].apply(s, output, x)
 
-            s[output].bind(by, tvm.thread_axis('blockIdx.y'))
-            s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
-            s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
-            s[output].bind(tx, tvm.thread_axis('threadIdx.x'))
+            s[output].bind(by, te.thread_axis('blockIdx.y'))
+            s[output].bind(bx, te.thread_axis('blockIdx.x'))
+            s[output].bind(ty, te.thread_axis('threadIdx.y'))
+            s[output].bind(tx, te.thread_axis('threadIdx.x'))
 
             if cfg['tile_y'].size[-1] < max_unroll:
                 s[output].unroll(yi)
@@ -108,6 +105,6 @@ def fuse_and_bind(s, tensor, axis=None, num_thread=None):
     axis = axis or s[tensor].op.axis
     fused = s[tensor].fuse(*axis)
     bx, tx = s[tensor].split(fused, num_thread)
-    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(bx, te.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, te.thread_axis("threadIdx.x"))
     return bx, tx
diff --git a/topi/python/topi/bifrost/depthwise_conv2d.py b/topi/python/topi/bifrost/depthwise_conv2d.py
index 4f7b0db7f95f..7a96705c5a2a 100644
--- a/topi/python/topi/bifrost/depthwise_conv2d.py
+++ b/topi/python/topi/bifrost/depthwise_conv2d.py
@@ -20,6 +20,7 @@
 
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 
 from .. import util
 from .. import tag
@@ -38,8 +39,8 @@ def schedule_depthwise_conv2d_nchw(outs):
     s: Schedule
         The computation schedule for depthwise_conv2d nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     def _schedule(pad_data, kernel, conv):
         raw_data = s[pad_data].op.input_tensors[0]
 
@@ -55,12 +56,12 @@ def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
             zo, zi = s[tensor].split(z, z_factor)
             yo, yi = s[tensor].split(y, y_factor)
             xo, xi = s[tensor].split(x, x_factor)
-            s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-            s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-            s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-            s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-            s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-            s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+            s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
+            s[tensor].bind(zi, te.thread_axis("threadIdx.z"))
+            s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
+            s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
+            s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
+            s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
             return zo, zi, yo, yi, xo, xi
 
         # set tunable parameters
@@ -115,7 +116,7 @@ def traverse(op):
         if op.tag == 'depthwise_conv2d_nchw':
             pad_data = op.input_tensors[0]
             kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
                 s[kernel].compute_inline()
             conv = op.output(0)
             _schedule(pad_data, kernel, conv)
diff --git a/topi/python/topi/bifrost/gemm.py b/topi/python/topi/bifrost/gemm.py
index cc6cf09de4ce..3dc010882094 100644
--- a/topi/python/topi/bifrost/gemm.py
+++ b/topi/python/topi/bifrost/gemm.py
@@ -16,9 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name,unused-variable,unused-argument
 """GEMM schedules for Mali Bifrost"""
-
-import tvm
-
 from .transforms import tile_and_bind, tile_and_bind3d, interleave_transpose, \
     transpose_interleave
 from .. import util
@@ -31,15 +28,15 @@ def decl_gemm(cfg, A, B):
     cfg : Config
         Schedule configuration
 
-    A : tvm.Tensor
+    A : tvm.te.Tensor
         2D Tensor, shape [n, k]
 
-    B : tvm.Tensor
+    B : tvm.te.Tensor
         2D Tensor, shape [k, m]
 
     Returns
     -------
-    C : tvm.Tensor
+    C : tvm.te.Tensor
         2D Tensor, shape [n, m]
     """
 
@@ -60,35 +57,35 @@ def decl_gemm(cfg, A, B):
     if unroll_gemm == 1:
         # No unrolling case must have the same set of tensors to keep scheduling consistent
         # Create identity tensors to take the place of A_unrolled, B_unrolled and R
-        A_unrolled = tvm.compute((n, k_size), lambda i, j: A[i, j], name="A_unrolled")
-        B_unrolled = tvm.compute((k_size, m), lambda i, j: B[i, j], name="B_unrolled")
+        A_unrolled = te.compute((n, k_size), lambda i, j: A[i, j], name="A_unrolled")
+        B_unrolled = te.compute((k_size, m), lambda i, j: B[i, j], name="B_unrolled")
 
         # Declare standard GEMM
-        k = tvm.reduce_axis((0, A.shape[1]), name='k')
-        C = tvm.compute((n, m), lambda i, j:
-                        tvm.sum(A_unrolled[i, k] * B_unrolled[k, j], axis=k), name='C')
+        k = te.reduce_axis((0, A.shape[1]), name='k')
+        C = te.compute((n, m), lambda i, j:
+                       te.sum(A_unrolled[i, k] * B_unrolled[k, j], axis=k), name='C')
 
-        R = tvm.compute((n, m), lambda i, j: C[i, j], name="R")
+        R = te.compute((n, m), lambda i, j: C[i, j], name="R")
 
     else:
         unrolled_k_size = k_size // unroll_gemm
 
         # Unroll the two input matrices along the shared k axis
-        A_unrolled = tvm.compute((unroll_gemm, n, unrolled_k_size), lambda b, i, j:
-                                 A[i][unrolled_k_size * b + j], name='A_unrolled')
+        A_unrolled = te.compute((unroll_gemm, n, unrolled_k_size), lambda b, i, j:
+                                A[i][unrolled_k_size * b + j], name='A_unrolled')
 
-        B_unrolled = tvm.compute((unroll_gemm, unrolled_k_size, m), lambda b, i, j:
-                                 B[unrolled_k_size * b + i][j], name='B_unrolled')
+        B_unrolled = te.compute((unroll_gemm, unrolled_k_size, m), lambda b, i, j:
+                                B[unrolled_k_size * b + i][j], name='B_unrolled')
 
         # Declare a batched GEMM
-        k = tvm.reduce_axis((0, unrolled_k_size), name='k')
-        C = tvm.compute((unroll_gemm, n, m), lambda b, i, j:
-                        tvm.sum(A_unrolled[b][i][k] * B_unrolled[b][k][j], axis=k), name='C')
+        k = te.reduce_axis((0, unrolled_k_size), name='k')
+        C = te.compute((unroll_gemm, n, m), lambda b, i, j:
+                       te.sum(A_unrolled[b][i][k] * B_unrolled[b][k][j], axis=k), name='C')
 
         # Then declare a reduction to reduce the sub matrices
-        k = tvm.reduce_axis((0, unroll_gemm), name='k')
-        R = tvm.compute((n, m), lambda i, j:
-                        tvm.sum(C[k][i][j], axis=k), name='R')
+        k = te.reduce_axis((0, unroll_gemm), name='k')
+        R = te.compute((n, m), lambda i, j:
+                       te.sum(C[k][i][j], axis=k), name='R')
 
     return R
 
@@ -99,15 +96,15 @@ def decl_batched_gemm(cfg, A, B):
     cfg : Config
         Schedule configuration
 
-    A : tvm.Tensor
+    A : tvm.te.Tensor
         3D Tensor, shape [b, n, k]
 
-    B : tvm.Tensor
+    B : tvm.te.Tensor
         3D Tensor, shape [b, k, m]
 
     Returns
     -------
-    C : tvm.Tensor
+    C : tvm.te.Tensor
         3D Tensor, shape [b, n, m]
 
     """
@@ -127,9 +124,9 @@ def decl_batched_gemm(cfg, A, B):
     b_size = util.get_const_int(A.shape[0])
 
     # Declare a batched GEMM
-    k = tvm.reduce_axis((0, k_size), name='k')
-    C = tvm.compute((b_size, n, m), lambda b, i, j:
-                    tvm.sum(A[b][i][k] * B[b][k][j], axis=k), name='C')
+    k = te.reduce_axis((0, k_size), name='k')
+    C = te.compute((b_size, n, m), lambda b, i, j:
+                   te.sum(A[b][i][k] * B[b][k][j], axis=k), name='C')
 
     return C
 
@@ -143,10 +140,10 @@ def decl_winograd_gemm(cfg, A, B):
     cfg : Config
         Schedule configuration
 
-    A : tvm.Tensor
+    A : tvm.te.Tensor
         4D Tensor, shape [a, a, n, k]
 
-    B : tvm.Tensor
+    B : tvm.te.Tensor
         4D Tensor, shape [a * a, k, m]
 
     Returns
@@ -157,8 +154,8 @@ def decl_winograd_gemm(cfg, A, B):
     n = util.get_const_int(A.shape[2])
     k = util.get_const_int(A.shape[3])
 
-    A_3D = tvm.compute((alpha * alpha, n, k), lambda b, i, j:
-                       A[b // alpha][b % alpha][i][j], name='A_3D')
+    A_3D = te.compute((alpha * alpha, n, k), lambda b, i, j:
+                      A[b // alpha][b % alpha][i][j], name='A_3D')
 
     C = decl_batched_gemm(cfg, A_3D, B)
     return A_3D, C
@@ -171,16 +168,16 @@ def schedule_gemm(cfg, s, A, B, C, batched=False, schedule_transforms=True):
     cfg : Config
         Schedule configuration
 
-    s : tvm.schedule.Schedule
+    s : tvm.te.schedule.Schedule
         Operator schedule
 
-    A : tvm.Tensor
+    A : tvm.te.Tensor
         2D/3D Tensor, shape [n, k]/[b, n, k]
 
-    B : tvm.Tensor
+    B : tvm.te.Tensor
         2D/3D Tensor, shape [k, m]/[b, k, m]
 
-    C : tvm.Tensor
+    C : tvm.te.Tensor
         2D/3D Tensor, shape [n, m]/[b, n, m]
 
     batched : bool
@@ -287,19 +284,19 @@ def schedule_unrollable_gemm(cfg, s, A, B, C, R):
     cfg : Config
         Schedule configuration
 
-    s : tvm.schedule.Schedule
+    s : tvm.te.schedule.Schedule
         Operator schedule
 
-    A : tvm.Tensor
+    A : tvm.te.Tensor
         2D/3D Tensor, shape [n, k]/[b, n, k]
 
-    B : tvm.Tensor
+    B : tvm.te.Tensor
         2D/3D Tensor, shape [k, m]/[b, k, m]
 
-    C : tvm.Tensor
+    C : tvm.te.Tensor
         2D/3D Tensor, shape [n, m]/[b, n, m]
 
-    R : tvm.Tensor
+    R : tvm.te.Tensor
         2D Tensor, shape [n, m]
 
     """
@@ -340,21 +337,21 @@ def get_unrollable_gemm_ops(R):
 
     Parameters
     ----------
-    R : tvm.Tensor
+    R : tvm.te.Tensor
         Reduced tensor, final stage of GEMM
 
     Returns
     -------
-    A_unrolled : tvm.Tensor
+    A_unrolled : tvm.te.Tensor
         Matrix A unrolled along k
 
-    B_unrolled: tvm.Tensor
+    B_unrolled: tvm.te.Tensor
         Matrix B unrolled along k
 
-    C : tvm.Tensor
+    C : tvm.te.Tensor
         Result of batched GEMM
 
-    R : tvm.Tensor
+    R : tvm.te.Tensor
         Reduction of C, result of unrollable GEMM
 
     """
diff --git a/topi/python/topi/bifrost/transforms.py b/topi/python/topi/bifrost/transforms.py
index d7fc292f0ade..3feb4e6c8759 100644
--- a/topi/python/topi/bifrost/transforms.py
+++ b/topi/python/topi/bifrost/transforms.py
@@ -19,6 +19,7 @@
 
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 
 def fuse_and_bind(s, tensor, axis=None, num_thread=None):
     """Fuse all the axis and bind to GPU threads"""
@@ -26,18 +27,18 @@ def fuse_and_bind(s, tensor, axis=None, num_thread=None):
     fused = s[tensor].fuse(*axis)
     max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
     bx, tx = s[tensor].split(fused, num_thread or max_threads)
-    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(bx, te.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, te.thread_axis("threadIdx.x"))
     return bx, tx
 
 def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
     """Tile and bind to GPU threads"""
     x_factor = x_factor or y_factor
     yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
+    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
     return yo, xo, yi, xi
 
 def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
@@ -47,12 +48,12 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
     zo, zi = s[tensor].split(z, z_factor)
     yo, yi = s[tensor].split(y, y_factor)
     xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, te.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
     return zo, yo, xo, zi, yi, xi
 
 def pack_tensor(s, tensor, factor, readers):
diff --git a/topi/python/topi/broadcast.py b/topi/python/topi/broadcast.py
index ba39c9aed35b..39b2841da854 100644
--- a/topi/python/topi/broadcast.py
+++ b/topi/python/topi/broadcast.py
@@ -27,7 +27,7 @@ def broadcast_to(data, shape):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input data
 
     shape : list or tuple
@@ -35,7 +35,7 @@ def broadcast_to(data, shape):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return _cpp.broadcast_to(data, shape)
 
@@ -45,14 +45,14 @@ def add(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -64,14 +64,14 @@ def subtract(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -83,14 +83,14 @@ def multiply(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -102,14 +102,14 @@ def divide(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -121,14 +121,14 @@ def floor_divide(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -140,14 +140,14 @@ def mod(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -159,14 +159,14 @@ def floor_mod(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -178,14 +178,14 @@ def maximum(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -197,14 +197,14 @@ def minimum(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -216,14 +216,14 @@ def power(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -235,14 +235,14 @@ def left_shift(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -254,14 +254,14 @@ def right_shift(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -273,14 +273,14 @@ def greater(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -292,14 +292,14 @@ def less(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -311,14 +311,14 @@ def equal(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -330,14 +330,14 @@ def not_equal(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -349,14 +349,14 @@ def greater_equal(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -368,14 +368,14 @@ def less_equal(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
         The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
         The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
         Returns Expr if both operands are Expr.
         Otherwise returns Tensor.
     """
@@ -387,14 +387,14 @@ def logical_and(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
           The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
           The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
           Returns Expr if both operands are Expr.
           Otherwise returns Tensor.
     """
@@ -406,14 +406,14 @@ def logical_or(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
           The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
           The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
           Returns Expr if both operands are Expr.
           Otherwise returns Tensor.
     """
@@ -425,14 +425,14 @@ def bitwise_and(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
           The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
           The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
           Returns Expr if both operands are Expr.
           Otherwise returns Tensor.
     """
@@ -444,14 +444,14 @@ def bitwise_or(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
           The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
           The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
           Returns Expr if both operands are Expr.
           Otherwise returns Tensor.
     """
@@ -463,14 +463,14 @@ def bitwise_xor(lhs, rhs):
 
     Parameters
     ----------
-    lhs : tvm.Tensor or Expr
+    lhs : tvm.te.Tensor or Expr
           The left operand
-    rhs : tvm.Tensor or Expr
+    rhs : tvm.te.Tensor or Expr
           The right operand
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
           Returns Expr if both operands are Expr.
           Otherwise returns Tensor.
     """
@@ -482,11 +482,11 @@ def logical_not(data):
 
     Parameters
     ----------
-    data : tvm.Tensor or Expr
+    data : tvm.te.Tensor or Expr
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
           Returns Expr if the operand are Expr.
           Otherwise returns Tensor.
     """
@@ -498,11 +498,11 @@ def bitwise_not(data):
 
     Parameters
     ----------
-    data : tvm.Tensor or Expr
+    data : tvm.te.Tensor or Expr
 
     Returns
     -------
-    ret : tvm.Tensor or Expr
+    ret : tvm.te.Tensor or Expr
           Returns Expr if the operand are Expr.
           Otherwise returns Tensor.
     """
diff --git a/topi/python/topi/cuda/batch_matmul.py b/topi/python/topi/cuda/batch_matmul.py
index e293c7ad41e8..bf801820d25a 100644
--- a/topi/python/topi/cuda/batch_matmul.py
+++ b/topi/python/topi/cuda/batch_matmul.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,too-many-locals,unused-variable
 """cuda batch_matmul operators"""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from tvm.contrib import cublas
 from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
 
@@ -35,8 +34,8 @@ def schedule_batch_matmul(outs):
     s: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(op):
         C = op.output(0)
@@ -60,13 +59,13 @@ def _schedule(op):
         x_nthreads = min(x_bn, 8)
         ty, yi = s[C].split(y, nparts=y_nthreads)
         tx, xi = s[C].split(x, nparts=x_nthreads)
-        thread_x = tvm.thread_axis((0, x_nthreads), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, y_nthreads), "threadIdx.y")
+        thread_x = te.thread_axis((0, x_nthreads), "threadIdx.x")
+        thread_y = te.thread_axis((0, y_nthreads), "threadIdx.y")
 
         s[C].reorder(b, by, bx, ty, tx, yi, xi)
-        s[C].bind(b, tvm.thread_axis("blockIdx.z"))
-        s[C].bind(by, tvm.thread_axis("blockIdx.y"))
-        s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[C].bind(b, te.thread_axis("blockIdx.z"))
+        s[C].bind(by, te.thread_axis("blockIdx.y"))
+        s[C].bind(bx, te.thread_axis("blockIdx.x"))
         s[C].bind(ty, thread_y)
         s[C].bind(tx, thread_x)
         s[C].pragma(yi, "auto_unroll_max_step", 16)
@@ -111,15 +110,15 @@ def batch_matmul_cublas(x, y):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         3-D with shape [batch, M, K]
 
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         3-D with shape [batch, N, K]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         3-D with shape [batch, M, N]
     """
     return cublas.batch_matmul(x, y, False, True)
diff --git a/topi/python/topi/cuda/conv1d.py b/topi/python/topi/cuda/conv1d.py
index 56918e2bbba2..3ddecbe646d7 100644
--- a/topi/python/topi/cuda/conv1d.py
+++ b/topi/python/topi/cuda/conv1d.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Compute definition for conv1d with cuda backend"""
 import tvm
+from tvm import te
 from tvm import autotvm
 
 from .. import nn
@@ -52,8 +53,8 @@ def schedule_conv1d_ncw(cfg, outs):
     s : Schedule
         The computation schedule for conv1d.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv1d_ncw':
@@ -79,7 +80,7 @@ def _callback(op):
             ##### space definition end #####
 
             if isinstance(kernel.op,
-                          tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                          tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
                 s[kernel].compute_inline()
 
             if conv.op in s.outputs:
@@ -103,14 +104,14 @@ def _callback(op):
             bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
             s[output].reorder(bn, bf, bx, vn, vf, vx, tn, tf, tx, ni, fi, xi)
-            s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-            s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
-            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-            s[output].bind(vn, tvm.thread_axis("vthread"))
-            s[output].bind(vf, tvm.thread_axis("vthread"))
-            s[output].bind(vx, tvm.thread_axis("vthread"))
-
-            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+            s[output].bind(bn, te.thread_axis("blockIdx.z"))
+            s[output].bind(bf, te.thread_axis("blockIdx.y"))
+            s[output].bind(bx, te.thread_axis("blockIdx.x"))
+            s[output].bind(vn, te.thread_axis("vthread"))
+            s[output].bind(vf, te.thread_axis("vthread"))
+            s[output].bind(vx, te.thread_axis("vthread"))
+
+            s[output].bind(tx, te.thread_axis("threadIdx.x"))
             s[OL].compute_at(s[output], tx)
             # number of threads
             n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
@@ -131,8 +132,8 @@ def _callback(op):
                 fused = s[load].fuse(f, x)
                 tz, fused = s[load].split(fused, nparts=n_tz)
                 tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, tvm.thread_axis("threadIdx.y"))
-                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[load].bind(tz, te.thread_axis("threadIdx.y"))
+                s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
             s[output].pragma(kernel_scope, 'auto_unroll_max_step',
                              cfg['auto_unroll_max_step'].val)
@@ -177,8 +178,8 @@ def schedule_conv1d_nwc(cfg, outs):
     s : Schedule
         The computation schedule for conv1d.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv1d_nwc':
@@ -204,7 +205,7 @@ def _callback(op):
             ##### space definition end #####
 
             if isinstance(kernel.op,
-                          tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                          tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
                 s[kernel].compute_inline()
 
             if conv.op in s.outputs:
@@ -228,14 +229,14 @@ def _callback(op):
             bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
 
             s[output].reorder(bn, bx, bf, vn, vx, vf, tn, tx, tf, ni, xi, fi)
-            s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-            s[output].bind(bx, tvm.thread_axis("blockIdx.y"))
-            s[output].bind(bf, tvm.thread_axis("blockIdx.x"))
-            s[output].bind(vn, tvm.thread_axis("vthread"))
-            s[output].bind(vx, tvm.thread_axis("vthread"))
-            s[output].bind(vf, tvm.thread_axis("vthread"))
-
-            s[output].bind(tf, tvm.thread_axis("threadIdx.x"))
+            s[output].bind(bn, te.thread_axis("blockIdx.z"))
+            s[output].bind(bx, te.thread_axis("blockIdx.y"))
+            s[output].bind(bf, te.thread_axis("blockIdx.x"))
+            s[output].bind(vn, te.thread_axis("vthread"))
+            s[output].bind(vx, te.thread_axis("vthread"))
+            s[output].bind(vf, te.thread_axis("vthread"))
+
+            s[output].bind(tf, te.thread_axis("threadIdx.x"))
             s[OL].compute_at(s[output], tf)
             # number of threads
             n_tz = cfg["tile_n"].size[2] * cfg["tile_x"].size[2]
@@ -256,8 +257,8 @@ def _callback(op):
                 fused = s[load].fuse(x, f)
                 tz, fused = s[load].split(fused, nparts=n_tz)
                 tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, tvm.thread_axis("threadIdx.y"))
-                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[load].bind(tz, te.thread_axis("threadIdx.y"))
+                s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
             s[output].pragma(kernel_scope, 'auto_unroll_max_step',
                              cfg['auto_unroll_max_step'].val)
diff --git a/topi/python/topi/cuda/conv1d_transpose_ncw.py b/topi/python/topi/cuda/conv1d_transpose_ncw.py
index 4802a0d144a3..cf1b66cc5202 100644
--- a/topi/python/topi/cuda/conv1d_transpose_ncw.py
+++ b/topi/python/topi/cuda/conv1d_transpose_ncw.py
@@ -18,6 +18,7 @@
 """Conv1d transpose template for cuda backend"""
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from .. import nn
 from ..util import get_const_tuple, traverse_inline
@@ -30,9 +31,9 @@ def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype):
     ----------
     cfg: ConfigEntity
         The config for this template
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         3-D with shape [batch, in_channel, inp_width]
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         3-D with shape [in_channel, num_filter, kernel_size]
     stride : tuple of one int
         The spatial stride along width
@@ -45,7 +46,7 @@ def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
     u    3-D with shape [batch, out_channel, out_width]
     """
     if isinstance(stride, (tuple, list)):
@@ -58,21 +59,21 @@ def conv1d_transpose_ncw(cfg, data, kernel, stride, padding, out_dtype):
     pad_left = kernel_size - 1 - pad_left
     pad_right = kernel_size - 1 - pad_right
     dilated_width = stride * (inp_width - 1) + 1
-    data = tvm.compute(
+    data = te.compute(
         (batch, inp_channels, pad_left + dilated_width + pad_right),
-        lambda n, c, x: tvm.if_then_else(
-            tvm.all(x >= pad_left,
-                    x < pad_left + dilated_width,
-                    tvm.indexmod(x - pad_left, stride).equal(0)),
-            data[n, c, tvm.indexdiv(x - pad_left, stride)],
-            tvm.const(0., "float32")),
+        lambda n, c, x: tvm.tir.if_then_else(
+            tvm.tir.all(x >= pad_left,
+                        x < pad_left + dilated_width,
+                        tvm.tir.indexmod(x - pad_left, stride).equal(0)),
+            data[n, c, tvm.tir.indexdiv(x - pad_left, stride)],
+            tvm.tir.const(0., "float32")),
         name='data_pad')
 
-    dc = tvm.reduce_axis((0, inp_channels), name='dc')
-    dw = tvm.reduce_axis((0, kernel_size), name='dw')
-    data_out = tvm.compute(
+    dc = te.reduce_axis((0, inp_channels), name='dc')
+    dw = te.reduce_axis((0, kernel_size), name='dw')
+    data_out = te.compute(
         (batch, out_channels, out_width),
-        lambda b, c, w: tvm.sum(
+        lambda b, c, w: te.sum(
             data[b, dc, w + dw].astype(out_dtype) *
             kernel[dc, c, kernel_size - 1 - dw].astype(out_dtype),
             axis=[dc, dw]), tag="conv1d_transpose_ncw")
@@ -97,8 +98,8 @@ def schedule_conv1d_transpose_ncw(cfg, outs):
     s: Schedule
         The computation schedule for conv1d transpose.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv1d_transpose_ncw':
@@ -123,7 +124,7 @@ def _callback(op):
 
             ##### space definition end #####
 
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
                 s[kernel].compute_inline()
 
             if conv.op in s.outputs:
@@ -147,14 +148,14 @@ def _callback(op):
             bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
             s[output].reorder(bn, bf, bx, vn, vf, vx, tn, tf, tx, ni, fi, xi)
-            s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-            s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
-            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-            s[output].bind(vn, tvm.thread_axis("vthread"))
-            s[output].bind(vf, tvm.thread_axis("vthread"))
-            s[output].bind(vx, tvm.thread_axis("vthread"))
-
-            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+            s[output].bind(bn, te.thread_axis("blockIdx.z"))
+            s[output].bind(bf, te.thread_axis("blockIdx.y"))
+            s[output].bind(bx, te.thread_axis("blockIdx.x"))
+            s[output].bind(vn, te.thread_axis("vthread"))
+            s[output].bind(vf, te.thread_axis("vthread"))
+            s[output].bind(vx, te.thread_axis("vthread"))
+
+            s[output].bind(tx, te.thread_axis("threadIdx.x"))
             s[OL].compute_at(s[output], tx)
             # number of threads
             n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
@@ -175,8 +176,8 @@ def _callback(op):
                 fused = s[load].fuse(f, x)
                 tz, fused = s[load].split(fused, nparts=n_tz)
                 tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, tvm.thread_axis("threadIdx.y"))
-                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[load].bind(tz, te.thread_axis("threadIdx.y"))
+                s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
             s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
             s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index e1ada325ea63..c24789307340 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Compute definition for conv2d with cuda backend"""
-import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.contrib import cudnn
 
@@ -35,8 +35,8 @@ def conv2d_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype='float3
 @autotvm.register_topi_schedule("conv2d_nchw.cuda")
 def schedule_conv2d_nchw(cfg, outs):
     """Create the schedule for conv2d_nchw"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv2d_nchw':
@@ -55,8 +55,8 @@ def _callback(op):
 #
 # @autotvm.register_topi_schedule("conv2d_nhwc.cuda")
 # def schedule_conv2d_nhwc(cfg, outs):
-#     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-#     s = tvm.create_schedule([x.op for x in outs])
+#     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+#     s = te.create_schedule([x.op for x in outs])
 #
 #     def _callback(op):
 #         if op.tag == 'conv2d_nhwc':
diff --git a/topi/python/topi/cuda/conv2d_alter_op.py b/topi/python/topi/cuda/conv2d_alter_op.py
index f3e4f4c3b3c9..b59827136c70 100644
--- a/topi/python/topi/cuda/conv2d_alter_op.py
+++ b/topi/python/topi/cuda/conv2d_alter_op.py
@@ -19,6 +19,7 @@
 
 import logging
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 
@@ -70,10 +71,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         ic_block_factor = oc_block_factor = 4
 
         # Store the same config for the altered operator (workload)
-        new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
-                                   dtype=data.dtype)
-        new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW, \
-                                      oc_block_factor, ic_block_factor), dtype=kernel.dtype)
+        new_data = te.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                  dtype=data.dtype)
+        new_kernel = te.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW, \
+                                     oc_block_factor, ic_block_factor), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype],
             "conv2d_NCHWc_int8.cuda")
@@ -100,8 +101,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
         # Store the same config for the altered operator (workload)
         new_data = data
-        new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO),
-                                     dtype=kernel.dtype)
+        new_weight = te.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO),
+                                    dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_weight, strides, padding, dilation, out_dtype],
             "conv2d_nchw_winograd_without_weight_transform.cuda")
@@ -122,11 +123,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         ic_block_factor = oc_block_factor = 4
 
         # Store the same config for the altered operator (workload)
-        new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
-                                   dtype=data.dtype)
-        new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,
-                                      KH, KW, oc_block_factor, ic_block_factor),
-                                     dtype=kernel.dtype)
+        new_data = te.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                  dtype=data.dtype)
+        new_kernel = te.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,
+                                     KH, KW, oc_block_factor, ic_block_factor),
+                                    dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
             "group_conv2d_NCHWc_int8.cuda")
diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py
index 2fab8cf12253..db6bff2e9289 100644
--- a/topi/python/topi/cuda/conv2d_direct.py
+++ b/topi/python/topi/cuda/conv2d_direct.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name
 """The templates for cuda conv2d operators"""
 import tvm
+from tvm import te
 from tvm import autotvm
 from ..util import get_const_tuple
 
@@ -50,7 +51,7 @@ def schedule_direct_cuda(cfg, s, conv):
     pad_data, kernel = s[conv].op.input_tensors
 
     s[pad_data].compute_inline()
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+    if isinstance(kernel.op, tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
         s[kernel].compute_inline()
 
     if conv.op in s.outputs:
@@ -74,15 +75,15 @@ def schedule_direct_cuda(cfg, s, conv):
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
     bf = s[output].fuse(n, bf)
-    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
-    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
-    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
-    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
     s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
     s[OL].compute_at(s[output], tx)
 
@@ -104,9 +105,9 @@ def schedule_direct_cuda(cfg, s, conv):
         tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
         ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
         tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # unroll
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index b0925ae93a16..e45083f53c5e 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
 """Schedule for conv2d_hwcn with auto fusion"""
 import tvm
+from tvm import te
 from tvm import autotvm
 
 from tvm.autotvm.task.space import SplitEntity
@@ -44,8 +45,8 @@ def schedule_conv2d_hwcn(cfg, outs):
     s: Schedule
         The computation schedule for conv2d_hwcn.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    sch = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
     def schedule(Apad, W, B):
         """Schedule conv2d_hwcn"""
         sch[Apad].compute_inline()
@@ -93,13 +94,13 @@ def schedule(Apad, W, B):
         bx, txz, tx, ni = cfg['tile_ni'].apply(sch, Out, ni)
         sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)
 
-        sch[Out].bind(bz, tvm.thread_axis('blockIdx.z'))
-        sch[Out].bind(by, tvm.thread_axis('blockIdx.y'))
-        sch[Out].bind(bx, tvm.thread_axis('blockIdx.x'))
-        sch[Out].bind(tyz, tvm.thread_axis('vthread'))
-        sch[Out].bind(txz, tvm.thread_axis('vthread'))
-        sch[Out].bind(ty, tvm.thread_axis('threadIdx.y'))
-        sch[Out].bind(tx, tvm.thread_axis('threadIdx.x'))
+        sch[Out].bind(bz, te.thread_axis('blockIdx.z'))
+        sch[Out].bind(by, te.thread_axis('blockIdx.y'))
+        sch[Out].bind(bx, te.thread_axis('blockIdx.x'))
+        sch[Out].bind(tyz, te.thread_axis('vthread'))
+        sch[Out].bind(txz, te.thread_axis('vthread'))
+        sch[Out].bind(ty, te.thread_axis('threadIdx.y'))
+        sch[Out].bind(tx, te.thread_axis('threadIdx.x'))
 
         # Schedule BL local write
         sch[BL].compute_at(sch[Out], tx)
@@ -121,8 +122,8 @@ def schedule(Apad, W, B):
         tx, ni = sch[AA].split(ni, nparts=cfg['tile_ni'].size[2])
         _, ni = sch[AA].split(ni, factor=4)
         sch[AA].reorder(ty, tx, yi, xi, ci, ni)
-        sch[AA].bind(ty, tvm.thread_axis('threadIdx.y'))
-        sch[AA].bind(tx, tvm.thread_axis('threadIdx.x'))
+        sch[AA].bind(ty, te.thread_axis('threadIdx.y'))
+        sch[AA].bind(tx, te.thread_axis('threadIdx.x'))
         sch[AA].vectorize(ni)
         # Schedule for W's shared memory load
         yi, xi, ci, fi = sch[WW].op.axis
@@ -130,8 +131,8 @@ def schedule(Apad, W, B):
         tx, fi = sch[WW].split(fi, nparts=cfg['tile_ni'].size[2])
         _, fi = sch[WW].split(fi, factor=4)
         sch[WW].reorder(ty, tx, yi, xi, ci, fi)
-        sch[WW].bind(ty, tvm.thread_axis('threadIdx.y'))
-        sch[WW].bind(tx, tvm.thread_axis('threadIdx.x'))
+        sch[WW].bind(ty, te.thread_axis('threadIdx.y'))
+        sch[WW].bind(tx, te.thread_axis('threadIdx.x'))
         sch[WW].vectorize(fi)
 
     scheduled_ops = []
@@ -142,12 +143,12 @@ def traverse(operator):
             if operator not in sch.outputs:
                 sch[operator].compute_inline()
             for tensor in operator.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         elif operator.tag == 'conv2d_hwcn':
             Apad = operator.input_tensors[0]
             W = operator.input_tensors[1]
-            if isinstance(W.op, tvm.tensor.ComputeOp) and 'dilate' in W.op.tag:
+            if isinstance(W.op, tvm.te.ComputeOp) and 'dilate' in W.op.tag:
                 sch[W].compute_inline()
             B = operator.output(0)
             schedule(Apad, W, B)
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
index 53a7bd9fa849..ad97fa68d6aa 100644
--- a/topi/python/topi/cuda/conv2d_int8.py
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name
 """Int8 conv2d in NCHWc layout"""
 import tvm
+from tvm import te
 from tvm import autotvm
 
 from .injective import schedule_injective_from_existing
@@ -35,11 +36,11 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
     cfg: ConfigEntity
         The config for this template
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width] or
         5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width] or
         6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
         filter_width, num_filter_block, in_channel_block]
@@ -61,7 +62,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
     assert layout in ["NCHW", "NCHW4c"]
@@ -74,17 +75,17 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
         assert channels % ic_block_factor == 0, \
             "Number of input channels should be multiple of {}".format(
                 ic_block_factor)
-        packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
-                                   ic_block_factor),
-                                  lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
-                                  name="packed_data")
+        packed_data = te.compute((batch, channels // ic_block_factor, height, width,
+                                  ic_block_factor),
+                                 lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
+                                 name="packed_data")
 
         out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(
             kernel.shape)
         assert out_channels % 4 == 0, \
             "Number of output channels should be multiple of {}".format(
                 oc_block_factor)
-        packed_kernel = tvm.compute(
+        packed_kernel = te.compute(
             (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w,
              oc_block_factor, ic_block_factor),
             lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block:
@@ -124,23 +125,23 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
 
     oshape = (batch, oc_chunk, out_height, out_width, oc_block)
 
-    icc = tvm.reduce_axis((0, ic_chunk), name='ic_chunk')
-    icb = tvm.reduce_axis((0, ic_block), name='ic_block')
-    kh = tvm.reduce_axis((0, kernel_h), name='kh')
-    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+    icc = te.reduce_axis((0, ic_chunk), name='ic_chunk')
+    icb = te.reduce_axis((0, ic_block), name='ic_block')
+    kh = te.reduce_axis((0, kernel_h), name='kh')
+    kw = te.reduce_axis((0, kernel_w), name='kw')
 
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(pad_data[n, icc, oh*stride_h+kh*dilation_h, \
-                               ow*stride_w+kw*dilation_w, icb]
-                               .astype('int32') *
-                               packed_kernel[oc_chunk, icc,
-                                             kh, kw, oc_block, icb]
-                               .astype('int32'),
-                               axis=[icc, kh, kw, icb]))
+    conv = te.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                      te.sum(pad_data[n, icc, oh*stride_h+kh*dilation_h, \
+                                      ow*stride_w+kw*dilation_w, icb]
+                             .astype('int32') *
+                             packed_kernel[oc_chunk, icc,
+                                           kh, kw, oc_block, icb]
+                             .astype('int32'),
+                             axis=[icc, kh, kw, icb]))
 
-    output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                         conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
-                         tag="conv2d_NCHWc_int8")
+    output = te.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                        conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
+                        tag="conv2d_NCHWc_int8")
 
     # num flop
     num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
@@ -156,8 +157,8 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_
 @autotvm.register_topi_schedule("conv2d_NCHWc_int8.cuda")
 def schedule_conv2d_NCHWc_int8(cfg, outs):
     """Schedule conv2d int8 NCHWc template"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv2d_NCHWc_int8':
@@ -171,7 +172,7 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
     conv = output.op.input_tensors[0]
     packed_data, packed_kernel = conv.op.input_tensors
 
-    if isinstance(packed_data.op, tvm.tensor.ComputeOp) and "pad" in packed_data.op.tag:
+    if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag:
         pad_data = packed_data
         packed_data = pad_data.op.input_tensors[0]
     else:
@@ -183,8 +184,8 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
         s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
         s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region")
     else:
-        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\
-                       packed_kernel.name == 'packed_kernel':
+        if isinstance(packed_kernel.op, tvm.te.ComputeOp) and\
+                packed_kernel.name == 'packed_kernel':
             # data and kernel are not pre-computed, schedule layout transform here
             schedule_injective_from_existing(s, packed_data)
             schedule_injective_from_existing(s, packed_kernel)
@@ -219,20 +220,20 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
     s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
-    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vn, tvm.thread_axis("vthread"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(bn, te.thread_axis("blockIdx.z"))
+    s[output].bind(bf, te.thread_axis("blockIdx.y"))
+    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
+    s[output].bind(vn, te.thread_axis("vthread"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
 
     cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf
     if cfg["fuse_yx"].val:
-        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
-        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(tn, te.thread_axis("threadIdx.z"))
+        s[output].bind(tf, te.thread_axis("threadIdx.y"))
         tyx = s[output].fuse(ty, tx)
-        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
+        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
         s[conv].compute_at(s[output], tyx)
 
         # number of threads
@@ -240,9 +241,9 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
         n_ty = cfg["tile_f"].size[2]
         n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
     else:
-        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
-        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
+        s[output].bind(ty, te.thread_axis("threadIdx.y"))
+        s[output].bind(tx, te.thread_axis("threadIdx.x"))
         s[conv].compute_at(s[output], tx)
 
         # number of threads
@@ -285,9 +286,9 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
         fused, tx = s[load].split(fused, factor=n_tx)
         fused, ty = s[load].split(fused, factor=n_ty)
         fused, tz = s[load].split(fused, factor=n_tz)
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # double buffer
     cfg.define_knob('AA_double_buffer', [0, 1])
diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py
index 8751800c4517..17bd37d1fc5c 100644
--- a/topi/python/topi/cuda/conv2d_transpose_nchw.py
+++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py
@@ -18,6 +18,7 @@
 """Conv2d transpose template for cuda backend"""
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn
@@ -32,9 +33,9 @@ def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype):
     ----------
     cfg: ConfigEntity
         The config for this template
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [in_channel, num_filter, filter_height, filter_width]
     strides : tuple of two ints
         The spatial stride along height and width
@@ -45,7 +46,7 @@ def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape)
@@ -56,42 +57,42 @@ def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype):
         padding, (kernel_height, kernel_width))
 
     out_width = (inp_width - 1) * stride_width + \
-                kernel_width - pad_left - pad_right
+        kernel_width - pad_left - pad_right
     pad_left = kernel_width - 1 - pad_left
     pad_right = kernel_width - 1 - pad_right
     dilated_width = stride_width * (inp_width - 1) + 1
 
     out_height = (inp_height - 1) * stride_height + \
-                 kernel_height - pad_top - pad_bottom
+        kernel_height - pad_top - pad_bottom
     pad_top = kernel_height - 1 - pad_top
     pad_bottom = kernel_height - 1 - pad_bottom
     dilated_height = stride_height * (inp_height - 1) + 1
 
     # compute pad
-    data = tvm.compute(
+    data = te.compute(
         (batch, inp_channels,
          pad_top + dilated_height + pad_bottom,
          pad_left + dilated_width + pad_right),
-        lambda n, c, y, x: tvm.if_then_else(
-            tvm.all(x >= pad_left,
-                    x < pad_left + dilated_width,
-                    tvm.indexmod(x - pad_left, stride_width).equal(0),
-                    y >= pad_top,
-                    y < pad_top + dilated_height,
-                    tvm.indexmod(y - pad_top, stride_height).equal(0)),
+        lambda n, c, y, x: tvm.tir.if_then_else(
+            tvm.tir.all(x >= pad_left,
+                        x < pad_left + dilated_width,
+                        tvm.tir.indexmod(x - pad_left, stride_width).equal(0),
+                        y >= pad_top,
+                        y < pad_top + dilated_height,
+                        tvm.tir.indexmod(y - pad_top, stride_height).equal(0)),
             data[n, c,
-                 tvm.indexdiv(y - pad_top, stride_height),
-                 tvm.indexdiv(x - pad_left, stride_width)],
-            tvm.const(0., "float32")),
+                 tvm.tir.indexdiv(y - pad_top, stride_height),
+                 tvm.tir.indexdiv(x - pad_left, stride_width)],
+            tvm.tir.const(0., "float32")),
         name='data_pad')
 
     # compute transposed conv
-    dc = tvm.reduce_axis((0, inp_channels), name='dc')
-    dh = tvm.reduce_axis((0, kernel_height), name='dh')
-    dw = tvm.reduce_axis((0, kernel_width), name='dw')
-    data_out = tvm.compute(
+    dc = te.reduce_axis((0, inp_channels), name='dc')
+    dh = te.reduce_axis((0, kernel_height), name='dh')
+    dw = te.reduce_axis((0, kernel_width), name='dw')
+    data_out = te.compute(
         (batch, out_channels, out_height, out_width),
-        lambda b, c, h, w: tvm.sum(
+        lambda b, c, h, w: te.sum(
             data[b, dc, h + dh, w + dw].astype(out_dtype) *
             kernel[dc,
                    c,
@@ -119,8 +120,8 @@ def schedule_conv2d_transpose_nchw(cfg, outs):
     s: Schedule
         The computation schedule for conv2d transpose.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _fallback_schedule(N, F, Y, X):
         # pylint: disable=unused-argument
@@ -181,7 +182,7 @@ def _callback(op):
 
             ##### space definition end #####
 
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
                 s[kernel].compute_inline()
 
             if conv.op in s.outputs:
@@ -206,21 +207,21 @@ def _callback(op):
             bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
             s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-            s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-            s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
-            s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
-            s[output].bind(vn, tvm.thread_axis("vthread"))
-            s[output].bind(vf, tvm.thread_axis("vthread"))
-            s[output].bind(vy, tvm.thread_axis("vthread"))
-            s[output].bind(vx, tvm.thread_axis("vthread"))
+            s[output].bind(bn, te.thread_axis("blockIdx.z"))
+            s[output].bind(bf, te.thread_axis("blockIdx.y"))
+            s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
+            s[output].bind(vn, te.thread_axis("vthread"))
+            s[output].bind(vf, te.thread_axis("vthread"))
+            s[output].bind(vy, te.thread_axis("vthread"))
+            s[output].bind(vx, te.thread_axis("vthread"))
 
             cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf
 
             if cfg["fuse_yx"].val:
-                s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
-                s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+                s[output].bind(tn, te.thread_axis("threadIdx.z"))
+                s[output].bind(tf, te.thread_axis("threadIdx.y"))
                 tyx = s[output].fuse(ty, tx)
-                s[output].bind(s[output].fuse(ty, tx), tvm.thread_axis("threadIdx.x"))
+                s[output].bind(s[output].fuse(ty, tx), te.thread_axis("threadIdx.x"))
                 s[OL].compute_at(s[output], tyx)
 
                 # number of threads
@@ -228,9 +229,9 @@ def _callback(op):
                 n_ty = cfg["tile_f"].size[2]
                 n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
             else:
-                s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
-                s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-                s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
+                s[output].bind(ty, te.thread_axis("threadIdx.y"))
+                s[output].bind(tx, te.thread_axis("threadIdx.x"))
                 s[OL].compute_at(s[output], tx)
 
                 # number of threads
@@ -254,9 +255,9 @@ def _callback(op):
                 tz, fused = s[load].split(fused, nparts=n_tz)
                 ty, fused = s[load].split(fused, nparts=n_ty)
                 tx, fused = s[load].split(fused, nparts=n_tx)
-                s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-                s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[load].bind(tz, te.thread_axis("threadIdx.z"))
+                s[load].bind(ty, te.thread_axis("threadIdx.y"))
+                s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
             s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
             s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 6e09be97390c..881f63aef781 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -19,6 +19,7 @@
 
 import logging
 import tvm
+from tvm import te
 from tvm import autotvm
 
 from .. import nn
@@ -75,50 +76,50 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype,
 
     # transform kernel
     if not pre_computed:
-        r_kh = tvm.reduce_axis((0, KH), name='r_kh')
-        r_kw = tvm.reduce_axis((0, KW), name='r_kw')
-        kernel_pack = tvm.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co:
-                                  tvm.sum(kernel[co][ci][r_kh][r_kw] *
-                                          G[eps][r_kh] * G[nu][r_kw],
-                                          axis=[r_kh, r_kw]), name='kernel_pack')
+        r_kh = te.reduce_axis((0, KH), name='r_kh')
+        r_kw = te.reduce_axis((0, KW), name='r_kw')
+        kernel_pack = te.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co:
+                                 te.sum(kernel[co][ci][r_kh][r_kw] *
+                                        G[eps][r_kh] * G[nu][r_kw],
+                                        axis=[r_kh, r_kw]), name='kernel_pack')
     else:
         kernel_pack = kernel
 
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
     # pack input tile
-    input_tile = tvm.compute((CI, P, alpha, alpha), lambda c, p, eps, nu:
-                             data_pad[idxdiv(p, (nH * nW))][c][idxmod(idxdiv(p, nW), nH) * m + eps]
-                             [idxmod(p, nW) * m + nu], name='d')
+    input_tile = te.compute((CI, P, alpha, alpha), lambda c, p, eps, nu:
+                            data_pad[idxdiv(p, (nH * nW))][c][idxmod(idxdiv(p, nW), nH) * m + eps]
+                            [idxmod(p, nW) * m + nu], name='d')
 
     # transform data
-    r_a = tvm.reduce_axis((0, alpha), 'r_a')
-    r_b = tvm.reduce_axis((0, alpha), 'r_a')
-    data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p:
-                            tvm.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
-                                    axis=[r_a, r_b]), name='data_pack')
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_a')
+    data_pack = te.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p:
+                           te.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
+                                  axis=[r_a, r_b]), name='data_pack')
 
     # do batch gemm
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    bgemm = tvm.compute((alpha, alpha, CO, P), lambda eps, nu, co, p:
-                        tvm.sum(kernel_pack[eps][nu][ci][co] *
-                                data_pack[eps][nu][ci][p],
-                                axis=[ci]), name='bgemm')
+    ci = te.reduce_axis((0, CI), name='ci')
+    bgemm = te.compute((alpha, alpha, CO, P), lambda eps, nu, co, p:
+                       te.sum(kernel_pack[eps][nu][ci][co] *
+                              data_pack[eps][nu][ci][p],
+                              axis=[ci]), name='bgemm')
 
     # inverse transform
-    r_a = tvm.reduce_axis((0, alpha), 'r_a')
-    r_b = tvm.reduce_axis((0, alpha), 'r_a')
-    inverse = tvm.compute((CO, P, m, m), lambda co, p, vh, vw:
-                          tvm.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
-                                  axis=[r_a, r_b]), name='inverse')
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_a')
+    inverse = te.compute((CO, P, m, m), lambda co, p, vh, vw:
+                         te.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
+                                axis=[r_a, r_b]), name='inverse')
 
     # output
-    output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
-                         inverse[co,
-                                 n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
-                                 idxmod(h, m),
-                                 idxmod(w, m)],
-                         name='output', tag='conv2d_nchw_winograd')
+    output = te.compute((N, CO, H, W), lambda n, co, h, w:
+                        inverse[co,
+                                n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
+                                idxmod(h, m),
+                                idxmod(w, m)],
+                        name='output', tag='conv2d_nchw_winograd')
     cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
 
     return output
@@ -147,8 +148,8 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     fused = s[data_pack].fuse(c, p)
     bb, tt = s[data_pack].split(fused, 128)
     s[data_pack].reorder(bb, tt, pi, eps, nu)
-    s[data_pack].bind(bb, tvm.thread_axis("blockIdx.x"))
-    s[data_pack].bind(tt, tvm.thread_axis("threadIdx.x"))
+    s[data_pack].bind(bb, te.thread_axis("blockIdx.x"))
+    s[data_pack].bind(tt, te.thread_axis("threadIdx.x"))
 
     s[data_l].compute_at(s[data_pack], pi)
     s[input_tile].compute_at(s[data_pack], pi)
@@ -172,12 +173,12 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
             fused = s[kernel_pack].fuse(ci, co)
             bb, tt = s[kernel_pack].split(fused, 128)
             s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b)
-            s[kernel_pack].bind(bb, tvm.thread_axis("blockIdx.x"))
-            s[kernel_pack].bind(tt, tvm.thread_axis("threadIdx.x"))
+            s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
+            s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
     else:
         kernel = kernel_pack
 
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
         s[kernel].compute_inline()
 
     ##### space definition begin #####
@@ -213,15 +214,15 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
     by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
     bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].bind(bz, tvm.thread_axis("blockIdx.z"))
-    s[C].bind(by, tvm.thread_axis("blockIdx.y"))
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(vz, tvm.thread_axis("vthread"))
-    s[C].bind(vy, tvm.thread_axis("vthread"))
-    s[C].bind(vx, tvm.thread_axis("vthread"))
-    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(bz, te.thread_axis("blockIdx.z"))
+    s[C].bind(by, te.thread_axis("blockIdx.y"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(vz, te.thread_axis("vthread"))
+    s[C].bind(vy, te.thread_axis("vthread"))
+    s[C].bind(vx, te.thread_axis("vthread"))
+    s[C].bind(tz, te.thread_axis("threadIdx.z"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
     s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
 
     # tile reduction axes
@@ -241,9 +242,9 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
         fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
         fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
         fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     s[C].pragma(bgemm_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
     s[C].pragma(bgemm_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
@@ -264,8 +265,8 @@ def schedule_winograd_cuda(cfg, s, output, pre_computed):
     fused = s[output].fuse(n, co, ho, wo)
     bb, tt = s[output].split(fused, 128)
 
-    s[output].bind(bb, tvm.thread_axis("blockIdx.x"))
-    s[output].bind(tt, tvm.thread_axis("threadIdx.x"))
+    s[output].bind(bb, te.thread_axis("blockIdx.x"))
+    s[output].bind(tt, te.thread_axis("threadIdx.x"))
 
     if OL is not None:
         s[OL].compute_at(s[output], tt)
@@ -286,7 +287,7 @@ def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtyp
 
 @autotvm.register_topi_schedule("conv2d_nchw_winograd.cuda")
 def schedule_conv2d_nchw_winograd(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'conv2d_nchw_winograd' in op.tag:
@@ -306,7 +307,7 @@ def conv2d_nchw_winograd_without_weight_transform(cfg, data, kernel, strides,
 @autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.cuda")
 def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
     """TOPI schedule callback"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'conv2d_nchw_winograd' in op.tag:
diff --git a/topi/python/topi/cuda/conv3d.py b/topi/python/topi/cuda/conv3d.py
index 0a6a71ccc2f0..cc13aa511612 100644
--- a/topi/python/topi/cuda/conv3d.py
+++ b/topi/python/topi/cuda/conv3d.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Compute definition for conv3d with cuda backend"""
-import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.contrib import cudnn
 
@@ -34,10 +34,10 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype='float
     cfg: ConfigEntity
         The config for this template
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         5-D with shape [batch, in_channel, in_depth, in_height, in_width]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
 
     strides : int or a list/tuple of three ints
@@ -54,7 +54,7 @@ def conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, out_dtype='float
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, out_channel, out_depth, out_height, out_width]
     """
     return nn.conv3d_ncdhw(data, kernel, strides, padding, dilation, out_dtype)
@@ -78,8 +78,8 @@ def schedule_conv3d_ncdhw(cfg, outs):
     s: Schedule
         The computation schedule for conv2d.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv3d_ncdhw':
@@ -96,10 +96,10 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype='float
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         5-D with shape [batch, in_depth, in_height, in_width, in_channel]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         5-D with shape [filter_depth, filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of three ints
@@ -113,7 +113,7 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype='float
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         5-D with shape [batch, out_depth, out_height, out_width, out_channel]
     """
     return nn.conv3d_ndhwc(data, kernel, strides, padding, dilation, out_dtype)
@@ -137,8 +137,8 @@ def schedule_conv3d_ndhwc(cfg, outs):
     s: Schedule
         The computation schedule for conv2d.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'conv3d_ndhwc':
@@ -159,10 +159,10 @@ def conv3d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCDHW',
     cfg: ConfigEntity
         The config for this template
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         5-D with shape [batch, in_channel, in_depth, in_height, in_width]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
 
     strides : int or a list/tuple of three ints
@@ -182,7 +182,7 @@ def conv3d_cudnn(cfg, data, kernel, strides, padding, dilation, layout='NCDHW',
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, out_channel, out_depth, out_height, out_width]
     """
     if layout == 'NCDHW':
diff --git a/topi/python/topi/cuda/conv3d_direct.py b/topi/python/topi/cuda/conv3d_direct.py
index fa6c8781b5d3..50b73d618995 100644
--- a/topi/python/topi/cuda/conv3d_direct.py
+++ b/topi/python/topi/cuda/conv3d_direct.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name
 """The templates for cuda conv3d operators"""
 import tvm
+from tvm import te
 from tvm import autotvm
 from ..util import get_const_tuple
 
@@ -57,7 +58,7 @@ def schedule_direct_conv3d_cuda(cfg, s, conv, layout, workload_name):
     pad_data, kernel = s[conv].op.input_tensors
 
     s[pad_data].compute_inline()
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+    if isinstance(kernel.op, tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
         s[kernel].compute_inline()
 
     if conv.op in s.outputs:
@@ -84,16 +85,16 @@ def schedule_direct_conv3d_cuda(cfg, s, conv, layout, workload_name):
     bf = s[output].fuse(n, bf)
     s[output].reorder(bf, bd, by, bx, vf, vd, vy, vx, tf, td, ty, tx, fi, di, yi, xi)
 
-    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(s[output].fuse(bd, by), tvm.thread_axis("blockIdx.y"))
-    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vd, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
-    s[output].bind(s[output].fuse(td, tf), tvm.thread_axis("threadIdx.z"))
-    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(s[output].fuse(bd, by), te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vd, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(s[output].fuse(td, tf), te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
     s[OL].compute_at(s[output], tx)
 
     # tile reduction axes
@@ -116,9 +117,9 @@ def schedule_direct_conv3d_cuda(cfg, s, conv, layout, workload_name):
         td, fused = s[load].split(fused, nparts=cfg["tile_d"].size[2])
         ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
         tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(s[load].fuse(td, ty), tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(s[load].fuse(td, ty), te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # unroll
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
diff --git a/topi/python/topi/cuda/deformable_conv2d.py b/topi/python/topi/cuda/deformable_conv2d.py
index bdec4e120fe4..8c31835e3d6d 100644
--- a/topi/python/topi/cuda/deformable_conv2d.py
+++ b/topi/python/topi/cuda/deformable_conv2d.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name,unused-argument
 """Schedule template of deformable conv2d with cuda backend"""
 import tvm
+from tvm import te
 from tvm import autotvm
 from .. import nn
 from ..util import traverse_inline
@@ -46,8 +47,8 @@ def schedule_deformable_conv2d_nchw(cfg, outs):
     s: Schedule
         The computation schedule for conv2d.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'deformable_conv2d_nchw':
@@ -78,7 +79,7 @@ def _schedule_direct_cuda(cfg, s, conv):
     data_deform, kernel = s[conv].op.input_tensors
 
     s[data_deform].compute_inline()
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+    if isinstance(kernel.op, tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
         s[kernel].compute_inline()
 
     if conv.op in s.outputs:
@@ -102,15 +103,15 @@ def _schedule_direct_cuda(cfg, s, conv):
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
     bf = s[output].fuse(n, bf)
-    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
-    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
-    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
-    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
     s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
     s[OL].compute_at(s[output], tx)
 
@@ -135,9 +136,9 @@ def _schedule_direct_cuda(cfg, s, conv):
         tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
         ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
         tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # unroll
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
diff --git a/topi/python/topi/cuda/dense.py b/topi/python/topi/cuda/dense.py
index 93797a4b49ba..f5b6563fbf09 100644
--- a/topi/python/topi/cuda/dense.py
+++ b/topi/python/topi/cuda/dense.py
@@ -16,9 +16,8 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Schedule for dense operator"""
-from __future__ import absolute_import as _abs
 import logging
-import tvm
+from tvm import te
 import tvm.autotvm as autotvm
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cublas
@@ -45,9 +44,9 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None):
     matmul = cublas.matmul(data, weight, False, True)
     cfg.add_flop(batch * in_dim * out_dim * 2)
     if bias is not None:
-        matmul = tvm.compute((batch, out_dim),
-                             lambda i, j: matmul[i, j] + bias[j],
-                             tag=tag.BROADCAST)
+        matmul = te.compute((batch, out_dim),
+                            lambda i, j: matmul[i, j] + bias[j],
+                            tag=tag.BROADCAST)
     return matmul
 
 
@@ -66,8 +65,8 @@ def dense_small_batch(cfg, data, weight, bias=None, out_dtype=None):
 @autotvm.register_topi_schedule("dense_small_batch.cuda")
 def schedule_dense_small_batch(cfg, outs):
     """Schedule float32/64 dense with small batch size"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'dense':
@@ -91,11 +90,11 @@ def _schedule_dense_small_batch(cfg, s, C):
     else:
         Out = s.outputs[0].output(0)
         s[C].compute_at(s[Out], s[Out].op.axis[1])
-    s[Out].bind(s[Out].op.axis[0], tvm.thread_axis("blockIdx.y"))
-    s[Out].bind(s[Out].op.axis[1], tvm.thread_axis("blockIdx.x"))
+    s[Out].bind(s[Out].op.axis[0], te.thread_axis("blockIdx.y"))
+    s[Out].bind(s[Out].op.axis[1], te.thread_axis("blockIdx.x"))
 
     tx = s[C].op.reduce_axis[0]
-    thread_x = tvm.thread_axis("threadIdx.x")
+    thread_x = te.thread_axis("threadIdx.x")
     s[C].bind(tx, thread_x)
     s[CF].compute_at(s[C], tx)
     s[C].set_store_predicate(thread_x.var.equal(0))
@@ -111,8 +110,8 @@ def dense_large_batch(cfg, data, weight, bias=None, out_dtype=None):
 @autotvm.register_topi_schedule("dense_large_batch.cuda")
 def schedule_dense_large_batch(cfg, outs):
     """Schedule float32/64 dense with large batch size"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'dense':
@@ -185,12 +184,12 @@ def _schedule_dense_large_batch(cfg, s, C):
     s[CC].compute_at(s[C], tx)
 
     # Binding
-    s[C].bind(by, tvm.thread_axis("blockIdx.y"))
-    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(tyz, tvm.thread_axis("vthread"))
-    s[C].bind(txz, tvm.thread_axis("vthread"))
-    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(by, te.thread_axis("blockIdx.y"))
+    s[C].bind(bx, te.thread_axis("blockIdx.x"))
+    s[C].bind(tyz, te.thread_axis("vthread"))
+    s[C].bind(txz, te.thread_axis("vthread"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
 
     # Split reduction
     yo, xo = CC.op.axis
@@ -207,8 +206,8 @@ def _schedule_dense_large_batch(cfg, s, C):
     ty, _ = s[AA].split(s[AA].op.axis[0], nparts=num_thread_x)
     _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread_x * 4)
     tx, xi = s[AA].split(xi, nparts=num_thread_x)
-    s[AA].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[AA].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
+    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
     s[AA].double_buffer()
 
     # Schedule for B' shared memory load
@@ -216,8 +215,8 @@ def _schedule_dense_large_batch(cfg, s, C):
     ty, _ = s[BB].split(s[BB].op.axis[0], nparts=num_thread_y)
     _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread_y * 4)
     tx, xi = s[BB].split(xi, nparts=num_thread_y)
-    s[BB].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
+    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
     s[BB].double_buffer()
 
 
@@ -229,19 +228,19 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
 
     batch, in_dim = get_const_tuple(data.shape)
     out_dim, _ = get_const_tuple(weight.shape)
-    k = tvm.reduce_axis((0, in_dim), name='k')
+    k = te.reduce_axis((0, in_dim), name='k')
 
-    matmul = tvm.compute((batch, out_dim),
-                         lambda i, j: tvm.sum(data[i, k].astype(out_dtype) *
-                                              weight[j, k].astype(out_dtype), axis=[k]),
-                         tag="dense_int8")
+    matmul = te.compute((batch, out_dim),
+                        lambda i, j: te.sum(data[i, k].astype(out_dtype) *
+                                            weight[j, k].astype(out_dtype), axis=[k]),
+                        tag="dense_int8")
 
     cfg.add_flop(batch * in_dim * out_dim * 2)
 
     if bias is not None:
-        matmul = tvm.compute((batch, out_dim),
-                             lambda i, j: matmul[i, j] + bias[j].astype(out_dtype),
-                             tag=tag.BROADCAST)
+        matmul = te.compute((batch, out_dim),
+                            lambda i, j: matmul[i, j] + bias[j].astype(out_dtype),
+                            tag=tag.BROADCAST)
         cfg.add_flop(batch * out_dim)
 
     return matmul
@@ -250,8 +249,8 @@ def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
 @autotvm.register_topi_schedule("dense_int8.cuda")
 def schedule_dense_int8(cfg, outs):
     """Dense schedule for int8 on CUDA"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if "dense_int8" in op.tag:
@@ -302,12 +301,12 @@ def _schedule_dense_int8(cfg, s, output):
     bx, vx, tx, xi = cfg['tile_x'].apply(s, output, x)
 
     s[output].reorder(by, bx, vy, vx, ty, tx, yi, xi)
-    s[output].bind(by, tvm.thread_axis('blockIdx.y'))
-    s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
-    s[output].bind(vy, tvm.thread_axis('vthread'))
-    s[output].bind(vx, tvm.thread_axis('vthread'))
-    s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
-    s[output].bind(tx, tvm.thread_axis('threadIdx.x'))
+    s[output].bind(by, te.thread_axis('blockIdx.y'))
+    s[output].bind(bx, te.thread_axis('blockIdx.x'))
+    s[output].bind(vy, te.thread_axis('vthread'))
+    s[output].bind(vx, te.thread_axis('vthread'))
+    s[output].bind(ty, te.thread_axis('threadIdx.y'))
+    s[output].bind(tx, te.thread_axis('threadIdx.x'))
     n_ty = cfg['tile_y'].size[2]
     n_tx = cfg['tile_x'].size[2]
 
@@ -325,8 +324,8 @@ def _schedule_dense_int8(cfg, s, output):
 
         fused, tx = s[load].split(fused, factor=n_tx)
         fused, ty = s[load].split(fused, factor=n_ty)
-        s[load].bind(tx, tvm.thread_axis('threadIdx.x'))
-        s[load].bind(ty, tvm.thread_axis('threadIdx.y'))
+        s[load].bind(tx, te.thread_axis('threadIdx.x'))
+        s[load].bind(ty, te.thread_axis('threadIdx.y'))
 
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
     s[output].pragma(kernel_scope, 'unroll_explicit', False)
diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py
index 062f95f00eff..db9da844e3af 100644
--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Schedule for depthwise_conv2d with auto fusion"""
 import tvm
+from tvm import te
 from tvm import autotvm
 from ..util import traverse_inline
 from .. import tag
@@ -43,8 +44,8 @@ def schedule_depthwise_conv2d_nchw(cfg, outs):
     s: Schedule
         The computation schedule for depthwise_conv2d nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'depthwise_conv2d_nchw':
@@ -75,7 +76,7 @@ def _callback(op):
             ##### space definition end #####
 
             s[pad_data].compute_inline()
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
                 s[kernel].compute_inline()
 
             if conv.op in s.outputs:
@@ -100,15 +101,15 @@ def _callback(op):
 
             kernel_scope, n = s[output].split(n, nparts=1)
             bf = s[output].fuse(n, bf)
-            s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
-            s[output].bind(by, tvm.thread_axis("blockIdx.y"))
-            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-            s[output].bind(vf, tvm.thread_axis("vthread"))
-            s[output].bind(vy, tvm.thread_axis("vthread"))
-            s[output].bind(vx, tvm.thread_axis("vthread"))
-            s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
-            s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+            s[output].bind(bf, te.thread_axis("blockIdx.z"))
+            s[output].bind(by, te.thread_axis("blockIdx.y"))
+            s[output].bind(bx, te.thread_axis("blockIdx.x"))
+            s[output].bind(vf, te.thread_axis("vthread"))
+            s[output].bind(vy, te.thread_axis("vthread"))
+            s[output].bind(vx, te.thread_axis("vthread"))
+            s[output].bind(tf, te.thread_axis("threadIdx.z"))
+            s[output].bind(ty, te.thread_axis("threadIdx.y"))
+            s[output].bind(tx, te.thread_axis("threadIdx.x"))
             s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
             s[OL].compute_at(s[output], tx)
 
@@ -123,9 +124,9 @@ def _callback(op):
                 fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
                 fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
                 fused, tz = s[load].split(fused, cfg["tile_f"].size[2])
-                s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-                s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[load].bind(tz, te.thread_axis("threadIdx.z"))
+                s[load].bind(ty, te.thread_axis("threadIdx.y"))
+                s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
             s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
             s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
@@ -147,8 +148,8 @@ def schedule_depthwise_conv2d_nhwc(outs):
     s: Schedule
         The computation schedule for depthwise_conv2d nhwc.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(temp, Filter, DepthwiseConv2d):
         s[temp].compute_inline()
@@ -160,13 +161,13 @@ def _schedule(temp, Filter, DepthwiseConv2d):
             Output = outs[0].op.output(0)
             s[DepthwiseConv2d].set_scope("local")
 
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis("threadIdx.x")
+        block_x = te.thread_axis("blockIdx.x")
+        thread_x = te.thread_axis("threadIdx.x")
 
         b, h, w, c = s[Output].op.axis
 
         # num_thread here could be 728, it is larger than cuda.max_num_threads
-        num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value
+        num_thread = tvm.tir.ir_pass.Simplify(temp.shape[3]).value
         target = tvm.target.Target.current()
         if target and (target.target_name not in ["cuda", "nvptx"]):
             num_thread = target.max_num_threads
@@ -199,13 +200,13 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule depthwise_conv2d
         if OP.tag == 'depthwise_conv2d_nhwc':
             PaddedInput = OP.input_tensors[0]
             Filter = OP.input_tensors[1]
-            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
+            if isinstance(Filter.op, tvm.te.ComputeOp) and 'dilate' in Filter.op.tag:
                 s[Filter].compute_inline()
             DepthwiseConv2d = OP.output(0)
             _schedule(PaddedInput, Filter, DepthwiseConv2d)
@@ -231,14 +232,14 @@ def schedule_depthwise_conv2d_backward_input_nhwc(outs):
         The computation schedule for depthwise_conv2d backward
         wrt input with layout nhwc.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(Padded_out_grad, In_grad):
         s[Padded_out_grad].compute_inline()
 
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis("threadIdx.x")
+        block_x = te.thread_axis("blockIdx.x")
+        thread_x = te.thread_axis("threadIdx.x")
         _, h, w, c = In_grad.op.axis
 
         fused_hwc = s[In_grad].fuse(h, w, c)
@@ -276,13 +277,13 @@ def schedule_depthwise_conv2d_backward_weight_nhwc(outs):
         The computation schedule for depthwise_conv2d backward
         wrt weight with layout nhwc.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(Weight_grad):
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_y = tvm.thread_axis("threadIdx.y")
-        thread_x = tvm.thread_axis("threadIdx.x")
+        block_x = te.thread_axis("blockIdx.x")
+        thread_y = te.thread_axis("threadIdx.y")
+        thread_x = te.thread_axis("threadIdx.x")
 
         db, dh, dw = Weight_grad.op.reduce_axis
 
diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py
index 5abf2985273c..c5cf72b60843 100644
--- a/topi/python/topi/cuda/group_conv2d_nchw.py
+++ b/topi/python/topi/cuda/group_conv2d_nchw.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name
 """The template for cuda group_conv2d_nchw"""
 import tvm
+from tvm import te
 from tvm import autotvm
 
 from .injective import schedule_injective_from_existing
@@ -51,8 +52,8 @@ def schedule_group_conv2d_nchw(cfg, outs):
     s: Schedule
         The computation schedule for group conv2d.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == "group_conv2d_nchw":
@@ -115,21 +116,21 @@ def _schedule_group_conv2d_nchw_direct(cfg, s, conv):
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
 
     s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
-    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(s[output].fuse(bg, bf), tvm.thread_axis("blockIdx.y"))
-    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vn, tvm.thread_axis("vthread"))
-    s[output].bind(vg, tvm.thread_axis("vthread"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(bn, te.thread_axis("blockIdx.z"))
+    s[output].bind(s[output].fuse(bg, bf), te.thread_axis("blockIdx.y"))
+    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
+    s[output].bind(vn, te.thread_axis("vthread"))
+    s[output].bind(vg, te.thread_axis("vthread"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
 
     cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
     if cfg["fuse_yx"].val:
-        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
-        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(tn, te.thread_axis("threadIdx.z"))
+        s[output].bind(tf, te.thread_axis("threadIdx.y"))
         tyx = s[output].fuse(ty, tx)
-        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
+        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
         s[OL].compute_at(s[output], tyx)
 
         # number of threads
@@ -137,9 +138,9 @@ def _schedule_group_conv2d_nchw_direct(cfg, s, conv):
         n_ty = cfg["tile_f"].size[2]
         n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
     else:
-        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
-        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
+        s[output].bind(ty, te.thread_axis("threadIdx.y"))
+        s[output].bind(tx, te.thread_axis("threadIdx.x"))
         s[OL].compute_at(s[output], tx)
 
         # number of threads
@@ -165,9 +166,9 @@ def _schedule_group_conv2d_nchw_direct(cfg, s, conv):
         fused, tx = s[load].split(fused, factor=n_tx)
         fused, ty = s[load].split(fused, factor=n_ty)
         fused, tz = s[load].split(fused, factor=n_tz)
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # unroll
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
@@ -185,11 +186,11 @@ def group_conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, groups
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width] or
         5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         4-D with shape [num_filter, in_channel // groups, filter_height, filter_width] or
         6-D with shape [num_filter_chunk, in_channel_chunk // groups, filter_height,
         filter_width, num_filter_block, in_channel_block]
@@ -211,7 +212,7 @@ def group_conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, groups
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         5-D with shape [batch, out_channel, out_height, out_width, out_channel_block]
     """
     ic_block_factor = 4
@@ -230,11 +231,11 @@ def group_conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, groups
         assert out_channels % oc_block_factor == 0, \
             "Number of output channels per group must divide {}".format(oc_block_factor)
 
-        packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
-                                   ic_block_factor),
-                                  lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
-                                  name="packed_data")
-        packed_kernel = tvm.compute(
+        packed_data = te.compute((batch, channels // ic_block_factor, height, width,
+                                  ic_block_factor),
+                                 lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
+                                 name="packed_data")
+        packed_kernel = te.compute(
             (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w,
              oc_block_factor, ic_block_factor),
             lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block:
@@ -286,10 +287,10 @@ def group_conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, groups
 
     oshape = (batch, oc_chunk, out_height, out_width, oc_block)
 
-    icc = tvm.reduce_axis((0, ic_chunk // groups), name='ic_chunk')
-    icb = tvm.reduce_axis((0, ic_block_factor), name='ic_block')
-    kh = tvm.reduce_axis((0, kernel_h), name='kh')
-    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+    icc = te.reduce_axis((0, ic_chunk // groups), name='ic_chunk')
+    icb = te.reduce_axis((0, ic_block_factor), name='ic_block')
+    kh = te.reduce_axis((0, kernel_h), name='kh')
+    kw = te.reduce_axis((0, kernel_w), name='kw')
 
     # NOTE(kumasento): explanation of this snippet -
     # oc_chunk//groups and ic_chunk//groups give you the number of blocks,
@@ -302,20 +303,20 @@ def group_conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, groups
     #
     # Compared with a normal convolution, group convolution only sums
     # input channels from the group that an output channel resides in.
-    conv = tvm.compute(
+    conv = te.compute(
         oshape, lambda n, occ, oh, ow, ocb:
-        tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
-                         oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
-                .astype('int32') *
-                packed_kernel[occ, icc, kh, kw, ocb, icb].astype('int32'),
-                axis=[icc, kh, kw, icb]))
+        te.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
+                        oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
+               .astype('int32') *
+               packed_kernel[occ, icc, kh, kw, ocb, icb].astype('int32'),
+               axis=[icc, kh, kw, icb]))
 
     # Type conversion
-    output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype),
-                         tag='group_conv2d_NCHWc_int8')
+    output = te.compute(oshape, lambda *index: conv(*index).astype(out_dtype),
+                        tag='group_conv2d_NCHWc_int8')
 
     num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
-               ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
+        ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
     cfg.add_flop(num_flop)
 
     return output
@@ -339,8 +340,8 @@ def schedule_group_conv2d_NCHWc_int8(cfg, outs):
     s: Schedule
         The computation schedule for group conv2d.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == "group_conv2d_NCHWc_int8":
@@ -361,7 +362,7 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
     conv = output.op.input_tensors[0]
     packed_data, packed_kernel = conv.op.input_tensors
 
-    if isinstance(packed_data.op, tvm.tensor.ComputeOp) and "pad" in packed_data.op.tag:
+    if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag:
         pad_data = packed_data
         packed_data = pad_data.op.input_tensors[0]
     else:
@@ -374,7 +375,7 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
         s[packed_kernel].pragma(
             s[packed_kernel].op.axis[0], "debug_skip_region")
     else:
-        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and \
+        if isinstance(packed_kernel.op, tvm.te.ComputeOp) and \
                 packed_kernel.name == 'packed_kernel':
             # data and kernel are not pre-computed, schedule layout transform here
             schedule_injective_from_existing(s, packed_data)
@@ -407,7 +408,7 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
     kernel_scope, n = s[output].split(n, nparts=1)
 
     g, f = s[output].split(f, nparts=groups)
-    s[output].bind(n, tvm.thread_axis('blockIdx.z'))
+    s[output].bind(n, te.thread_axis('blockIdx.z'))
     bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
     bg, vg = cfg["tile_g"].apply(s, output, g)
     bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
@@ -416,20 +417,20 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
 
     s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy,
                       vx, tn, tf, ty, tx, ni, fi, yi, xi)
-    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(s[output].fuse(bg, bf), tvm.thread_axis("blockIdx.y"))
-    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vn, tvm.thread_axis("vthread"))
-    s[output].bind(vg, tvm.thread_axis("vthread"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(bn, te.thread_axis("blockIdx.z"))
+    s[output].bind(s[output].fuse(bg, bf), te.thread_axis("blockIdx.y"))
+    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
+    s[output].bind(vn, te.thread_axis("vthread"))
+    s[output].bind(vg, te.thread_axis("vthread"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
     cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
     if cfg["fuse_yx"].val:
-        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
-        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(tn, te.thread_axis("threadIdx.z"))
+        s[output].bind(tf, te.thread_axis("threadIdx.y"))
         tyx = s[output].fuse(ty, tx)
-        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
+        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
         s[conv].compute_at(s[output], tyx)
 
         # number of threads
@@ -437,10 +438,10 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
         n_ty = cfg["tile_f"].size[2]
         n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
     else:
-        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
-        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
-        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[output].bind(tn, te.thread_axis("threadIdx.z"))
+        s[output].bind(s[output].fuse(tn, tf), te.thread_axis("threadIdx.z"))
+        s[output].bind(ty, te.thread_axis("threadIdx.y"))
+        s[output].bind(tx, te.thread_axis("threadIdx.x"))
         s[conv].compute_at(s[output], tx)
 
         # number of threads
@@ -476,9 +477,9 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
         fused, tx = s[load].split(fused, factor=n_tx)
         fused, ty = s[load].split(fused, factor=n_ty)
         fused, tz = s[load].split(fused, factor=n_tz)
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # double buffer
     cfg.define_knob('AA_double_buffer', [0, 1])
diff --git a/topi/python/topi/cuda/injective.py b/topi/python/topi/cuda/injective.py
index 1690407a1602..303fe5f7cc77 100644
--- a/topi/python/topi/cuda/injective.py
+++ b/topi/python/topi/cuda/injective.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-variable,
 """Schedule for composition of injective operator"""
 import tvm
+from tvm import te
 from .. import util
 
 def schedule_injective_from_existing(sch, out):
@@ -56,12 +57,12 @@ def schedule_injective_from_existing(sch, out):
         xo, xi = sch[out].split(fused, factor=num_thread * max_block)
         bx, tx = sch[out].split(xi, factor=num_thread)
         sch[out].reorder(bx, tx, xo)
-        sch[out].bind(bx, tvm.thread_axis("blockIdx.x"))
-        sch[out].bind(tx, tvm.thread_axis("threadIdx.x"))
+        sch[out].bind(bx, te.thread_axis("blockIdx.x"))
+        sch[out].bind(tx, te.thread_axis("threadIdx.x"))
     else:
         bx, tx = sch[out].split(fused, factor=num_thread)
-        sch[out].bind(tx, tvm.thread_axis("threadIdx.x"))
-        sch[out].bind(bx, tvm.thread_axis("blockIdx.x"))
+        sch[out].bind(tx, te.thread_axis("threadIdx.x"))
+        sch[out].bind(bx, te.thread_axis("blockIdx.x"))
 
     return sch
 
@@ -79,10 +80,10 @@ def schedule_injective(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
-    tvm.schedule.AutoInlineInjective(s)
+    tvm.te.schedule.AutoInlineInjective(s)
     for out in outs:
         if not util.is_empty_shape(out.shape):
             schedule_injective_from_existing(s, out)
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 27a52724fb2d..e008dcdb1ce2 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -19,20 +19,20 @@
 """Non-maximum suppression operator"""
 import math
 import tvm
+from tvm import te
 
-from tvm import api
-from tvm.intrin import if_then_else
+from tvm.tir import if_then_else
 from .sort import argsort
 from .. import tag
 
 
 def cuda_atomic_add_rule(op):
     if op.dtype == "float32":
-        return tvm.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
+        return tvm.tir.call_pure_extern("float32", "atomicAdd", op.args[0], op.args[1])
     if op.dtype == "float64":
-        return tvm.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
+        return tvm.tir.call_pure_extern("float64", "atomicAdd", op.args[0], op.args[1])
     if op.dtype == "int32":
-        return tvm.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
+        return tvm.tir.call_pure_extern("int32", "atomicAdd", op.args[0], op.args[1])
     raise RuntimeError("only support int32, float32 and float64")
 
 
@@ -41,7 +41,7 @@ def cuda_atomic_add_rule(op):
 
 
 def atomic_add(x, y):
-    return tvm.call_pure_intrin(y.dtype, "atomic_add", x, y)
+    return tvm.tir.call_pure_intrin(y.dtype, "atomic_add", x, y)
 
 
 def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, score_index):
@@ -78,7 +78,7 @@ def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, scor
     num_anchors = data.shape[1]
     elem_length = data.shape[2]
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
     data = ib.buffer_ptr(data)
 
@@ -86,22 +86,22 @@ def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, scor
     flag = ib.buffer_ptr(flag)
     atomic_add_return = ib.allocate(
         valid_count.dtype, (1,), name='atomic_add_return', scope='local')
-    one_count = tvm.const(1, dtype=valid_count.dtype)
-    score_threshold = tvm.make.node(
+    one_count = tvm.tir.const(1, dtype=valid_count.dtype)
+    score_threshold = tvm.ir.make_node(
         "FloatImm", dtype="float32", value=score_threshold)
-    id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
-    score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
+    id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index)
+    score_index = tvm.ir.make_node("IntImm", dtype="int32", value=score_index)
 
     max_threads = int(tvm.target.Target.current(
         allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    idxd = tvm.indexdiv
+    idxd = tvm.tir.indexdiv
 
     # initialize valid_count
     with ib.if_scope(tid < batch_size):
@@ -111,11 +111,12 @@ def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, scor
         flag[tid] = 0
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
-        with ib.if_scope(tvm.all(data[tid * elem_length + score_index] > score_threshold,
-                                 tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
+        with ib.if_scope(
+                tvm.tir.all(data[tid * elem_length + score_index] > score_threshold,
+                            tvm.tir.any(id_index < 0, data[tid * elem_length + id_index] >= 0))):
             flag[tid] = 1
-            atomic_add_return[0] = atomic_add(tvm.call_pure_intrin("handle", "tvm_address_of",
-                                                                 valid_count[i]), one_count)
+            atomic_add_return[0] = atomic_add(tvm.tir.call_pure_intrin("handle", "tvm_address_of",
+                                                                       valid_count[i]), one_count)
 
     return ib.get()
 
@@ -140,7 +141,7 @@ def flag_scan(flag, prefix_sum):
     batch_size = flag.shape[0]
     num_anchors = flag.shape[1]
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
     flag = ib.buffer_ptr(flag)
     prefix_sum = ib.buffer_ptr(prefix_sum)
@@ -149,13 +150,13 @@ def flag_scan(flag, prefix_sum):
         allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     # initialize prefix_sum
     with ib.if_scope(tid < batch_size * num_anchors):
@@ -200,9 +201,9 @@ def out_rewrite(data, flag, prefix_sum, valid_count, out):
     num_anchors = out.shape[1]
     elem_length = out.shape[2]
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
-    one = tvm.const(1, dtype=out.dtype)
+    one = tvm.tir.const(1, dtype=out.dtype)
     data = ib.buffer_ptr(data)
     flag = ib.buffer_ptr(flag)
     valid_count = ib.buffer_ptr(valid_count)
@@ -213,20 +214,20 @@ def out_rewrite(data, flag, prefix_sum, valid_count, out):
         allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = batch_size * num_anchors // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
         j = idxm(tid, num_anchors)
         base_idx = i * num_anchors * elem_length
-        with ib.if_scope(tvm.all(flag[tid] > 0, prefix_sum[tid] >= 0,
-                                 prefix_sum[tid] < num_anchors)):
+        with ib.if_scope(tvm.tir.all(flag[tid] > 0, prefix_sum[tid] >= 0,
+                                     prefix_sum[tid] < num_anchors)):
             with ib.for_range(0, elem_length) as k:
                 out[base_idx + prefix_sum[tid] * elem_length +
                     k] = data[tid * elem_length + k]
@@ -243,7 +244,7 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length].
 
     score_threshold : optional, float
@@ -257,28 +258,28 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
 
     Returns
     -------
-    valid_count : tvm.Tensor
+    valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
 
-    out_tensor : tvm.Tensor
+    out_tensor : tvm.te.Tensor
         Rearranged data tensor.
     """
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
-    data_buf = api.decl_buffer(
+    data_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "data_buf", data_alignment=8)
-    valid_count_buf = api.decl_buffer(
+    valid_count_buf = tvm.tir.decl_buffer(
         (batch_size,), "int32", "valid_count_buf", data_alignment=8)
-    temp_flag_buf = api.decl_buffer(
+    temp_flag_buf = tvm.tir.decl_buffer(
         (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8)
-    temp_partial_buf = api.decl_buffer(
+    temp_partial_buf = tvm.tir.decl_buffer(
         (batch_size, num_anchors), "int32", "temp_partial", data_alignment=8)
-    out_buf = api.decl_buffer(
+    out_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "out_buf", data_alignment=8)
 
     valid_count, temp_flag = \
-        tvm.extern([(batch_size,), (batch_size, num_anchors)], [data],
-                   lambda ins, outs: get_valid_counts_ir(
+        te.extern([(batch_size,), (batch_size, num_anchors)], [data],
+                  lambda ins, outs: get_valid_counts_ir(
             ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
             dtype=["int32", "int32"],
             in_buffers=[data_buf],
@@ -287,8 +288,8 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
             tag="get_valid_counts_gpu")
 
     temp_partial = \
-        tvm.extern([(batch_size, num_anchors)], [temp_flag],
-                   lambda ins, outs: flag_scan(
+        te.extern([(batch_size, num_anchors)], [temp_flag],
+                  lambda ins, outs: flag_scan(
             ins[0], outs[0]),
             dtype=["int32"],
             in_buffers=[temp_flag_buf],
@@ -296,8 +297,8 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
             name="flag_scan")
 
     out = \
-        tvm.extern([data.shape], [data, temp_flag, temp_partial, valid_count],
-                   lambda ins, outs: out_rewrite(
+        te.extern([data.shape], [data, temp_flag, temp_partial, valid_count],
+                  lambda ins, outs: out_rewrite(
             ins[0], ins[1], ins[2], ins[3], outs[0]),
             dtype=[data.dtype],
             in_buffers=[data_buf, temp_flag_buf,
@@ -357,22 +358,22 @@ def nms_ir(data, sorted_index, valid_count, out, box_indices,
     def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         """Calculate overlap of two boxes.
         """
-        w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-                    - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
-        h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-                    - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
+        w = tvm.te.max(0.0, tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
+                       - tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
+        h = tvm.te.max(0.0, tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
+                       - tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
         i = w * h
         u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
             (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
             (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
             (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
-        return tvm.expr.Select(u <= 0.0, 0.0, i / u)
+        return tvm.tir.Select(u <= 0.0, 0.0, i / u)
 
     batch_size = data.shape[0]
     num_anchors = data.shape[1]
     box_data_length = data.shape[2]
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
     data = ib.buffer_ptr(data)
     sorted_index = ib.buffer_ptr(sorted_index)
@@ -386,27 +387,27 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = num_anchors // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     j = bx * max_threads + tx
 
-    iou_threshold = tvm.make.node(
+    iou_threshold = tvm.ir.make_node(
         "FloatImm", dtype="float32", value=iou_threshold)
-    top_k = tvm.make.node("IntImm", dtype="int32", value=top_k)
-    coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start)
-    id_index = tvm.make.node("IntImm", dtype="int32", value=id_index)
-    score_index = tvm.make.node("IntImm", dtype="int32", value=score_index)
-    force_suppress = tvm.make.node(
+    top_k = tvm.ir.make_node("IntImm", dtype="int32", value=top_k)
+    coord_start = tvm.ir.make_node("IntImm", dtype="int32", value=coord_start)
+    id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index)
+    score_index = tvm.ir.make_node("IntImm", dtype="int32", value=score_index)
+    force_suppress = tvm.ir.make_node(
         "IntImm", dtype="int32", value=1 if force_suppress else 0)
 
     with ib.for_range(0, batch_size, for_type="unroll") as i:
         base_idx = i * num_anchors * box_data_length
-        with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)):
+        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
             # Reorder output
             nkeep = if_then_else(
-                tvm.all(top_k > 0, top_k < valid_count[i]),
+                tvm.tir.all(top_k > 0, top_k < valid_count[i]),
                 top_k, valid_count[i])
             with ib.if_scope(j < nkeep):
                 with ib.for_range(0, box_data_length) as k:
@@ -415,7 +416,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
                               * box_data_length + k)]
                 box_indices[i * num_anchors +
                             j] = sorted_index[i * num_anchors + j]
-            with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])):
+            with ib.if_scope(tvm.tir.all(top_k > 0, top_k < valid_count[i])):
                 with ib.if_scope(j < valid_count[i] - nkeep):
                     with ib.for_range(0, box_data_length) as k:
                         out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0
@@ -423,19 +424,21 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             # Apply nms
             with ib.for_range(0, valid_count[i]) as k:
                 offset_k = k * box_data_length
-                with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0,
-                                         tvm.any(id_index < 0, out[base_idx +
-                                                                   offset_k + id_index] >= 0))):
+                with ib.if_scope(
+                        tvm.tir.all(out[base_idx + offset_k + score_index] > 0,
+                                    tvm.tir.any(id_index < 0, out[base_idx +
+                                                                  offset_k + id_index] >= 0))):
                     with ib.if_scope(j < valid_count[i]):
                         offset_j = j * box_data_length
-                        with ib.if_scope(tvm.all(j > k,
-                                                 out[base_idx + offset_j +
-                                                     score_index] > 0,
-                                                 tvm.any(id_index < 0,
-                                                         out[base_idx + offset_j + id_index] >= 0),
-                                                 tvm.any(force_suppress > 0, id_index < 0,
-                                                         out[base_idx + offset_k + id_index] ==
-                                                         out[base_idx + offset_j + id_index]))):
+                        with ib.if_scope(
+                                tvm.tir.all(j > k,
+                                            out[base_idx + offset_j +
+                                                score_index] > 0,
+                                            tvm.tir.any(id_index < 0,
+                                                        out[base_idx + offset_j + id_index] >= 0),
+                                            tvm.tir.any(force_suppress > 0, id_index < 0,
+                                                    out[base_idx + offset_k + id_index] ==
+                                                        out[base_idx + offset_j + id_index]))):
                             iou = calculate_overlap(out, base_idx + offset_j + coord_start,
                                                     base_idx + offset_k + coord_start)
                             with ib.if_scope(iou >= iou_threshold):
@@ -495,7 +498,7 @@ def invalid_to_bottom_pre(data, flag, idx):
     num_anchors = data.shape[1]
     elem_length = data.shape[2]
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
     data = ib.buffer_ptr(data)
     flag = ib.buffer_ptr(flag)
@@ -505,8 +508,8 @@ def invalid_to_bottom_pre(data, flag, idx):
         tvm.target.Target.current(allow_none=False).max_num_threads))
     nthread_tx = max_threads
     nthread_bx = num_anchors // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     j = bx * max_threads + tx
@@ -554,7 +557,7 @@ def invalid_to_bottom_ir(data, flag, idx, out):
     num_anchors = data.shape[1]
     elem_length = data.shape[2]
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
     data = ib.buffer_ptr(data)
     flag = ib.buffer_ptr(flag)
@@ -565,8 +568,8 @@ def invalid_to_bottom_ir(data, flag, idx, out):
         tvm.target.Target.current(allow_none=False).max_num_threads))
     nthread_tx = max_threads
     nthread_bx = num_anchors // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     j = bx * max_threads + tx
@@ -591,12 +594,12 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         3-D tensor with shape [batch_size, num_anchors, elem_length].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
 
-    valid_count : tvm.Tensor
+    valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
 
     max_output_size : optional, int
@@ -629,7 +632,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         3-D tensor with shape [batch_size, num_anchors, elem_length].
 
     Example
@@ -638,8 +641,8 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
 
         # An example to use nms
         dshape = (1, 5, 6)
-        data = tvm.placeholder(dshape, name="data")
-        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        data = te.placeholder(dshape, name="data")
+        valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count")
         iou_threshold = 0.7
         force_suppress = True
         top_k = -1
@@ -659,63 +662,63 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     num_anchors = data.shape[1]
 
     valid_count_dtype = "int32"
-    valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
-                                      "valid_count_buf", data_alignment=4)
+    valid_count_buf = tvm.tir.decl_buffer(valid_count.shape, valid_count_dtype,
+                                          "valid_count_buf", data_alignment=4)
     score_axis = score_index
     score_shape = (batch_size, num_anchors)
-    score_tensor = tvm.compute(
+    score_tensor = te.compute(
         score_shape, lambda i, j: data[i, j, score_axis], tag=tag.ELEMWISE)
     sort_tensor = argsort(
         score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
 
-    sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype,
-                                      "sort_tensor_buf", data_alignment=8)
+    sort_tensor_buf = tvm.tir.decl_buffer(sort_tensor.shape, sort_tensor.dtype,
+                                          "sort_tensor_buf", data_alignment=8)
 
-    data_buf = api.decl_buffer(
+    data_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "data_buf", data_alignment=8)
 
-    out_buf = api.decl_buffer(
+    out_buf = tvm.tir.decl_buffer(
         data.shape, data.dtype, "out_buf", data_alignment=8)
 
     out, box_indices = \
-        tvm.extern([data.shape, score_shape],
-                   [data, sort_tensor, valid_count],
-                   lambda ins, outs: nms_ir(
-                       ins[0], ins[1], ins[2], outs[0], outs[1],
-                       max_output_size, iou_threshold, force_suppress,
-                       top_k, coord_start, id_index, score_index),
-                   dtype=[data.dtype, "int32"],
-                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
-                   name="nms",
-                   tag="nms")
+        te.extern([data.shape, score_shape],
+                  [data, sort_tensor, valid_count],
+                  lambda ins, outs: nms_ir(
+            ins[0], ins[1], ins[2], outs[0], outs[1],
+            max_output_size, iou_threshold, force_suppress,
+            top_k, coord_start, id_index, score_index),
+            dtype=[data.dtype, "int32"],
+            in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
+            name="nms",
+            tag="nms")
 
     if return_indices:
         return box_indices
 
     if invalid_to_bottom:
-        output_buf = api.decl_buffer(
+        output_buf = tvm.tir.decl_buffer(
             data.shape, data.dtype, "output_buf", data_alignment=8)
-        temp_flag_buf = api.decl_buffer(
+        temp_flag_buf = tvm.tir.decl_buffer(
             score_shape, valid_count_dtype, "temp_flag", data_alignment=8)
-        temp_idx_buf = api.decl_buffer(
+        temp_idx_buf = tvm.tir.decl_buffer(
             score_shape, valid_count_dtype, "temp_idx", data_alignment=8)
-        temp_flag, temp_idx = tvm.extern([score_shape, score_shape], [out],
-                                         lambda ins, outs: invalid_to_bottom_pre(
-                                             ins[0], outs[0], outs[1]),
-                                         dtype=["int32", "int32"],
-                                         in_buffers=[out_buf],
-                                         out_buffers=[
-                                             temp_flag_buf, temp_idx_buf],
-                                         name="invalid_to_bottom_phase_one")
-
-        output = tvm.extern([data.shape], [out, temp_flag, temp_idx],
-                            lambda ins, outs: invalid_to_bottom_ir(
-                                ins[0], ins[1], ins[2], outs[0]),
-                            dtype=[data.dtype],
-                            in_buffers=[out_buf, temp_flag_buf, temp_idx_buf],
-                            out_buffers=[output_buf],
-                            name="invalid_to_bottom",
-                            tag="invalid_to_bottom")
+        temp_flag, temp_idx = te.extern([score_shape, score_shape], [out],
+                                        lambda ins, outs: invalid_to_bottom_pre(
+            ins[0], outs[0], outs[1]),
+            dtype=["int32", "int32"],
+            in_buffers=[out_buf],
+            out_buffers=[
+            temp_flag_buf, temp_idx_buf],
+            name="invalid_to_bottom_phase_one")
+
+        output = te.extern([data.shape], [out, temp_flag, temp_idx],
+                           lambda ins, outs: invalid_to_bottom_ir(
+            ins[0], ins[1], ins[2], outs[0]),
+            dtype=[data.dtype],
+            in_buffers=[out_buf, temp_flag_buf, temp_idx_buf],
+            out_buffers=[output_buf],
+            name="invalid_to_bottom",
+            tag="invalid_to_bottom")
         return output
 
     return out
diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
index 2bebd3912378..26c18eeaa306 100644
--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for pooling operators"""
 import tvm
+from tvm import te
 from .. import tag
 from ..util import traverse_inline
 
@@ -35,15 +36,15 @@ def schedule_adaptive_pool(outs):
     s: Schedule
         The computation schedule for adaptive_pool.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(Pool):
         num_thread = 8
-        block_x = tvm.thread_axis("blockIdx.x")
-        block_y = tvm.thread_axis("blockIdx.y")
-        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
+        block_x = te.thread_axis("blockIdx.x")
+        block_y = te.thread_axis("blockIdx.y")
+        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
+        thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
         if Pool.op in s.outputs:
             Out = Pool
             OL = s.cache_write(Pool, "local")
@@ -71,7 +72,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule global_pool
         elif OP.tag.startswith('adaptive_pool'):
@@ -103,10 +104,10 @@ def schedule_pool(outs, layout):
     s: Schedule
         The computation schedule for pool.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     def _schedule(PaddedInput, Pool):
-        if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
+        if isinstance(PaddedInput.op, tvm.te.ComputeOp):
             s[PaddedInput].compute_inline()
         num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
         if Pool.op in s.outputs:
@@ -117,8 +118,8 @@ def _schedule(PaddedInput, Pool):
             s[Pool].set_scope("local")
         fused = s[Out].fuse(*s[Out].op.axis)
         bx, tx = s[Out].split(fused, factor=num_thread)
-        s[Out].bind(bx, tvm.thread_axis("blockIdx.x"))
-        s[Out].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[Out].bind(bx, te.thread_axis("blockIdx.x"))
+        s[Out].bind(tx, te.thread_axis("threadIdx.x"))
         if Pool.op in s.outputs:
             s[OL].compute_at(s[Out], tx)
         else:
@@ -133,7 +134,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('pool'):
@@ -163,8 +164,8 @@ def schedule_pool_grad(outs):
     s: Schedule
         The computation schedule for pool_grad.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule_pool_grad(op):
         if op in s.outputs:
@@ -174,15 +175,15 @@ def _schedule_pool_grad(op):
         fused = s[out].fuse(*s[out].op.axis)
         num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
         bx, tx = s[out].split(fused, factor=num_thread)
-        s[out].bind(bx, tvm.thread_axis("blockIdx.x"))
-        s[out].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[out].bind(bx, te.thread_axis("blockIdx.x"))
+        s[out].bind(tx, te.thread_axis("threadIdx.x"))
 
         if tag.COMM_REDUCE_IDX in op.input_tensors[0].op.tag:
             max_pool_index = op.input_tensors[0]
             s[max_pool_index].compute_at(s[out], tx)
 
             pool_input = max_pool_index.op.input_tensors[0]
-            if isinstance(pool_input.op, tvm.tensor.ComputeOp):
+            if isinstance(pool_input.op, tvm.te.ComputeOp):
                 # handle padding
                 s[pool_input].compute_inline()
         if op not in s.outputs:
diff --git a/topi/python/topi/cuda/rcnn/proposal.py b/topi/python/topi/cuda/rcnn/proposal.py
index 489c354e6cf3..3546448cd306 100644
--- a/topi/python/topi/cuda/rcnn/proposal.py
+++ b/topi/python/topi/cuda/rcnn/proposal.py
@@ -18,6 +18,7 @@
 """Proposal operator"""
 import math
 import tvm
+from tvm import te
 from ...vision.rcnn import generate_anchor, reg_bbox, reg_iou
 from ...util import get_const_tuple, get_const_int
 
@@ -28,16 +29,16 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
 
     Parameters
     ----------
-    cls_prob_buf : tvm.schedule.Buffer
+    cls_prob_buf : tvm.te.schedule.Buffer
         4-D with shape [batch, 2 * num_anchors, height, width]
 
-    bbox_pred_buf : tvm.schedule.Buffer
+    bbox_pred_buf : tvm.te.schedule.Buffer
         4-D with shape [batch, 4 * num_anchors, height, width]
 
-    im_info_buf : tvm.schedule.Buffer
+    im_info_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, 3]
 
-    out_buf : tvm.schedule.Buffer
+    out_buf : tvm.te.schedule.Buffer
         3-D with shape [batch, num_bbox, 5]
         The last dimension is in format of [w_start, h_start, w_end, h_end, score]
 
@@ -67,10 +68,10 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = (batch * height * width) // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     tid = bx * max_threads + tx
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
 
@@ -79,8 +80,8 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
     p_im_info = ib.buffer_ptr(im_info_buf)
     p_out = ib.buffer_ptr(out_buf)
 
-    idxm = tvm.indexmod
-    idxd = tvm.indexdiv
+    idxm = tvm.tir.indexmod
+    idxd = tvm.tir.indexdiv
 
     with ib.if_scope(tid < batch * height * width):
         w = idxm(tid, width)
@@ -104,10 +105,10 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
             regression_func = reg_iou if iou_loss else reg_bbox
             pred_x1, pred_y1, pred_x2, pred_y2 = regression_func(x1, y1, x2, y2, *delta)
 
-            pred_x1 = tvm.max(tvm.min(pred_x1, im_width - 1.0), 0.0)
-            pred_y1 = tvm.max(tvm.min(pred_y1, im_height - 1.0), 0.0)
-            pred_x2 = tvm.max(tvm.min(pred_x2, im_width - 1.0), 0.0)
-            pred_y2 = tvm.max(tvm.min(pred_y2, im_height - 1.0), 0.0)
+            pred_x1 = tvm.te.max(tvm.te.min(pred_x1, im_width - 1.0), 0.0)
+            pred_y1 = tvm.te.max(tvm.te.min(pred_y1, im_height - 1.0), 0.0)
+            pred_x2 = tvm.te.max(tvm.te.min(pred_x2, im_width - 1.0), 0.0)
+            pred_y2 = tvm.te.max(tvm.te.min(pred_y2, im_height - 1.0), 0.0)
 
             real_height = (im_height / feature_stride).astype('int32')
             real_width = (im_width / feature_stride).astype('int32')
@@ -117,15 +118,15 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
             min_size = p_im_info[b * 3 + 2] * rpn_min_size
 
             pred_score = p_score[((b * num_anchors * 2 + num_anchors + k) * height + h) * width + w]
-            pred_score = tvm.expr.Select(tvm.any(h >= real_height, w >= real_width),
-                                         -1.0, pred_score)
+            pred_score = tvm.tir.Select(tvm.tir.any(h >= real_height, w >= real_width),
+                                        -1.0, pred_score)
             p_out[out_index * 5 + 0] = pred_x1
             p_out[out_index * 5 + 1] = pred_y1
             p_out[out_index * 5 + 2] = pred_x2
             p_out[out_index * 5 + 3] = pred_y2
             p_out[out_index * 5 + 4] = pred_score
 
-            with ib.if_scope(tvm.any(bbox_w < min_size, bbox_h < min_size)):
+            with ib.if_scope(tvm.tir.any(bbox_w < min_size, bbox_h < min_size)):
                 p_out[out_index * 5 + 0] -= min_size / 2.0
                 p_out[out_index * 5 + 1] -= min_size / 2.0
                 p_out[out_index * 5 + 2] += min_size / 2.0
@@ -140,10 +141,10 @@ def argsort_ir(data_buf, out_index_buf):
 
     Parameters
     ----------
-    data_buf : tvm.schedule.Buffer
+    data_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, num_bbox]
 
-    out_index_buf : tvm.schedule.Buffer
+    out_index_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, num_bbox]. Indices of data in sorted order.
 
     Returns
@@ -153,20 +154,20 @@ def argsort_ir(data_buf, out_index_buf):
     """
     batch, num_bbox = get_const_tuple(data_buf.shape)
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     p_data = ib.buffer_ptr(data_buf)
     index_out = ib.buffer_ptr(out_index_buf)
     nthread_tx = max_threads
     nthread_bx = (num_bbox + 1) // 2 // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("vthread")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("vthread")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "virtual_thread", nthread_bx)
     tid = bx * nthread_tx + tx
     temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
     temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
 
-    idxm = tvm.indexmod
+    idxm = tvm.tir.indexmod
 
     with ib.for_range(0, batch, for_type="unroll") as b:
         start = b * num_bbox
@@ -177,16 +178,16 @@ def argsort_ir(data_buf, out_index_buf):
         with ib.for_range(0, num_bbox) as k:
             offset = start + 2 * tid + idxm(k, 2)
             with ib.if_scope(
-                    tvm.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])):
+                    tvm.tir.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1])):
                 temp_data[0] = p_data[offset]
                 p_data[offset] = p_data[offset + 1]
                 p_data[offset + 1] = temp_data[0]
                 temp_index[0] = index_out[offset]
                 index_out[offset] = index_out[offset + 1]
                 index_out[offset + 1] = temp_index[0]
-            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                  tvm.convert(['shared']),
-                                  tvm.expr.Call.Intrinsic, None, 0))
+            ib.emit(tvm.tir.Call(None, 'tvm_storage_sync',
+                                 tvm.runtime.convert(['shared']),
+                                 tvm.tir.Call.Intrinsic, None, 0))
     return ib.get()
 
 
@@ -195,11 +196,11 @@ def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
 
     Parameters
     ----------
-    sorted_bbox_buf : tvm.schedule.Buffer
+    sorted_bbox_buf : tvm.te.schedule.Buffer
         3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
         [w_start, h_start, w_end, h_end, score].
 
-    out_buf : tvm.schedule.Buffer
+    out_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
 
     nms_threshold : float
@@ -213,10 +214,10 @@ def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
     def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         """Calculate overlap of two boxes.
         """
-        w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-                    - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx]) + 1.0)
-        h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-                    - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]) + 1.0)
+        w = tvm.te.max(0.0, tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
+                       - tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx]) + 1.0)
+        h = tvm.te.max(0.0, tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
+                       - tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]) + 1.0)
         i = w * h
         u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0) * \
             (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0) + \
@@ -226,9 +227,9 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
 
     batch, num_bbox = get_const_tuple(out_buf.shape)
     max_threads = int(math.sqrt(tvm.target.Target.current(allow_none=False).max_num_threads))
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
-    ib = tvm.ir_builder.create()
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
+    ib = tvm.tir.ir_builder.create()
     p_data = ib.buffer_ptr(sorted_bbox_buf)
     p_out = ib.buffer_ptr(out_buf)
     nthread_tx = max_threads
@@ -241,13 +242,13 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         with ib.if_scope(i < num_bbox):
             p_out[base_idx + i] = False
         with ib.for_range(0, num_bbox - 1) as l:
-            with ib.if_scope(tvm.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
+            with ib.if_scope(tvm.tir.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
                 iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5)
                 with ib.if_scope(iou > nms_threshold):
                     p_out[base_idx + i] = True
-        ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                              tvm.convert(['shared']),
-                              tvm.expr.Call.Intrinsic, None, 0))
+        ib.emit(tvm.tir.Call(None, 'tvm_storage_sync',
+                             tvm.runtime.convert(['shared']),
+                             tvm.tir.Call.Intrinsic, None, 0))
     return ib.get()
 
 
@@ -256,14 +257,14 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
 
     Parameters
     ----------
-    sorted_bbox_buf : tvm.schedule.Buffer
+    sorted_bbox_buf : tvm.te.schedule.Buffer
         3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
         [w_start, h_start, w_end, h_end, score].
 
-    remove_mask_buf : tvm.schedule.Buffer
+    remove_mask_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
 
-    out_buf : tvm.schedule.Buffer
+    out_buf : tvm.te.schedule.Buffer
         2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
         [batch_index, w_start, h_start, w_end, h_end].
 
@@ -275,8 +276,8 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
     batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape)
     rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch
     nthread_tx = batch
-    tx = tvm.thread_axis("threadIdx.x")
-    ib = tvm.ir_builder.create()
+    tx = te.thread_axis("threadIdx.x")
+    ib = tvm.tir.ir_builder.create()
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     i = ib.allocate('int32', (1,), 'i', scope='local')
     i[0] = 0
@@ -292,14 +293,14 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
         with ib.if_scope(p_remove[b * num_bbox + j] == False):
             nkeep[0] += 1
     with ib.if_scope(nkeep[0] > 0):
-        with ib.for_range(0, tvm.ceil(
-            tvm.const(rpn_post_nms_top_n, 'float32') / nkeep[0]).astype('int32')):
+        with ib.for_range(0, te.ceil(
+                tvm.tir.const(rpn_post_nms_top_n, 'float32') / nkeep[0]).astype('int32')):
             with ib.for_range(0, num_bbox) as j:
                 offset_j = (b * num_bbox + j) * 5
                 offset_i = (b * rpn_post_nms_top_n + i[0]) * 5
-                with ib.if_scope(tvm.all(i[0] < rpn_post_nms_top_n,
-                                         p_remove[(b*num_bbox+j)] == False)):
-                    p_out[offset_i] = tvm.expr.Cast('float32', b)
+                with ib.if_scope(tvm.tir.all(i[0] < rpn_post_nms_top_n,
+                                             p_remove[(b*num_bbox+j)] == False)):
+                    p_out[offset_i] = tvm.tir.Cast('float32', b)
                     with ib.for_range(0, 4, for_type='unroll') as k:
                         p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
                     i[0] = i[0] + 1
@@ -314,13 +315,13 @@ def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, thres
 
     Parameters
     ----------
-    cls_prob : tvm.Tensor
+    cls_prob : tvm.te.Tensor
         4-D with shape [batch, 2 * num_anchors, height, width]
 
-    bbox_pred : tvm.Tensor
+    bbox_pred : tvm.te.Tensor
         4-D with shape [batch, 4 * num_anchors, height, width]
 
-    im_info : tvm.Tensor
+    im_info : tvm.te.Tensor
         2-D with shape [batch, 3]
 
     scales : list/tuple of float
@@ -350,7 +351,7 @@ def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, thres
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
         [batch_index, w_start, h_start, w_end, h_end].
     """
@@ -360,20 +361,20 @@ def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, thres
     num_bbox = height * width * num_anchors
     rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox
 
-    bbox = tvm.extern((batch, num_bbox, 5), [cls_prob, bbox_pred, im_info], lambda ins, outs:
-                      predict_bbox_ir(ins[0], ins[1], ins[2], outs[0], scales, ratios,
-                                      feature_stride, rpn_min_size, iou_loss),
-                      dtype=bbox_pred.dtype)
-    score = tvm.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag='bbox_score')
-    sorted_index = tvm.extern([score.shape], [score],
-                              lambda ins, outs: argsort_ir(ins[0], outs[0]),
-                              dtype='int32')
-    sorted_bbox = tvm.compute((batch, rpn_pre_nms_top_n, 5),
-                              lambda b, i, j: bbox[b, sorted_index[b, i], j], tag='sorted_bbox')
-    nms_remove_mask = tvm.extern((batch, rpn_pre_nms_top_n), [sorted_bbox],
-                                 lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
-                                 dtype='bool')
-    nms_out = tvm.extern((batch * rpn_post_nms_top_n, 5), [sorted_bbox, nms_remove_mask],
-                         lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
-                         dtype=sorted_bbox.dtype)
+    bbox = te.extern((batch, num_bbox, 5), [cls_prob, bbox_pred, im_info], lambda ins, outs:
+                     predict_bbox_ir(ins[0], ins[1], ins[2], outs[0], scales, ratios,
+                                     feature_stride, rpn_min_size, iou_loss),
+                     dtype=bbox_pred.dtype)
+    score = te.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag='bbox_score')
+    sorted_index = te.extern([score.shape], [score],
+                             lambda ins, outs: argsort_ir(ins[0], outs[0]),
+                             dtype='int32')
+    sorted_bbox = te.compute((batch, rpn_pre_nms_top_n, 5),
+                             lambda b, i, j: bbox[b, sorted_index[b, i], j], tag='sorted_bbox')
+    nms_remove_mask = te.extern((batch, rpn_pre_nms_top_n), [sorted_bbox],
+                                lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
+                                dtype='bool')
+    nms_out = te.extern((batch * rpn_post_nms_top_n, 5), [sorted_bbox, nms_remove_mask],
+                        lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
+                        dtype=sorted_bbox.dtype)
     return nms_out
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 0b9d5885375e..d885c09e3984 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -18,6 +18,7 @@
 """Schedule for reduce operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import tag
 from .injective import schedule_injective_from_existing
 
@@ -39,13 +40,13 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
             # without it, CL_INVALID_WORK_GROUP_SIZE occurred when running test_topi_reduce.py
             # don't know why
             num_thread = 16
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
+        block_x = te.thread_axis("blockIdx.x")
+        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
+        thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
     else:
         all_reduce = True
         num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
-        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
+        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
 
     # Fuse and refactor the reduce axis
     fused_reduce = sch[data_out].fuse(*[sch[data_out].op.reduce_axis[i]
@@ -79,7 +80,7 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
     else:
         if is_idx_reduce:
             spatial_axis = sch[real_output].fuse(*(sch[real_output].op.axis))
-            sch[real_output].bind(spatial_axis, tvm.thread_axis("blockIdx.x"))
+            sch[real_output].bind(spatial_axis, te.thread_axis("blockIdx.x"))
             sch[temp_idx_input].compute_at(sch[real_output],
                                            spatial_axis)
             sch[temp_val_input].compute_at(sch[real_output],
@@ -102,13 +103,13 @@ def schedule_reduce(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    sch = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def traverse_before_reduce(operator):
         """Internal traverse function"""
-        if isinstance(operator, tvm.tensor.PlaceholderOp):
+        if isinstance(operator, tvm.te.PlaceholderOp):
             return
         if tag.is_injective(operator.tag):
             sch[operator].compute_inline()
diff --git a/topi/python/topi/cuda/softmax.py b/topi/python/topi/cuda/softmax.py
index afd11ea0e71e..ded3ff9cfff8 100644
--- a/topi/python/topi/cuda/softmax.py
+++ b/topi/python/topi/cuda/softmax.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, trailing-whitespace
 """Schedule for softmax operator"""
-import tvm
+from tvm import te
 from .injective import schedule_injective_from_existing
 
 
@@ -34,8 +34,8 @@ def schedule_softmax(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     softmax = outs[0]
 
     op_tag = softmax.op.tag
@@ -60,8 +60,8 @@ def schedule_softmax(outs):
             s = schedule_injective_from_existing(s, op.output(0))
     else:
         num_thread = 64
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
+        block_x = te.thread_axis("blockIdx.x")
+        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
 
         if exp is not None:
             s[exp].bind(exp.op.axis[0], block_x)
diff --git a/topi/python/topi/cuda/sort.py b/topi/python/topi/cuda/sort.py
index 88ca9d876abc..f9e535e133fa 100644
--- a/topi/python/topi/cuda/sort.py
+++ b/topi/python/topi/cuda/sort.py
@@ -17,8 +17,8 @@
 # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
 """Argsort operator """
 import tvm
+from tvm import te
 
-from tvm import api
 from .injective import schedule_injective_from_existing
 from ..math import identity
 from ..transform import strided_slice
@@ -38,8 +38,8 @@ def _schedule_sort(outs):
     s: Schedule
       The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def traverse(op):
@@ -86,7 +86,7 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
         elif i > axis:
             axis_mul_after *= value
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     data = ib.buffer_ptr(data)
     values_out = ib.buffer_ptr(values_out)
     if indices_out is not None:
@@ -94,8 +94,8 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
     nthread_tx = max_threads
     nthread_bx = shape[axis] // max_threads + 1
 
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("vthread")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("vthread")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "virtual_thread", nthread_bx)
     tid = bx * nthread_tx + tx
@@ -110,12 +110,12 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
                 values_out[base_idx + tid * axis_mul_after] = data[base_idx + tid * axis_mul_after]
                 if indices_out is not None:
                     indices_out[base_idx + tid * axis_mul_after] = \
-                        tvm.generic.cast(tid, indices_out.dtype)
-    ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                          tvm.convert(['shared']),
-                          tvm.expr.Call.Intrinsic, None, 0))
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+                        tvm.tir.generic.cast(tid, indices_out.dtype)
+    ib.emit(tvm.tir.Call(None, 'tvm_storage_sync',
+                         tvm.runtime.convert(['shared']),
+                         tvm.tir.Call.Intrinsic, None, 0))
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     with ib.for_range(0, axis_mul_before) as i:
         with ib.for_range(0, axis_mul_after) as j:
@@ -126,11 +126,11 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
                 with ib.if_scope(tid < idxd(current_sort_num + 1, 2)):
                     offset = base_idx + (2 * tid + idxm(k, 2)) * axis_mul_after
                     if is_ascend:
-                        cond = tvm.all(2 * tid + idxm(k, 2) + 1 < current_sort_num,
-                                       values_out[offset] > values_out[offset + axis_mul_after])
+                        cond = tvm.tir.all(2 * tid + idxm(k, 2) + 1 < current_sort_num,
+                                           values_out[offset] > values_out[offset + axis_mul_after])
                     else:
-                        cond = tvm.all(2 * tid + idxm(k, 2) + 1 < current_sort_num,
-                                       values_out[offset] < values_out[offset + axis_mul_after])
+                        cond = tvm.tir.all(2 * tid + idxm(k, 2) + 1 < current_sort_num,
+                                           values_out[offset] < values_out[offset + axis_mul_after])
                     with ib.if_scope(cond):
                         temp_data[0] = values_out[offset]
                         values_out[offset] = values_out[offset + axis_mul_after]
@@ -139,9 +139,9 @@ def sort_ir(data, values_out, axis, is_ascend, indices_out=None):
                             temp_index[0] = indices_out[offset]
                             indices_out[offset] = indices_out[offset + axis_mul_after]
                             indices_out[offset + axis_mul_after] = temp_index[0]
-                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                      tvm.convert(['shared']),
-                                      tvm.expr.Call.Intrinsic, None, 0))
+                ib.emit(tvm.tir.Call(None, 'tvm_storage_sync',
+                                     tvm.runtime.convert(['shared']),
+                                     tvm.tir.Call.Intrinsic, None, 0))
 
     return ib.get()
 
@@ -185,23 +185,23 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
         elif i > axis:
             axis_mul_after *= value
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     data = ib.buffer_ptr(data)
     valid_count = ib.buffer_ptr(valid_count)
     output = ib.buffer_ptr(output)
     nthread_tx = max_threads
     nthread_bx = size // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("vthread")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("vthread")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "virtual_thread", nthread_bx)
     tid = bx * nthread_tx + tx
     temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
     temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
-    is_ascend = tvm.make.node("IntImm", dtype="int32", value=is_ascend)
+    is_ascend = tvm.ir.make_node("IntImm", dtype="int32", value=is_ascend)
 
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     with ib.for_range(0, axis_mul_before) as i:
         with ib.for_range(0, axis_mul_after) as j:
@@ -213,27 +213,27 @@ def sort_nms_ir(data, valid_count, output, axis, is_ascend):
             with ib.for_range(0, current_sort_num) as k:
                 with ib.if_scope(tid < idxd(current_sort_num + 1, 2)):
                     offset = base_idx + (2 * tid + idxm(k, 2)) * axis_mul_after
-                    with ib.if_scope(tvm.all(is_ascend == 1, \
-                                             2 * tid + idxm(k, 2) + 1 < current_sort_num, \
-                                             data[offset] > data[offset + axis_mul_after])):
+                    with ib.if_scope(tvm.tir.all(is_ascend == 1, \
+                                                 2 * tid + idxm(k, 2) + 1 < current_sort_num, \
+                                                 data[offset] > data[offset + axis_mul_after])):
                         temp_data[0] = data[offset]
                         data[offset] = data[offset + axis_mul_after]
                         data[offset + axis_mul_after] = temp_data[0]
                         temp_index[0] = output[offset]
                         output[offset] = output[offset + axis_mul_after]
                         output[offset + axis_mul_after] = temp_index[0]
-                    with ib.if_scope(tvm.all(is_ascend == 0, \
-                                             2 * tid + idxm(k, 2) + 1 < current_sort_num, \
-                                             data[offset] < data[offset + axis_mul_after])):
+                    with ib.if_scope(tvm.tir.all(is_ascend == 0, \
+                                                 2 * tid + idxm(k, 2) + 1 < current_sort_num, \
+                                                 data[offset] < data[offset + axis_mul_after])):
                         temp_data[0] = data[offset]
                         data[offset] = data[offset + axis_mul_after]
                         data[offset + axis_mul_after] = temp_data[0]
                         temp_index[0] = output[offset]
                         output[offset] = output[offset + axis_mul_after]
                         output[offset + axis_mul_after] = temp_index[0]
-                ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
-                                      tvm.convert(['shared']),
-                                      tvm.expr.Call.Intrinsic, None, 0))
+                ib.emit(tvm.tir.Call(None, 'tvm_storage_sync',
+                                     tvm.runtime.convert(['shared']),
+                                     tvm.tir.Call.Intrinsic, None, 0))
 
     return ib.get()
 
@@ -243,10 +243,10 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
 
     Parameters
     ----------
-    data: tvm.Tensor
+    data: tvm.te.Tensor
         The input array.
 
-    valid_count : tvm.Tensor, optional
+    valid_count : tvm.te.Tensor, optional
         The number of valid elements to be sorted.
 
     axis : int, optional
@@ -260,35 +260,35 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         The output of this function.
     """
     if valid_count is not None:
         sorted_data = identity(data)
-        sorted_data_buf = api.decl_buffer(data.shape, data.dtype, "sorted_data_buf",
-                                          data_alignment=8)
-        valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
-                                          "valid_count_buf", data_alignment=4)
-        out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4)
-        out = tvm.extern([data.shape],
-                         [sorted_data, valid_count],
-                         lambda ins, outs: sort_nms_ir(
-                             ins[0], ins[1], outs[0], axis, is_ascend),
-                         dtype="int32",
-                         in_buffers=[sorted_data_buf, valid_count_buf],
-                         out_buffers=[out_buf],
-                         name="argsort_nms_gpu",
-                         tag="argsort_nms_gpu")
+        sorted_data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "sorted_data_buf",
+                                              data_alignment=8)
+        valid_count_buf = tvm.tir.decl_buffer(valid_count.shape, valid_count.dtype,
+                                              "valid_count_buf", data_alignment=4)
+        out_buf = tvm.tir.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4)
+        out = te.extern([data.shape],
+                        [sorted_data, valid_count],
+                        lambda ins, outs: sort_nms_ir(
+                            ins[0], ins[1], outs[0], axis, is_ascend),
+                        dtype="int32",
+                        in_buffers=[sorted_data_buf, valid_count_buf],
+                        out_buffers=[out_buf],
+                        name="argsort_nms_gpu",
+                        tag="argsort_nms_gpu")
     else:
-        value_buf = api.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
-        indices_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
-        out = tvm.extern([data.shape, data.shape],
-                         [data],
-                         lambda ins, outs: sort_ir(
-                             ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
-                         out_buffers=[value_buf, indices_buf],
-                         name="argsort_gpu",
-                         tag="argsort_gpu")[1]
+        value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
+        indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
+        out = te.extern([data.shape, data.shape],
+                        [data],
+                        lambda ins, outs: sort_ir(
+                            ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
+                        out_buffers=[value_buf, indices_buf],
+                        name="argsort_gpu",
+                        tag="argsort_gpu")[1]
     return out
 
 def schedule_argsort(outs):
@@ -312,7 +312,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tensor.
 
     k : int, optional
@@ -335,31 +335,31 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
 
     Returns
     -------
-    out : tvm.Tensor or List[tvm.Tensor]
+    out : tvm.te.Tensor or List[tvm.te.Tensor]
         The computed result.
     """
     assert ret_type in ["both", "values", "indices"]
     ndim = len(data.shape)
     axis = axis + ndim if axis < 0 else axis
     assert 0 <= axis < ndim
-    values_buf = api.decl_buffer(data.shape, data.dtype, "values_buf", data_alignment=8)
-    indices_buf = api.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8)
+    values_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "values_buf", data_alignment=8)
+    indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8)
     if ret_type == "values":
-        output = tvm.extern([data.shape],
-                            [data],
-                            lambda ins, outs: sort_ir(
-                                ins[0], outs[0], axis, is_ascend),
-                            out_buffers=[values_buf],
-                            name="topk_gpu",
-                            tag="topk_gpu")
+        output = te.extern([data.shape],
+                           [data],
+                           lambda ins, outs: sort_ir(
+                               ins[0], outs[0], axis, is_ascend),
+                           out_buffers=[values_buf],
+                           name="topk_gpu",
+                           tag="topk_gpu")
     else:
-        output = tvm.extern([data.shape, data.shape],
-                            [data],
-                            lambda ins, outs: sort_ir(
-                                ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
-                            out_buffers=[values_buf, indices_buf],
-                            name="topk_gpu",
-                            tag="topk_gpu")
+        output = te.extern([data.shape, data.shape],
+                           [data],
+                           lambda ins, outs: sort_ir(
+                               ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
+                           out_buffers=[values_buf, indices_buf],
+                           name="topk_gpu",
+                           tag="topk_gpu")
     if k < 1:
         if ret_type == "indices":
             return output[1]
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 0b3f50ba0031..30784f45a591 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -16,12 +16,10 @@
 # under the License.
 # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, too-many-function-args
 """SSD multibox operators"""
-from __future__ import absolute_import as _abs
 import math
 import tvm
-
-from tvm import api
-from tvm.intrin import if_then_else, exp
+from tvm import te
+from tvm.tir import if_then_else, exp
 
 import topi
 
@@ -58,11 +56,11 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     """
     max_threads = int(math.sqrt(
         tvm.target.Target.current(allow_none=False).max_num_threads))
-    tx = tvm.thread_axis("threadIdx.x")
-    ty = tvm.thread_axis("threadIdx.y")
-    bx = tvm.thread_axis("blockIdx.x")
-    by = tvm.thread_axis("blockIdx.y")
-    ib = tvm.ir_builder.create()
+    tx = te.thread_axis("threadIdx.x")
+    ty = te.thread_axis("threadIdx.y")
+    bx = te.thread_axis("blockIdx.x")
+    by = te.thread_axis("blockIdx.y")
+    ib = tvm.tir.ir_builder.create()
     p_out = ib.buffer_ptr(out)
     in_height = data.shape[2]
     in_width = data.shape[3]
@@ -115,7 +113,7 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, c_in, h_in, w_in]]
 
     sizes : tuple of float
@@ -135,17 +133,17 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
     num_sizes = len(sizes)
     num_ratios = len(ratios)
     oshape = (
         1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
-    out = tvm.extern(oshape, [data], lambda ins, outs:
-                     multibox_prior_ir(
-                         ins[0], outs[0], sizes, ratios, steps, offsets),
-                     tag="multibox_prior")
+    out = te.extern(oshape, [data], lambda ins, outs:
+                    multibox_prior_ir(
+                        ins[0], outs[0], sizes, ratios, steps, offsets),
+                    tag="multibox_prior")
     if clip:
         out = topi.clip(out, 0, 1)
     return out
@@ -182,7 +180,7 @@ def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp
     num_classes = cls_prob.shape[1]
     num_anchors = cls_prob.shape[2]
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
     cls_prob = ib.buffer_ptr(cls_prob)
     cls_id = ib.buffer_ptr(temp_cls_id)
@@ -190,18 +188,18 @@ def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp
     temp_valid_count = ib.buffer_ptr(temp_valid_count)
     score = ib.buffer_ptr(temp_score)
 
-    threshold = tvm.make.node("FloatImm", dtype="float32", value=threshold)
+    threshold = tvm.ir.make_node("FloatImm", dtype="float32", value=threshold)
 
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = (batch_size *  num_anchors) // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
@@ -212,8 +210,8 @@ def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp
         with ib.for_range(0, num_classes - 1) as k:
             temp = cls_prob[i * num_classes * num_anchors + (k + 1) * num_anchors + j]
             cls_id[tid] = if_then_else(temp > score[tid], k + 1, cls_id[tid])
-            score[tid] = tvm.max(temp, score[tid])
-        with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)):
+            score[tid] = tvm.te.max(temp, score[tid])
+        with ib.if_scope(tvm.tir.all(cls_id[tid] > 0, score[tid] < threshold)):
             cls_id[tid] = 0
         with ib.if_scope(cls_id[tid] > 0):
             temp_valid_count[tid] = 1
@@ -224,7 +222,7 @@ def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp
             with ib.for_range(0, num_anchors) as k:
                 with ib.if_scope(k > 0):
                     temp_valid_count[tid * num_anchors + k] += \
-                    temp_valid_count[tid * num_anchors + k - 1]
+                        temp_valid_count[tid * num_anchors + k - 1]
             valid_count[i] = temp_valid_count[tid * num_anchors + num_anchors - 1]
 
     return ib.get()
@@ -289,12 +287,12 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         oy = py * vy * ah + ay
         ow = exp(pw * vw) * aw / 2.0
         oh = exp(ph * vh) * ah / 2.0
-        return tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, ox - ow)), ox - ow), \
-            tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, oy - oh)), oy - oh), \
-            tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, ox + ow)), ox + ow), \
-            tvm.if_then_else(clip, tvm.max(0.0, tvm.min(1.0, oy + oh)), oy + oh)
+        return tvm.tir.if_then_else(clip, tvm.te.max(0.0, tvm.te.min(1.0, ox - ow)), ox - ow), \
+            tvm.tir.if_then_else(clip, tvm.te.max(0.0, tvm.te.min(1.0, oy - oh)), oy - oh), \
+            tvm.tir.if_then_else(clip, tvm.te.max(0.0, tvm.te.min(1.0, ox + ow)), ox + ow), \
+            tvm.tir.if_then_else(clip, tvm.te.max(0.0, tvm.te.min(1.0, oy + oh)), oy + oh)
 
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
     loc_pred = ib.buffer_ptr(loc_pred)
     anchor = ib.buffer_ptr(anchor)
@@ -306,14 +304,14 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
     max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)
     nthread_tx = max_threads
     nthread_bx = (batch_size * num_anchors) // max_threads + 1
-    tx = tvm.thread_axis("threadIdx.x")
-    bx = tvm.thread_axis("blockIdx.x")
+    tx = te.thread_axis("threadIdx.x")
+    bx = te.thread_axis("blockIdx.x")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
 
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     with ib.if_scope(tid < batch_size * num_anchors):
         i = idxd(tid, num_anchors)
@@ -348,13 +346,13 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, \
 
     Parameters
     ----------
-    cls_prob : tvm.Tensor
+    cls_prob : tvm.te.Tensor
         Class probabilities.
 
-    loc_pred : tvm.Tensor
+    loc_pred : tvm.te.Tensor
         Location regression predictions.
 
-    anchor : tvm.Tensor
+    anchor : tvm.te.Tensor
         Prior anchor boxes.
 
     clip : boolean
@@ -368,12 +366,12 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, \
 
     Returns
     -------
-    ret : tuple of tvm.Tensor composed of
+    ret : tuple of tvm.te.Tensor composed of
 
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         3-D tensor with shape (batch_size, num_anchors, 6)
 
-    valid_count : tvm.Tensor
+    valid_count : tvm.te.Tensor
         1-D tensor with shape (batch_size,), number of valid anchor boxes.
     """
     batch_size = cls_prob.shape[0]
@@ -383,40 +381,40 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, \
     valid_count_dtype = "int32"
     out_loc_dtype = loc_pred.dtype
 
-    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
-                                      "valid_count_buf", data_alignment=4)
-    loc_pred_buf = api.decl_buffer(loc_pred.shape, loc_pred.dtype,
-                                   "loc_pred_buf", data_alignment=8)
-    anchor_buf = api.decl_buffer(anchor.shape, anchor.dtype,
-                                 "anchor_buf", data_alignment=8)
+    valid_count_buf = tvm.tir.decl_buffer((batch_size,), valid_count_dtype,
+                                          "valid_count_buf", data_alignment=4)
+    loc_pred_buf = tvm.tir.decl_buffer(loc_pred.shape, loc_pred.dtype,
+                                       "loc_pred_buf", data_alignment=8)
+    anchor_buf = tvm.tir.decl_buffer(anchor.shape, anchor.dtype,
+                                     "anchor_buf", data_alignment=8)
 
-    temp_valid_count_buf = api.decl_buffer(
+    temp_valid_count_buf = tvm.tir.decl_buffer(
         (batch_size, num_anchors,), valid_count_dtype, "temp_valid_count", data_alignment=8)
-    temp_cls_id_buf = api.decl_buffer(
+    temp_cls_id_buf = tvm.tir.decl_buffer(
         (batch_size, num_anchors,), valid_count_dtype, "temp_cls_id", data_alignment=8)
-    temp_score_buf = api.decl_buffer(
+    temp_score_buf = tvm.tir.decl_buffer(
         (batch_size, num_anchors,), cls_prob.dtype, "temp_score", data_alignment=8)
 
     valid_count, temp_valid_count, temp_cls_id, temp_score = \
-        tvm.extern([(batch_size,), (batch_size, num_anchors,), (batch_size, num_anchors,), \
-                    (batch_size, num_anchors,)], [cls_prob],
-                   lambda ins, outs: transform_loc_pre(
-                       ins[0], outs[0], outs[1], outs[2], outs[3], threshold),
-                   dtype=[valid_count_dtype, valid_count_dtype, valid_count_dtype, cls_prob.dtype],
-                   out_buffers=[valid_count_buf, temp_valid_count_buf, \
-                                temp_cls_id_buf, temp_score_buf],
-                   tag="multibox_transform_loc_phase_one")
+        te.extern([(batch_size,), (batch_size, num_anchors,), (batch_size, num_anchors,), \
+                   (batch_size, num_anchors,)], [cls_prob],
+                  lambda ins, outs: transform_loc_pre(
+                      ins[0], outs[0], outs[1], outs[2], outs[3], threshold),
+                  dtype=[valid_count_dtype, valid_count_dtype, valid_count_dtype, cls_prob.dtype],
+                  out_buffers=[valid_count_buf, temp_valid_count_buf, \
+                               temp_cls_id_buf, temp_score_buf],
+                  tag="multibox_transform_loc_phase_one")
 
     out_loc = \
-        tvm.extern([oshape],
-                   [loc_pred, anchor, temp_valid_count, temp_cls_id, temp_score],
-                   lambda ins, outs: transform_loc_ir(
-                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, variances, \
-                       batch_size, num_anchors),
-                   in_buffers=[loc_pred_buf, anchor_buf, temp_valid_count_buf, \
-                               temp_cls_id_buf, temp_score_buf],
-                   dtype=[out_loc_dtype],
-                   tag="multibox_transform_loc")
+        te.extern([oshape],
+                  [loc_pred, anchor, temp_valid_count, temp_cls_id, temp_score],
+                  lambda ins, outs: transform_loc_ir(
+                      ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, variances, \
+                      batch_size, num_anchors),
+                  in_buffers=[loc_pred_buf, anchor_buf, temp_valid_count_buf, \
+                              temp_cls_id_buf, temp_score_buf],
+                  dtype=[out_loc_dtype],
+                  tag="multibox_transform_loc")
 
     return [out_loc, valid_count]
 
@@ -427,13 +425,13 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
 
     Parameters
     ----------
-    cls_prob : tvm.Tensor
+    cls_prob : tvm.te.Tensor
         Class probabilities.
 
-    loc_pred : tvm.Tensor
+    loc_pred : tvm.te.Tensor
         Location regression predictions.
 
-    anchor : tvm.Tensor
+    anchor : tvm.te.Tensor
         Prior anchor boxes.
 
     clip : boolean
@@ -456,7 +454,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         3-D tensor with shape (batch_size, num_anchors, 6)
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
diff --git a/topi/python/topi/cuda/tensor_intrin.py b/topi/python/topi/cuda/tensor_intrin.py
index 8f46d278d950..468e2cd21fa8 100644
--- a/topi/python/topi/cuda/tensor_intrin.py
+++ b/topi/python/topi/cuda/tensor_intrin.py
@@ -17,6 +17,7 @@
 """Tensor intrinsics on CUDA."""
 #pylint: disable=invalid-name
 import tvm
+from tvm import te
 
 
 def dp4a(x_scope='local', y_scope='local', z_scope='local'):
@@ -39,12 +40,12 @@ def dp4a(x_scope='local', y_scope='local', z_scope='local'):
     """
 
     n = 4  # dp4a requires operands packed by 4
-    x = tvm.placeholder((n,), name='x', dtype='int8')
-    y = tvm.placeholder((n,), name='y', dtype='int8')
+    x = te.placeholder((n,), name='x', dtype='int8')
+    y = te.placeholder((n,), name='y', dtype='int8')
 
-    k = tvm.reduce_axis((0, n), name='rc')
+    k = te.reduce_axis((0, n), name='rc')
 
-    z = tvm.compute((1,), lambda i: tvm.sum(
+    z = te.compute((1,), lambda i: te.sum(
         x[k].astype('int32') * y[k].astype('int32'), axis=[k]))
 
     def _intrin_func(ins, outs):
@@ -55,24 +56,24 @@ def _instr(index):
             if index == 1:
                 return zz.vstore(0, 0)
 
-            ib = tvm.ir_builder.create()
+            ib = tvm.tir.ir_builder.create()
 
             vec_x = xx.vload(0, dtype='int8x4')
             vec_y = yy.vload(0, dtype='int8x4')
             prev_z = 0 if index == 0 else zz.vload(0)
 
-            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
+            new_z = tvm.tir.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
             ib.emit(zz.vstore(0, new_z))
 
             return ib.get()
 
         return _instr(0), _instr(1), _instr(2) # body, reset, update
 
-    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
+    with tvm.target.build_config(data_alignment=4, offset_factor=1) as cfg:
         scopes = {x: x_scope, y: y_scope, z: z_scope}
-        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
-                                    data_alignment=cfg.data_alignment,
-                                    offset_factor=cfg.offset_factor,
-                                    scope=scopes[t]) for t in [x, y, z]}
+        binds = {t: tvm.tir.decl_buffer(t.shape, t.dtype, t.op.name,
+                                        data_alignment=cfg.data_alignment,
+                                        offset_factor=cfg.offset_factor,
+                                        scope=scopes[t]) for t in [x, y, z]}
 
-        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
+        return te.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 8666c22774de..eb49328c3da3 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -18,6 +18,7 @@
 """Schedule for vision operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import cpp
 from .. import tag
 from .pooling import schedule_pool
@@ -25,8 +26,8 @@
 
 def _default_schedule(outs):
     """Default schedule for gpu."""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
     def traverse(op):
         if tag.is_broadcast(op.tag) or op.tag in ['bbox_score', 'sorted_bbox']:
diff --git a/topi/python/topi/generic/__init__.py b/topi/python/topi/generic/__init__.py
index b9db1560e588..bf45bc30a42d 100644
--- a/topi/python/topi/generic/__init__.py
+++ b/topi/python/topi/generic/__init__.py
@@ -28,7 +28,7 @@
 
   # create schedule that dispatches to topi.cuda.schedule_injective
   with tvm.target.create("cuda"):
-    s = tvm.generic.schedule_injective(outs)
+    s = tvm.tir.generic.schedule_injective(outs)
 """
 from __future__ import absolute_import as _abs
 
diff --git a/topi/python/topi/generic/conv2d.py b/topi/python/topi/generic/conv2d.py
index 08bb06c6f855..69984a169ac6 100644
--- a/topi/python/topi/generic/conv2d.py
+++ b/topi/python/topi/generic/conv2d.py
@@ -17,8 +17,7 @@
 # pylint: disable=invalid-name, unused-variable, too-many-locals
 # pylint: disable=unused-argument, redefined-builtin
 """Generic convolution schedules"""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from ..util import get_const_tuple
@@ -123,7 +122,7 @@ def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data_vec, kernel_vec, conv_out,
     _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
 
     # schedule pad
-    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+    if isinstance(s[data_vec].op, te.tensor.ComputeOp) \
             and "pad" in data_vec.op.tag:
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
@@ -136,7 +135,7 @@ def schedule_conv_NCHWc_cpu_common_int8(s, cfg, data_vec, kernel_vec, conv_out,
         # this part will be folded during Relay fold_constant pass.
         s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
         s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
-    elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \
+    elif isinstance(kernel_vec.op, te.tensor.ComputeOp) and \
             kernel_vec.name == 'kernel_vec':
         # data and kernel are not pre-computed, schedule layout transform here.
         # this should only be used by x86 conv2d_nchw, which is for
@@ -213,7 +212,7 @@ def schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data_vec, kernel_vec, conv_out,
     _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
 
     # schedule pad
-    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+    if isinstance(s[data_vec].op, te.tensor.ComputeOp) \
             and "pad" in data_vec.op.tag:
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
@@ -226,7 +225,7 @@ def schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data_vec, kernel_vec, conv_out,
         # this part will be folded during Relay fold_constant pass.
         s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
         s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
-    elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \
+    elif isinstance(kernel_vec.op, te.tensor.ComputeOp) and \
             kernel_vec.name == 'kernel_vec':
         # data and kernel are not pre-computed, schedule layout transform here.
         # this should only be used by x86 conv2d_nchw, which is for
diff --git a/topi/python/topi/generic/extern.py b/topi/python/topi/generic/extern.py
index 977c53763a52..3b4feb771876 100644
--- a/topi/python/topi/generic/extern.py
+++ b/topi/python/topi/generic/extern.py
@@ -16,8 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name
 """generic declaration and schedules."""
-from __future__ import absolute_import as _abs
-
 import tvm
 from .. import cpp
 
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index 6f1013c06dbd..50de7988be10 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -19,6 +19,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import te
 
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
@@ -55,10 +56,10 @@ def schedule_injective(outs):
     target = tvm.target.Target.current(allow_none=False)
     if target.target_name != "llvm":
         raise RuntimeError("schedule_injective not registered for '%s'" % target)
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     x = outs[0]
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    s = te.create_schedule([x.op for x in outs])
+    te.schedule.AutoInlineInjective(s)
     schedule_injective_from_existing(s, x)
     return s
 
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index ba50a8b88cb4..25b550115200 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -16,19 +16,19 @@
 # under the License.
 # pylint: disable=invalid-name,unused-argument
 """Generic nn operators"""
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 
 def _default_schedule(outs, auto_inline):
     """Default schedule for llvm."""
     target = tvm.target.Target.current(allow_none=False)
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     if target.target_name not in ("llvm", "c"):
         raise RuntimeError("schedule not registered for '%s'" % target)
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     if auto_inline:
         x = outs[0]
-        tvm.schedule.AutoInlineInjective(s)
+        te.schedule.AutoInlineInjective(s)
         s[x].fuse(s[x].op.axis)
     return s
 
@@ -187,7 +187,7 @@ def schedule_conv2d_winograd_weight_transform(outs):
     """
     # Typically this is computed in PreCompute pass
     # so we make a schedule here for cpu llvm
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     output = outs[0]
     _, G = s[output].op.input_tensors
     s[G].compute_inline()
@@ -230,7 +230,7 @@ def schedule_conv2d_winograd_nnpack_weight_transform(outs):
         The computation schedule for the op.
     """
     # Typically this is computed in PreCompute pass
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     return s
 
 
diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py
index d6e80df9b89d..3935250bcbbd 100644
--- a/topi/python/topi/generic/vision.py
+++ b/topi/python/topi/generic/vision.py
@@ -18,18 +18,19 @@
 """Generic vision operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import cpp
 
 def _default_schedule(outs, auto_inline):
     """Default schedule for llvm."""
     target = tvm.target.Target.current(allow_none=False)
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     if target.target_name != "llvm":
         raise RuntimeError("schedule not registered for '%s'" % target)
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     if auto_inline:
         x = outs[0]
-        tvm.schedule.AutoInlineInjective(s)
+        te.schedule.AutoInlineInjective(s)
         s[x].fuse(s[x].op.axis)
     return s
 
diff --git a/topi/python/topi/generic_op_impl.py b/topi/python/topi/generic_op_impl.py
index b4b719fb35d4..f4695d3db3ee 100644
--- a/topi/python/topi/generic_op_impl.py
+++ b/topi/python/topi/generic_op_impl.py
@@ -16,8 +16,8 @@
 # under the License.
 """Implementation of generic operators in the presence of Tensor"""
 # pylint: disable=invalid-name, too-many-arguments
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from . import broadcast as _broadcast
 from . import math as _math
 
@@ -75,11 +75,11 @@ def _tensor_bop_impl(lhs, rhs):
 
         Returns
         -------
-        ret : tvm.Tensor (if at least one operand is non-zero-rank Tensor)
+        ret : tvm.te.Tensor (if at least one operand is non-zero-rank Tensor)
               tvm.Expr (otherwise)
             The result of {op} operation.
         """
-        if not isinstance(lhs, tvm.tensor.Tensor) and not isinstance(rhs, tvm.tensor.Tensor):
+        if not isinstance(lhs, te.tensor.Tensor) and not isinstance(rhs, te.tensor.Tensor):
             return orig_bop(lhs, rhs)
         return broadcast_bop(lhs, rhs)
     _tensor_bop_impl.__doc__ = _tensor_bop_impl.__doc__.format(op=name)
@@ -90,12 +90,12 @@ def _bind_generic_ops():
     """Bind generic operators for Tensor."""
     # Check __op_priority__ to make sure the binding happens only once.
     __op_priority__ = 1
-    if __op_priority__ > tvm.generic.__op_priority__:
-        tvm.generic.__op_priority__ = __op_priority__
-        tvm.generic.add = _make_bop(_broadcast.add, tvm.generic.add)
-        tvm.generic.subtract = _make_bop(_broadcast.subtract, tvm.generic.subtract)
-        tvm.generic.multiply = _make_bop(_broadcast.multiply, tvm.generic.multiply)
-        tvm.generic.divide = _make_bop(_broadcast.divide, tvm.generic.divide)
-        tvm.generic.cast = _math.cast
+    if __op_priority__ > tvm.tir.generic.__op_priority__:
+        tvm.tir.generic.__op_priority__ = __op_priority__
+        tvm.tir.generic.add = _make_bop(_broadcast.add, tvm.tir.generic.add)
+        tvm.tir.generic.subtract = _make_bop(_broadcast.subtract, tvm.tir.generic.subtract)
+        tvm.tir.generic.multiply = _make_bop(_broadcast.multiply, tvm.tir.generic.multiply)
+        tvm.tir.generic.divide = _make_bop(_broadcast.divide, tvm.tir.generic.divide)
+        tvm.tir.generic.cast = _math.cast
 
 _bind_generic_ops()
diff --git a/topi/python/topi/hls/injective.py b/topi/python/topi/hls/injective.py
index d4ccf41ed26d..6d0c6f4928ec 100644
--- a/topi/python/topi/hls/injective.py
+++ b/topi/python/topi/hls/injective.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-variable,
 """Schedule for composition of injective operator"""
 import tvm
+from tvm import te
 
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
@@ -35,7 +36,7 @@ def schedule_injective_from_existing(sch, out):
     """
     fused = sch[out].fuse(*sch[out].op.axis)
     px, x = sch[out].split(fused, nparts=1)
-    sch[out].bind(px, tvm.thread_axis("pipeline"))
+    sch[out].bind(px, te.thread_axis("pipeline"))
     return sch
 
 def schedule_injective(outs):
@@ -52,9 +53,9 @@ def schedule_injective(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
     for out in outs:
         schedule_injective_from_existing(s, out)
     return s
diff --git a/topi/python/topi/hls/nn.py b/topi/python/topi/hls/nn.py
index 06cf3298682d..3d7ff82085c7 100644
--- a/topi/python/topi/hls/nn.py
+++ b/topi/python/topi/hls/nn.py
@@ -18,13 +18,14 @@
 """HLS nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import tag
 
 
 def _schedule_conv2d(outs):
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
 
     def traverse(OP):
         """Internal traverse function"""
@@ -33,7 +34,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if isinstance(tensor.op, tvm.te.ComputeOp):
                     traverse(tensor.op)
         # schedule conv2d
         elif OP.tag.find("conv2d") >= 0:
@@ -47,7 +48,7 @@ def traverse(OP):
     traverse(outs[0].op)
 
     px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
-    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    s[outs[0]].bind(px, te.thread_axis("pipeline"))
     return s
 
 
@@ -199,9 +200,9 @@ def schedule_reduce(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
 
     def traverse(OP):
         """Internal traverse function"""
@@ -210,7 +211,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if isinstance(tensor.op, tvm.te.ComputeOp):
                     traverse(tensor.op)
         elif OP.tag in ["comm_reduce", "comm_reduce_idx"]:
             if OP.tag == "comm_reduce":
@@ -227,7 +228,7 @@ def traverse(OP):
 
     fused = s[outs[0]].fuse()
     px, x = s[outs[0]].split(fused, nparts=1)
-    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    s[outs[0]].bind(px, te.thread_axis("pipeline"))
     return s
 
 
@@ -245,9 +246,9 @@ def schedule_softmax(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
 
     softmax = outs[0]
 
@@ -271,7 +272,7 @@ def schedule_softmax(outs):
     s[max_elem].compute_at(s[softmax], s[softmax].op.axis[1])
 
     px, x = s[softmax].split(softmax.op.axis[0], nparts=1)
-    s[softmax].bind(px, tvm.thread_axis("pipeline"))
+    s[softmax].bind(px, te.thread_axis("pipeline"))
     return s
 
 
@@ -289,9 +290,9 @@ def schedule_dense(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
 
     def traverse(OP):
         """Internal traverse function"""
@@ -300,7 +301,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if isinstance(tensor.op, tvm.te.ComputeOp):
                     traverse(tensor.op)
         # schedule dense
         elif OP.tag == 'dense':
@@ -314,7 +315,7 @@ def traverse(OP):
     traverse(outs[0].op)
 
     px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
-    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    s[outs[0]].bind(px, te.thread_axis("pipeline"))
     return s
 
 
@@ -332,9 +333,9 @@ def schedule_pool(outs, layout):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
 
     def traverse(OP):
         """Internal traverse function"""
@@ -343,7 +344,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if isinstance(tensor.op, tvm.te.ComputeOp):
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('pool'):
@@ -357,7 +358,7 @@ def traverse(OP):
     traverse(outs[0].op)
 
     px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
-    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    s[outs[0]].bind(px, te.thread_axis("pipeline"))
     return s
 
 
@@ -375,9 +376,9 @@ def schedule_adaptive_pool(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
 
     def traverse(OP):
         """Internal traverse function"""
@@ -386,7 +387,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if isinstance(tensor.op, tvm.te.ComputeOp):
                     traverse(tensor.op)
         # schedule global_pool
         elif OP.tag.startswith('adaptive_pool'):
@@ -400,5 +401,5 @@ def traverse(OP):
     traverse(outs[0].op)
 
     px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
-    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    s[outs[0]].bind(px, te.thread_axis("pipeline"))
     return s
diff --git a/topi/python/topi/image/resize.py b/topi/python/topi/image/resize.py
index 0c02867ef54d..d901babc835b 100644
--- a/topi/python/topi/image/resize.py
+++ b/topi/python/topi/image/resize.py
@@ -18,6 +18,7 @@
 """TVM operator input resize compute."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 from topi.util import nchw_pack_layout, nchw_xc_layout
 from .. import tag
 
@@ -42,8 +43,8 @@ def get_2d_indices(indices, layout='NCHW'):
 def get_2d_pixel(data, layout, boxes, image_height, image_width, n, c, y, x, cc, ib, ic):
     """ Get 2d pixel """
     if boxes is None:
-        y = tvm.max(tvm.min(y, image_height - 1), 0)
-        x = tvm.max(tvm.min(x, image_width - 1), 0)
+        y = tvm.te.max(tvm.te.min(y, image_height - 1), 0)
+        x = tvm.te.max(tvm.te.min(x, image_width - 1), 0)
     if layout == 'NHWC':
         return data(n, y, x, c).astype('float')
     if layout == 'NCHW':
@@ -70,7 +71,7 @@ def resize_nearest_neighbor(indices, data, image_height, image_width,
     indices : tuple
         The indices of input data
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         inputs is a 4-D tensor with shape
         [batch, channel, in_height, in_width]
         or  [batch, in_height, in_width, channel]
@@ -87,11 +88,11 @@ def resize_nearest_neighbor(indices, data, image_height, image_width,
     target_width : integer
         The target resized image width
 
-    boxes : tvm.Tensor, optional
+    boxes : tvm.te.Tensor, optional
         A 2-D tensor of shape [num_boxes, 4]. Each row of the tensor specifies
         the coordinates of a box.
 
-    box_indices : tvm.Tensor, optional
+    box_indices : tvm.te.Tensor, optional
         A 1-D tensor of shape [num_boxes], box_indices[i] specifies the data that
         the i-th box refers to.
 
@@ -150,29 +151,29 @@ def _cast_output(value, data_dtype="float32", out_dtype=None):
         in_x = w_scale * x
 
     if coordinate_transformation_mode == "align_corners" or boxes is not None:
-        closest_x_index = tvm.round(in_x).astype("int32")
-        closest_y_index = tvm.round(in_y).astype("int32")
+        closest_x_index = te.round(in_x).astype("int32")
+        closest_y_index = te.round(in_y).astype("int32")
     else:
         # Add epsilon to floor to prevent gpu rounding errors.
         epsilon = 1e-5
-        closest_y_index = tvm.floor(in_y + epsilon).astype('int32')
-        closest_x_index = tvm.floor(in_x + epsilon).astype('int32')
+        closest_y_index = te.floor(in_y + epsilon).astype('int32')
+        closest_x_index = te.floor(in_x + epsilon).astype('int32')
 
     value = get_2d_pixel(data, layout, boxes, image_height, image_width,
                          box_idx, c, closest_y_index, closest_x_index, cc, inum, ic)
 
     if extrapolation_value is not None:
-        out = tvm.if_then_else(in_y < 0,
-                               extrapolation_value,
-                               tvm.if_then_else(in_y > image_height - 1,
-                                                extrapolation_value,
-                                                value))
+        out = tvm.tir.if_then_else(in_y < 0,
+                                   extrapolation_value,
+                                   tvm.tir.if_then_else(in_y > image_height - 1,
+                                                        extrapolation_value,
+                                                        value))
         # use extrapolation_value if in_x is out of boundary
-        value = tvm.if_then_else(in_x < 0,
-                                 extrapolation_value,
-                                 tvm.if_then_else(in_x > image_width - 1,
-                                                  extrapolation_value,
-                                                  out))
+        value = tvm.tir.if_then_else(in_x < 0,
+                                     extrapolation_value,
+                                     tvm.tir.if_then_else(in_x > image_width - 1,
+                                                          extrapolation_value,
+                                                          out))
     return _cast_output(value, data.dtype, out_dtype=out_dtype)
 
 
@@ -191,7 +192,7 @@ def resize_bilinear(indices, data, image_height, image_width,
     indices : tuple
         The indices of input data
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         inputs is a 4-D tensor with shape
         [batch, channel, in_height, in_width]
         or  [batch, in_height, in_width, channel]
@@ -208,11 +209,11 @@ def resize_bilinear(indices, data, image_height, image_width,
     target_width : integer
         The target resized image width
 
-    boxes : tvm.Tensor, optional
+    boxes : tvm.te.Tensor, optional
         A 2-D tensor of shape [num_boxes, 4]. Each row of the tensor specifies
         the coordinates of a box.
 
-    box_indices : tvm.Tensor, optional
+    box_indices : tvm.te.Tensor, optional
         A 1-D tensor of shape [num_boxes], box_indices[i] specifies the data that
         the i-th box refers to.
 
@@ -279,12 +280,12 @@ def _lerp(A, B, t):
             in_y = h_scale * y
             in_x = w_scale * x
 
-    top_y_index = tvm.floor(in_y).astype('int32')
-    bottom_y_index = tvm.ceil(in_y).astype('int32')
+    top_y_index = te.floor(in_y).astype('int32')
+    bottom_y_index = te.ceil(in_y).astype('int32')
     y_lerp = in_y - top_y_index
 
-    left_x_index = tvm.floor(in_x).astype('int32')
-    right_x_index = tvm.ceil(in_x).astype('int32')
+    left_x_index = te.floor(in_x).astype('int32')
+    right_x_index = te.ceil(in_x).astype('int32')
     x_lerp = in_x - left_x_index
 
     top_left = get_2d_pixel(data, layout, boxes, image_height, image_width,
@@ -302,16 +303,16 @@ def _lerp(A, B, t):
 
     # use extrapolation_value if in_y/in_x is out of boundary
     if extrapolation_value is not None:
-        out = tvm.if_then_else(in_y < 0,
-                               extrapolation_value,
-                               tvm.if_then_else(in_y > image_height - 1,
-                                                extrapolation_value,
-                                                value))
-        value = tvm.if_then_else(in_x < 0,
-                                 extrapolation_value,
-                                 tvm.if_then_else(in_x > image_width - 1,
-                                                  extrapolation_value,
-                                                  out))
+        out = tvm.tir.if_then_else(in_y < 0,
+                                   extrapolation_value,
+                                   tvm.tir.if_then_else(in_y > image_height - 1,
+                                                        extrapolation_value,
+                                                        value))
+        value = tvm.tir.if_then_else(in_x < 0,
+                                     extrapolation_value,
+                                     tvm.tir.if_then_else(in_x > image_width - 1,
+                                                          extrapolation_value,
+                                                          out))
     return _cast_output(value, data.dtype, out_dtype=out_dtype)
 
 
@@ -329,7 +330,7 @@ def resize_bicubic(indices, data, image_height, image_width,
     indices : tuple
         The indices of input data
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         inputs is a 4-D tensor with shape
         [batch, channel, in_height, in_width]
         or  [batch, in_height, in_width, channel]
@@ -346,11 +347,11 @@ def resize_bicubic(indices, data, image_height, image_width,
     target_width : integer
         The target resized image width
 
-    boxes : tvm.Tensor, optional
+    boxes : tvm.te.Tensor, optional
         A 2-D tensor of shape [num_boxes, 4]. Each row of the tensor specifies
         the coordinates of a box.
 
-    box_indices : tvm.Tensor, optional
+    box_indices : tvm.te.Tensor, optional
         A 1-D tensor of shape [num_boxes], box_indices[i] specifies the data that
         the i-th box refers to.
 
@@ -421,11 +422,11 @@ def _cast_output(value, data_dtype="float32", out_dtype=None):
             in_y = h_scale * y
             in_x = w_scale * x
 
-    xint = tvm.floor(in_x).astype('int32')
-    xfract = in_x - tvm.floor(in_x)
+    xint = te.floor(in_x).astype('int32')
+    xfract = in_x - te.floor(in_x)
 
-    yint = tvm.floor(in_y).astype('int32')
-    yfract = in_y - tvm.floor(in_y)
+    yint = te.floor(in_y).astype('int32')
+    yfract = in_y - te.floor(in_y)
 
     # 1st row
     p00 = _get_pixel(data, layout, boxes, image_height, image_width,
@@ -476,16 +477,16 @@ def _cast_output(value, data_dtype="float32", out_dtype=None):
 
     # use extrapolation_value if in_y/in_x is out of boundary
     if extrapolation_value is not None:
-        out = tvm.if_then_else(in_y < 0,
-                               extrapolation_value,
-                               tvm.if_then_else(in_y > image_height - 1,
-                                                extrapolation_value,
-                                                value))
-        value = tvm.if_then_else(in_x < 0,
-                                 extrapolation_value,
-                                 tvm.if_then_else(in_x > image_width - 1,
-                                                  extrapolation_value,
-                                                  out))
+        out = tvm.tir.if_then_else(in_y < 0,
+                                   extrapolation_value,
+                                   tvm.tir.if_then_else(in_y > image_height - 1,
+                                                        extrapolation_value,
+                                                        value))
+        value = tvm.tir.if_then_else(in_x < 0,
+                                     extrapolation_value,
+                                     tvm.tir.if_then_else(in_x > image_width - 1,
+                                                          extrapolation_value,
+                                                          out))
     return _cast_output(value, data.dtype, out_dtype=out_dtype)
 
 
@@ -495,7 +496,7 @@ def resize(data, size, layout="NCHW", method="bilinear",
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         inputs is a 4-D tensor with shape
         [batch, channel, in_height, in_width]
         or  [batch, in_height, in_width, channel]
@@ -520,7 +521,7 @@ def resize(data, size, layout="NCHW", method="bilinear",
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, channel, in_height*scale, in_width*scale]
         or [batch, in_height*scale, in_width*scale, channel]
         or 5-D with shape [batch, channel-major, in_height*scale, in_width*scale, channel-minor]
@@ -548,21 +549,21 @@ def _nearest_neighbor(*indices):
         return resize_nearest_neighbor(indices, data, in_h, in_w,
                                        size[0], size[1], layout=layout,
                                        coordinate_transformation_mode= \
-                                           coordinate_transformation_mode,
+                                       coordinate_transformation_mode,
                                        out_dtype=out_dtype)
 
     def _bilinear(*indices):
         return resize_bilinear(indices, data, in_h, in_w,
                                size[0], size[1], layout=layout,
                                coordinate_transformation_mode= \
-                                   coordinate_transformation_mode,
+                               coordinate_transformation_mode,
                                out_dtype=out_dtype)
 
     def _bicubic(*indices):
         return resize_bicubic(indices, data, in_h, in_w,
                               size[0], size[1], layout,
                               coordinate_transformation_mode= \
-                                  coordinate_transformation_mode,
+                              coordinate_transformation_mode,
                               out_dtype=out_dtype)
 
     # Determine which interpolation method to use then run it.
@@ -575,7 +576,7 @@ def _bicubic(*indices):
     else:
         raise ValueError('%s method is not supported.' % method)
 
-    return tvm.compute(output_shape, compute_func, name='resize', tag=tag.INJECTIVE)
+    return te.compute(output_shape, compute_func, name='resize', tag=tag.INJECTIVE)
 
 
 def crop_and_resize(data, boxes, box_indices, crop_size, layout="NCHW",
@@ -584,16 +585,16 @@ def crop_and_resize(data, boxes, box_indices, crop_size, layout="NCHW",
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         inputs is a 4-D tensor with shape
         [batch, channel, in_height, in_width]
         or  [batch, in_height, in_width, channel]
 
-    boxes : tvm.Tensor
+    boxes : tvm.te.Tensor
         A 2-D tensor of shape [num_boxes, 4]. Each row of the tensor specifies
         the coordinates of a box.
 
-    box_indices : tvm.Tensor
+    box_indices : tvm.te.Tensor
         A 1-D tensor of shape [num_boxes], box_indices[i] specifies the data that
         the i-th box refers to.
 
@@ -614,7 +615,7 @@ def crop_and_resize(data, boxes, box_indices, crop_size, layout="NCHW",
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [num_boxes, channel, crop_height, crop_width]
         or [num_boxes, crop_height, crop_width, channel]
     """
@@ -656,7 +657,7 @@ def _nearest_neighbor(*indices):
     else:
         raise ValueError('%s method is not supported.' % method)
 
-    return tvm.compute(output_shape, compute_func, name='crop_and_resize', tag=tag.INJECTIVE)
+    return te.compute(output_shape, compute_func, name='crop_and_resize', tag=tag.INJECTIVE)
 
 
 
@@ -665,7 +666,7 @@ def resize3d(data, size, layout="NCDHW", method="nearest_neighbor",
     """Perform resize operation on the data.
     Parameters
     ----------
-    inputs: tvm.Tensor
+    inputs: tvm.te.Tensor
         inputs is a 5-D tensor with shape
         [batch, channel, in_depth, in_height, in_width]
         or  [batch, in_depth, in_height, in_width, channel]
@@ -684,7 +685,7 @@ def resize3d(data, size, layout="NCDHW", method="nearest_neighbor",
         Type to return. If left None will be same as input type.
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale]
         or [batch, in_depth*scale, in_height*scale, in_width*scale, channel]
         or 5-D with shape [batch, channel-major, in_depth*scale, in_height*scale, in_width*scale,
@@ -716,9 +717,9 @@ def resize3d(data, size, layout="NCDHW", method="nearest_neighbor",
             coordinate_transformation_mode))
 
     def _get_pixel(n, c, z, y, x, cc):
-        z = tvm.max(tvm.min(z, in_d - 1), 0)
-        y = tvm.max(tvm.min(y, in_h - 1), 0)
-        x = tvm.max(tvm.min(x, in_w - 1), 0)
+        z = tvm.te.max(tvm.te.min(z, in_d - 1), 0)
+        y = tvm.te.max(tvm.te.min(y, in_h - 1), 0)
+        x = tvm.te.max(tvm.te.min(x, in_w - 1), 0)
         if layout == 'NDHWC':
             return data(n, z, y, x, c).astype('float')
         if layout == 'NCDHW':
@@ -754,15 +755,15 @@ def _nearest_neighbor(*indices):
         in_x = x_ratio * x
 
         if coordinate_transformation_mode == "align_corners":
-            zint = tvm.round(in_z).astype('int32')
-            yint = tvm.round(in_y).astype('int32')
-            xint = tvm.round(in_x).astype('int32')
+            zint = te.round(in_z).astype('int32')
+            yint = te.round(in_y).astype('int32')
+            xint = te.round(in_x).astype('int32')
         elif coordinate_transformation_mode in ["asymmetric", "half_pixel"]:
             # Add epsilon to floor to prevent gpu rounding errors.
             epsilon = 1e-5
-            zint = tvm.floor(in_z + epsilon).astype('int32')
-            yint = tvm.floor(in_y + epsilon).astype('int32')
-            xint = tvm.floor(in_x + epsilon).astype('int32')
+            zint = te.floor(in_z + epsilon).astype('int32')
+            yint = te.floor(in_y + epsilon).astype('int32')
+            xint = te.floor(in_x + epsilon).astype('int32')
         else:
             raise ValueError("Unsupported coordinate_transformation_mode: {}".format(
                 coordinate_transformation_mode))
@@ -785,14 +786,14 @@ def _trilinear(*indices):
             in_y = y_ratio * y
             in_x = x_ratio * x
 
-        zint = tvm.floor(in_z).astype('int32')
-        zfract = in_z - tvm.floor(in_z)
+        zint = te.floor(in_z).astype('int32')
+        zfract = in_z - te.floor(in_z)
 
-        xint = tvm.floor(in_x).astype('int32')
-        xfract = in_x - tvm.floor(in_x)
+        xint = te.floor(in_x).astype('int32')
+        xfract = in_x - te.floor(in_x)
 
-        yint = tvm.floor(in_y).astype('int32')
-        yfract = in_y - tvm.floor(in_y)
+        yint = te.floor(in_y).astype('int32')
+        yfract = in_y - te.floor(in_y)
 
         p000 = _get_pixel(n, c, zint, yint, xint, cc)
         p001 = _get_pixel(n, c, zint, yint, xint + 1, cc)
@@ -820,4 +821,4 @@ def _trilinear(*indices):
     else:
         raise ValueError('%s method is not supported.' % method)
 
-    return tvm.compute(output_shape, compute_func, name='resize3d', tag=tag.INJECTIVE)
+    return te.compute(output_shape, compute_func, name='resize3d', tag=tag.INJECTIVE)
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index 8993063b16e3..e4ea196ac84f 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -20,6 +20,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
@@ -132,14 +133,14 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
     xo, xi = s[tensor].split(x, x_factor)
     s[tensor].reorder(zo, yo, xo, zi, yi, xi)
 
-    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
-    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    thread_z = te.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = te.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = te.thread_axis((0, x_factor), "threadIdx.x")
+    s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
     s[tensor].bind(zi, thread_z)
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
     s[tensor].bind(yi, thread_y)
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
     s[tensor].bind(xi, thread_x)
     return xi, thread_z, thread_y, thread_x
 
@@ -151,11 +152,11 @@ def _pack_data(data, kernel, ic_bn, oc_bn):
     ic_chunk = ic // ic_bn
     oc_chunk = oc // oc_bn
 
-    data = tvm.compute((n, ic_chunk, ih, iw, ic_bn),
-                       lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
-                       name="data_vec")
+    data = te.compute((n, ic_chunk, ih, iw, ic_bn),
+                      lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
+                      name="data_vec")
 
-    kernel = tvm.compute(
+    kernel = te.compute(
         (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn),
         lambda occ, icc, k_h, k_w, icb, ocb:
         kernel[occ * oc_bn + ocb,
@@ -172,10 +173,10 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         5-D with shape [num_filter, in_channel, filter_height, filter_width, nnum_filter_vec]
 
     stride : int or a list/tuple of two ints
@@ -189,7 +190,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout,
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     if len(data.shape) == 5:
@@ -215,9 +216,9 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout,
     _create_schedule_template(cfg, data_shape, kernel_shape, strides, padding, dilation)
 
     if cfg.is_fallback:
-        _get_default_config(cfg, tvm.placeholder((batch, in_channel, ih, iw), dtype=data.dtype),
-                            tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
-                                            dtype=kernel.dtype),
+        _get_default_config(cfg, te.placeholder((batch, in_channel, ih, iw), dtype=data.dtype),
+                            te.placeholder((num_filter, in_channel, kernel_height, kernel_width),
+                                           dtype=kernel.dtype),
                             strides, padding, out_dtype)
 
     ic_bn = cfg["tile_ic"].val if hasattr(cfg["tile_ic"], "val") else cfg["tile_ic"].size[-1]
@@ -232,9 +233,9 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout,
     out_width = simplify((iw - kernel_width + pad_left + pad_right) // stride_w + 1)
     oshape = (batch, out_channel // oc_bn, out_height, out_width, oc_bn)
 
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_height), name='ry')
-    rx = tvm.reduce_axis((0, kernel_width), name='rx')
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    ry = te.reduce_axis((0, kernel_height), name='ry')
+    rx = te.reduce_axis((0, kernel_width), name='rx')
 
     block_h = cfg["block_oh"].val
     block_w = cfg["block_ow"].val
@@ -261,17 +262,17 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout,
     else:
         temp = data
 
-    conv = tvm.compute(
+    conv = te.compute(
         cshape,
         lambda nn, ff, yy, xx, ff_v: \
-            tvm.sum(
-                temp[nn, rc//ic_bn, yy * stride_h + ry, xx * stride_w + rx, rc%ic_bn]. \
-                astype(out_dtype) *
-                kernel[ff, rc//ic_bn, ry, rx, rc%ic_bn, ff_v].astype(out_dtype),
-                axis=[rc, ry, rx]), tag="conv2d_NCHWc", name='conv2d_NCHWc')
+        te.sum(
+            temp[nn, rc//ic_bn, yy * stride_h + ry, xx * stride_w + rx, rc%ic_bn]. \
+            astype(out_dtype) *
+            kernel[ff, rc//ic_bn, ry, rx, rc%ic_bn, ff_v].astype(out_dtype),
+            axis=[rc, ry, rx]), tag="conv2d_NCHWc", name='conv2d_NCHWc')
 
     if DOUNPACK:
-        output = tvm.compute(
+        output = te.compute(
             oshape,
             lambda nn, ff, yy, xx, ff_v:
             conv[nn][ff][yy][xx][ff_v],
@@ -297,8 +298,8 @@ def schedule_conv2d_NCHWc(cfg, outs):
     s: Schedule
         The computation schedule for conv2d_nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
@@ -344,7 +345,7 @@ def _schedule_cl_spatialpack_NCHWc(cfg, s, op):
         # this part will be folded during Relay fold_constant pass.
         s[data].pragma(s[data].op.axis[0], "debug_skip_region")
         s[kernel].pragma(s[kernel].op.axis[0], "debug_skip_region")
-    elif isinstance(kernel.op, tvm.tensor.ComputeOp) and kernel.name == "kernel_vec":
+    elif isinstance(kernel.op, tvm.te.ComputeOp) and kernel.name == "kernel_vec":
         # data and kernel are not pre-computed, schedule layout transform here.
         # TODO(@Laurawly): Add schedule for data and kernel pack
         pass
@@ -356,9 +357,9 @@ def _schedule_cl_spatialpack_NCHWc(cfg, s, op):
     z_factor = 1
     y_factor = 1
     x_factor = 16
-    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
+    thread_z = te.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = te.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = te.thread_axis((0, x_factor), "threadIdx.x")
     _, co, oh, ow, vc = s[conv].op.axis
     ooh, ioh = s[conv].split(oh, factor=OUTPUT_BLOCK_HEIGHT)
     oow, iow = s[conv].split(ow, factor=OUTPUT_BLOCK_WIDTH)
@@ -371,9 +372,9 @@ def _schedule_cl_spatialpack_NCHWc(cfg, s, op):
     s[conv].bind(oohi, thread_z)
     s[conv].bind(oowi, thread_y)
     s[conv].bind(vci, thread_x)
-    s[conv].bind(ooho, tvm.thread_axis("blockIdx.z"))
-    s[conv].bind(oowo, tvm.thread_axis("blockIdx.y"))
-    s[conv].bind(coi, tvm.thread_axis("blockIdx.x"))
+    s[conv].bind(ooho, te.thread_axis("blockIdx.z"))
+    s[conv].bind(oowo, te.thread_axis("blockIdx.y"))
+    s[conv].bind(coi, te.thread_axis("blockIdx.x"))
 
     # schedule conv_L
     s[conv_L].compute_at(s[conv], vci)
@@ -424,9 +425,9 @@ def conv2d_nchw(data, kernel, stride, padding, dilation, out_dtype='float32'):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
     stride : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -434,7 +435,7 @@ def conv2d_nchw(data, kernel, stride, padding, dilation, out_dtype='float32'):
         padding size, or [pad_height, pad_width]
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     assert data.shape[0].value == 1, "only support batch size=1 convolution on intel gpu"
@@ -456,8 +457,8 @@ def schedule_conv2d_nchw(outs):
     s: Schedule
         The computation schedule for conv2d_nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         """inline all one-to-one-mapping operators except the last stage (output)"""
@@ -483,9 +484,9 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'):
     out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
     oshape = (batch, out_channel, out_height, out_width)
 
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
 
     if stride_h == 2:
         if num_filter + kernel_h == 515:
@@ -529,20 +530,20 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, out_dtype='float16'):
     cshape = (batch, out_channel // nv, c_h, c_w, nv)
     kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
 
-    kernel_vec = tvm.compute(
+    kernel_vec = te.compute(
         kvshape,
         lambda co, ci, kh, kw, vc:
         kernel[co*nv + vc][ci][kh][kw], name='kernel_vec')
 
-    conv = tvm.compute(
+    conv = te.compute(
         cshape,
         lambda nn, ff, yy, xx, vc: \
-            tvm.sum(
-                temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
-                kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
-                axis=[rc, ry, rx]), name='conv', attrs=attrs)
+        te.sum(
+            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
+            kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
+            axis=[rc, ry, rx]), name='conv', attrs=attrs)
 
-    output = tvm.compute(
+    output = te.compute(
         oshape,
         lambda nn, ff, yy, xx:
         conv[nn][ff//nv][yy][xx][ff%nv],
@@ -573,9 +574,9 @@ def _schedule_cl_spatialpack(s, op):
     z_factor = 1
     y_factor = 1
     x_factor = 16
-    thread_z = tvm.thread_axis((0, z_factor), "threadIdx.z")
-    thread_y = tvm.thread_axis((0, y_factor), "threadIdx.y")
-    thread_x = tvm.thread_axis((0, x_factor), "threadIdx.x")
+    thread_z = te.thread_axis((0, z_factor), "threadIdx.z")
+    thread_y = te.thread_axis((0, y_factor), "threadIdx.y")
+    thread_x = te.thread_axis((0, x_factor), "threadIdx.x")
     _, co, oh, ow, vc = s[conv].op.axis
     ooh, ioh = s[conv].split(oh, factor=OUTPUT_BLOCK_HEIGHT)
     oow, iow = s[conv].split(ow, factor=OUTPUT_BLOCK_WIDTH)
@@ -588,9 +589,9 @@ def _schedule_cl_spatialpack(s, op):
     s[conv].bind(oohi, thread_z)
     s[conv].bind(oowi, thread_y)
     s[conv].bind(vci, thread_x)
-    s[conv].bind(ooho, tvm.thread_axis("blockIdx.z"))
-    s[conv].bind(oowo, tvm.thread_axis("blockIdx.y"))
-    s[conv].bind(coi, tvm.thread_axis("blockIdx.x"))
+    s[conv].bind(ooho, te.thread_axis("blockIdx.z"))
+    s[conv].bind(oowo, te.thread_axis("blockIdx.y"))
+    s[conv].bind(coi, te.thread_axis("blockIdx.x"))
 
     # schedule conv_L
     s[conv_L].compute_at(s[conv], vci)
diff --git a/topi/python/topi/intel_graphics/conv2d_alter_op.py b/topi/python/topi/intel_graphics/conv2d_alter_op.py
index e95e59f4c6d7..bbe5e7f296cf 100644
--- a/topi/python/topi/intel_graphics/conv2d_alter_op.py
+++ b/topi/python/topi/intel_graphics/conv2d_alter_op.py
@@ -18,6 +18,7 @@
 """Conv2D alter op and legalize functions for x86"""
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 
@@ -74,10 +75,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
 
         # Store altered operator's config
-        new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
-                                   dtype=data_dtype)
-        new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn,
-                                      kh, kw, ic_bn, oc_bn), dtype=kernel_dtype)
+        new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                  dtype=data_dtype)
+        new_kernel = te.placeholder((out_channel//oc_bn, in_channel//ic_bn,
+                                     kh, kw, ic_bn, oc_bn), dtype=kernel_dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"],
              new_attrs["out_layout"], out_dtype], "conv2d_NCHWc.intel_graphics")
diff --git a/topi/python/topi/intel_graphics/depthwise_conv2d.py b/topi/python/topi/intel_graphics/depthwise_conv2d.py
index 17f19435b62f..a54941315a1a 100644
--- a/topi/python/topi/intel_graphics/depthwise_conv2d.py
+++ b/topi/python/topi/intel_graphics/depthwise_conv2d.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name
 """Schedule for depthwise_conv2d with auto fusion"""
 import tvm
+from tvm import te
 from tvm import autotvm
 from ..util import traverse_inline
 from .. import tag
@@ -44,8 +45,8 @@ def schedule_depthwise_conv2d_nchw(cfg, outs):
     s: Schedule
         The computation schedule for depthwise_conv2d nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'depthwise_conv2d_nchw':
@@ -75,7 +76,7 @@ def _callback(op):
             ##### space definition end #####
 
             s[pad_data].compute_inline()
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and 'dilate' in kernel.op.tag:
                 s[kernel].compute_inline()
 
             if conv.op in s.outputs:
@@ -100,15 +101,15 @@ def _callback(op):
 
             kernel_scope, n = s[output].split(n, nparts=1)
             bf = s[output].fuse(n, bf)
-            s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
-            s[output].bind(by, tvm.thread_axis("blockIdx.y"))
-            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-            s[output].bind(vf, tvm.thread_axis("vthread"))
-            s[output].bind(vy, tvm.thread_axis("vthread"))
-            s[output].bind(vx, tvm.thread_axis("vthread"))
-            s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
-            s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+            s[output].bind(bf, te.thread_axis("blockIdx.z"))
+            s[output].bind(by, te.thread_axis("blockIdx.y"))
+            s[output].bind(bx, te.thread_axis("blockIdx.x"))
+            s[output].bind(vf, te.thread_axis("vthread"))
+            s[output].bind(vy, te.thread_axis("vthread"))
+            s[output].bind(vx, te.thread_axis("vthread"))
+            s[output].bind(tf, te.thread_axis("threadIdx.z"))
+            s[output].bind(ty, te.thread_axis("threadIdx.y"))
+            s[output].bind(tx, te.thread_axis("threadIdx.x"))
             s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
             s[OL].compute_at(s[output], tx)
 
@@ -123,9 +124,9 @@ def _callback(op):
                 fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
                 fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
                 fused, tz = s[load].split(fused, cfg["tile_f"].size[2])
-                s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-                s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+                s[load].bind(tz, te.thread_axis("threadIdx.z"))
+                s[load].bind(ty, te.thread_axis("threadIdx.y"))
+                s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
             s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
             s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
@@ -148,8 +149,8 @@ def schedule_depthwise_conv2d_nhwc(outs):
     s: Schedule
         The computation schedule for depthwise_conv2d nhwc.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(temp, Filter, DepthwiseConv2d):
         s[temp].compute_inline()
@@ -161,13 +162,13 @@ def _schedule(temp, Filter, DepthwiseConv2d):
             Output = outs[0].op.output(0)
             s[DepthwiseConv2d].set_scope("local")
 
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis("threadIdx.x")
+        block_x = te.thread_axis("blockIdx.x")
+        thread_x = te.thread_axis("threadIdx.x")
 
         b, h, w, c = s[Output].op.axis
 
         # num_thread here could be 728, it is larger than cuda.max_num_threads
-        num_thread = tvm.ir_pass.Simplify(temp.shape[3]).value
+        num_thread = tvm.tir.ir_pass.Simplify(temp.shape[3]).value
         target = tvm.target.Target.current()
         if target and (target.target_name not in ["cuda", "nvptx"]):
             num_thread = target.max_num_threads
@@ -206,7 +207,7 @@ def traverse(OP):
         if OP.tag == 'depthwise_conv2d_nhwc':
             PaddedInput = OP.input_tensors[0]
             Filter = OP.input_tensors[1]
-            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
+            if isinstance(Filter.op, tvm.te.ComputeOp) and 'dilate' in Filter.op.tag:
                 s[Filter].compute_inline()
             DepthwiseConv2d = OP.output(0)
             _schedule(PaddedInput, Filter, DepthwiseConv2d)
@@ -232,14 +233,14 @@ def schedule_depthwise_conv2d_backward_input_nhwc(outs):
         The computation schedule for depthwise_conv2d backward
         wrt input with layout nhwc.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(Padded_out_grad, In_grad):
         s[Padded_out_grad].compute_inline()
 
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis("threadIdx.x")
+        block_x = te.thread_axis("blockIdx.x")
+        thread_x = te.thread_axis("threadIdx.x")
         _, h, w, c = In_grad.op.axis
 
         fused_hwc = s[In_grad].fuse(h, w, c)
@@ -277,13 +278,13 @@ def schedule_depthwise_conv2d_backward_weight_nhwc(outs):
         The computation schedule for depthwise_conv2d backward
         wrt weight with layout nhwc.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(Weight_grad):
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_y = tvm.thread_axis("threadIdx.y")
-        thread_x = tvm.thread_axis("threadIdx.x")
+        block_x = te.thread_axis("blockIdx.x")
+        thread_y = te.thread_axis("threadIdx.y")
+        thread_x = te.thread_axis("threadIdx.x")
 
         db, dh, dw = Weight_grad.op.reduce_axis
 
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index f774e76c0ccd..d19592857086 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
 """conv2d schedule on ARM Mali GPU"""
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 from tvm.autotvm.task.space import get_factors
@@ -38,10 +39,10 @@ def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_
     cfg: ConfigEntity
         The config for this template
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width] or
         pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
         filter_width, num_filter_block]
@@ -60,7 +61,7 @@ def conv2d_nchw_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
@@ -83,7 +84,7 @@ def schedule_conv2d_nchw_spatial_pack(cfg, outs):
     s: Schedule
         The computation schedule for conv2d
     """
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         # schedule conv2d
@@ -100,7 +101,7 @@ def _callback(op):
                 kernel = kernel_vec.op.input_tensors[0]
             else:
                 kernel = kernel_vec
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
 
             _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
@@ -121,12 +122,12 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
     BW, TW, VW = cfg["tile_ow"].size
 
     # schedule padding
-    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
         data_pad = data
         s[data_pad].compute_inline()
 
     # schedule data packing
-    if isinstance(data_vec.op, tvm.tensor.ComputeOp) and data_vec.op.name == 'data_vec_undilated':
+    if isinstance(data_vec.op, tvm.te.ComputeOp) and data_vec.op.name == 'data_vec_undilated':
         _, h, w, ci, _, _, vh, vw = s[data_vec].op.axis
     else:
         _, h, w, ci, vh, vw = s[data_vec].op.axis
@@ -136,7 +137,7 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
     if vw.dom.extent.value < max_unroll:
         s[data_vec].unroll(vw)
 
-    if isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and kernel_vec.name == 'kernel_vec':
+    if isinstance(kernel_vec.op, tvm.te.ComputeOp) and kernel_vec.name == 'kernel_vec':
         if autotvm.GLOBAL_SCOPE.in_tuning:
             # kernel packing will be pre-computed during compilation, so we skip
             # this part to make tuning records correct
@@ -147,8 +148,8 @@ def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
             fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
             fused, vec = s[kernel_vec].split(fused, VC)
             bb, tt = s[kernel_vec].split(fused, max_threads)
-            s[kernel_vec].bind(bb, tvm.thread_axis("blockIdx.x"))
-            s[kernel_vec].bind(tt, tvm.thread_axis("threadIdx.x"))
+            s[kernel_vec].bind(bb, te.thread_axis("blockIdx.x"))
+            s[kernel_vec].bind(tt, te.thread_axis("threadIdx.x"))
             if VC in vec_size:
                 s[kernel_vec].vectorize(vec)
 
@@ -199,7 +200,7 @@ def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtyp
 
 @autotvm.register_topi_schedule("conv2d_nchw_winograd.mali")
 def schedule_conv2d_nchw_winograd(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'winograd_conv2d_output' in op.tag:
@@ -271,54 +272,55 @@ def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, til
     assert CO % bna == 0 and P_round % bnb == 0
 
     # pack input tile
-    input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
-         tvm.if_then_else(
-             b * bnb + bb < P,
-             data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
-             [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d')
+    input_tile = te.compute(
+        (CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
+        tvm.tir.if_then_else(
+            b * bnb + bb < P,
+            data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
+            [(b*bnb+bb) % nW * m + nu], tvm.tir.const(0, data_pad.dtype)), name='d')
 
     # transform kernel
     if pre_computed:
         U = kernel
     else:
-        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
-        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
-        U = tvm.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
-                        tvm.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
-                                axis=[r_kh, r_kw]), name='U')
+        r_kh = te.reduce_axis((0, KH), 'r_kh')
+        r_kw = te.reduce_axis((0, KW), 'r_kw')
+        U = te.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
+                       te.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                              axis=[r_kh, r_kw]), name='U')
 
     # transform image
-    r_a = tvm.reduce_axis((0, alpha), 'r_a')
-    r_b = tvm.reduce_axis((0, alpha), 'r_b')
-    V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp:
-                    tvm.sum(input_tile[ci][p][r_a][r_b][vp] * B[r_a][eps] * B[r_b][nu],
-                            axis=[r_a, r_b]), name='V')
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_b')
+    V = te.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp:
+                   te.sum(input_tile[ci][p][r_a][r_b][vp] * B[r_a][eps] * B[r_b][nu],
+                          axis=[r_a, r_b]), name='V')
 
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
 
     # batch gemm
-    ci = tvm.reduce_axis((0, CI), name='c')
-    M = tvm.compute((alpha, alpha, CO, P_round), lambda eps, nu, co, p:
-                    tvm.sum(U[eps][nu][idxdiv(co, bna)][ci][idxmod(co, bna)] *
-                            V[eps][nu][idxdiv(p, bnb)][ci][idxmod(p, bnb)], axis=ci), name='M')
+    ci = te.reduce_axis((0, CI), name='c')
+    M = te.compute((alpha, alpha, CO, P_round), lambda eps, nu, co, p:
+                   te.sum(U[eps][nu][idxdiv(co, bna)][ci][idxmod(co, bna)] *
+                          V[eps][nu][idxdiv(p, bnb)][ci][idxmod(p, bnb)], axis=ci), name='M')
 
-    r_a = tvm.reduce_axis((0, alpha), 'r_a')
-    r_b = tvm.reduce_axis((0, alpha), 'r_b')
-    Y = tvm.compute((CO, P, m, m), lambda co, p, vh, vw:
-                    tvm.sum(M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
-                            axis=[r_a, r_b]), name='Y')
+    r_a = te.reduce_axis((0, alpha), 'r_a')
+    r_b = te.reduce_axis((0, alpha), 'r_b')
+    Y = te.compute((CO, P, m, m), lambda co, p, vh, vw:
+                   te.sum(M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
+                          axis=[r_a, r_b]), name='Y')
 
     # unpack output
-    output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
-                         Y[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
-                           idxmod(h, m), idxmod(w, m)]
-                         # The following hack term is used to make the padding in batch gemm ("M")
-                         # effective, otherwise the padding will be eliminated by bound inference.
-                         # Use `tvm.expr.Mul` instead of `*` to avoid issues in const folding.
-                         + tvm.expr.Mul(tvm.const(0, out_dtype),
-                                        M[alpha-1][alpha-1][CO-1][P_round-1]),
-                         name='output', tag='winograd_conv2d_output')
+    output = te.compute((N, CO, H, W), lambda n, co, h, w:
+                        Y[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
+                          idxmod(h, m), idxmod(w, m)]
+                        # The following hack term is used to make the padding in batch gemm ("M")
+                        # effective, otherwise the padding will be eliminated by bound inference.
+                        # Use `tvm.tir.Mul` instead of `*` to avoid issues in const folding.
+                        + tvm.tir.Mul(tvm.tir.const(0, out_dtype),
+                                      M[alpha-1][alpha-1][CO-1][P_round-1]),
+                        name='output', tag='winograd_conv2d_output')
 
     # we have to manually assign effective GFLOP for winograd
     cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
@@ -339,7 +341,7 @@ def _schedule_winograd(cfg, s, op):
     s[data_pad].compute_inline()
 
     # transform kernel
-    if isinstance(U.op, tvm.tensor.ComputeOp):
+    if isinstance(U.op, tvm.te.ComputeOp):
         kernel, G = s[U].op.input_tensors
         s[G].compute_inline()
         eps, nu, co, ci, vco, = s[U].op.axis
@@ -355,7 +357,7 @@ def _schedule_winograd(cfg, s, op):
             tile_and_bind(s, U, co, ci, 1, 256)
 
         # dilation
-        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
             s[kernel].compute_inline()
 
     # transform image
@@ -370,8 +372,8 @@ def _schedule_winograd(cfg, s, op):
     fused = s[V].fuse(p, ci)
 
     bb, tt = cfg['tile_t1'].apply(s, V, fused)
-    s[V].bind(bb, tvm.thread_axis('blockIdx.x'))
-    s[V].bind(tt, tvm.thread_axis('threadIdx.x'))
+    s[V].bind(bb, te.thread_axis('blockIdx.x'))
+    s[V].bind(tt, te.thread_axis('threadIdx.x'))
 
     eps, nu, p, ci, vp = s[VL].op.axis
     r_a, r_b = s[VL].op.reduce_axis
@@ -416,8 +418,8 @@ def _schedule_winograd(cfg, s, op):
     s[output].unroll(wi)
     fused = s[output].fuse(n, co, h, w)
     bb, tt = cfg['tile_t2'].apply(s, output, fused)
-    s[output].bind(bb, tvm.thread_axis('blockIdx.x'))
-    s[output].bind(tt, tvm.thread_axis('threadIdx.x'))
+    s[output].bind(bb, te.thread_axis('blockIdx.x'))
+    s[output].bind(tt, te.thread_axis('threadIdx.x'))
 
     s[Y].compute_at(s[output], tt)
 
@@ -451,7 +453,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     data, kernel = tinfos
     out_dtype = out_type.dtype
 
-    idxd = tvm.indexdiv
+    idxd = tvm.tir.indexdiv
 
     if topi_tmpl == "conv2d_nchw_spatial_pack.mali":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
@@ -462,7 +464,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['kernel_layout'] = 'OIHW%do' % VC
 
         new_data = data
-        new_kernel = tvm.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
+        new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "conv2d_nchw_spatial_pack.mali")
@@ -488,10 +490,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['tile_size'] = tile_size
 
         new_data = data
-        new_kernel = tvm.placeholder((KH + tile_size - 1,
-                                      KW + tile_size -1,
-                                      idxd(CO, VC), CI, VC),
-                                     kernel.dtype)
+        new_kernel = te.placeholder((KH + tile_size - 1,
+                                     KW + tile_size -1,
+                                     idxd(CO, VC), CI, VC),
+                                    kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             'conv2d_nchw_winograd.mali')
@@ -508,10 +510,10 @@ def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
     """ tile and bind to GPU threads """
     x_factor = x_factor or y_factor
     yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
+    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
     return yo, xo, yi, xi
 
 
@@ -522,11 +524,11 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
     zo, zi = s[tensor].split(z, z_factor)
     yo, yi = s[tensor].split(y, y_factor)
     xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, te.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
     s[tensor].reorder(zo, yo, xo, zi, yi, xi)
     return zo, yo, xo, zi, yi, xi
diff --git a/topi/python/topi/mali/dense.py b/topi/python/topi/mali/dense.py
index 3b233e92ba8a..8ec5d19c9fa0 100644
--- a/topi/python/topi/mali/dense.py
+++ b/topi/python/topi/mali/dense.py
@@ -16,10 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,unused-variable
 """dense schedule on ARM Mali GPU"""
-
-from __future__ import absolute_import as _abs
-
-import tvm
+from tvm import te
 from tvm import autotvm
 
 from .. import nn
@@ -50,8 +47,8 @@ def schedule_dense(cfg, outs):
     s: Schedule
         The computation schedule for dense.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'dense':
@@ -82,10 +79,10 @@ def _callback(op):
             by, ty, yi = cfg['tile_y'].apply(s, output, y)
             bx, tx, xi = cfg['tile_x'].apply(s, output, x)
 
-            s[output].bind(by, tvm.thread_axis('blockIdx.y'))
-            s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
-            s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
-            s[output].bind(tx, tvm.thread_axis('threadIdx.x'))
+            s[output].bind(by, te.thread_axis('blockIdx.y'))
+            s[output].bind(bx, te.thread_axis('blockIdx.x'))
+            s[output].bind(ty, te.thread_axis('threadIdx.y'))
+            s[output].bind(tx, te.thread_axis('threadIdx.x'))
 
             if cfg['tile_y'].size[-1] < max_unroll:
                 s[output].unroll(yi)
@@ -113,6 +110,6 @@ def fuse_and_bind(s, tensor, axis=None, num_thread=None):
     axis = axis or s[tensor].op.axis
     fused = s[tensor].fuse(*axis)
     bx, tx = s[tensor].split(fused, num_thread)
-    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(bx, te.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, te.thread_axis("threadIdx.x"))
     return bx, tx
diff --git a/topi/python/topi/mali/depthwise_conv2d.py b/topi/python/topi/mali/depthwise_conv2d.py
index 4ff17e534feb..785128c84dfa 100644
--- a/topi/python/topi/mali/depthwise_conv2d.py
+++ b/topi/python/topi/mali/depthwise_conv2d.py
@@ -18,6 +18,7 @@
 """depthwise_conv2d schedule on ARM Mali GPU"""
 
 import tvm
+from tvm import te
 from tvm import autotvm
 
 from .. import nn
@@ -47,8 +48,8 @@ def schedule_depthwise_conv2d_nchw(cfg, outs):
     s: Schedule
         The computation schedule for depthwise_conv2d nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(pad_data, kernel, conv):
         """schedule depthwise_conv2d"""
@@ -75,7 +76,7 @@ def _schedule(pad_data, kernel, conv):
         tile_and_bind3d(s, pad_data, c, y, x, cfg["tile_c"].size[1], 1, 1)
 
         # schedule dilation
-        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
             s[kernel].compute_inline()
 
         # schedule conv
@@ -93,12 +94,12 @@ def _schedule(pad_data, kernel, conv):
         bx, tx, xi = cfg['tile_x'].apply(s, output, x)
 
         bc = s[output].fuse(n, bc)
-        s[output].bind(bc, tvm.thread_axis("blockIdx.z"))
-        s[output].bind(tc, tvm.thread_axis("threadIdx.z"))
-        s[output].bind(by, tvm.thread_axis("blockIdx.y"))
-        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[output].bind(bc, te.thread_axis("blockIdx.z"))
+        s[output].bind(tc, te.thread_axis("threadIdx.z"))
+        s[output].bind(by, te.thread_axis("blockIdx.y"))
+        s[output].bind(ty, te.thread_axis("threadIdx.y"))
+        s[output].bind(bx, te.thread_axis("blockIdx.x"))
+        s[output].bind(tx, te.thread_axis("threadIdx.x"))
 
         di, dj = s[OL].op.reduce_axis
         s[OL].unroll(di)
@@ -134,10 +135,10 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
     zo, zi = s[tensor].split(z, z_factor)
     yo, yi = s[tensor].split(y, y_factor)
     xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(zo, te.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, te.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, te.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, te.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, te.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, te.thread_axis("threadIdx.x"))
     return zo, zi, yo, yi, xo, xi
diff --git a/topi/python/topi/math.py b/topi/python/topi/math.py
index 148d53a54cfe..5b6b9ab8da75 100644
--- a/topi/python/topi/math.py
+++ b/topi/python/topi/math.py
@@ -16,181 +16,181 @@
 # under the License.
 """Elementwise operators"""
 # pylint: disable=redefined-builtin
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from . import tag
 from . import cpp
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def identity(x):
     """Take identity of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     # pylint: disable=unnecessary-lambda
-    return tvm.compute(x.shape, lambda *i: x(*i))
+    return te.compute(x.shape, lambda *i: x(*i))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def negative(x):
     """Take negation of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     # pylint: disable=unnecessary-lambda
-    return tvm.compute(x.shape, lambda *i: -x(*i))
+    return te.compute(x.shape, lambda *i: -x(*i))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def exp(x):
     """Take exponential of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.exp(x(*i)))
+    return te.compute(x.shape, lambda *i: te.exp(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def erf(x):
     """Take gauss error function of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.erf(x(*i)))
+    return te.compute(x.shape, lambda *i: te.erf(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def tanh(x):
     """Take hyperbolic tanh of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.tanh(x(*i)))
+    return te.compute(x.shape, lambda *i: te.tanh(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def cos(x):
     """Take cos of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.cos(x(*i)))
+    return te.compute(x.shape, lambda *i: te.cos(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def sin(x):
     """Take sin of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.sin(x(*i)))
+    return te.compute(x.shape, lambda *i: te.sin(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def atan(x):
     """Take atan of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.atan(x(*i)))
+    return te.compute(x.shape, lambda *i: te.atan(x(*i)))
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def floor(x):
     """Take floor of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.floor(x(*i)))
+    return te.compute(x.shape, lambda *i: te.floor(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def ceil(x):
     """Take ceil of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.ceil(x(*i)))
+    return te.compute(x.shape, lambda *i: te.ceil(x(*i)))
 
 
 def sign(x):
@@ -198,199 +198,199 @@ def sign(x):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return cpp.sign(x)
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def trunc(x):
     """Take truncated value of the input of x, element-wise.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.trunc(x(*i)))
+    return te.compute(x.shape, lambda *i: te.trunc(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def abs(x):
     """Take absolute value of the input of x, element-wise.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.abs(x(*i)))
+    return te.compute(x.shape, lambda *i: te.abs(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def isnan(x):
     """Check if value of x is NaN, element-wise.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.isnan(x(*i)))
+    return te.compute(x.shape, lambda *i: te.isnan(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def round(x):
     """Round elements of x to nearest integer.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.round(x(*i)))
+    return te.compute(x.shape, lambda *i: te.round(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def log(x):
     """Take logarithm of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.log(x(*i)))
+    return te.compute(x.shape, lambda *i: te.log(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def sqrt(x):
     """Take square root of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.sqrt(x(*i)))
+    return te.compute(x.shape, lambda *i: te.sqrt(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def rsqrt(x):
     """Take inverse square root of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.rsqrt(x(*i)))
+    return te.compute(x.shape, lambda *i: te.rsqrt(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def sigmoid(x):
     """Take sigmoid tanh of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.sigmoid(x(*i)))
+    return te.compute(x.shape, lambda *i: te.sigmoid(x(*i)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def left_shift(x, n):
     """Take n bits left shift of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
     n : int
         Number of bits.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: x(*i) << n)
+    return te.compute(x.shape, lambda *i: x(*i) << n)
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def right_shift(x, n):
     """Take n bits right shift of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
     n : int
         Number of bits.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: x(*i) >> n)
+    return te.compute(x.shape, lambda *i: x(*i) >> n)
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def clip(x, a_min, a_max):
     """Clip (limit) the values in an array. Given an interval, values
     outside the interval are clipped to the interval edges.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
     a_min : int or float
         Minimum value.
@@ -399,15 +399,15 @@ def clip(x, a_min, a_max):
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     def _compute(*indices):
         value = x(*indices)
-        const_min = tvm.const(a_min, value.dtype)
-        const_max = tvm.const(a_max, value.dtype)
-        return tvm.max(tvm.min(value, const_max), const_min)
-    return tvm.compute(x.shape, _compute)
+        const_min = tvm.tir.const(a_min, value.dtype)
+        const_max = tvm.tir.const(a_max, value.dtype)
+        return tvm.te.max(tvm.te.min(value, const_max), const_min)
+    return te.compute(x.shape, _compute)
 
 
 def cast(x, dtype):
@@ -415,7 +415,7 @@ def cast(x, dtype):
 
     Parameters
     ----------
-    x : tvm.Tensor or Expr
+    x : tvm.te.Tensor or Expr
         Input argument.
 
     dtype : str
@@ -423,11 +423,11 @@ def cast(x, dtype):
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    if isinstance(x, tvm.tensor.Tensor):
-        return tvm.compute(
+    if isinstance(x, te.tensor.Tensor):
+        return te.compute(
             x.shape, lambda *i: x(*i).astype(dtype), tag=tag.ELEMWISE)
     # pylint: disable=import-outside-toplevel
     from tvm.tir import _ffi_api
@@ -439,7 +439,7 @@ def reinterpret(x, dtype):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     dtype : str
@@ -447,7 +447,7 @@ def reinterpret(x, dtype):
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return cpp.reinterpret(x, dtype)
@@ -458,12 +458,12 @@ def fast_exp(x):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return cpp.fast_exp(x, x.dtype, tag.ELEMWISE)
diff --git a/topi/python/topi/nn/batch_matmul.py b/topi/python/topi/nn/batch_matmul.py
index d69562c4daf6..0d9f3510d097 100644
--- a/topi/python/topi/nn/batch_matmul.py
+++ b/topi/python/topi/nn/batch_matmul.py
@@ -16,8 +16,7 @@
 # under the License.
 """Binary Neural Network (BNN) Operators"""
 # pylint: disable=invalid-name
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from ..util import get_const_tuple
 
 def batch_matmul(x, y):
@@ -26,15 +25,15 @@ def batch_matmul(x, y):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         3-D with shape [batch, M, K]
 
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         3-D with shape [batch, N, K]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         3-D with shape [batch, M, N]
     """
     assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul"
@@ -44,7 +43,7 @@ def batch_matmul(x, y):
     assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
     batch, M, K = x.shape
     N = y.shape[1]
-    k = tvm.reduce_axis((0, K), name='k')
-    return tvm.compute((batch, M, N),
-                       lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k),
-                       tag='batch_matmul')
+    k = te.reduce_axis((0, K), name='k')
+    return te.compute((batch, M, N),
+                      lambda b, i, j: te.sum(x[b, i, k] * y[b, j, k], axis=k),
+                      tag='batch_matmul')
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index f18a5aae7eed..e1a7697ca4de 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -17,8 +17,8 @@
 # pylint: disable=invalid-name, too-many-locals, too-many-arguments
 # pylint: disable=unused-argument, redefined-builtin
 """Bitserial Conv2D operators"""
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .pad import pad
 from .util import get_pad_tuple
 from .bitserial_util import bitpack
@@ -30,10 +30,10 @@ def bitserial_conv2d_nchw(data, kernel, stride, padding, activation_bits, weight
 
     Parameters
     ----------
-    input : tvm.Tensor
+    input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    filter : tvm.Tensor
+    filter : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
     stride : int or a list/tuple of two ints
@@ -59,7 +59,7 @@ def bitserial_conv2d_nchw(data, kernel, stride, padding, activation_bits, weight
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     assert isinstance(stride, int) or len(stride) == 2
@@ -88,32 +88,32 @@ def bitserial_conv2d_nchw(data, kernel, stride, padding, activation_bits, weight
     out_height = (in_height - kernel_h + TPAD + DPAD) // stride_h + 1
     out_width = (in_width - kernel_w + LPAD + RPAD) // stride_w + 1
 
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    b1 = tvm.reduce_axis((0, activation_bits), name='b1')
-    b2 = tvm.reduce_axis((0, weight_bits), name='b2')
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
+    b1 = te.reduce_axis((0, activation_bits), name='b1')
+    b2 = te.reduce_axis((0, weight_bits), name='b2')
 
     if unipolar:
         def _conv(nn, ff, yy, xx):
             b1b2 = (b1+b2).astype(out_dtype)
-            return tvm.sum(
-                ((tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
-                               Filter_q[ff, rc, ry, rx, b2]) -
-                  tvm.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
-                               ~Filter_q[ff, rc, ry, rx, b2]))
+            return te.sum(
+                ((tvm.tir.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
+                                   Filter_q[ff, rc, ry, rx, b2]) -
+                  tvm.tir.popcount(PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
+                                   ~Filter_q[ff, rc, ry, rx, b2]))
                  << (b1b2)).astype(out_dtype),
                 axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
     else:
         def _conv(nn, ff, yy, xx):
             b1b2 = (b1+b2).astype(out_dtype)
-            return tvm.sum((tvm.popcount(
+            return te.sum((tvm.tir.popcount(
                 PadInput_q[nn, rc, b1, yy * stride_h + ry, xx * stride_w + rx] &
                 Filter_q[ff, rc, ry, rx, b2])<< (b1b2)).astype(out_dtype),
-                           axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
+                          axis=[rc, ry, rx, b2, b1]).astype(out_dtype)
 
-    return tvm.compute((batch, out_channel, out_height, out_width), _conv,
-                       name="Conv2dOutput", tag="bitserial_conv2d_nchw")
+    return te.compute((batch, out_channel, out_height, out_width), _conv,
+                      name="Conv2dOutput", tag="bitserial_conv2d_nchw")
 
 def bitserial_conv2d_nhwc(data, kernel, stride, padding, activation_bits, weight_bits,
                           pack_dtype='uint32', out_dtype='int16', unipolar=True):
@@ -121,10 +121,10 @@ def bitserial_conv2d_nhwc(data, kernel, stride, padding, activation_bits, weight
 
     Parameters
     ----------
-    input : tvm.Tensor
+    input : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
 
-    filter : tvm.Tensor
+    filter : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of two ints
@@ -150,7 +150,7 @@ def bitserial_conv2d_nhwc(data, kernel, stride, padding, activation_bits, weight
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     assert isinstance(stride, int) or len(stride) == 2
@@ -180,33 +180,33 @@ def bitserial_conv2d_nhwc(data, kernel, stride, padding, activation_bits, weight
     out_width = (in_width - kernel_w + LPAD + RPAD) // stride_w + 1
     PadInput_q = pad(Input_q, pad_before, pad_after, name="PaddedInput")
 
-    rc = tvm.reduce_axis((0, in_channel_q), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    b1 = tvm.reduce_axis((0, activation_bits), name='b1')
-    b2 = tvm.reduce_axis((0, weight_bits), name='b2')
+    rc = te.reduce_axis((0, in_channel_q), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
+    b1 = te.reduce_axis((0, activation_bits), name='b1')
+    b2 = te.reduce_axis((0, weight_bits), name='b2')
 
     if unipolar:
         def _conv(nn, yy, xx, ff):
             b1b2 = (b1+b2).astype(out_dtype)
-            return tvm.sum(
-                ((tvm.popcount(PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] &
-                               Filter_q[ry, rx, rc, ff, b2]) -
-                  tvm.popcount(PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] &
-                               ~Filter_q[ry, rx, rc, ff, b2]))
+            return te.sum(
+                ((tvm.tir.popcount(PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] &
+                                   Filter_q[ry, rx, rc, ff, b2]) -
+                  tvm.tir.popcount(PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] &
+                                   ~Filter_q[ry, rx, rc, ff, b2]))
                  << b1b2).astype(out_dtype),
                 axis=[rc, ry, rx, b2, b1])
 
     else:
         def _conv(nn, yy, xx, ff):
             b1b2 = (b1+b2).astype(out_dtype)
-            return tvm.sum((tvm.popcount(
+            return te.sum((tvm.tir.popcount(
                 PadInput_q[nn, yy * stride_h + ry, xx * stride_w + rx, rc, b1] &
                 Filter_q[ry, rx, rc, ff, b2]) << b1b2).astype(out_dtype),
-                           axis=[rc, ry, rx, b2, b1])
+                          axis=[rc, ry, rx, b2, b1])
 
-    conv = tvm.compute((batch, out_height, out_width, out_channel), _conv,
-                       name="Conv2dOutput", tag="bitserial_conv2d_nhwc")
+    conv = te.compute((batch, out_height, out_width, out_channel), _conv,
+                      name="Conv2dOutput", tag="bitserial_conv2d_nhwc")
 
     return conv
 
diff --git a/topi/python/topi/nn/bitserial_dense.py b/topi/python/topi/nn/bitserial_dense.py
index fa1b5df7d066..10635d8e9f2c 100644
--- a/topi/python/topi/nn/bitserial_dense.py
+++ b/topi/python/topi/nn/bitserial_dense.py
@@ -18,6 +18,7 @@
 """Bitserial Dense operator."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 from topi.util import get_const_tuple
 from .bitserial_util import bitpack
 
@@ -27,14 +28,14 @@ def bitserial_dense(data, weight, data_bits, weight_bits, pack_dtype='uint32',
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D with shape [batch, in_dim]
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [out_dim, in_dim] or
         3-D with shape [out_dim, weight_bits, in_dim]
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
     data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
@@ -46,18 +47,18 @@ def bitserial_dense(data, weight, data_bits, weight_bits, pack_dtype='uint32',
     X, WB, _ = get_const_tuple(weight_packed.shape)
 
     oshape = (Y, X)
-    k = tvm.reduce_axis((0, K), name='k')
-    db = tvm.reduce_axis((0, DB), name='db')
-    wb = tvm.reduce_axis((0, WB), name='wb')
+    k = te.reduce_axis((0, K), name='k')
+    db = te.reduce_axis((0, DB), name='db')
+    wb = te.reduce_axis((0, WB), name='wb')
 
-    matmul_unipolar = tvm.compute(oshape, lambda i, j: tvm.sum(
-        (tvm.popcount(weight_packed[j, wb, k] & data_packed[i, db, k]) -
-         tvm.popcount(~weight_packed[j, wb, k] & data_packed[i, db, k])).astype(out_dtype)
+    matmul_unipolar = te.compute(oshape, lambda i, j: te.sum(
+        (tvm.tir.popcount(weight_packed[j, wb, k] & data_packed[i, db, k]) -
+         tvm.tir.popcount(~weight_packed[j, wb, k] & data_packed[i, db, k])).astype(out_dtype)
         << (db+wb).astype(out_dtype), axis=[wb, db, k]),
-                                  tag='bitserial_dense_unipolar')
+                                 tag='bitserial_dense_unipolar')
 
-    matmul = tvm.compute(oshape, lambda i, j: tvm.sum(
-        tvm.popcount(weight_packed[j, wb, k] & data_packed[i, db, k]).astype(out_dtype)
+    matmul = te.compute(oshape, lambda i, j: te.sum(
+        tvm.tir.popcount(weight_packed[j, wb, k] & data_packed[i, db, k]).astype(out_dtype)
         << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense')
 
 
diff --git a/topi/python/topi/nn/bitserial_util.py b/topi/python/topi/nn/bitserial_util.py
index def5b5e2e193..a25aa91198d8 100644
--- a/topi/python/topi/nn/bitserial_util.py
+++ b/topi/python/topi/nn/bitserial_util.py
@@ -18,6 +18,7 @@
 """Utility functions for bitserial operators"""
 import numpy as np
 import tvm
+from tvm import te
 from topi.transform import concatenate
 from ..util import get_const_int
 
@@ -52,7 +53,7 @@ def bitpack(data, bits, pack_axis, bit_axis, pack_type, name="QuantizeInput"):
         pack_axis += 1
 
     def _bitpack(*indices):
-        packed_data = [tvm.const(0, pack_type)] * bits
+        packed_data = [tvm.tir.const(0, pack_type)] * bits
         for k in range(data_width):
             # Translate indices for packed data back to original
             idx = [0] * n
@@ -68,7 +69,8 @@ def _bitpack(*indices):
 
             element = data(*idx)
             for b in range(bits):
-                extracted_bit = ((element & tvm.const(masks[b], "int32")) >> b).astype(pack_type)
+                extracted_bit = (
+                    (element & tvm.tir.const(masks[b], "int32")) >> b).astype(pack_type)
                 packed_data[b] = (packed_data[b] | extracted_bit)
                 if k < data_width - 1:
                     packed_data[b] = packed_data[b] << 1
@@ -77,7 +79,7 @@ def _bitpack(*indices):
                 return tuple(packed_data)
         return tuple(packed_data)
 
-    output_tuple = tvm.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack')
+    output_tuple = te.compute(bitserial_oshape, _bitpack, name=name, tag='bitpack')
 
     if bits > 1:
         return concatenate(output_tuple, axis=bit_axis)
diff --git a/topi/python/topi/nn/bnn.py b/topi/python/topi/nn/bnn.py
index e3b841e0b673..d7355fb0c297 100644
--- a/topi/python/topi/nn/bnn.py
+++ b/topi/python/topi/nn/bnn.py
@@ -17,6 +17,7 @@
 """Binary Neural Network (BNN) Operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import tag
 from ..util import simplify, get_const_int
 
@@ -26,7 +27,7 @@ def binarize_pack(data, axis=None, name="PackedInput"):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D input, can be any layout.
 
     axis : None or int
@@ -38,7 +39,7 @@ def binarize_pack(data, axis=None, name="PackedInput"):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         n-D, the same layout as input, dtype is uint32.
     """
     ishape = data.shape
@@ -47,11 +48,11 @@ def binarize_pack(data, axis=None, name="PackedInput"):
     assert get_const_int(ishape[axis]) % 32 == 0
     n = len(ishape)
     oshape = tuple(simplify(ishape[i] // 32) if i == axis \
-        else ishape[i] for i in range(n))
+                   else ishape[i] for i in range(n))
 
     def _binarize_pack(*indices):
         start_idx = [indices[i] * 32 if i == axis else indices[i] for i in range(n)]
-        packed = tvm.const(0, 'uint32')
+        packed = tvm.tir.const(0, 'uint32')
         for j in range(32):
             idx = [start_idx[i] + j if i == axis else start_idx[i] for i in range(n)]
             sign = (data(*idx) >= 0).astype("uint32")
@@ -61,7 +62,7 @@ def _binarize_pack(*indices):
             packed = packed << 1
         raise RuntimeError("not resach")
 
-    return tvm.compute(oshape, _binarize_pack, name=name, tag='binarize_pack')
+    return te.compute(oshape, _binarize_pack, name=name, tag='binarize_pack')
 
 
 def binary_dense(data, weight):
@@ -69,15 +70,15 @@ def binary_dense(data, weight):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D with shape [batch, in_dim], dtype is uint32.
 
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [out_dim, in_dim], dtype is uint32.
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [batch, out_dim], dtype is float32.
     """
     assert data.dtype == 'uint32' and weight.dtype == 'uint32', \
@@ -86,11 +87,11 @@ def binary_dense(data, weight):
         "only support 2-dim binary dense"
     batch, in_dim = data.shape
     out_dim, _ = weight.shape
-    k = tvm.reduce_axis((0, in_dim), name='k')
-    matmul = tvm.compute((batch, out_dim), lambda i, j: \
-                          tvm.sum(tvm.popcount(data[i, k] ^ weight[j, k]), axis=k), \
-                          tag='binary_dense')
+    k = te.reduce_axis((0, in_dim), name='k')
+    matmul = te.compute((batch, out_dim), lambda i, j: \
+                        te.sum(tvm.tir.popcount(data[i, k] ^ weight[j, k]), axis=k), \
+                        tag='binary_dense')
 
-    return tvm.compute((batch, out_dim), lambda i, j: \
-                        32 * in_dim - 2. * matmul(i, j), \
-                        tag=tag.ELEMWISE)
+    return te.compute((batch, out_dim), lambda i, j: \
+                      32 * in_dim - 2. * matmul(i, j), \
+                      tag=tag.ELEMWISE)
diff --git a/topi/python/topi/nn/conv1d.py b/topi/python/topi/nn/conv1d.py
index 4565fd2f5a46..8049dff01ffa 100644
--- a/topi/python/topi/nn/conv1d.py
+++ b/topi/python/topi/nn/conv1d.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """1D convolution operators."""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from .pad import pad
 from ..util import simplify
 from .util import get_pad_tuple1d
@@ -34,11 +33,11 @@ def conv1d(data,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         3-D input shape [batch, in_channel, in_width] for layout == 'NCW'
         and [batch, in_width, in_channel] for layout == 'NWC'
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         3-D kernel with shape [num_filter, in_channel, filter_size] for layout == 'NCW'
         and [filter_size, in_channel, num_filter] for layout == 'NWC'
 
@@ -81,10 +80,10 @@ def conv1d_ncw(data,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         3-D with shape [batch, in_channel, in_width]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         3-D with shape [num_filter, in_channel, filter_size]
 
     strides : int or tuple
@@ -123,12 +122,12 @@ def conv1d_ncw(data,
     temp = pad(data, pad_before, pad_after, name='pad_temp')
 
     # Compute graph
-    rc = tvm.reduce_axis((0, in_channels), name='rc')
-    rw = tvm.reduce_axis((0, kernel_size), name='rw')
+    rc = te.reduce_axis((0, in_channels), name='rc')
+    rw = te.reduce_axis((0, kernel_size), name='rw')
 
-    return tvm.compute(
+    return te.compute(
         (batch, out_channels, out_width),
-        lambda b, c, w: tvm.sum(
+        lambda b, c, w: te.sum(
             temp[b, rc, w * strides + rw * dilation].astype(out_dtype)
             * kernel[c, rc, rw].astype(out_dtype),
             axis=[rc, rw]),
@@ -145,10 +144,10 @@ def conv1d_nwc(data,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         3-D with shape [batch, in_width, in_channel]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         3-D with shape [filter_size, in_channel, num_filter]
 
     strides : int or tuple
@@ -187,12 +186,12 @@ def conv1d_nwc(data,
     temp = pad(data, pad_before, pad_after, name='pad_temp')
 
     # Compute graph
-    rc = tvm.reduce_axis((0, in_channels), name='rc')
-    rw = tvm.reduce_axis((0, kernel_size), name='rw')
+    rc = te.reduce_axis((0, in_channels), name='rc')
+    rw = te.reduce_axis((0, kernel_size), name='rw')
 
-    return tvm.compute(
+    return te.compute(
         (batch, out_width, out_channels),
-        lambda b, w, c: tvm.sum(
+        lambda b, w, c: te.sum(
             temp[b, w * strides + rw * dilation, rc].astype(out_dtype)
             * kernel[rw, rc, c].astype(out_dtype),
             axis=[rc, rw]),
diff --git a/topi/python/topi/nn/conv1d_transpose.py b/topi/python/topi/nn/conv1d_transpose.py
index 8d224247db01..1895b1f04a1f 100644
--- a/topi/python/topi/nn/conv1d_transpose.py
+++ b/topi/python/topi/nn/conv1d_transpose.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """Transposed 1D convolution operators (sometimes called Deconvolution)."""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from .dilate import dilate
 from .pad import pad
 from ..util import simplify
@@ -29,10 +28,10 @@ def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         3-D with shape [batch, in_channel, in_width]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         3-D with shape [in_channel, num_filter, filter_width]
 
     stride : ints
@@ -46,7 +45,7 @@ def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         3-D with shape [batch, out_channel, out_width]
     """
 
@@ -63,18 +62,18 @@ def conv1d_transpose_ncw(data, kernel, stride, padding, out_dtype):
     data = pad(data, [0, 0, pad_left], [0, 0, pad_right], name='data_pad')
 
     # transpose kernel, switch kernel layout to IOW
-    kernel = tvm.compute((channels_out, channels_in, kernel_width), \
-                         lambda o, i, w: kernel[i][o][kernel_width-1-w],\
-                         name='kernel')
+    kernel = te.compute((channels_out, channels_in, kernel_width), \
+                        lambda o, i, w: kernel[i][o][kernel_width-1-w],\
+                        name='kernel')
 
     # convolution
     _, _, data_width = data.shape
     out_w = simplify(data_width - kernel_width + 1)
-    dc = tvm.reduce_axis((0, channels_in), name='dc')
-    dw = tvm.reduce_axis((0, kernel_width), name='dw')
-    output = tvm.compute(
+    dc = te.reduce_axis((0, channels_in), name='dc')
+    dw = te.reduce_axis((0, kernel_width), name='dw')
+    output = te.compute(
         (batch, channels_out, out_w),
-        lambda b, c, w: tvm.sum(
+        lambda b, c, w: te.sum(
             data[b, dc, w+dw].astype(out_dtype) *
             kernel[c, dc, dw].astype(out_dtype),
             axis=[dc, dw]), tag="conv1d_transpose_ncw")
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index a7a75ed0ef0c..4c7941b49692 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -20,6 +20,7 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
+from tvm import te
 
 from .pad import pad
 from .util import get_pad_tuple
@@ -36,10 +37,10 @@ def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=N
 
     Parameters
     ----------
-    input : tvm.Tensor
+    input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    filter : tvm.Tensor
+    filter : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
     strides : int or a list/tuple of two ints
@@ -58,7 +59,7 @@ def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=N
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     # search platform specific declaration first
@@ -170,10 +171,10 @@ def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
     stride : int or a list/tuple of two ints
@@ -189,7 +190,7 @@ def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     if out_dtype is None:
@@ -220,12 +221,12 @@ def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     pad_before = [0, 0, pad_top, pad_left]
     pad_after = [0, 0, pad_down, pad_right]
     temp = pad(Input, pad_before, pad_after, name="pad_temp")
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    return tvm.compute(
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
+    return te.compute(
         (batch, out_channel, out_height, out_width),
-        lambda nn, ff, yy, xx: tvm.sum(
+        lambda nn, ff, yy, xx: te.sum(
             temp[nn, rc, yy * stride_h + ry * dilation_h,
                  xx * stride_w + rx * dilation_w].astype(out_dtype) *
             Filter[ff, rc, ry, rx].astype(out_dtype),
@@ -237,10 +238,10 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [in_height, in_width, in_channel, batch]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of two ints
@@ -256,7 +257,7 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [out_height, out_width, out_channel, batch]
     """
     if out_dtype is None:
@@ -287,12 +288,12 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     pad_before = [pad_top, pad_left, 0, 0]
     pad_after = [pad_down, pad_right, 0, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    Output = tvm.compute(
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
+    Output = te.compute(
         (out_height, out_width, out_channel, batch),
-        lambda yy, xx, ff, nn: tvm.sum(
+        lambda yy, xx, ff, nn: te.sum(
             PaddedInput[yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w,
                         rc, nn].astype(out_dtype) *
             Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
@@ -305,10 +306,10 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of two ints
@@ -324,7 +325,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     assert isinstance(stride, int) or len(stride) == 2
@@ -353,12 +354,12 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     pad_before = [0, pad_top, pad_left, 0]
     pad_after = [0, pad_down, pad_right, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    Output = tvm.compute(
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
+    Output = te.compute(
         (batch, out_height, out_width, out_channel),
-        lambda nn, yy, xx, ff: tvm.sum(
+        lambda nn, yy, xx, ff: te.sum(
             PaddedInput[nn, yy * stride_h + ry * dilation_h,
                         xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
             Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
@@ -371,10 +372,10 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         6-D with shape
         [num_filter_chunk, in_channel_chunk, filter_height, filter_width,
         in_channel_block, num_filter_block]
@@ -401,7 +402,7 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
 
@@ -441,27 +442,27 @@ def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, ou
     else:
         data_pad = data
 
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
-
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
-
-    return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n,
-                                        idxdiv(ic, ic_bn),
-                                        oh * HSTR + kh * dilation_h,
-                                        ow * WSTR + kw * dilation_w,
-                                        idxmod(ic, ic_bn)].astype(out_dtype)
-                               * kernel[oc_chunk,
-                                        idxdiv(ic, ic_bn),
-                                        kh,
-                                        kw,
-                                        idxmod(ic, ic_bn),
-                                        oc_block],
-                               axis=[ic, kh, kw]),
-                       name='conv2d_NCHWc', tag="conv2d_NCHWc")
+    ic = te.reduce_axis((0, in_channel), name='ic')
+    kh = te.reduce_axis((0, kernel_height), name='kh')
+    kw = te.reduce_axis((0, kernel_width), name='kw')
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+
+    return te.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                      te.sum(data_pad[n,
+                                      idxdiv(ic, ic_bn),
+                                      oh * HSTR + kh * dilation_h,
+                                      ow * WSTR + kw * dilation_w,
+                                      idxmod(ic, ic_bn)].astype(out_dtype)
+                             * kernel[oc_chunk,
+                                      idxdiv(ic, ic_bn),
+                                      kh,
+                                      kw,
+                                      idxmod(ic, ic_bn),
+                                      oc_block],
+                             axis=[ic, kh, kw]),
+                      name='conv2d_NCHWc', tag="conv2d_NCHWc")
 
 
 def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layout,
@@ -470,10 +471,10 @@ def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layou
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         7-D with shape
         [num_filter_chunk, in_channel_chunk, filter_height, filter_width, in_channel_block/4,
         num_filter_block, 4]
@@ -500,7 +501,7 @@ def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layou
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
 
@@ -540,53 +541,53 @@ def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layou
     else:
         data_pad = data
 
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+    ic = te.reduce_axis((0, in_channel), name='ic')
+    kh = te.reduce_axis((0, kernel_height), name='kh')
+    kw = te.reduce_axis((0, kernel_width), name='kw')
 
     if groups == 1:
         n_elems = 4
-        ic_outer = tvm.reduce_axis((0, in_channel//ic_bn), name='ic_outer')
-        ic_f_inner = tvm.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
-        ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
-        return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                           tvm.sum(data_pad[n,
-                                            ic_outer,
-                                            oh * HSTR + kh * dilation_h,
-                                            ow * WSTR + kw * dilation_w,
-                                            ic_f_inner * n_elems + ic_s_inner].astype(out_dtype)
-                                   * kernel[oc_chunk,
-                                            ic_outer,
-                                            kh,
-                                            kw,
-                                            ic_f_inner,
-                                            oc_block,
-                                            ic_s_inner].astype(out_dtype),
-                                   axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
-                           name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
+        ic_outer = te.reduce_axis((0, in_channel//ic_bn), name='ic_outer')
+        ic_f_inner = te.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
+        ic_s_inner = te.reduce_axis((0, n_elems), name='ic_s_inner')
+        return te.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                          te.sum(data_pad[n,
+                                          ic_outer,
+                                          oh * HSTR + kh * dilation_h,
+                                          ow * WSTR + kw * dilation_w,
+                                          ic_f_inner * n_elems + ic_s_inner].astype(out_dtype)
+                                 * kernel[oc_chunk,
+                                          ic_outer,
+                                          kh,
+                                          kw,
+                                          ic_f_inner,
+                                          oc_block,
+                                          ic_s_inner].astype(out_dtype),
+                                 axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
+                          name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
     # for int8 group conv support
     n_elems = 4
     ic_chunk = in_channel//ic_bn
-    ic_outer = tvm.reduce_axis((0, ic_chunk//groups), name='ic_outer')
-    ic_f_inner = tvm.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
-    ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
+    ic_outer = te.reduce_axis((0, ic_chunk//groups), name='ic_outer')
+    ic_f_inner = te.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
+    ic_s_inner = te.reduce_axis((0, n_elems), name='ic_s_inner')
     oshape = (n, oc_chunk, out_height, out_width, oc_bn)
-    return tvm.compute(oshape, lambda n, occ, oh, ow, oc_block:
-                       tvm.sum(data_pad[n,
-                                        (occ * oc_bn // (oc_chunk * oc_bn // groups))
-                                        * (ic_chunk // groups) + ic_outer,
-                                        oh * HSTR + kh,
-                                        ow * WSTR + kw,
-                                        ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype)
-                               * kernel[occ,
-                                        ic_outer,
-                                        kh,
-                                        kw,
-                                        ic_f_inner,
-                                        oc_block,
-                                        ic_s_inner].astype(out_dtype),
-                               axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
-                       name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
+    return te.compute(oshape, lambda n, occ, oh, ow, oc_block:
+                      te.sum(data_pad[n,
+                                      (occ * oc_bn // (oc_chunk * oc_bn // groups))
+                                      * (ic_chunk // groups) + ic_outer,
+                                      oh * HSTR + kh,
+                                      ow * WSTR + kw,
+                                      ic_f_inner * n_elems +  ic_s_inner].astype(out_dtype)
+                             * kernel[occ,
+                                      ic_outer,
+                                      kh,
+                                      kw,
+                                      ic_f_inner,
+                                      oc_block,
+                                      ic_s_inner].astype(out_dtype),
+                             axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
+                      name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
 
 
 def conv2d_winograd_weight_transform(kernel, tile_size):
@@ -601,7 +602,7 @@ def conv2d_winograd_weight_transform(kernel, tile_size):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [alpha, alpha, CO, CI]
     """
     shape = get_const_tuple(kernel.shape)
@@ -613,12 +614,12 @@ def conv2d_winograd_weight_transform(kernel, tile_size):
 
     _, _, G = winograd_transform_matrices(tile_size, K, kernel.dtype)
 
-    r_kh = tvm.reduce_axis((0, K), name='r_kh')
-    r_kw = tvm.reduce_axis((0, K), name='r_kw')
-    return tvm.compute(shape, lambda eps, nu, co, ci:
-                       tvm.sum(kernel[co][ci][r_kh][r_kw] *
-                               G[eps][r_kh] * G[nu][r_kw],
-                               axis=[r_kh, r_kw]), name='transform_weight')
+    r_kh = te.reduce_axis((0, K), name='r_kh')
+    r_kw = te.reduce_axis((0, K), name='r_kw')
+    return te.compute(shape, lambda eps, nu, co, ci:
+                      te.sum(kernel[co][ci][r_kh][r_kw] *
+                             G[eps][r_kh] * G[nu][r_kw],
+                             axis=[r_kh, r_kw]), name='transform_weight')
 
 
 def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype):
@@ -631,7 +632,7 @@ def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_d
         The convolution algorithm for Winograd NNPACK.
      Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [alpha, alpha, CO, CI]
     """
     # pylint: disable=import-outside-toplevel
@@ -645,10 +646,10 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [num_filter, in_channel // groups, filter_height, filter_width]
 
     stride : int or a list/tuple of two ints
@@ -670,7 +671,7 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     if out_dtype is None:
@@ -705,12 +706,12 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
     pad_before = [0, 0, pad_top, pad_left]
     pad_after = [0, 0, pad_down, pad_right]
     temp = pad(Input, pad_before, pad_after, name="pad_temp")
-    rc = tvm.reduce_axis((0, in_channel // groups), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    return tvm.compute(
+    rc = te.reduce_axis((0, in_channel // groups), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
+    return te.compute(
         (batch, out_channel, out_height, out_width),
-        lambda nn, ff, yy, xx: tvm.sum(
+        lambda nn, ff, yy, xx: te.sum(
             temp[nn, ff // (num_filter//groups) * (in_channel//groups) + rc,
                  yy * stride_h + ry * dilation_h,
                  xx * stride_w + rx * dilation_w].astype(out_dtype) *
@@ -723,7 +724,7 @@ def unpack_NCHWc_to_nchw(packed_out, out_dtype):
 
      Parameters
     -----------
-    packed_out : tvm.Tensor
+    packed_out : tvm.te.Tensor
         The output tensor of conv2d_NCHWc.
 
     out_dtype : str
@@ -731,20 +732,20 @@ def unpack_NCHWc_to_nchw(packed_out, out_dtype):
 
     Returns
     -------
-    unpacked_out : tvm.Tensor
+    unpacked_out : tvm.te.Tensor
         The unpacked output tensor in NCHW layout.
     """
     n, oc_chunk, oh, ow, oc_bn = get_const_tuple(packed_out.shape)
 
-    idxmod = tvm.indexmod
-    idxdiv = tvm.indexdiv
+    idxmod = tvm.tir.indexmod
+    idxdiv = tvm.tir.indexdiv
 
     oshape = (n, oc_chunk * oc_bn, oh, ow)
     unpacked_out = \
-        tvm.compute(oshape,
-                    lambda n, c, h, w:
-                    packed_out[n, idxdiv(c, oc_bn), h, w, idxmod(c, oc_bn)]
-                    .astype(out_dtype),
-                    name='output_unpack',
-                    tag=tag.INJECTIVE+",unpack_nchwc")
+        te.compute(oshape,
+                   lambda n, c, h, w:
+                   packed_out[n, idxdiv(c, oc_bn), h, w, idxmod(c, oc_bn)]
+                   .astype(out_dtype),
+                   name='output_unpack',
+                   tag=tag.INJECTIVE+",unpack_nchwc")
     return unpacked_out
diff --git a/topi/python/topi/nn/conv2d_transpose.py b/topi/python/topi/nn/conv2d_transpose.py
index db132fc81f13..3563112ed244 100644
--- a/topi/python/topi/nn/conv2d_transpose.py
+++ b/topi/python/topi/nn/conv2d_transpose.py
@@ -16,8 +16,8 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """Transposed 2D convolution operators (sometimes called Deconvolution)."""
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from tvm import relay
 from .dilate import dilate
 from .pad import pad
@@ -30,10 +30,10 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [in_channel, num_filter, filter_height, filter_width]
 
     strides : tuple of two ints
@@ -47,7 +47,7 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     return declaration_conv2d_transpose_impl(Input, Filter, strides, padding, out_dtype)
@@ -72,9 +72,9 @@ def conv2d_transpose_nchw_preprocess(data, kernel, strides, padding, out_dtype):
                    [0, 0, bpad_bottom, bpad_right], \
                    name='data_pad')
     # transform kernel layout from IOHW to OIHW, and rotate kernel by 180 degrees
-    kernel_transform = tvm.compute((out_c, in_c, filter_h, filter_w), \
-                                    lambda o, i, h, w: kernel[i][o][filter_h-1-h][filter_w-1-w], \
-                                    name='kernel_transform')
+    kernel_transform = te.compute((out_c, in_c, filter_h, filter_w), \
+                                  lambda o, i, h, w: kernel[i][o][filter_h-1-h][filter_w-1-w], \
+                                  name='kernel_transform')
     return data_pad, kernel_transform
 
 
@@ -90,13 +90,13 @@ def declaration_conv2d_transpose_impl(data, kernel, strides, padding, out_dtype)
     out_c = simplify(out_c)
     out_h = simplify(in_h - filter_h + 1)
     out_w = simplify(in_w - filter_w + 1)
-    dc = tvm.reduce_axis((0, in_c), name='dc')
-    dh = tvm.reduce_axis((0, filter_h), name='dh')
-    dw = tvm.reduce_axis((0, filter_w), name='dw')
+    dc = te.reduce_axis((0, in_c), name='dc')
+    dh = te.reduce_axis((0, filter_h), name='dh')
+    dw = te.reduce_axis((0, filter_w), name='dw')
 
-    Output = tvm.compute(
+    Output = te.compute(
         (batch, out_c, out_h, out_w),
-        lambda b, c, h, w: tvm.sum(
+        lambda b, c, h, w: te.sum(
             data_pad[b, dc, h+dh, w+dw].astype(out_dtype) *
             kernel_transform[c, dc, dh, dw].astype(out_dtype),
             axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")
diff --git a/topi/python/topi/nn/conv3d.py b/topi/python/topi/nn/conv3d.py
index 88c7c6a3ed90..d6bd6424a947 100644
--- a/topi/python/topi/nn/conv3d.py
+++ b/topi/python/topi/nn/conv3d.py
@@ -17,8 +17,7 @@
 # pylint: disable=invalid-name, unused-variable, too-many-locals
 # pylint: disable=unused-argument, redefined-builtin, no-else-return
 """Conv3D operators"""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 
 from .pad import pad
 from .util import get_pad_tuple3d
@@ -30,10 +29,10 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None):
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         5-D with shape [batch, in_channel, in_depth, in_height, in_width]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         5-D with shape [num_filter, in_channel, filter_depth, filter_height, filter_width]
 
     stride : int or a list/tuple of three ints
@@ -47,7 +46,7 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         5-D with shape [batch, out_channel, out_depth, out_height, out_width]
     """
     if out_dtype is None:
@@ -80,14 +79,14 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, out_dtype=None):
     pad_before = [0, 0, pad_front, pad_top, pad_left]
     pad_after = [0, 0, pad_back, pad_down, pad_right]
     temp = pad(Input, pad_before, pad_after, name="pad_temp")
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    rz = tvm.reduce_axis((0, kernel_d), name='rz')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    rz = te.reduce_axis((0, kernel_d), name='rz')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
 
-    return tvm.compute(
+    return te.compute(
         (batch, out_channel, out_depth, out_height, out_width),
-        lambda nn, ff, zz, yy, xx: tvm.sum(
+        lambda nn, ff, zz, yy, xx: te.sum(
             temp[nn, rc, zz * stride_d + rz * dilation_d, yy * stride_h + ry * dilation_h,
                  xx * stride_w + rx * dilation_w].astype(out_dtype) *
             Filter[ff, rc, rz, ry, rx].astype(out_dtype),
@@ -99,10 +98,10 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         5-D with shape [batch, in_depth, in_height, in_width, in_channel]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         5-D with shape [filter_depth, filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of three ints
@@ -116,7 +115,7 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         5-D with shape [batch, out_depth, out_height, out_width, out_channel]
     """
     assert isinstance(stride, int) or len(stride) == 3
@@ -148,13 +147,13 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     pad_before = [0, pad_front, pad_top, pad_left, 0]
     pad_after = [0, pad_back, pad_down, pad_right, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    rd = tvm.reduce_axis((0, kernel_d), name='rd')
-    rh = tvm.reduce_axis((0, kernel_h), name='rh')
-    rw = tvm.reduce_axis((0, kernel_w), name='rw')
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    Output = tvm.compute(
+    rd = te.reduce_axis((0, kernel_d), name='rd')
+    rh = te.reduce_axis((0, kernel_h), name='rh')
+    rw = te.reduce_axis((0, kernel_w), name='rw')
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    Output = te.compute(
         (batch, out_depth, out_height, out_width, out_channel),
-        lambda nn, dd, hh, ww, cc: tvm.sum(
+        lambda nn, dd, hh, ww, cc: te.sum(
             PaddedInput[nn, dd * stride_d + rd * dilation_d, hh * stride_h + rh * dilation_h,
                         ww * stride_w + rw * dilation_w, rc].astype(out_dtype) *
             Filter[rd, rh, rw, rc, cc].astype(out_dtype), axis=[rd, rh, rw, rc]),
diff --git a/topi/python/topi/nn/deformable_conv2d.py b/topi/python/topi/nn/deformable_conv2d.py
index 251f68aa8c25..9f95fd1ae790 100644
--- a/topi/python/topi/nn/deformable_conv2d.py
+++ b/topi/python/topi/nn/deformable_conv2d.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, too-many-locals, too-many-arguments
 """Deformable Conv2D operators"""
 import tvm
+from tvm import te
 
 from .util import get_pad_tuple
 from ..util import get_const_tuple
@@ -30,14 +31,14 @@ def deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation, def
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    offset : tvm.Tensor
+    offset : tvm.te.Tensor
         4-D with shape [batch, deformable_groups * filter_height * filter_width * 2,
         out_height, out_width].
 
-    kernel : tvm.Tensor
+    kernel : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
     strides : int or a list/tuple of two ints
@@ -57,7 +58,7 @@ def deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation, def
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     if out_dtype is None:
@@ -85,30 +86,30 @@ def deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation, def
     dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
     pad_top, pad_left, _, _ = get_pad_tuple(
         padding, (dilated_kernel_h, dilated_kernel_w))
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
 
-    zero = tvm.const(0.0, data.dtype)
+    zero = tvm.tir.const(0.0, data.dtype)
 
     def _bilinear(n, c, h, w):
-        outside = tvm.any(h < 0, w < 0, h >= in_height, w >= in_width)
+        outside = tvm.tir.any(h < 0, w < 0, h >= in_height, w >= in_width)
         val = bilinear_sample_nchw(data, (n, c, h, w), in_height - 1, in_width - 1)
-        return tvm.if_then_else(outside, zero, val)
+        return tvm.tir.if_then_else(outside, zero, val)
 
     data_deform = \
-        tvm.compute((batch, in_channel, kernel_h, kernel_w, out_height, out_width),
-                    lambda n, c, kh, kw, y, x:
-                    _bilinear(n, c,
-                              y * stride_h - pad_top + kh * dilation_h +
-                              offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
-                                     (kh * kernel_w + kw) * 2, y, x],
-                              x * stride_w - pad_left + kw * dilation_w +
-                              offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
-                                     (kh * kernel_w + kw) * 2 + 1, y, x]))
-    return tvm.compute(
+        te.compute((batch, in_channel, kernel_h, kernel_w, out_height, out_width),
+                   lambda n, c, kh, kw, y, x:
+                   _bilinear(n, c,
+                             y * stride_h - pad_top + kh * dilation_h +
+                             offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
+                                    (kh * kernel_w + kw) * 2, y, x],
+                             x * stride_w - pad_left + kw * dilation_w +
+                             offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
+                                    (kh * kernel_w + kw) * 2 + 1, y, x]))
+    return te.compute(
         (batch, out_channel, out_height, out_width),
-        lambda n, f, y, x: tvm.sum(
+        lambda n, f, y, x: te.sum(
             data_deform[n, rc, ry, rx, y, x].astype(out_dtype) *
             kernel[f, rc, ry, rx].astype(out_dtype),
             axis=[rc, ry, rx]), tag="deformable_conv2d_nchw")
diff --git a/topi/python/topi/nn/dense.py b/topi/python/topi/nn/dense.py
index fe21e7417bda..7d7ef6c23d56 100644
--- a/topi/python/topi/nn/dense.py
+++ b/topi/python/topi/nn/dense.py
@@ -15,8 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """TVM operator fully connected compute."""
-from __future__ import absolute_import
-import tvm
+from tvm import te
 from .. import tag
 
 def dense(data, weight, bias=None, out_dtype=None):
@@ -24,13 +23,13 @@ def dense(data, weight, bias=None, out_dtype=None):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D with shape [batch, in_dim]
 
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [out_dim, in_dim]
 
-    bias : tvm.Tensor, optional
+    bias : tvm.te.Tensor, optional
         1-D with shape [out_dim]
 
     out_dtype : str
@@ -38,7 +37,7 @@ def dense(data, weight, bias=None, out_dtype=None):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
     assert len(data.shape) == 2 and len(weight.shape) == 2, \
@@ -49,13 +48,13 @@ def dense(data, weight, bias=None, out_dtype=None):
         out_dtype = data.dtype
     batch, in_dim = data.shape
     out_dim, _ = weight.shape
-    k = tvm.reduce_axis((0, in_dim), name='k')
-    matmul = tvm.compute((batch, out_dim), \
-                         lambda i, j: tvm.sum(data[i, k].astype(out_dtype) * \
-                                              weight[j, k].astype(out_dtype), axis=k), \
-                         name='T_dense', tag='dense')
+    k = te.reduce_axis((0, in_dim), name='k')
+    matmul = te.compute((batch, out_dim), \
+                        lambda i, j: te.sum(data[i, k].astype(out_dtype) * \
+                                            weight[j, k].astype(out_dtype), axis=k), \
+                        name='T_dense', tag='dense')
     if bias is not None:
-        matmul = tvm.compute((batch, out_dim), \
-                             lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), \
-                             tag=tag.BROADCAST)
+        matmul = te.compute((batch, out_dim), \
+                            lambda i, j: matmul[i, j] + bias[j].astype(out_dtype), \
+                            tag=tag.BROADCAST)
     return matmul
diff --git a/topi/python/topi/nn/depth_to_space.py b/topi/python/topi/nn/depth_to_space.py
index d847c08daf27..a9fbfea186cb 100644
--- a/topi/python/topi/nn/depth_to_space.py
+++ b/topi/python/topi/nn/depth_to_space.py
@@ -18,6 +18,7 @@
 """TVM operator depth_to_space compute."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 from .. import tag
 
 
@@ -26,7 +27,7 @@ def depth_to_space(data, block_size, layout='NCHW', mode='DCR'):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D tensor in either NCHW or NHWC layout.
 
     block_size : int
@@ -42,17 +43,17 @@ def depth_to_space(data, block_size, layout='NCHW', mode='DCR'):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         Output of shape [N, C / block_size**2, H * block_size, W * block_size]
     """
     if layout == 'NCHW':
         in_n, in_c, in_h, in_w = data.shape
-        channel_factor = tvm.truncdiv(in_c, (block_size * block_size))
+        channel_factor = tvm.tir.truncdiv(in_c, (block_size * block_size))
         output_shape = [in_n, channel_factor,
                         in_h * block_size, in_w * block_size]
     elif layout == 'NHWC':
         in_n, in_h, in_w, in_c = data.shape
-        channel_factor = tvm.truncdiv(in_c, (block_size * block_size))
+        channel_factor = tvm.tir.truncdiv(in_c, (block_size * block_size))
         output_shape = [in_n, in_h * block_size,
                         in_w * block_size, channel_factor]
     else:
@@ -66,10 +67,10 @@ def _get_indices(*indices):
         return n, c, y, x
 
     def _get_pixel(n, c, y, x):
-        block_x = tvm.truncdiv(x, block_size)
-        block_y = tvm.truncdiv(y, block_size)
-        idx_x = tvm.truncmod(x, block_size)
-        idx_y = tvm.truncmod(y, block_size)
+        block_x = tvm.tir.truncdiv(x, block_size)
+        block_y = tvm.tir.truncdiv(y, block_size)
+        idx_x = tvm.tir.truncmod(x, block_size)
+        idx_y = tvm.tir.truncmod(y, block_size)
         if mode == "DCR":
             channel_idx = channel_factor * ((block_size * idx_y) + idx_x) + c
         else:
@@ -85,4 +86,4 @@ def _compute(*indices):
         n, c, y, x = _get_indices(*indices)
         return _get_pixel(n, c, y, x)
 
-    return tvm.compute(output_shape, _compute, name='depth_to_space', tag=tag.INJECTIVE)
+    return te.compute(output_shape, _compute, name='depth_to_space', tag=tag.INJECTIVE)
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index 49aaace0f833..32a92585c8cb 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -19,6 +19,7 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
+from tvm import te
 
 from .dilate import dilate
 from .pad import pad
@@ -52,10 +53,10 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=No
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [in_channel, channel_multiplier, filter_height, filter_width]
 
     stride : tuple of two ints
@@ -72,7 +73,7 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=No
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     out_dtype = Input.dtype if out_dtype is None else out_dtype
@@ -104,13 +105,13 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=No
     pad_after = [0, 0, pad_down, pad_right]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
     # depthconv stage
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
-    di = tvm.reduce_axis((0, filter_height), name='di')
-    dj = tvm.reduce_axis((0, filter_width), name='dj')
-    Output = tvm.compute(
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+    di = te.reduce_axis((0, filter_height), name='di')
+    dj = te.reduce_axis((0, filter_width), name='dj')
+    Output = te.compute(
         (batch, out_channel, out_height, out_width),
-        lambda b, c, i, j: tvm.sum(
+        lambda b, c, i, j: te.sum(
             (PaddedInput[b, idxdiv(c, channel_multiplier), i*stride_h+di*dilation_h,
                          j*stride_w+dj*dilation_w].astype(out_dtype) *
              Filter[idxdiv(c, channel_multiplier),
@@ -125,10 +126,10 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=No
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]
 
     stride : tuple of two ints
@@ -145,7 +146,7 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=No
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     out_dtype = Input.dtype if out_dtype is None else out_dtype
@@ -177,14 +178,14 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=No
     pad_after = [0, pad_down, pad_right, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
     # depthconv stage
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
 
-    di = tvm.reduce_axis((0, filter_height), name='di')
-    dj = tvm.reduce_axis((0, filter_width), name='dj')
-    Output = tvm.compute(
+    di = te.reduce_axis((0, filter_height), name='di')
+    dj = te.reduce_axis((0, filter_width), name='dj')
+    Output = te.compute(
         (batch, out_height, out_width, out_channel),
-        lambda b, i, j, c: tvm.sum(
+        lambda b, i, j, c: te.sum(
             (PaddedInput[b, i*stride_h + di*dilation_h, j*stride_w + dj*dilation_w,
                          idxdiv(c, channel_multiplier)].astype(out_dtype) *
              Filter[di, dj,
@@ -199,10 +200,10 @@ def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, strid
 
     Parameters
     ----------
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]
 
-    Out_grad : tvm.Tensor
+    Out_grad : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
 
     stride : tuple of two ints
@@ -213,7 +214,7 @@ def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, strid
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
     """
     batch, in_h, in_w, in_c = ishape
@@ -235,19 +236,19 @@ def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, strid
     bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1)
 
     padded_out_grad = pad(dilated_out_grad, \
-                                  [0, bpad_top, bpad_left, 0], \
-                                  [0, bpad_bottom, bpad_right, 0], \
-                                  name='padded_out_grad')
+                          [0, bpad_top, bpad_left, 0], \
+                          [0, bpad_bottom, bpad_right, 0], \
+                          name='padded_out_grad')
 
-    dh = tvm.reduce_axis((0, filter_h), name='dh')
-    dw = tvm.reduce_axis((0, filter_w), name='dw')
-    dc = tvm.reduce_axis((0, channel_multiplier), name='dc')
+    dh = te.reduce_axis((0, filter_h), name='dh')
+    dw = te.reduce_axis((0, filter_w), name='dw')
+    dc = te.reduce_axis((0, channel_multiplier), name='dc')
 
-    In_grad = tvm.compute(
+    In_grad = te.compute(
         (batch, in_h, in_w, in_c),
-        lambda b, h, w, c: tvm.sum(padded_out_grad[b, h+dh, w+dw, c*channel_multiplier + dc] * \
-                                   Filter[filter_h-1-dh, filter_w-1-dw, c, dc],
-                                   axis=[dh, dw, dc]), tag='depthwise_conv2d_backward_input_nhwc')
+        lambda b, h, w, c: te.sum(padded_out_grad[b, h+dh, w+dw, c*channel_multiplier + dc] * \
+                                  Filter[filter_h-1-dh, filter_w-1-dw, c, dc],
+                                  axis=[dh, dw, dc]), tag='depthwise_conv2d_backward_input_nhwc')
 
     return In_grad
 
@@ -257,10 +258,10 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
 
-    Out_grad : tvm.Tensor
+    Out_grad : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
 
     stride : tuple of two ints
@@ -271,7 +272,7 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]
     """
     batch, out_h, out_w, out_c = oshape
@@ -285,19 +286,19 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
     pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (filter_h, filter_w))
 
     padded_in = pad(Input, \
-                        [0, pad_top, pad_left, 0], \
-                        [0, pad_bottom, pad_right, 0], \
-                        name='padded_in')
+                    [0, pad_top, pad_left, 0], \
+                    [0, pad_bottom, pad_right, 0], \
+                    name='padded_in')
 
-    dh = tvm.reduce_axis((0, Out_grad.shape[1].value), name='dh')
-    dw = tvm.reduce_axis((0, Out_grad.shape[2].value), name='dw')
-    db = tvm.reduce_axis((0, batch), name='db')
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    dh = te.reduce_axis((0, Out_grad.shape[1].value), name='dh')
+    dw = te.reduce_axis((0, Out_grad.shape[2].value), name='dw')
+    db = te.reduce_axis((0, batch), name='db')
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
 
-    Weight_grad = tvm.compute(
+    Weight_grad = te.compute(
         (filter_h, filter_w, in_c, channel_multiplier),
-        lambda fh, fw, c, m: tvm.sum(
+        lambda fh, fw, c, m: te.sum(
             Out_grad[db, dh, dw, c*channel_multiplier+idxmod(m, channel_multiplier)] *
             padded_in[db, fh+dh*stride_h, fw+dw*stride_w, c], axis=[db, dh, dw]),
         tag='depthwise_conv2d_backward_weight_nhwc')
@@ -311,10 +312,10 @@ def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
 
-    Filter : tvm.Tensor
+    Filter : tvm.te.Tensor
         6-D with shape [out_channel_chunk, 1, filter_height, filter_width, 1, out_channel_block]
         In NCHWc depthwise convolution,
         we group kernel's in_channel and channel_multiplier together then do the tiling.
@@ -339,7 +340,7 @@ def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
     raise ValueError("missing register for topi.nn.depthwise_conv2d_NCHWc")
diff --git a/topi/python/topi/nn/dilate.py b/topi/python/topi/nn/dilate.py
index d95245395a11..f628fadee96e 100644
--- a/topi/python/topi/nn/dilate.py
+++ b/topi/python/topi/nn/dilate.py
@@ -16,18 +16,18 @@
 # under the License.
 # pylint: disable=invalid-name
 """Dilation operators"""
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import util
 from .. import tag
 
-@tvm.tag_scope(tag=tag.INJECTIVE+",dilate")
+@te.tag_scope(tag=tag.INJECTIVE+",dilate")
 def dilate(data, strides, name="DilatedInput"):
     """Dilate data with zeros.
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D, can be any layout.
 
     strides : list / tuple of n ints
@@ -38,7 +38,7 @@ def dilate(data, strides, name="DilatedInput"):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         n-D, the same layout as data.
     """
     n = len(data.shape)
@@ -47,13 +47,13 @@ def dilate(data, strides, name="DilatedInput"):
             n, len(strides)))
 
     out_shape = tuple(
-        tvm.ir_pass.Simplify((data.shape[i] - 1) * strides[i] + 1) for i in range(n))
+        tvm.tir.ir_pass.Simplify((data.shape[i] - 1) * strides[i] + 1) for i in range(n))
 
     def _dilate(*indices):
         not_zero = []
         index_tuple = []
-        idxdiv = tvm.indexdiv
-        idxmod = tvm.indexmod
+        idxdiv = tvm.tir.indexdiv
+        idxmod = tvm.tir.indexmod
         for i in range(n):
             if not util.equal_const_int(strides[i], 1):
                 index_tuple.append(idxdiv(indices[i], strides[i]))
@@ -61,8 +61,9 @@ def _dilate(*indices):
             else:
                 index_tuple.append(indices[i])
         if not_zero:
-            not_zero = tvm.all(*not_zero)
-            return tvm.if_then_else(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype))
+            not_zero = tvm.tir.all(*not_zero)
+            return tvm.tir.if_then_else(
+                not_zero, data(*index_tuple), tvm.tir.const(0.0, data.dtype))
         return data(*index_tuple)
 
-    return tvm.compute(out_shape, _dilate, name=name)
+    return te.compute(out_shape, _dilate, name=name)
diff --git a/topi/python/topi/nn/elemwise.py b/topi/python/topi/nn/elemwise.py
index e9f301942aa3..1315a48cc0ef 100644
--- a/topi/python/topi/nn/elemwise.py
+++ b/topi/python/topi/nn/elemwise.py
@@ -17,33 +17,34 @@
 """Elementwise operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import tag
 from ..util import get_const_int
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def relu(x):
     """Take relu of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
-    return tvm.compute(x.shape, lambda *i: tvm.max(x(*i), tvm.const(0, x.dtype)))
+    return te.compute(x.shape, lambda *i: tvm.te.max(x(*i), tvm.tir.const(0, x.dtype)))
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
+@tvm.te.tag_scope(tag=tag.ELEMWISE)
 def leaky_relu(x, alpha):
     """Take leaky relu of input x.
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
     alpha : float
@@ -51,16 +52,16 @@ def leaky_relu(x, alpha):
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     def _compute(*indices):
         value = x(*indices)
-        calpha = tvm.const(alpha, value.dtype)
-        return tvm.expr.Select(value > 0, value, value * calpha)
-    return tvm.compute(x.shape, _compute)
+        calpha = tvm.tir.const(alpha, value.dtype)
+        return tvm.tir.Select(value > 0, value, value * calpha)
+    return te.compute(x.shape, _compute)
 
-@tvm.tag_scope(tag=tag.BROADCAST)
+@tvm.te.tag_scope(tag=tag.BROADCAST)
 def prelu(x, slope, axis=1):
     """ PReLU.
     It accepts two arguments: an input ``x`` and a weight array ``W``
@@ -68,17 +69,17 @@ def prelu(x, slope, axis=1):
     where :math:`*` is an elementwise multiplication for each sample in the
     batch.
     Arguments:
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
 
-    slope : tvm.Tensor
+    slope : tvm.te.Tensor
         Channelised slope tensor for prelu
 
     axis : int
         The axis where the channel data needs to be applied
 
     Returns:
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
 
     Links:
@@ -91,5 +92,5 @@ def prelu(x, slope, axis=1):
 
     def _compute_channelwise(*indices):
         xval = x(*indices)
-        return tvm.expr.Select(xval > 0, xval, xval * slope(indices[axis]))
-    return tvm.compute(x.shape, _compute_channelwise)
+        return tvm.tir.Select(xval > 0, xval, xval * slope(indices[axis]))
+    return te.compute(x.shape, _compute_channelwise)
diff --git a/topi/python/topi/nn/fifo_buffer.py b/topi/python/topi/nn/fifo_buffer.py
index 946b8d1e3180..de283e0de4eb 100644
--- a/topi/python/topi/nn/fifo_buffer.py
+++ b/topi/python/topi/nn/fifo_buffer.py
@@ -18,10 +18,11 @@
 """FIFO buffer op"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import tag
 from ..transform import concatenate, strided_slice
 
-@tvm.tag_scope(tag=tag.INJECTIVE+",fifo_buffer")
+@tvm.te.tag_scope(tag=tag.INJECTIVE+",fifo_buffer")
 def fifo_buffer(data, buffer, axis):
     """
     FIFO buffer to enable computation reuse in CNNs with sliding indow input
@@ -42,16 +43,16 @@ def fifo_buffer(data, buffer, axis):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input data
-    buffer : tvm.Tensor
+    buffer : tvm.te.Tensor
         Previous value of the FIFO buffer
     axis : int
         Specify which axis should be used for buffering
 
     Returns
     -------
-    result : tvm.Tensor
+    result : tvm.te.Tensor
         Updated value for the buffer
     """
     assert len(data.shape) == len(buffer.shape), \
@@ -70,80 +71,80 @@ def fifo_buffer(data, buffer, axis):
 
     # Explicitly write out formula up to 4D, and then use concat+slice combo for 5D and higher
     if len(buffer.shape) == 1:
-        return tvm.compute(buffer.shape,
-                           lambda i:
-                           tvm.if_then_else(i < buflen - data_size,
-                                            buffer[i + data_size],
-                                            data[i - buflen + data_size]),
-                           name='new_buffer')
+        return te.compute(buffer.shape,
+                          lambda i:
+                          tvm.tir.if_then_else(i < buflen - data_size,
+                                               buffer[i + data_size],
+                                               data[i - buflen + data_size]),
+                          name='new_buffer')
     if len(buffer.shape) == 2:
         if axis == 0:
-            return tvm.compute(buffer.shape,
-                               lambda i, j:
-                               tvm.if_then_else(i < buflen - data_size,
-                                                buffer[i + data_size, j],
-                                                data[i - buflen + data_size, j]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j:
+                              tvm.tir.if_then_else(i < buflen - data_size,
+                                                   buffer[i + data_size, j],
+                                                   data[i - buflen + data_size, j]),
+                              name='new_buffer')
         if axis == 1:
-            return tvm.compute(buffer.shape,
-                               lambda i, j:
-                               tvm.if_then_else(j < buflen - data_size,
-                                                buffer[i, j + data_size],
-                                                data[i, j - buflen + data_size]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j:
+                              tvm.tir.if_then_else(j < buflen - data_size,
+                                                   buffer[i, j + data_size],
+                                                   data[i, j - buflen + data_size]),
+                              name='new_buffer')
         assert False, 'Invalid value for axis; it should be at most {}'.format(len(buffer.shape))
     elif len(buffer.shape) == 3:
         if axis == 0:
-            return tvm.compute(buffer.shape,
-                               lambda i, j, k:
-                               tvm.if_then_else(i < buflen - data_size,
-                                                buffer[i + data_size, j, k],
-                                                data[i - buflen + data_size, j, k]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j, k:
+                              tvm.tir.if_then_else(i < buflen - data_size,
+                                                   buffer[i + data_size, j, k],
+                                                   data[i - buflen + data_size, j, k]),
+                              name='new_buffer')
         if axis == 1:
-            return tvm.compute(buffer.shape,
-                               lambda i, j, k:
-                               tvm.if_then_else(j < buflen - data_size,
-                                                buffer[i, j + data_size, k],
-                                                data[i, j - buflen + data_size, k]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j, k:
+                              tvm.tir.if_then_else(j < buflen - data_size,
+                                                   buffer[i, j + data_size, k],
+                                                   data[i, j - buflen + data_size, k]),
+                              name='new_buffer')
         if axis == 2:
-            return tvm.compute(buffer.shape,
-                               lambda i, j, k:
-                               tvm.if_then_else(k < buflen - data_size,
-                                                buffer[i, j, k + data_size],
-                                                data[i, j, k - buflen + data_size]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j, k:
+                              tvm.tir.if_then_else(k < buflen - data_size,
+                                                   buffer[i, j, k + data_size],
+                                                   data[i, j, k - buflen + data_size]),
+                              name='new_buffer')
         assert False, 'Invalid value for axis; it should be at most {}'.format(len(buffer.shape))
     elif len(buffer.shape) == 4:
         if axis == 0:
-            return tvm.compute(buffer.shape,
-                               lambda i, j, k, l:
-                               tvm.if_then_else(i < buflen - data_size,
-                                                buffer[i + data_size, j, k, l],
-                                                data[i - buflen + data_size, j, k, l]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j, k, l:
+                              tvm.tir.if_then_else(i < buflen - data_size,
+                                                   buffer[i + data_size, j, k, l],
+                                                   data[i - buflen + data_size, j, k, l]),
+                              name='new_buffer')
         if axis == 1:
-            return tvm.compute(buffer.shape,
-                               lambda i, j, k, l:
-                               tvm.if_then_else(j < buflen - data_size,
-                                                buffer[i, j + data_size, k, l],
-                                                data[i, j - buflen + data_size, k, l]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j, k, l:
+                              tvm.tir.if_then_else(j < buflen - data_size,
+                                                   buffer[i, j + data_size, k, l],
+                                                   data[i, j - buflen + data_size, k, l]),
+                              name='new_buffer')
         if axis == 2:
-            return tvm.compute(buffer.shape,
-                               lambda i, j, k, l:
-                               tvm.if_then_else(k < buflen - data_size,
-                                                buffer[i, j, k + data_size, l],
-                                                data[i, j, k - buflen + data_size, l]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j, k, l:
+                              tvm.tir.if_then_else(k < buflen - data_size,
+                                                   buffer[i, j, k + data_size, l],
+                                                   data[i, j, k - buflen + data_size, l]),
+                              name='new_buffer')
         if axis == 3:
-            return tvm.compute(buffer.shape,
-                               lambda i, j, k, l:
-                               tvm.if_then_else(l < buflen - data_size,
-                                                buffer[i, j, k, l + data_size],
-                                                data[i, j, k, l - buflen + data_size]),
-                               name='new_buffer')
+            return te.compute(buffer.shape,
+                              lambda i, j, k, l:
+                              tvm.tir.if_then_else(l < buflen - data_size,
+                                                   buffer[i, j, k, l + data_size],
+                                                   data[i, j, k, l - buflen + data_size]),
+                              name='new_buffer')
         assert False, 'Invalid value for axis; it should be at most {}'.format(len(buffer.shape))
     else:
         # Implement FIFO buffer as combination of concat and slice
diff --git a/topi/python/topi/nn/flatten.py b/topi/python/topi/nn/flatten.py
index dba9b7cd6005..11fe0d854cb2 100644
--- a/topi/python/topi/nn/flatten.py
+++ b/topi/python/topi/nn/flatten.py
@@ -17,20 +17,21 @@
 """TVM operator flatten compute."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 from .. import tag
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
+@tvm.te.tag_scope(tag=tag.INJECTIVE)
 def flatten(data):
     """Flattens the input array into a 2-D array by collapsing the higher dimensions.
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         Input array.
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D array with collapsed higher dimensions.
     """
     ishape = data.shape
@@ -38,8 +39,8 @@ def flatten(data):
     for i in range(1, len(ishape)):
         dim = dim * ishape[i]
     oshape = [ishape[0], dim]
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
 
     def unwrap(idx, shape):
         index = []
@@ -48,4 +49,4 @@ def unwrap(idx, shape):
             idx = idxdiv(idx, s)
         return list(reversed(index))
 
-    return tvm.compute(oshape, lambda i, j: data(i, *unwrap(j, ishape[1:])))
+    return te.compute(oshape, lambda i, j: data(i, *unwrap(j, ishape[1:])))
diff --git a/topi/python/topi/nn/local_response_norm.py b/topi/python/topi/nn/local_response_norm.py
index 1b41c7dbfb5e..35c76d2b0a6a 100644
--- a/topi/python/topi/nn/local_response_norm.py
+++ b/topi/python/topi/nn/local_response_norm.py
@@ -31,7 +31,7 @@ def lrn(data, size, axis=1, alpha=0.0001, beta=0.75, bias=2):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, channel, height, width]
 
     size : int
@@ -52,7 +52,7 @@ def lrn(data, size, axis=1, alpha=0.0001, beta=0.75, bias=2):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D output with same shape
     """
     return cpp.nn.lrn(data, size, axis, alpha, beta, bias)
diff --git a/topi/python/topi/nn/mapping.py b/topi/python/topi/nn/mapping.py
index b2222bdeb87d..12558a8c33a2 100644
--- a/topi/python/topi/nn/mapping.py
+++ b/topi/python/topi/nn/mapping.py
@@ -18,49 +18,50 @@
 """Operators of one-to-one-mapping on the first input"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .. import tag
 
-@tvm.tag_scope(tag=tag.BROADCAST)
+@tvm.te.tag_scope(tag=tag.BROADCAST)
 def scale_shift_nchw(Input, Scale, Shift):
     """Batch normalization operator in inference.
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         Input tensor, layout is NCHW
 
-    Scale : tvm.Tensor
+    Scale : tvm.te.Tensor
         Scale tensor, 1-D of size channel number
 
-    Shift : tvm.Tensor
+    Shift : tvm.te.Tensor
         Shift tensor, 1-D of size channel number
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         Output tensor, layout is NCHW
     """
-    return tvm.compute(Input.shape, lambda b, c, i, j: Input[b, c, i, j] * Scale[c] + Shift[c], name='ScaleShift')
+    return te.compute(Input.shape, lambda b, c, i, j: Input[b, c, i, j] * Scale[c] + Shift[c], name='ScaleShift')
 
 
-@tvm.tag_scope(tag=tag.BROADCAST)
+@tvm.te.tag_scope(tag=tag.BROADCAST)
 def scale_shift_nhwc(Input, Scale, Shift):
     """Batch normalization operator in inference.
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         Input tensor, layout is NHWC
 
-    Scale : tvm.Tensor
+    Scale : tvm.te.Tensor
         Scale tensor, 1-D of size channel number
 
-    Shift : tvm.Tensor
+    Shift : tvm.te.Tensor
         Shift tensor, 1-D of size channel number
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         Output tensor, layout is NHWC
     """
-    return tvm.compute(Input.shape, lambda b, i, j, c: Input[b, i, j, c] * Scale[c] + Shift[c], name='ScaleShift')
+    return te.compute(Input.shape, lambda b, i, j, c: Input[b, i, j, c] * Scale[c] + Shift[c], name='ScaleShift')
diff --git a/topi/python/topi/nn/pad.py b/topi/python/topi/nn/pad.py
index 13f8e720288b..8fe53374f2b5 100644
--- a/topi/python/topi/nn/pad.py
+++ b/topi/python/topi/nn/pad.py
@@ -17,16 +17,17 @@
 """Pad the data by constant value """
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from ..util import equal_const_int
 from .. import tag
 
-@tvm.tag_scope(tag=tag.INJECTIVE+",pad")
+@tvm.te.tag_scope(tag=tag.INJECTIVE+",pad")
 def pad(data, pad_before, pad_after=None, pad_value=0.0, name="PadInput"):
     """Pad Input with zeros.
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D input, can be any layout.
 
     pad_before : list / tuple of n ints
@@ -43,7 +44,7 @@ def pad(data, pad_before, pad_after=None, pad_value=0.0, name="PadInput"):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         n-D, the same layout as Input.
     """
     n = len(data.shape)
@@ -55,10 +56,10 @@ def pad(data, pad_before, pad_after=None, pad_value=0.0, name="PadInput"):
         raise ValueError("Input dimension and pad_after dismatch : %d vs %d" % (
             n, len(pad_before)))
     out_shape = tuple(
-        tvm.ir_pass.Simplify(
+        tvm.tir.ir_pass.Simplify(
             (data.shape[i] + pad_before[i] + pad_after[i])) for i in range(n))
-    pad_value = (pad_value if isinstance(pad_value, tvm.expr.PrimExpr)
-                 else tvm.const(pad_value, data.dtype))
+    pad_value = (pad_value if isinstance(pad_value, tvm.tir.PrimExpr)
+                 else tvm.tir.const(pad_value, data.dtype))
     def _pad(*indices):
         not_zero = []
         index_tuple = []
@@ -70,13 +71,13 @@ def _pad(*indices):
                 not_zero.append(indices[i] >= pad_before[i])
                 not_zero.append(indices[i] < data.shape[i] + pad_before[i])
         if not_zero:
-            not_zero = tvm.all(*not_zero)
-            return tvm.if_then_else(not_zero, data(*index_tuple), pad_value)
+            not_zero = tvm.tir.all(*not_zero)
+            return tvm.tir.if_then_else(not_zero, data(*index_tuple), pad_value)
         return data(*index_tuple)
-    return tvm.compute(out_shape, _pad, name=name)
+    return te.compute(out_shape, _pad, name=name)
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE + ",pad")
+@tvm.te.tag_scope(tag=tag.INJECTIVE + ",pad")
 def mirror_pad(data,
                pad_before,
                pad_after=None,
@@ -86,7 +87,7 @@ def mirror_pad(data,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D input, can be any layout.
 
     pad_before : list / tuple of n ints
@@ -103,7 +104,7 @@ def mirror_pad(data,
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         n-D, the same layout as Input.
     """
     n = len(data.shape)
@@ -115,7 +116,7 @@ def mirror_pad(data,
         raise ValueError("Input dimension and pad_after dismatch : %d vs %d" %
                          (n, len(pad_before)))
     out_shape = tuple(
-        tvm.ir_pass.Simplify((data.shape[i] + pad_before[i] + pad_after[i]))
+        tvm.tir.ir_pass.Simplify((data.shape[i] + pad_before[i] + pad_after[i]))
         for i in range(n))
     assert mode in ('SYMMETRIC', 'REFLECT')
     mode = int(mode == 'SYMMETRIC')
@@ -136,10 +137,10 @@ def _pad(*indices):
                 below.append(indices[i] < pad_before[i])
         mapped_tuple = []
         for i, axis in enumerate(index_tuple):
-            mapped_axis = tvm.if_then_else(below[i], -axis - mode, axis)
-            mapped_axis = tvm.if_then_else(
+            mapped_axis = tvm.tir.if_then_else(below[i], -axis - mode, axis)
+            mapped_axis = tvm.tir.if_then_else(
                 above[i], (2 * (data.shape[i] - 1)) - axis + mode, mapped_axis)
             mapped_tuple.append(mapped_axis)
         return data(*mapped_tuple)
 
-    return tvm.compute(out_shape, _pad, name=name)
+    return te.compute(out_shape, _pad, name=name)
diff --git a/topi/python/topi/nn/pooling.py b/topi/python/topi/nn/pooling.py
index 5fd2dedf9619..e3d57ce8ed78 100644
--- a/topi/python/topi/nn/pooling.py
+++ b/topi/python/topi/nn/pooling.py
@@ -34,7 +34,7 @@ def global_pool(data, pool_type, layout="NCHW"):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D with shape of layout
 
     pool_type : str
@@ -51,7 +51,7 @@ def global_pool(data, pool_type, layout="NCHW"):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         n-D in same layout with height and width dimension size of 1.
         e.g., for NCHW, the output shape will be [batch, channel, 1, 1]
     """
@@ -76,7 +76,7 @@ def pool(data,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D with shape of layout
 
     kernel : list/tuple of two ints
@@ -108,7 +108,7 @@ def pool(data,
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         n-D in the same layout
     """
     return cpp.nn.pool(data, kernel, stride, padding,
@@ -133,10 +133,10 @@ def pool_grad(grads,
 
     Parameters
     ----------
-    grads : tvm.Tensor
+    grads : tvm.te.Tensor
         n-D with shape of layout
 
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D with shape of layout
 
     kernel : list/tuple of two ints
@@ -168,7 +168,7 @@ def pool_grad(grads,
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         n-D in the same layout
     """
     return cpp.nn.pool_grad(grads, data, kernel,
@@ -192,7 +192,7 @@ def adaptive_pool(data,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D with shape of layout
 
     output_size : tuple of int
@@ -212,7 +212,7 @@ def adaptive_pool(data,
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         n-D in the same layout
     """
     return cpp.nn.adaptive_pool(data, output_size, POOL_TYPE_CODE[pool_type], layout)
@@ -236,7 +236,7 @@ def pool1d(data,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D with shape of layout
 
     kernel : list/tuple of one int or int
@@ -268,7 +268,7 @@ def pool1d(data,
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         n-D in the same layout
     """
     if isinstance(kernel, int):
@@ -297,7 +297,7 @@ def pool3d(data,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         n-D with shape of layout
 
     kernel : list/tuple of three ints
@@ -329,7 +329,7 @@ def pool3d(data,
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         n-D in the same layout
     """
     return cpp.nn.pool3d(data, kernel, stride, padding,
diff --git a/topi/python/topi/nn/softmax.py b/topi/python/topi/nn/softmax.py
index 16ffd797aafc..c414372ade93 100644
--- a/topi/python/topi/nn/softmax.py
+++ b/topi/python/topi/nn/softmax.py
@@ -18,14 +18,15 @@
 """TVM operator for softmax and log_softmax compute."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 
-@tvm.tag_scope(tag='softmax_output')
+@tvm.te.tag_scope(tag='softmax_output')
 def softmax(x, axis=-1):
     """Perform softmax activation on the data
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         can be any dimension
 
     axis : int
@@ -33,7 +34,7 @@ def softmax(x, axis=-1):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         output shape is the same as input
     """
     shape = x.shape
@@ -42,8 +43,8 @@ def softmax(x, axis=-1):
     if axis >= len(shape):
         ValueError("axis parameter should be less than input dim")
 
-    k1 = tvm.reduce_axis((0, shape[axis]), name='k')
-    k2 = tvm.reduce_axis((0, shape[axis]), name='k')
+    k1 = te.reduce_axis((0, shape[axis]), name='k')
+    k2 = te.reduce_axis((0, shape[axis]), name='k')
 
     def insert_reduce_index(indices, reduce_index):
         return indices[:axis] + (reduce_index,) + indices[axis:]
@@ -53,51 +54,51 @@ def get_non_reduce_indices(indices):
 
     def _compute_max(*indices):
         eval_range = insert_reduce_index(indices, k1)
-        return tvm.max(x[eval_range], axis=k1)
+        return tvm.te.max(x[eval_range], axis=k1)
 
     def _compute_exp(max_elem, *indices):
         non_reduce_indices = get_non_reduce_indices(indices)
-        return tvm.exp(x[indices] - max_elem[non_reduce_indices])
+        return te.exp(x[indices] - max_elem[non_reduce_indices])
 
     def _compute_expsum(exp, *indices):
         eval_range = insert_reduce_index(indices, k2)
-        return tvm.sum(exp[eval_range], axis=k2)
+        return te.sum(exp[eval_range], axis=k2)
 
     def _normalize(exp, expsum, *indices):
         non_reduce_indices = get_non_reduce_indices(indices)
         return exp[indices] / expsum[non_reduce_indices]
 
     reduced_shape = tuple([dim for (i, dim) in enumerate(shape) if i != axis])
-    max_elem = tvm.compute(reduced_shape, _compute_max, name='T_softmax_maxelem')
-    exp = tvm.compute(shape, lambda *indices: _compute_exp(max_elem, *indices),
-                      name='T_softmax_exp')
-    expsum = tvm.compute(reduced_shape, lambda *indices: _compute_expsum(exp, *indices),
-                         name='T_softmax_expsum')
-    return tvm.compute(shape, lambda *indices: _normalize(exp, expsum, *indices),
-                       name='T_softmax_norm', attrs={"axis" : axis})
+    max_elem = te.compute(reduced_shape, _compute_max, name='T_softmax_maxelem')
+    exp = te.compute(shape, lambda *indices: _compute_exp(max_elem, *indices),
+                     name='T_softmax_exp')
+    expsum = te.compute(reduced_shape, lambda *indices: _compute_expsum(exp, *indices),
+                        name='T_softmax_expsum')
+    return te.compute(shape, lambda *indices: _normalize(exp, expsum, *indices),
+                      name='T_softmax_norm', attrs={"axis" : axis})
 
 
-@tvm.tag_scope(tag='log_softmax_output')
+@tvm.te.tag_scope(tag='log_softmax_output')
 def log_softmax(x):
     """Perform log softmax activation on the data
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D input data
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D output with same shape
     """
 
     assert len(x.shape) == 2, "only support 2-dim log softmax"
     m, n = x.shape
-    k = tvm.reduce_axis((0, n), name='k')
-    max_elem = tvm.compute((m, ), lambda i: tvm.max(x[i, k], axis=k))
-    k = tvm.reduce_axis((0, n), name='k')
-    expsum = tvm.compute(
-        (m, ), lambda i: tvm.sum(tvm.exp(x[i, k] - max_elem[i]), axis=k))
-    return tvm.compute(
-        x.shape, lambda i, j: x[i, j] - max_elem[i] - tvm.log(expsum[i]))
+    k = te.reduce_axis((0, n), name='k')
+    max_elem = te.compute((m, ), lambda i: tvm.te.max(x[i, k], axis=k))
+    k = te.reduce_axis((0, n), name='k')
+    expsum = te.compute(
+        (m, ), lambda i: te.sum(te.exp(x[i, k] - max_elem[i]), axis=k))
+    return te.compute(
+        x.shape, lambda i, j: x[i, j] - max_elem[i] - te.log(expsum[i]))
diff --git a/topi/python/topi/nn/space_to_depth.py b/topi/python/topi/nn/space_to_depth.py
index 6ed7cd64a448..b90bd118287b 100644
--- a/topi/python/topi/nn/space_to_depth.py
+++ b/topi/python/topi/nn/space_to_depth.py
@@ -18,6 +18,7 @@
 """TVM operator space_to_depth compute."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 from .. import tag
 
 
@@ -26,7 +27,7 @@ def space_to_depth(data, block_size, layout='NCHW'):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D tensor in either NCHW or NHWC layout.
 
     block_size : int
@@ -37,17 +38,17 @@ def space_to_depth(data, block_size, layout='NCHW'):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         Output of shape [N, C * block_size**2, H / block_size, W / block_size]
     """
 
     if layout == 'NCHW':
         in_n, in_c, in_h, in_w = data.shape
         output_shape = [in_n, in_c * block_size * block_size,
-                        tvm.truncdiv(in_h, block_size), tvm.truncdiv(in_w, block_size)]
+                        tvm.tir.truncdiv(in_h, block_size), tvm.tir.truncdiv(in_w, block_size)]
     elif layout == 'NHWC':
         in_n, in_h, in_w, in_c = data.shape
-        output_shape = [in_n, tvm.truncdiv(in_h, block_size), tvm.truncdiv(
+        output_shape = [in_n, tvm.tir.truncdiv(in_h, block_size), tvm.tir.truncdiv(
             in_w, block_size), in_c * block_size * block_size]
     else:
         raise ValueError("Only NCHW and NHWC layouts are currently supported.")
@@ -60,10 +61,10 @@ def _get_indices(*indices):
         return n, c, y, x
 
     def _get_pixel(n, c, y, x):
-        block_offset = tvm.truncdiv(c, in_c)
-        channel_idx = tvm.truncmod(c, in_c)
-        x_idx = tvm.truncmod(block_offset, block_size)
-        y_idx = tvm.truncdiv(block_offset, block_size)
+        block_offset = tvm.tir.truncdiv(c, in_c)
+        channel_idx = tvm.tir.truncmod(c, in_c)
+        x_idx = tvm.tir.truncmod(block_offset, block_size)
+        y_idx = tvm.tir.truncdiv(block_offset, block_size)
 
         if layout == 'NCHW':
             output = data(n, channel_idx, y_idx +
@@ -77,4 +78,4 @@ def _compute(*indices):
         n, c, y, x = _get_indices(*indices)
         return _get_pixel(n, c, y, x)
 
-    return tvm.compute(output_shape, _compute, name='space_to_depth', tag=tag.INJECTIVE)
+    return te.compute(output_shape, _compute, name='space_to_depth', tag=tag.INJECTIVE)
diff --git a/topi/python/topi/nn/sparse.py b/topi/python/topi/nn/sparse.py
index 6974ff4a13ab..b37bac2a213a 100644
--- a/topi/python/topi/nn/sparse.py
+++ b/topi/python/topi/nn/sparse.py
@@ -18,6 +18,7 @@
 """Sparse operators"""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 
 from ..util import get_const_tuple
 
@@ -29,24 +30,24 @@ def sparse_dense(data, weight_data, weight_indices, weight_indptr):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         2-D with shape [M, K], float32
 
-    weight_data : tvm.Tensor
+    weight_data : tvm.te.Tensor
         1-D with shape [nnz] (CSR) or
         3-D with shape [num_blocks, bs_r, bs_c] (BSR)
 
-    weight_indices : tvm.Tensor
+    weight_indices : tvm.te.Tensor
         1-D with shape [nnz] (CSR) or
         1-D with shape [num_blocks] (BSR)
 
-    weight_indptr : tvm.Tensor
+    weight_indptr : tvm.te.Tensor
         1-D with shape [N + 1] (CSR) or
         1-D with shape [(N + 1) // bs_r] (BSR)
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [M, N]
     """
     assert len(weight_data.shape) in (1, 3)
@@ -66,12 +67,12 @@ def f(i, row):
         row_start = weight_indptr[row]
         row_end = weight_indptr[row + 1]
         row_elems = row_end - row_start
-        elem_idx = tvm.reduce_axis((0, row_elems), name="elem_idx")
+        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
         elem = row_start + elem_idx
         a_val = weight_data[elem]
         weight_val = data[i, weight_indices[elem]]
-        return tvm.sum(a_val * weight_val, axis=elem_idx)
-    return tvm.compute(oshape, f, tag="sparse_dense_csrmm")
+        return te.sum(a_val * weight_val, axis=elem_idx)
+    return te.compute(oshape, f, tag="sparse_dense_csrmm")
 
 
 def _sparse_dense_bsrmm(data, weight_data, weight_indices, weight_indptr):
@@ -84,22 +85,22 @@ def _compute_block(i, nb_j, j):
         row_start = weight_indptr[nb_j]
         row_end = weight_indptr[nb_j + 1]
         row_elems = row_end - row_start
-        elem_idx = tvm.reduce_axis(
+        elem_idx = te.reduce_axis(
             (0, row_elems), name="elem_idx")
         block_offset = row_start + elem_idx
-        c = tvm.reduce_axis((0, bs_c), name="c")
+        c = te.reduce_axis((0, bs_c), name="c")
         block_j = weight_indices[block_offset]
         block_ij_val = weight_data[block_offset][j][c]
         x_val = data[i, bs_c * block_j + c]
-        return tvm.sum(block_ij_val * x_val, axis=[elem_idx, c])
+        return te.sum(block_ij_val * x_val, axis=[elem_idx, c])
 
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
-    bsrmm_block = tvm.compute(
+    bsrmm_block = te.compute(
         (m, num_blocks, bs_r), _compute_block,
         tag="sparse_dense_bsrmm_block")
-    return tvm.compute(
+    return te.compute(
         (m, num_blocks * bs_r),
         lambda m, n: bsrmm_block[m, idxd(n, bs_r), idxm(n, bs_r)],
         tag="sparse_dense_bsrmm")
@@ -113,24 +114,24 @@ def sparse_transpose(sparse_data, sparse_indices, sparse_indptr):
 
     Parameters
     ----------
-    sparse_data : tvm.Tensor
+    sparse_data : tvm.te.Tensor
         1-D with shape [nonzeros], dtype of 'float32'
 
-    sparse_indices : tvm.Tensor
+    sparse_indices : tvm.te.Tensor
         1-D with shape [nonzeros], dtype of 'int32'
 
-    sparse_indptr : tvm.Tensor
+    sparse_indptr : tvm.te.Tensor
         1-D with shape [n+1], dtype of 'int32'
 
     Returns
     -------
-    out_data : tvm.Tensor
+    out_data : tvm.te.Tensor
         1-D with shape [nonzeros], dtype of 'float32'
 
-    out_indices : tvm.Tensor
+    out_indices : tvm.te.Tensor
         1-D with shape [nonzeros], dtype of 'int32'
 
-    out_indptr : tvm.Tensor
+    out_indptr : tvm.te.Tensor
         1-D with shape [n+1], dtype of 'int32'
     """
     assert len(sparse_data.shape) == 1, "error in data dimension"
@@ -143,7 +144,7 @@ def sparse_transpose(sparse_data, sparse_indices, sparse_indptr):
 
     # TODO: Add BSR transpose support
 
-    output_data, output_indices, output_indptr = tvm.extern(
+    output_data, output_indices, output_indptr = te.extern(
         shape=output_shape,
         inputs=[sparse_data, sparse_indices, sparse_indptr],
         fcompute=lambda ins, outs:
@@ -157,7 +158,7 @@ def sparse_transpose(sparse_data, sparse_indices, sparse_indptr):
 
 def _csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr):
     """define ir for csr_transpose"""
-    irb = tvm.ir_builder.create()
+    irb = tvm.tir.ir_builder.create()
 
     data_ptr = irb.buffer_ptr(data)
     indices_ptr = irb.buffer_ptr(indices)
diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py
index c816bbb3c04e..008e52e337ae 100644
--- a/topi/python/topi/nn/upsampling.py
+++ b/topi/python/topi/nn/upsampling.py
@@ -15,9 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """TVM operator upsampling compute."""
-from __future__ import absolute_import
 import topi
-import tvm
+from tvm import te
 from ..util import simplify
 
 
@@ -28,7 +27,7 @@ def upsampling(data, scale_h, scale_w, layout="NCHW", method='nearest_neighbor',
 
     Parameters
     ----------
-    inputs : tvm.Tensor
+    inputs : tvm.te.Tensor
         inputs is a 4-D tensor with shape
         [batch, channel, in_height, in_width]
         or  [batch, in_height, in_width, channel]
@@ -47,17 +46,17 @@ def upsampling(data, scale_h, scale_w, layout="NCHW", method='nearest_neighbor',
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, channel, in_height*scale_h, in_width*scale_w]
         or [batch, in_height*scale, in_width*scale, channel]
     """
     base_layout = layout[0:4]
     if base_layout == "NCHW":
-        out_shape = (simplify(topi.cast(tvm.round(data.shape[2] * scale_h), data.shape[2].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[3] * scale_w), data.shape[3].dtype)))
+        out_shape = (simplify(topi.cast(te.round(data.shape[2] * scale_h), data.shape[2].dtype)),
+                     simplify(topi.cast(te.round(data.shape[3] * scale_w), data.shape[3].dtype)))
     elif layout == "NHWC":
-        out_shape = (simplify(topi.cast(tvm.round(data.shape[1] * scale_h), data.shape[1].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[2] * scale_w), data.shape[2].dtype)))
+        out_shape = (simplify(topi.cast(te.round(data.shape[1] * scale_h), data.shape[1].dtype)),
+                     simplify(topi.cast(te.round(data.shape[2] * scale_w), data.shape[2].dtype)))
 
     else:
         raise ValueError("not support this layout {} yet".format(layout))
@@ -73,7 +72,7 @@ def upsampling3d(data, scale_d, scale_h, scale_w, layout="NCDHW", method='neares
 
     Parameters
     ----------
-    inputs : tvm.Tensor
+    inputs : tvm.te.Tensor
         inputs is a 5-D tensor with shape
         [batch, channel, in_depth, in_height, in_width]
         or  [batch, in_depth, in_height, in_width, channel]
@@ -101,19 +100,19 @@ def upsampling3d(data, scale_d, scale_h, scale_w, layout="NCDHW", method='neares
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale]
         or [batch, in_depth*scale, in_height*scale, in_width*scale, channel]
     """
     base_layout = layout[0:5]
     if base_layout == "NCDHW":
-        out_shape = (simplify(topi.cast(tvm.round(data.shape[2] * scale_d), data.shape[2].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[3] * scale_h), data.shape[3].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[4] * scale_w), data.shape[4].dtype)))
+        out_shape = (simplify(topi.cast(te.round(data.shape[2] * scale_d), data.shape[2].dtype)),
+                     simplify(topi.cast(te.round(data.shape[3] * scale_h), data.shape[3].dtype)),
+                     simplify(topi.cast(te.round(data.shape[4] * scale_w), data.shape[4].dtype)))
     elif layout == "NDHWC":
-        out_shape = (simplify(topi.cast(tvm.round(data.shape[1] * scale_d), data.shape[1].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[2] * scale_h), data.shape[2].dtype)),
-                     simplify(topi.cast(tvm.round(data.shape[3] * scale_w), data.shape[3].dtype)))
+        out_shape = (simplify(topi.cast(te.round(data.shape[1] * scale_d), data.shape[1].dtype)),
+                     simplify(topi.cast(te.round(data.shape[2] * scale_h), data.shape[2].dtype)),
+                     simplify(topi.cast(te.round(data.shape[3] * scale_w), data.shape[3].dtype)))
 
     else:
         raise ValueError("not support this layout {} yet".format(layout))
diff --git a/topi/python/topi/nn/util.py b/topi/python/topi/nn/util.py
index f0cdd9a0d3c2..5a9b49e3eceb 100644
--- a/topi/python/topi/nn/util.py
+++ b/topi/python/topi/nn/util.py
@@ -107,8 +107,8 @@ def infer_stride(data, kernel, out):
     _, _, IH, IW = data.shape
     _, _, KH, KW = kernel.shape
     _, _, OH, OW = out.shape
-    hstride = (IH - KH) // tvm.make.Max(OH - 1, 1) + tvm.expr.Select(OH == 1, 1, 0)
-    wstride = (IW - KW) // tvm.make.Max(OW - 1, 1) + tvm.expr.Select(OW == 1, 1, 0)
+    hstride = (IH - KH) // tvm.te.max(OH - 1, 1) + tvm.tir.Select(OH == 1, 1, 0)
+    wstride = (IW - KW) // tvm.te.max(OW - 1, 1) + tvm.tir.Select(OW == 1, 1, 0)
     return get_const_int(hstride), get_const_int(wstride)
 
 
diff --git a/topi/python/topi/nn/winograd_util.py b/topi/python/topi/nn/winograd_util.py
index 464b63301b40..d967431719ff 100644
--- a/topi/python/topi/nn/winograd_util.py
+++ b/topi/python/topi/nn/winograd_util.py
@@ -55,7 +55,7 @@ def _B_m(a, n):
         f = lambda j, i: reduce(mul, ((a[i]-a[k] if k != i else 1) for k in range(0, n-1)), 1)
         Ff = np.fromfunction(np.vectorize(f), (1, n-1), dtype=int)
         f = lambda i, nth: (reduce(mul, [(np.poly1d([1, -a[k]]) if k != i else 1) \
-                            for k in range(0, n-1)], 1)).coef[n-1-nth-1]/Ff[0, i]
+                                         for k in range(0, n-1)], 1)).coef[n-1-nth-1]/Ff[0, i]
         F = np.fromfunction(np.vectorize(f), (n-1, n-1), dtype=int)
         f = lambda i, j: -a[i]**(n-1)
         t = np.fromfunction(np.vectorize(f), (n-1, 1), dtype=int)
diff --git a/topi/python/topi/opengl/conv2d_nchw.py b/topi/python/topi/opengl/conv2d_nchw.py
index 52ed11972e6f..c93bcc25daef 100644
--- a/topi/python/topi/opengl/conv2d_nchw.py
+++ b/topi/python/topi/opengl/conv2d_nchw.py
@@ -17,6 +17,7 @@
 #pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
 """Schedule for conv2d_nchw with auto fusion"""
 import tvm
+from tvm import te
 from .. import tag
 
 def schedule_conv2d_nchw(outs):
@@ -33,8 +34,8 @@ def schedule_conv2d_nchw(outs):
     s: Schedule
         The computation schedule for conv2d_nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def _schedule(conv2d, data):
@@ -53,14 +54,14 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].opengl()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, tvm.te.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule conv2d_nchw
         elif OP.tag.startswith('conv2d_nchw'):
             conv2d = OP.output(0)
             data = OP.input_tensors[0]
             kernel = OP.input_tensors[1]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
             _schedule(conv2d, data)
         else:
diff --git a/topi/python/topi/opengl/dense.py b/topi/python/topi/opengl/dense.py
index db2c4a677904..715f713d56d6 100644
--- a/topi/python/topi/opengl/dense.py
+++ b/topi/python/topi/opengl/dense.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable
 """Schedule for dense operator"""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from .. import tag
 
 def schedule_dense(outs):
@@ -34,8 +33,8 @@ def schedule_dense(outs):
     s: Schedule
         The computation schedule for dense.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def _schedule(Dense):
@@ -53,7 +52,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule dense
         elif OP.tag == 'dense':
diff --git a/topi/python/topi/opengl/injective.py b/topi/python/topi/opengl/injective.py
index 28dc87d1a5fb..3d45247413d2 100644
--- a/topi/python/topi/opengl/injective.py
+++ b/topi/python/topi/opengl/injective.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable,
 """Schedule for composition of injective operator"""
-import tvm
+from tvm import te
 
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
@@ -50,10 +50,10 @@ def schedule_injective(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
-    tvm.schedule.AutoInlineInjective(s)
+    te.schedule.AutoInlineInjective(s)
     for out in outs:
         schedule_injective_from_existing(s, out)
     return s
diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py
index 3226422048e5..c30389c7b72c 100644
--- a/topi/python/topi/opengl/pooling.py
+++ b/topi/python/topi/opengl/pooling.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for pooling operators"""
-import tvm
+from tvm import te
 from .. import tag
 
 def schedule_adaptive_pool(outs):
@@ -33,8 +33,8 @@ def schedule_adaptive_pool(outs):
     s: Schedule
         The computation schedule for adaptive pool.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def _schedule(Pool):
@@ -52,7 +52,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].opengl()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule global_pool
         elif OP.tag.startswith('adaptive_pool'):
@@ -84,12 +84,12 @@ def schedule_pool(outs, layout):
     s: Schedule
         The computation schedule for pool.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def _schedule(PaddedInput, Pool):
-        if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
+        if isinstance(PaddedInput.op, te.tensor.ComputeOp):
             s[PaddedInput].opengl()
         if Pool.op in s.outputs:
             Out = Pool
@@ -105,7 +105,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if tensor.op not in scheduled_ops and isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if tensor.op not in scheduled_ops and isinstance(tensor.op, te.tensor.ComputeOp):
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('pool'):
diff --git a/topi/python/topi/opengl/softmax.py b/topi/python/topi/opengl/softmax.py
index ff218d13c2b1..e725134494fc 100644
--- a/topi/python/topi/opengl/softmax.py
+++ b/topi/python/topi/opengl/softmax.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, trailing-whitespace
 """Schedule for softmax operator"""
-import tvm
+from tvm import te
 
 def schedule_softmax(outs):
     """Schedule for softmax op.
@@ -32,8 +32,8 @@ def schedule_softmax(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     softmax = outs[0]
 
     op_tag = softmax.op.tag
diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py
index 7c4e059d8334..74ba68848353 100644
--- a/topi/python/topi/reduction.py
+++ b/topi/python/topi/reduction.py
@@ -45,7 +45,7 @@ def sum(data, axis=None, keepdims=False):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tvm tensor
 
     axis : None or int or tuple of int
@@ -60,7 +60,7 @@ def sum(data, axis=None, keepdims=False):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.sum(data, axis, keepdims)
 
@@ -70,7 +70,7 @@ def all(data, axis=None, keepdims=False):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tvm boolean tensor
 
     axis : None or int or tuple of int
@@ -85,7 +85,7 @@ def all(data, axis=None, keepdims=False):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.all(data, axis, keepdims)
 
@@ -95,7 +95,7 @@ def any(data, axis=None, keepdims=False):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tvm boolean tensor
 
     axis : None or int or tuple of int
@@ -110,7 +110,7 @@ def any(data, axis=None, keepdims=False):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.any(data, axis, keepdims)
 
@@ -120,7 +120,7 @@ def max(data, axis=None, keepdims=False):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tvm tensor
 
     axis : None or int or tuple of int
@@ -135,7 +135,7 @@ def max(data, axis=None, keepdims=False):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.max(data, axis, keepdims)
 
@@ -145,7 +145,7 @@ def min(data, axis=None, keepdims=False):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tvm tensor
 
     axis : None or int or tuple of int
@@ -160,7 +160,7 @@ def min(data, axis=None, keepdims=False):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.min(data, axis, keepdims)
 
@@ -170,7 +170,7 @@ def argmax(data, axis=None, keepdims=False):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tvm tensor
 
     axis : None or int or tuple of int
@@ -185,7 +185,7 @@ def argmax(data, axis=None, keepdims=False):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.argmax(data, axis, keepdims)
 
@@ -195,7 +195,7 @@ def argmin(data, axis=None, keepdims=False):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tvm tensor
 
     axis : None or int or tuple of int
@@ -210,7 +210,7 @@ def argmin(data, axis=None, keepdims=False):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.argmin(data, axis, keepdims)
 
@@ -220,7 +220,7 @@ def prod(data, axis=None, keepdims=False):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tvm tensor
 
     axis : None or int or tuple of int
@@ -235,6 +235,6 @@ def prod(data, axis=None, keepdims=False):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.prod(data, axis, keepdims)
diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py
index ce56dc4e0847..713647e4ca8a 100644
--- a/topi/python/topi/rocm/conv2d.py
+++ b/topi/python/topi/rocm/conv2d.py
@@ -32,10 +32,10 @@ def conv2d_nchw_miopen(cfg, data, kernel, strides, padding, dilation, out_dtype=
     cfg: ConfigEntity
         The config for this template
 
-    input : tvm.Tensor
+    input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
-    filter : tvm.Tensor
+    filter : tvm.te.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
     strides : int or a list/tuple of two ints
@@ -51,7 +51,7 @@ def conv2d_nchw_miopen(cfg, data, kernel, strides, padding, dilation, out_dtype=
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
 
@@ -67,7 +67,7 @@ def conv2d_nchw_miopen(cfg, data, kernel, strides, padding, dilation, out_dtype=
     OH = (H + 2 * pad_h - KH) // stride_h + 1
     OW = (W + 2 * pad_w - KW) // stride_w + 1
     cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
-                ((KW - 1) * dilation_w + 1))
+                 ((KW - 1) * dilation_w + 1))
 
     return miopen.conv2d_forward(data,
                                  kernel,
diff --git a/topi/python/topi/rocm/dense.py b/topi/python/topi/rocm/dense.py
index 8729a62bd677..097120da88d6 100644
--- a/topi/python/topi/rocm/dense.py
+++ b/topi/python/topi/rocm/dense.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for dense operator"""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.contrib import rocblas
 from .. import generic, nn
@@ -30,13 +29,13 @@ def dense(cfg, data, weight, bias=None, out_dtype=None):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D with shape [batch, in_dim]
 
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [out_dim, in_dim]
 
-    bias : tvm.Tensor, optional
+    bias : tvm.te.Tensor, optional
         1-D with shape [out_dim]
 
     out_dtype : str
@@ -44,7 +43,7 @@ def dense(cfg, data, weight, bias=None, out_dtype=None):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
     assert len(data.shape) == 2 and len(weight.shape) == 2, \
@@ -71,8 +70,8 @@ def schedule_dense(cfg, outs):
     s: Schedule
         The computation schedule for dense.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if op.tag == 'dense':
@@ -87,11 +86,11 @@ def _callback(op):
             else:
                 Out = outs[0].op.output(0)
                 s[Dense].compute_at(s[Out], s[Out].op.axis[1])
-            s[Out].bind(s[Out].op.axis[0], tvm.thread_axis("blockIdx.y"))
-            s[Out].bind(s[Out].op.axis[1], tvm.thread_axis("blockIdx.x"))
+            s[Out].bind(s[Out].op.axis[0], te.thread_axis("blockIdx.y"))
+            s[Out].bind(s[Out].op.axis[1], te.thread_axis("blockIdx.x"))
 
             tx = s[Dense].op.reduce_axis[0]
-            thread_x = tvm.thread_axis("threadIdx.x")
+            thread_x = te.thread_axis("threadIdx.x")
             s[Dense].bind(tx, thread_x)
             s[DenseF].compute_at(s[Dense], tx)
             s[Dense].set_store_predicate(thread_x.var.equal(0))
@@ -107,13 +106,13 @@ def dense_rocblas(cfg, data, weight, bias=None, out_dtype=None):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D with shape [batch, in_dim]
 
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [out_dim, in_dim]
 
-    bias : tvm.Tensor, optional
+    bias : tvm.te.Tensor, optional
         1-D with shape [out_dim]
 
     out_dtype : str
@@ -121,7 +120,7 @@ def dense_rocblas(cfg, data, weight, bias=None, out_dtype=None):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
     assert out_dtype == data.dtype, "Mixed precision not supported."
@@ -130,9 +129,9 @@ def dense_rocblas(cfg, data, weight, bias=None, out_dtype=None):
     out_dim, _ = weight.shape
     cfg.add_flop(batch * in_dim * out_dim * 2)
     if bias is not None:
-        matmul = tvm.compute((batch, out_dim),
-                             lambda i, j: matmul[i, j] + bias[j],
-                             tag=tag.BROADCAST)
+        matmul = te.compute((batch, out_dim),
+                            lambda i, j: matmul[i, j] + bias[j],
+                            tag=tag.BROADCAST)
     return matmul
 
 
diff --git a/topi/python/topi/sort.py b/topi/python/topi/sort.py
index 96a088923d2d..744da622adc2 100644
--- a/topi/python/topi/sort.py
+++ b/topi/python/topi/sort.py
@@ -17,7 +17,7 @@
 # pylint: disable=too-many-arguments
 """Argsort operator"""
 import tvm
-from tvm import api
+from tvm import te
 from .util import get_const_tuple
 
 def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
@@ -27,14 +27,14 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tensor.
 
-    valid_count : tvm.Tensor, optional
+    valid_count : tvm.te.Tensor, optional
         1-D tensor for valid number of boxes only for ssd.
 
     axis : int, optional
-	    Axis along which to sort the input tensor.
+            Axis along which to sort the input tensor.
         By default the flattened array is used.
 
     is_ascend : boolean, optional
@@ -45,7 +45,7 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         Sorted index tensor.
 
     Example
@@ -54,7 +54,7 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
 
         # An example to use argsort
         dshape = (1, 5, 6)
-        data = tvm.placeholder(dshape, name="data")
+        data = te.placeholder(dshape, name="data")
         axis = 0
         is_ascend = False
         out = argsort(data, axis=axis, is_ascend=is_ascend)
@@ -66,35 +66,36 @@ def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
         f(tvm_data, tvm_out)
     """
-    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
     if valid_count is not None:
-        valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
-                                          "valid_count_buf", data_alignment=4)
-        out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=8)
+        valid_count_buf = tvm.tir.decl_buffer(
+            valid_count.shape, valid_count.dtype,
+            "valid_count_buf", data_alignment=4)
+        out_buf = tvm.tir.decl_buffer(data.shape, "int32", "out_buf", data_alignment=8)
         out = \
-            tvm.extern(data.shape,
-                       [data, valid_count],
-                       lambda ins, outs: tvm.call_packed(
-                           "tvm.contrib.sort.argsort_nms", ins[0], ins[1],
-                           outs[0], axis, is_ascend),
-                       dtype="int32",
-                       in_buffers=[data_buf, valid_count_buf],
-                       out_buffers=out_buf,
-                       name="argsort_nms_cpu",
-                       tag="argsort_nms_cpu")
+            te.extern(data.shape,
+                      [data, valid_count],
+                      lambda ins, outs: tvm.tir.call_packed(
+                          "tvm.contrib.sort.argsort_nms", ins[0], ins[1],
+                          outs[0], axis, is_ascend),
+                      dtype="int32",
+                      in_buffers=[data_buf, valid_count_buf],
+                      out_buffers=out_buf,
+                      name="argsort_nms_cpu",
+                      tag="argsort_nms_cpu")
     else:
-        out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
+        out_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
         out = \
-            tvm.extern(data.shape,
-                       [data],
-                       lambda ins, outs: tvm.call_packed(
-                           "tvm.contrib.sort.argsort", ins[0],
-                           outs[0], axis, is_ascend),
-                       dtype=dtype,
-                       in_buffers=[data_buf],
-                       out_buffers=out_buf,
-                       name="argsort_cpu",
-                       tag="argsort_cpu")
+            te.extern(data.shape,
+                      [data],
+                      lambda ins, outs: tvm.tir.call_packed(
+                          "tvm.contrib.sort.argsort", ins[0],
+                          outs[0], axis, is_ascend),
+                      dtype=dtype,
+                      in_buffers=[data_buf],
+                      out_buffers=out_buf,
+                      name="argsort_cpu",
+                      tag="argsort_cpu")
     return out
 
 
@@ -103,7 +104,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         The input tensor.
 
     k : int, optional
@@ -126,27 +127,27 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
 
     Returns
     -------
-    out : tvm.Tensor or List[tvm.Tensor]
+    out : tvm.te.Tensor or List[tvm.te.Tensor]
         The computed result.
     """
     assert ret_type in ["both", "values", "indices"]
-    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
+    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
     out_shape = list(get_const_tuple(data.shape))
     if k >= 1:
         out_shape[axis] = k
     out_bufs = []
     if ret_type in ["both", "values"]:
-        out_bufs.append(api.decl_buffer(out_shape, data.dtype, "value_buf", data_alignment=8))
+        out_bufs.append(tvm.tir.decl_buffer(out_shape, data.dtype, "value_buf", data_alignment=8))
     if ret_type in ["both", "indices"]:
-        out_bufs.append(api.decl_buffer(out_shape, dtype, "indices_buf", data_alignment=8))
+        out_bufs.append(tvm.tir.decl_buffer(out_shape, dtype, "indices_buf", data_alignment=8))
     out_shapes = [out_shape] * len(out_bufs)
 
-    out = tvm.extern(out_shapes,
-                     [data],
-                     lambda ins, outs: tvm.call_packed(
-                         "tvm.contrib.sort.topk", ins[0], *outs, k, axis, ret_type, is_ascend),
-                     in_buffers=[data_buf],
-                     out_buffers=out_bufs,
-                     name="topk_cpu",
-                     tag="topk_cpu")
+    out = te.extern(out_shapes,
+                    [data],
+                    lambda ins, outs: tvm.tir.call_packed(
+                        "tvm.contrib.sort.topk", ins[0], *outs, k, axis, ret_type, is_ascend),
+                    in_buffers=[data_buf],
+                    out_buffers=out_bufs,
+                    name="topk_cpu",
+                    tag="topk_cpu")
     return out
diff --git a/topi/python/topi/sparse/csrmm.py b/topi/python/topi/sparse/csrmm.py
index 29f9cb4dbaa6..8dc08949505d 100644
--- a/topi/python/topi/sparse/csrmm.py
+++ b/topi/python/topi/sparse/csrmm.py
@@ -17,6 +17,7 @@
 """TVM operator compute SpMM in CSR format."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 from .. import tag
 from ..util import simplify
 
@@ -26,37 +27,37 @@ def csrmm_default(data, indices, indptr, weight, bias=None):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         1-D with shape [nonzeros]
 
-    indices : tvm.Tensor
+    indices : tvm.te.Tensor
         1-D with shape [nonzeros]
 
-    indptr : tvm.Tensor
+    indptr : tvm.te.Tensor
         1-D with shape [m+1]
 
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [k, n]
 
-    bias : tvm.Tensor, optional
+    bias : tvm.te.Tensor, optional
         1-D with shape [m]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [m, n]
     """
     assert len(data.shape) == 1 and len(indices.shape) == 1 and len(indptr.shape) == 1 \
         and len(weight.shape) == 2, "only support 2-dim csrmm"
-    assert isinstance(weight, tvm.tensor.Tensor), \
-        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    assert isinstance(weight, te.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.te.Tensor, but weight is `%s`" % (type(weight))
     if bias is not None:
         assert len(bias.shape) == 1
     M = simplify(indptr.shape[0]-1)
     _, N = weight.shape
     def csrmm_default_ir(data, indices, indptr, weight, out):
         """define ir for csrmm"""
-        irb = tvm.ir_builder.create()
+        irb = tvm.tir.ir_builder.create()
         data_ptr = irb.buffer_ptr(data)
         indices_ptr = irb.buffer_ptr(indices)
         indptr_ptr = irb.buffer_ptr(indptr)
@@ -78,12 +79,12 @@ def csrmm_default_ir(data, indices, indptr, weight, out):
                 out_ptr[row*N+n] += dot[0]
         return irb.get()
     oshape = (M, N)
-    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
-                        lambda ins, outs: csrmm_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-                        tag="csrmm", dtype='float32', name='out')
+    matmul = te.extern(oshape, [data, indices, indptr, weight],
+                       lambda ins, outs: csrmm_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                       tag="csrmm", dtype='float32', name='out')
     if bias is not None:
-        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[i], \
-                             tag=tag.BROADCAST)
+        matmul = te.compute(oshape, lambda i, j: matmul[i, j] + bias[i], \
+                            tag=tag.BROADCAST)
     return matmul
 
 
@@ -96,15 +97,15 @@ def csrmm(a, b, c=None):
     a : tvm.contrib.sparse.CSRNDArray
         2-D sparse matrix with shape [m, k]
 
-    b : tvm.Tensor
+    b : tvm.te.Tensor
         2-D dense matrix with shape [k, n]
 
-    c : tvm.Tensor, optional
+    c : tvm.te.Tensor, optional
         1-D dense vector with shape [n]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [m, n]
     """
     return csrmm_default(a.data, a.indices, a.indptr, b, c)
diff --git a/topi/python/topi/sparse/csrmv.py b/topi/python/topi/sparse/csrmv.py
index 8a21f0db6d96..c0aa1b41449c 100644
--- a/topi/python/topi/sparse/csrmv.py
+++ b/topi/python/topi/sparse/csrmv.py
@@ -17,6 +17,7 @@
 """TVM operator compute SpMV in CSR format."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 from .. import tag
 
 def csrmv_default(data, indices, indptr, weight, bias=None):
@@ -24,36 +25,36 @@ def csrmv_default(data, indices, indptr, weight, bias=None):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         1-D with shape [nonzeros]
 
-    indices : tvm.Tensor
+    indices : tvm.te.Tensor
         1-D with shape [nonzeros]
 
-    indptr : tvm.Tensor
+    indptr : tvm.te.Tensor
         1-D with shape [m+1]
 
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [k, 1]
 
-    bias : tvm.Tensor, optional
+    bias : tvm.te.Tensor, optional
         1-D with shape [1]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [m, 1]
     """
     assert len(data.shape) == 1 and len(weight.shape) == 2, \
         "only support 2-dim csrmv"
-    assert isinstance(weight, tvm.tensor.Tensor), \
-        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    assert isinstance(weight, te.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.te.Tensor, but weight is `%s`" % (type(weight))
     if bias is not None:
         assert len(bias.shape) == 1
     batch = indptr.shape[0]-1
     def csrmv_default_ir(data, indices, indptr, weight, out):
         """define ir for csrmv"""
-        irb = tvm.ir_builder.create()
+        irb = tvm.tir.ir_builder.create()
         data_ptr = irb.buffer_ptr(data)
         indices_ptr = irb.buffer_ptr(indices)
         indptr_ptr = irb.buffer_ptr(indptr)
@@ -73,12 +74,12 @@ def csrmv_default_ir(data, indices, indptr, weight, out):
             out_ptr[row] += dot[0]
         return irb.get()
     oshape = (batch, 1)
-    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
-                        lambda ins, outs: csrmv_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-                        tag="csrmv", dtype='float32', name='csrmv')
+    matmul = te.extern(oshape, [data, indices, indptr, weight],
+                       lambda ins, outs: csrmv_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                       tag="csrmv", dtype='float32', name='csrmv')
     if bias is not None:
-        matmul = tvm.compute((batch, 1), lambda i, j: matmul[i, 0] + bias[i], \
-                             tag=tag.BROADCAST)
+        matmul = te.compute((batch, 1), lambda i, j: matmul[i, 0] + bias[i], \
+                            tag=tag.BROADCAST)
     return matmul
 
 
@@ -91,15 +92,15 @@ def csrmv(a, x, y=None):
     a : tvm.contrib.sparse.CSRNDArray
         2-D sparse matrix with shape [m, k]
 
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         2-D dense matrix with shape [k, 1]
 
-    y : tvm.Tensor, optional
+    y : tvm.te.Tensor, optional
         1-D dense vector with shape [1]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D dense matrix with shape [m, 1]
     """
     return csrmv_default(a.data, a.indices, a.indptr, x, y)
diff --git a/topi/python/topi/sparse/dense.py b/topi/python/topi/sparse/dense.py
index fe21e2fdf90e..9f01405b14f3 100644
--- a/topi/python/topi/sparse/dense.py
+++ b/topi/python/topi/sparse/dense.py
@@ -17,6 +17,7 @@
 """TVM operator compute Dense in CSR format."""
 from __future__ import absolute_import
 import tvm
+from tvm import te
 from .. import tag
 from ..util import simplify
 
@@ -26,30 +27,30 @@ def dense_si(data, indices, indptr, weight, bias=None):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         1-D with shape [num_nonzeros]
 
-    indices : tvm.Tensor
+    indices : tvm.te.Tensor
         1-D with shape [num_nonzeros]
 
-    indptr : tvm.Tensor
+    indptr : tvm.te.Tensor
         1-D with shape [m+1]
 
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [k, n]
 
-    bias : tvm.Tensor, optional
+    bias : tvm.te.Tensor, optional
         1-D with shape [m]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [m, n]
     """
     assert len(data.shape) == 1 and len(indices.shape) == 1 and len(indptr.shape) == 1 \
         and len(weight.shape) == 2, "only support 2-dim dense"
-    assert isinstance(weight, tvm.tensor.Tensor), \
-        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    assert isinstance(weight, te.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.te.Tensor, but weight is `%s`" % (type(weight))
     if bias is not None:
         assert len(bias.shape) == 1
     dtype = data.dtype
@@ -58,7 +59,7 @@ def dense_si(data, indices, indptr, weight, bias=None):
     def dense_default_ir(data, indices, indptr, weight, out):
         """Define IR for Dense"""
         dtype = data.dtype
-        irb = tvm.ir_builder.create()
+        irb = tvm.tir.ir_builder.create()
         data_ptr = irb.buffer_ptr(data)
         indices_ptr = irb.buffer_ptr(indices)
         indptr_ptr = irb.buffer_ptr(indptr)
@@ -69,8 +70,8 @@ def dense_default_ir(data, indices, indptr, weight, out):
         with irb.for_range(0, N, for_type="vectorize", name='n') as n:
             with irb.for_range(0, M, for_type="parallel", name='m') as m:
                 dot = irb.allocate(dtype, (1,), name='dot', scope='local')
-                out_ptr[m*N+n] = tvm.const(0, dtype)
-                dot[0] = tvm.const(0, dtype)
+                out_ptr[m*N+n] = tvm.tir.const(0, dtype)
+                dot[0] = tvm.tir.const(0, dtype)
                 row_start = indptr_ptr[m]
                 row_elems = indptr_ptr[m+1]-row_start
                 with irb.for_range(0, row_elems, name='k') as k:
@@ -79,12 +80,12 @@ def dense_default_ir(data, indices, indptr, weight, out):
                 out_ptr[m*N+n] += dot[0]
         return irb.get()
     oshape = (M, N)
-    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
-                        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-                        tag="dense", dtype=dtype, name='out')
+    matmul = te.extern(oshape, [data, indices, indptr, weight],
+                       lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                       tag="dense", dtype=dtype, name='out')
     if bias is not None:
-        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
-                             tag=tag.BROADCAST)
+        matmul = te.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
+                            tag=tag.BROADCAST)
     return matmul
 
 
@@ -94,30 +95,30 @@ def dense_sw(data, w_data, w_indices, w_indptr, bias=None):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D with shape [m, k]
 
-    w_data : tvm.Tensor
+    w_data : tvm.te.Tensor
         1-D with shape [nonzeros]
 
-    w_indices : tvm.Tensor
+    w_indices : tvm.te.Tensor
         1-D with shape [nonzeros]
 
-    w_indptr : tvm.Tensor
+    w_indptr : tvm.te.Tensor
         1-D with shape [n+1]
 
-    bias : tvm.Tensor, optional
+    bias : tvm.te.Tensor, optional
         1-D with shape [n]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [m, n]
     """
     assert len(w_data.shape) == 1 and len(w_indices.shape) == 1 and len(w_indptr.shape) == 1 \
         and len(data.shape) == 2, "only support 2-dim dense"
-    assert isinstance(data, tvm.tensor.Tensor), \
-        "data matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(data))
+    assert isinstance(data, te.tensor.Tensor), \
+        "data matrix is assumed to be tvm.te.Tensor, but weight is `%s`" % (type(data))
     if bias is not None:
         assert len(bias.shape) == 1
     dtype = data.dtype
@@ -126,7 +127,7 @@ def dense_sw(data, w_data, w_indices, w_indptr, bias=None):
     def dense_default_ir(data, w_data, w_indices, w_indptr, out):
         """Define IR for Dense"""
         dtype = data.dtype
-        irb = tvm.ir_builder.create()
+        irb = tvm.tir.ir_builder.create()
         data_ptr = irb.buffer_ptr(data)
         w_data_ptr = irb.buffer_ptr(w_data)
         w_indices_ptr = irb.buffer_ptr(w_indices)
@@ -137,8 +138,8 @@ def dense_default_ir(data, w_data, w_indices, w_indptr, out):
         with irb.for_range(0, M, for_type="vectorize", name='m') as m:
             with irb.for_range(0, N, for_type="parallel", name='n') as n:
                 dot = irb.allocate(dtype, (1,), name='dot', scope='local')
-                out_ptr[m*N+n] = tvm.const(0, dtype)
-                dot[0] = tvm.const(0, dtype)
+                out_ptr[m*N+n] = tvm.tir.const(0, dtype)
+                dot[0] = tvm.tir.const(0, dtype)
                 row_start = w_indptr_ptr[n]
                 row_elems = w_indptr_ptr[n+1]-row_start
                 with irb.for_range(0, row_elems, name='k') as k:
@@ -147,12 +148,12 @@ def dense_default_ir(data, w_data, w_indices, w_indptr, out):
                 out_ptr[m*N+n] += dot[0]
         return irb.get()
     oshape = (M, N)
-    matmul = tvm.extern(oshape, [data, w_data, w_indices, w_indptr],
-                        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
-                        tag="dense", dtype=dtype, name='out')
+    matmul = te.extern(oshape, [data, w_data, w_indices, w_indptr],
+                       lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                       tag="dense", dtype=dtype, name='out')
     if bias is not None:
-        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
-                             tag=tag.BROADCAST)
+        matmul = te.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
+                            tag=tag.BROADCAST)
     return matmul
 
 
@@ -162,26 +163,26 @@ def dense(data, weight, bias=None):
 
     Parameters
     ----------
-    data : tvm.contrib.sparse.CSRNDArray or tvm.tensor.Tensor
+    data : tvm.contrib.sparse.CSRNDArray or te.tensor.Tensor
         2-D with shape [batch, in_dim]
 
-    weight : tvm.tensor.Tensor or tvm.contrib.sparse.CSRNDArray
+    weight : te.tensor.Tensor or tvm.contrib.sparse.CSRNDArray
         2-D with shape [out_dim, in_dim]
 
-    bias : tvm.tensor.Tensor, optional
+    bias : te.tensor.Tensor, optional
         1-D with shape [out_dim]
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
     ret = None
     if isinstance(data, tvm.contrib.sparse.CSRPlaceholderOp) and \
-       isinstance(weight, tvm.tensor.Tensor):
+       isinstance(weight, te.tensor.Tensor):
         ret = dense_si(data.data, data.indices, data.indptr, weight, bias)
-    elif isinstance(data, tvm.tensor.Tensor) and \
-       isinstance(weight, tvm.contrib.sparse.CSRPlaceholderOp):
+    elif isinstance(data, te.tensor.Tensor) and \
+            isinstance(weight, tvm.contrib.sparse.CSRPlaceholderOp):
         ret = dense_sw(data, weight.data, weight.indices, weight.indptr, bias)
     else:
         raise NotImplementedError("implementation for %s as data and %s as weights, "
diff --git a/topi/python/topi/tensor.py b/topi/python/topi/tensor.py
index 0231efcca272..00712420ee07 100644
--- a/topi/python/topi/tensor.py
+++ b/topi/python/topi/tensor.py
@@ -24,12 +24,12 @@ def elemwise_sum(xs):
 
     Parameters
     ----------
-    xs : list of tvm.Tensor
+    xs : list of tvm.te.Tensor
         Input arguments.
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return cpp.elemwise_sum(xs)
@@ -49,7 +49,7 @@ def full(shape, dtype, fill_value):
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return cpp.full(shape, dtype, fill_value)
@@ -61,14 +61,14 @@ def full_like(x, fill_value):
 
     Parameters
     ----------
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         Input argument.
     fill_value : float
         Value to be filled
 
     Returns
     -------
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         The result.
     """
     return cpp.full_like(x, fill_value)
diff --git a/topi/python/topi/testing/conv2d_transpose_python.py b/topi/python/topi/testing/conv2d_transpose_python.py
index 50c43eb70e3e..c789feca617f 100644
--- a/topi/python/topi/testing/conv2d_transpose_python.py
+++ b/topi/python/topi/testing/conv2d_transpose_python.py
@@ -59,9 +59,9 @@ def conv2d_transpose_nchw_python(a_np, w_np, stride, padding):
     bpad_left = filter_w - 1 - fpad_left
     bpad_right = filter_w - 1 - fpad_right
     padded_a_np = np.zeros((batch, in_c, dilated_a_np.shape[2]+bpad_top+bpad_bottom, \
-        dilated_a_np.shape[3]+bpad_left+bpad_right))
+                            dilated_a_np.shape[3]+bpad_left+bpad_right))
     padded_a_np[:, :, bpad_top:dilated_a_np.shape[2]+bpad_top, \
-        bpad_left:dilated_a_np.shape[3]+bpad_left] = dilated_a_np
+                bpad_left:dilated_a_np.shape[3]+bpad_left] = dilated_a_np
     # convolution stage
     out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
     out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
diff --git a/topi/python/topi/testing/conv3d_ncdhw_python.py b/topi/python/topi/testing/conv3d_ncdhw_python.py
index 825ec622a1ec..063c07d94133 100644
--- a/topi/python/topi/testing/conv3d_ncdhw_python.py
+++ b/topi/python/topi/testing/conv3d_ncdhw_python.py
@@ -48,7 +48,7 @@ def _conv3d_ncdhw_python(a_np, w_np, stride, padding):
                 if pad_d > 0 or pad_h > 0 or pad_w > 0:
                     apad = np.zeros((in_depth + pad_d, in_height + pad_h, in_width + pad_w))
                     apad[pad_front:pad_front + in_depth, pad_top:pad_top + in_height,\
-                        pad_left:pad_left + in_width] = a_np[n, c]
+                         pad_left:pad_left + in_width] = a_np[n, c]
                 else:
                     apad = a_np[n, c]
                 out = scipy.signal.convolve(
diff --git a/topi/python/topi/testing/conv3d_ndhwc_python.py b/topi/python/topi/testing/conv3d_ndhwc_python.py
index 2810f72b094f..85b991f3ec5f 100644
--- a/topi/python/topi/testing/conv3d_ndhwc_python.py
+++ b/topi/python/topi/testing/conv3d_ndhwc_python.py
@@ -73,7 +73,7 @@ def conv3d_ndhwc_python(a_np, w_np, stride, padding):
                 if pad_d > 0 or pad_h > 0 or pad_w > 0:
                     apad = np.zeros((in_depth + pad_d, in_height + pad_h, in_width + pad_w))
                     apad[pad_front:pad_front + in_depth, pad_top:pad_top + in_height,\
-                        pad_left:pad_left + in_width] = at[n, c]
+                         pad_left:pad_left + in_width] = at[n, c]
                 else:
                     apad = at[n, c]
                 out = scipy.signal.convolve(
diff --git a/topi/python/topi/testing/depthwise_conv2d_python.py b/topi/python/topi/testing/depthwise_conv2d_python.py
index 566bb93f42a7..5addc7578d10 100644
--- a/topi/python/topi/testing/depthwise_conv2d_python.py
+++ b/topi/python/topi/testing/depthwise_conv2d_python.py
@@ -57,8 +57,8 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding):
         for i in range(batch):
             for j in range(out_channel):
                 output_np[i, j, :, :] = signal.convolve2d(input_np[i, j//channel_multiplier, :, :], \
-                    np.rot90(filter_np[j//channel_multiplier, j%channel_multiplier, :, :], 2), \
-                    mode='valid')[0:(in_height - filter_height + 1):stride_h, 0:(in_width - filter_height + 1):stride_w]
+                                                          np.rot90(filter_np[j//channel_multiplier, j%channel_multiplier, :, :], 2), \
+                                                          mode='valid')[0:(in_height - filter_height + 1):stride_h, 0:(in_width - filter_height + 1):stride_w]
     if padding == 'SAME':
         out_channel = in_channel * channel_multiplier
         out_height = np.int(np.ceil(float(in_height) / float(stride_h)))
@@ -75,8 +75,8 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding):
         for i in range(batch):
             for j in range(out_channel):
                 output_np[i, j, :, :] = signal.convolve2d(input_np[i, j//channel_multiplier, :, :], \
-                    np.rot90(filter_np[j//channel_multiplier, j%channel_multiplier, :, :], 2), \
-                    mode='same')[index_h:in_height:stride_h, index_w:in_width:stride_w]
+                                                          np.rot90(filter_np[j//channel_multiplier, j%channel_multiplier, :, :], 2), \
+                                                          mode='same')[index_h:in_height:stride_h, index_w:in_width:stride_w]
 
     return output_np
 
@@ -118,8 +118,8 @@ def depthwise_conv2d_python_nhwc(input_np, filter_np, stride, padding):
         for i in range(batch):
             for j in range(out_channel):
                 output_np[i, :, :, j] = signal.convolve2d(input_np[i, :, :, j//channel_multiplier], \
-                    np.rot90(filter_np[:, :, j//channel_multiplier, j%channel_multiplier], 2), \
-                    mode='valid')[0:(in_height - filter_height + 1):stride_h, 0:(in_width - filter_height + 1):stride_w]
+                                                          np.rot90(filter_np[:, :, j//channel_multiplier, j%channel_multiplier], 2), \
+                                                          mode='valid')[0:(in_height - filter_height + 1):stride_h, 0:(in_width - filter_height + 1):stride_w]
     if padding == 'SAME':
         out_channel = in_channel * channel_multiplier
         out_height = np.int(np.ceil(float(in_height) / float(stride_h)))
@@ -136,7 +136,7 @@ def depthwise_conv2d_python_nhwc(input_np, filter_np, stride, padding):
         for i in range(batch):
             for j in range(out_channel):
                 output_np[i, :, :, j] = signal.convolve2d(input_np[i, :, :, j//channel_multiplier], \
-                    np.rot90(filter_np[:, :, j//channel_multiplier, j%channel_multiplier], 2), \
-                    mode='same')[index_h:in_height:stride_h, index_w:in_width:stride_w]
+                                                          np.rot90(filter_np[:, :, j//channel_multiplier, j%channel_multiplier], 2), \
+                                                          mode='same')[index_h:in_height:stride_h, index_w:in_width:stride_w]
 
     return output_np
diff --git a/topi/python/topi/testing/pool3d_python.py b/topi/python/topi/testing/pool3d_python.py
index 631a995e7c12..2606650b33cf 100644
--- a/topi/python/topi/testing/pool3d_python.py
+++ b/topi/python/topi/testing/pool3d_python.py
@@ -40,9 +40,9 @@ def pool3d_ncdhw_python(np_data, kernel,
         assert out_shape[3] == int(math.floor(float(in_shape[3] - k_h + pt + pb) / s_h) + 1)
         assert out_shape[4] == int(math.floor(float(in_shape[4] - k_w + pl + pr) / s_w) + 1)
 
-    fill_value = tvm.const(0.0, dtype).value
+    fill_value = tvm.tir.const(0.0, dtype).value
     if not(count_include_pad) and pool_type == 'max':
-        fill_value = tvm.min_value(dtype).value
+        fill_value = tvm.te.min_value(dtype).value
 
     pad_np = np.full(shape=(in_n, in_c,
                             in_d + pf + pk,
diff --git a/topi/python/topi/testing/pool_grad_python.py b/topi/python/topi/testing/pool_grad_python.py
index f1e51f0c957e..ee671c2c49d2 100644
--- a/topi/python/topi/testing/pool_grad_python.py
+++ b/topi/python/topi/testing/pool_grad_python.py
@@ -53,7 +53,7 @@ def pool_grad_nchw(a_np, out_grad_np,
                     # take the first element, as they are the same across batch and channel
                     pad_count = pad_count.ravel()[0]
                 pad_pool_grad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] += \
-                        out_grad_np[:, :, i, j].reshape(n, ic, 1, 1) / np.maximum(pad_count, 1)
+                    out_grad_np[:, :, i, j].reshape(n, ic, 1, 1) / np.maximum(pad_count, 1)
     elif pool_type == 'max':
         for i in range(oh):
             for j in range(ow):
diff --git a/topi/python/topi/testing/roi_align_python.py b/topi/python/topi/testing/roi_align_python.py
index 6ba2061abd75..d3285490d4c8 100644
--- a/topi/python/topi/testing/roi_align_python.py
+++ b/topi/python/topi/testing/roi_align_python.py
@@ -45,8 +45,8 @@ def _bilinear(b, c, y, x):
         lx = x - x_low
         return (1 - ly) * (1 - lx) * a_np[b, c, y_low, x_low] + \
                (1 - ly) * lx * a_np[b, c, y_low, x_high] + \
-               ly * (1 - lx) * a_np[b, c, y_high, x_low] + \
-               ly * lx * a_np[b, c, y_high, x_high]
+            ly * (1 - lx) * a_np[b, c, y_high, x_low] + \
+            ly * lx * a_np[b, c, y_high, x_high]
 
     for i in range(num_roi):
         roi = rois_np[i]
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index bdeb22304b07..036191bc6ead 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -18,6 +18,7 @@
 """Injective transformation operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 import topi
 from . import cpp
 from . import tag
@@ -29,7 +30,7 @@ def expand_dims(a, axis, num_newaxis=1):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be expanded.
 
     num_newaxis: int, optional
@@ -37,7 +38,7 @@ def expand_dims(a, axis, num_newaxis=1):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.expand_dims(a, axis, num_newaxis)
 
@@ -63,21 +64,21 @@ def expand_like(a, shape_like, axis):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be expanded.
-    shape_like : tvm.Tensor
+    shape_like : tvm.te.Tensor
         The tensor to with target shape.
     axis: list of int
         axis to be expanded on
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     odim = len(axis) + len(a.shape)
     if odim != len(shape_like.shape):
         if len(a.shape) == 1 and len(axis) == len(shape_like.shape):
             # A special case: `a` is a scalar represented as a 1-dim tensor
-            return tvm.compute(shape_like.shape, lambda *idxs: a(0))
+            return te.compute(shape_like.shape, lambda *idxs: a(0))
         raise ValueError("shape inconsistent when expand_like ({}, {}, {})".format(
             len(axis), len(a.shape), len(shape_like.shape)))
 
@@ -92,7 +93,7 @@ def _compute(*idxs):
                 indices.append(idxs[i])
                 axis_index += 1
         return a(*indices)
-    return tvm.compute(shape_like.shape, _compute)
+    return te.compute(shape_like.shape, _compute)
 
 
 def transpose(a, axes=None):
@@ -100,7 +101,7 @@ def transpose(a, axes=None):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be expanded.
 
     axes: tuple of ints, optional
@@ -108,7 +109,7 @@ def transpose(a, axes=None):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.transpose(a, axes)
 
@@ -118,7 +119,7 @@ def flip(a, axis=0):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be expanded.
 
     axis : int, optional
@@ -126,7 +127,7 @@ def flip(a, axis=0):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.flip(a, axis)
 
@@ -135,7 +136,7 @@ def strided_slice(a, begin, end, strides=None):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be sliced.
 
     begin: list of int
@@ -151,38 +152,38 @@ def strided_slice(a, begin, end, strides=None):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     if strides is None:
         strides = []
     return cpp.strided_slice(a, begin, end, strides)
 
-@tvm.tag_scope(tag=tag.INJECTIVE+",strided_set")
+@tvm.te.tag_scope(tag=tag.INJECTIVE+",strided_set")
 def strided_set(a, v, begin, end, strides=None):
     """Set slice of an array.
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be sliced.
 
-    v : tvm.Tensor
+    v : tvm.te.Tensor
         The values to set
 
-    begin: tvm.Tensor
+    begin: tvm.te.Tensor
         The indices to begin with in the slicing.
 
-    end: tvm.Tensor
+    end: tvm.te.Tensor
         Indicies indicating end of the slice.
 
-    strides: tvm.Tensor, optional
+    strides: tvm.te.Tensor, optional
         Specifies the stride values, it can be negative
         in that case, the input tensor will be reversed
         in that particular axis.
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     n = len(a.shape)
 
@@ -201,38 +202,38 @@ def strided_set(a, v, begin, end, strides=None):
             raise TypeError("strides should be int32")
 
     def _max(a, b):
-        return tvm.expr.Select(a > b, a, b)
+        return tvm.tir.Select(a > b, a, b)
 
     if strides is None:
-        strides = [tvm.const(1, 'int32')] * n
+        strides = [tvm.tir.const(1, 'int32')] * n
     else:
-        strides = [tvm.if_then_else(strides.shape[0] > i,
-                                    strides[i],
-                                    tvm.const(1, 'int32'))
+        strides = [tvm.tir.if_then_else(strides.shape[0] > i,
+                                        strides[i],
+                                        tvm.tir.const(1, 'int32'))
                    for i in range(n)]
 
-    begin = [tvm.if_then_else(begin.shape[0] > i,
-                              begin[i],
-                              tvm.expr.Select(strides[i] > 0,
-                                              tvm.const(0, 'int32'),
-                                              a.shape[i]))
+    begin = [tvm.tir.if_then_else(begin.shape[0] > i,
+                                  begin[i],
+                                  tvm.tir.Select(strides[i] > 0,
+                                                 tvm.tir.const(0, 'int32'),
+                                                 a.shape[i]))
              for i in range(n)]
-    end = [tvm.if_then_else(end.shape[0] > i,
-                            end[i],
-                            tvm.expr.Select(strides[i] > 0,
-                                            a.shape[i] + 1,
-                                            -(a.shape[i] + 1)))
+    end = [tvm.tir.if_then_else(end.shape[0] > i,
+                                end[i],
+                                tvm.tir.Select(strides[i] > 0,
+                                               a.shape[i] + 1,
+                                               -(a.shape[i] + 1)))
            for i in range(n)]
 
 
     # Convert negative indexes
     for i in range(n):
-        begin[i] = tvm.if_then_else(begin[i] < 0,
-                                    begin[i] + a.shape[i],
-                                    begin[i])
-        end[i] = tvm.if_then_else(end[i] < 0,
-                                  end[i] + a.shape[i],
-                                  end[i])
+        begin[i] = tvm.tir.if_then_else(begin[i] < 0,
+                                        begin[i] + a.shape[i],
+                                        begin[i])
+        end[i] = tvm.tir.if_then_else(end[i] < 0,
+                                      end[i] + a.shape[i],
+                                      end[i])
 
     def _select(*indices):
         from_val = []
@@ -241,9 +242,9 @@ def _select(*indices):
             from_val.append(within_index(begin[i], end[i], strides[i], indices[i]))
             index_tuple.append(
                 make_idx(begin[i], end[i], strides[i], a.shape[i], indices[i]))
-        return tvm.if_then_else(tvm.all(*from_val), v(*index_tuple), a(*indices))
+        return tvm.tir.if_then_else(tvm.tir.all(*from_val), v(*index_tuple), a(*indices))
 
-    return tvm.compute(a.shape, _select, name="strided_set")
+    return te.compute(a.shape, _select, name="strided_set")
 
 
 def reshape(a, newshape):
@@ -251,14 +252,14 @@ def reshape(a, newshape):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be reshaped
     newshape : tuple of ints
         The new shape
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.reshape(a, newshape)
 
@@ -268,7 +269,7 @@ def squeeze(a, axis=None):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
 
     axis : None or int or tuple of ints, optional
         Selects a subset of the single-dimensional entries in the shape.
@@ -276,7 +277,7 @@ def squeeze(a, axis=None):
 
     Returns
     -------
-    squeezed : tvm.Tensor
+    squeezed : tvm.te.Tensor
     """
     return cpp.squeeze(a, axis)
 
@@ -286,7 +287,7 @@ def concatenate(a_tuple, axis=0):
 
     Parameters
     ----------
-    a_tuple : tuple of tvm.Tensor
+    a_tuple : tuple of tvm.te.Tensor
         The arrays to concatenate
 
     axis : int, optional
@@ -294,7 +295,7 @@ def concatenate(a_tuple, axis=0):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.concatenate(a_tuple, axis)
 
@@ -304,7 +305,7 @@ def stack(a, axis):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be stacked.
 
     axis : int, optional
@@ -313,7 +314,7 @@ def stack(a, axis):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.stack(a, axis)
 
@@ -323,7 +324,7 @@ def split(ary, indices_or_sections, axis=0):
 
     Parameters
     ----------
-    ary : tvm.Tensor
+    ary : tvm.te.Tensor
 
     indices_or_sections : int or 1-D array
 
@@ -331,7 +332,7 @@ def split(ary, indices_or_sections, axis=0):
 
     Returns
     -------
-    ret : tuple of tvm.Tensor
+    ret : tuple of tvm.te.Tensor
     """
     return cpp.split(ary, indices_or_sections, axis)
 
@@ -341,10 +342,10 @@ def take(a, indices, axis=None, mode="clip"):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The source array.
 
-    indices : tvm.Tensor
+    indices : tvm.te.Tensor
         The indices of the values to extract.
 
     axis : int, optional
@@ -359,7 +360,7 @@ def take(a, indices, axis=None, mode="clip"):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     if axis is None:
         return cpp.take(a, indices, mode)
@@ -371,15 +372,15 @@ def gather_nd(a, indices):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The source array.
 
-    indices : tvm.Tensor
+    indices : tvm.te.Tensor
         The indices of the values to extract.
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.gather_nd(a, indices)
 
@@ -444,7 +445,7 @@ def arange(start, stop=None, step=1, dtype="float32"):
 
     Returns
     -------
-    result : tvm.Tensor
+    result : tvm.te.Tensor
         The resulting tensor.
     """
     if stop is None:
@@ -458,7 +459,7 @@ def repeat(a, repeats, axis):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be repeated.
 
     repeats: int, required
@@ -469,7 +470,7 @@ def repeat(a, repeats, axis):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.repeat(a, repeats, axis)
 
@@ -479,7 +480,7 @@ def tile(a, reps):
 
     Parameters
     ----------
-    a : tvm.Tensor
+    a : tvm.te.Tensor
         The tensor to be tiled.
 
     reps: tuple of ints, required
@@ -487,7 +488,7 @@ def tile(a, reps):
 
     Returns
     -------
-    ret : tvm.Tensor
+    ret : tvm.te.Tensor
     """
     return cpp.tile(a, reps)
 
@@ -497,7 +498,7 @@ def layout_transform(array, src_layout, dst_layout):
 
     Parameters
     ----------
-    array : tvm.Tensor
+    array : tvm.te.Tensor
         The source array.
 
     src_layout : str
@@ -514,7 +515,7 @@ def shape(array, dtype="int32"):
 
     Parameters
     ----------
-    array : tvm.Tensor
+    array : tvm.te.Tensor
         The source tensor.
 
     dtype : str, optional
@@ -522,7 +523,7 @@ def shape(array, dtype="int32"):
 
     Returns
     -------
-    result : tvm.Tensor
+    result : tvm.te.Tensor
         The resulting tensor.
     """
     return cpp.shape(array, dtype)
@@ -543,11 +544,11 @@ def sequence_mask(data, valid_length, mask_value=0, axis=0):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         N-D with shape [MAX_LENGTH, batch_size, ...] or [batch_size, MAX_LENGTH, ...]
         depending on the value of `axis`.
 
-    valid_length : tvm.Tensor
+    valid_length : tvm.te.Tensor
         1-D with shape [batch_size,]
 
     mask_value : float, optional
@@ -558,7 +559,7 @@ def sequence_mask(data, valid_length, mask_value=0, axis=0):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         N-D with shape [MAX_LENGTH, batch_size, ...] or [batch_size, MAX_LENGTH, ...]
         depending on the value of `axis`.
     """
@@ -574,7 +575,7 @@ def ndarray_size(array, dtype="int32"):
 
     Parameters
     ----------
-    array : tvm.Tensor
+    array : tvm.te.Tensor
         The source tensor.
 
     dtype : str, optional
@@ -582,7 +583,7 @@ def ndarray_size(array, dtype="int32"):
 
     Returns
     -------
-    result : tvm.Tensor
+    result : tvm.te.Tensor
         The resulting tensor.
     """
     return cpp.ndarray_size(array, dtype)
@@ -593,18 +594,18 @@ def where(condition, x, y):
 
     Parameters
     ----------
-    condition : tvm.Tensor
+    condition : tvm.te.Tensor
         The condition array.
 
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         First array to be selected.
 
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         Second array to be selected.
 
     Returns
     -------
-    result : tvm.Tensor
+    result : tvm.te.Tensor
         A Tensor selected from x or y depending on condition.
     """
     return cpp.where(condition, x, y)
@@ -617,13 +618,13 @@ def one_hot(indices, on_value, off_value, depth, axis, dtype):
 
     Parameters
     ----------
-    indices : tvm.Tensor
+    indices : tvm.te.Tensor
         Locations to set to on_value.
 
-    on_value : tvm.Tensor
+    on_value : tvm.te.Tensor
         Value to fill at indices.
 
-    off_value : tvm.Tensor
+    off_value : tvm.te.Tensor
         Value to fill at all other positions besides indices.
 
     depth : int
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index c4c3ee6dd37c..681535761f83 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -20,7 +20,8 @@
 from numbers import Integral
 
 import tvm
-from tvm.api import layout, bijective_layout
+from tvm import te
+from tvm.tir import layout, bijective_layout
 from . import tag, cpp
 
 class InvalidShapeError(ValueError):
@@ -56,7 +57,7 @@ def _traverse(op):
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if isinstance(tensor.op, tvm.te.ComputeOp):
                     _traverse(tensor.op)
         callback(op)
 
@@ -77,7 +78,7 @@ def prod(x):
         The result value
     """
     if not x:
-        return tvm.const(1, "int32")
+        return tvm.tir.const(1, "int32")
     res = x[0]
     for i in range(1, len(x)):
         res = res * x[i]
@@ -99,9 +100,9 @@ def get_const_int(expr):
     """
     if isinstance(expr, Integral):
         return expr
-    if not isinstance(expr, tvm.expr.IntImm):
-        expr = tvm.ir_pass.Simplify(expr)
-    if not isinstance(expr, tvm.expr.IntImm):
+    if not isinstance(expr, tvm.tir.IntImm):
+        expr = tvm.tir.ir_pass.Simplify(expr)
+    if not isinstance(expr, tvm.tir.IntImm):
         raise ValueError("Expect value to be constant int")
     return int(expr.value)
 
@@ -121,9 +122,9 @@ def get_const_float(expr):
     """
     if isinstance(expr, float):
         return float(expr)
-    if not isinstance(expr, tvm.expr.FloatImm):
-        expr = tvm.ir_pass.Simplify(expr)
-    if not isinstance(expr, tvm.expr.FloatImm):
+    if not isinstance(expr, tvm.tir.FloatImm):
+        expr = tvm.tir.ir_pass.Simplify(expr)
+    if not isinstance(expr, tvm.tir.FloatImm):
         raise ValueError("Expect value to be constant float")
     return float(expr.value)
 
@@ -143,9 +144,9 @@ def equal_const_int(expr, value):
     """
     if isinstance(expr, Integral):
         return expr == value
-    if not isinstance(expr, tvm.expr.IntImm):
-        expr = tvm.ir_pass.Simplify(expr)
-    if not isinstance(expr, tvm.expr.IntImm):
+    if not isinstance(expr, tvm.tir.IntImm):
+        expr = tvm.tir.ir_pass.Simplify(expr)
+    if not isinstance(expr, tvm.tir.IntImm):
         return False
     return expr.value == value
 
@@ -165,11 +166,11 @@ def get_const_tuple(in_tuple):
     """
     ret = []
     for elem in in_tuple:
-        if isinstance(elem, tvm.expr.Var):
+        if isinstance(elem, tvm.tir.Var):
             ret.append(elem)
-        elif not isinstance(elem, (tvm.expr.IntImm, int)):
-            elem = tvm.ir_pass.Simplify(elem)
-            if not isinstance(elem, tvm.expr.IntImm):
+        elif not isinstance(elem, (tvm.tir.IntImm, int)):
+            elem = tvm.tir.ir_pass.Simplify(elem)
+            if not isinstance(elem, tvm.tir.IntImm):
                 ret.append(elem)
         else:
             ret.append(get_const_int(elem))
@@ -205,7 +206,7 @@ def simplify(expr):
     out : Expr or int
         The simplified output
     """
-    return tvm.ir_pass.Simplify(expr) if isinstance(expr, tvm.expr.PrimExpr) else expr
+    return tvm.tir.ir_pass.Simplify(expr) if isinstance(expr, tvm.tir.PrimExpr) else expr
 
 
 def ravel_index(indices, shape):
@@ -213,7 +214,7 @@ def ravel_index(indices, shape):
 
     Parameters
     ----------
-    indices : tuple of int or tvm.expr.IntImm
+    indices : tuple of int or tvm.tir.IntImm
         The input coordinates
 
     shape : tuple of int
@@ -238,7 +239,7 @@ def unravel_index(idx, shape):
 
     Parameters
     ----------
-    idx : int or tvm.expr.IntImm
+    idx : int or tvm.tir.IntImm
         The 1D index
 
     shape : tuple of int
@@ -246,11 +247,11 @@ def unravel_index(idx, shape):
 
     Returns
     -------
-    indices : tuple of int or tvm.expr.IntImm
+    indices : tuple of int or tvm.tir.IntImm
         Corresponding coordinate of the 1D index
     """
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
     indices = []
     for i in range(len(shape) - 1, -1, -1):
         indices.append(idxm(idx, shape[i]))
@@ -276,18 +277,18 @@ def const_matrix(matrix, name="const_matrix"):
     """
     row, col = matrix.shape
     dtype = str(matrix.dtype)
-    idxm = tvm.indexmod
+    idxm = tvm.tir.indexmod
 
     def select_array(i, j):
-        now = tvm.const(0.0, dtype)
+        now = tvm.tir.const(0.0, dtype)
         for ii in range(row):
             for jj in range(col):
-                now = tvm.expr.Select(tvm.all(idxm(i, row) == ii, idxm(j, col) == jj),
-                                      tvm.const(matrix[ii][jj], dtype),
-                                      now)
+                now = tvm.tir.Select(tvm.tir.all(idxm(i, row) == ii, idxm(j, col) == jj),
+                                     tvm.tir.const(matrix[ii][jj], dtype),
+                                     now)
         return now
 
-    return tvm.compute(matrix.shape, select_array, name=name)
+    return te.compute(matrix.shape, select_array, name=name)
 
 
 def get_max_power2_factor(n, max_value=None):
@@ -349,7 +350,7 @@ def get_shape(src_shape, src_layout, dst_layout):
 
     layout_mapping = bijective_layout(src_layout, dst_layout)
     dst_indices = layout_mapping.forward_index(
-        tvm.convert(list(range(len(src_layout)))))
+        tvm.runtime.convert(list(range(len(src_layout)))))
 
     return get_const_tuple(tuple([src_shape[i.value] for i in dst_indices]))
 
@@ -377,12 +378,12 @@ def within_index(b, e, s, i):
         bool expression that is True is the array position would be selected
         by the index and False otherwise
     """
-    bc = tvm.expr.Select(s < 0, i <= e, i < b)
-    ec = tvm.expr.Select(s < 0, i > b, i >= e)
-    ss = tvm.if_then_else(s < 0,
-                          ((i - e) + (e % tvm.abs(s)) + 1) % tvm.abs(s),
-                          (i - b) % s)
-    return tvm.expr.Select(tvm.expr.Or(bc, ec), tvm.const(False), ss.equal(0))
+    bc = tvm.tir.Select(s < 0, i <= e, i < b)
+    ec = tvm.tir.Select(s < 0, i > b, i >= e)
+    ss = te.if_then_else(s < 0,
+                         ((i - e) + (e % te.abs(s)) + 1) % te.abs(s),
+                         (i - b) % s)
+    return tvm.tir.Select(tvm.tir.Or(bc, ec), tvm.tir.const(False), ss.equal(0))
 
 
 def make_idx(b, e, s, z, i):
@@ -414,16 +415,16 @@ def make_idx(b, e, s, z, i):
     postion: Expr
         int expression that corresponds to an array position in the selection.
     """
-    bc = tvm.expr.Select(s < 0, i <= e, i < b)
-    ec = tvm.expr.Select(s < 0, i > b, i >= e)
+    bc = tvm.tir.Select(s < 0, i <= e, i < b)
+    ec = tvm.tir.Select(s < 0, i > b, i >= e)
 
     # Clamp to array size
-    b = tvm.expr.Select(z < b, z - 1, b)
+    b = tvm.tir.Select(z < b, z - 1, b)
 
-    ss = tvm.if_then_else(s < 0,
-                          (b - i) // tvm.abs(s),
-                          (i - b) // s)
-    return tvm.if_then_else(tvm.expr.Or(bc, ec), 88, ss)
+    ss = tvm.tir.if_then_else(s < 0,
+                              (b - i) // te.abs(s),
+                              (i - b) // s)
+    return tvm.tir.if_then_else(tvm.tir.Or(bc, ec), 88, ss)
 
 
 def is_empty_shape(shape):
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index c171f8ca5fe3..d95ca75a2d1b 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -17,6 +17,7 @@
 # pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args
 """Non-maximum suppression operator"""
 import tvm
+from tvm import te
 
 from tvm import hybrid
 from ..sort import argsort
@@ -28,16 +29,16 @@ def hybrid_rearrange_out(data, one):
 
     Parameters
     ----------
-    data : tvm.Tensor or numpy NDArray
+    data : tvm.te.Tensor or numpy NDArray
         NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
 
-    one: tvm.const
+    one: tvm.tir.const
         Constant one with the same dtype as data.
 
     Returns
     -------
-    output : tvm.Tensor or numpy NDArray
+    output : tvm.te.Tensor or numpy NDArray
         Transformed NMS output. 3-D tensor with shape
         [batch_size, num_anchors, 6].
     """
@@ -70,28 +71,28 @@ def hybrid_get_valid_counts(data, score_threshold, id_index, score_index, one):
 
     Parameters
     ----------
-    data : tvm.Tensor or numpy NDArray
+    data : tvm.te.Tensor or numpy NDArray
         Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
         or [batch_size, num_anchors, 5].
 
-    score_threshold : tvm.const
+    score_threshold : tvm.tir.const
         Lower limit of score for valid bounding boxes.
 
-    id_index : tvm.const
+    id_index : tvm.tir.const
         index of the class categories, -1 to disable.
 
-    score_index: tvm.const
+    score_index: tvm.tir.const
         Index of the scores/confidence of boxes.
 
-    one: tvm.const
+    one: tvm.tir.const
         Constant one with the same dtype as data.
 
     Returns
     -------
-    out_tensor : tvm.Tensor or numpy NDArray
+    out_tensor : tvm.te.Tensor or numpy NDArray
         Rearranged data tensor.
 
-    valid_count : tvm.Tensor or numpy NDArray
+    valid_count : tvm.te.Tensor or numpy NDArray
         1-D tensor for valid number of boxes.
     """
     batch_size = data.shape[0]
@@ -123,7 +124,7 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         Input data. 3-D tensor with shape [batch_size, num_anchors, 6]
         or [batch_size, num_anchors, 5].
 
@@ -138,18 +139,18 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1):
 
     Returns
     -------
-    out_tensor : tvm.Tensor
+    out_tensor : tvm.te.Tensor
         Rearranged data tensor.
 
-    valid_count : tvm.Tensor
+    valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
     """
-    score_threshold_const = tvm.const(score_threshold, data.dtype)
-    id_index_const = tvm.const(id_index, "int32")
-    score_index_const = tvm.const(score_index, "int32")
+    score_threshold_const = tvm.tir.const(score_threshold, data.dtype)
+    id_index_const = tvm.tir.const(id_index, "int32")
+    score_index_const = tvm.tir.const(score_index, "int32")
     return hybrid_get_valid_counts(data, score_threshold_const,
                                    id_index_const, score_index_const,
-                                   tvm.const(1, data.dtype))
+                                   tvm.tir.const(1, data.dtype))
 
 
 @hybrid.script
@@ -160,51 +161,51 @@ def hybrid_nms(data, sorted_index, valid_count,
 
     Parameters
     ----------
-    data: tvm.Tensor or numpy NDArray
+    data: tvm.te.Tensor or numpy NDArray
         Bounding boxes with class and score. 3-D tensor with shape
         [batch_size, num_anchors, 6].
 
-    sorted_index : tvm.Tensor or numpy NDArray
+    sorted_index : tvm.te.Tensor or numpy NDArray
         Bounding box indexes sorted by score, with shape
         [batch_size, num_anchors].
 
-    valid_count : tvm.Tensor or numpy NDArray
+    valid_count : tvm.te.Tensor or numpy NDArray
         1-D tensor for valid number of boxes.
 
-    max_output_size : tvm.const
+    max_output_size : tvm.tir.const
         Max number of output valid boxes for each instance.
         By default all valid boxes are returned.
 
-    iou_threshold : tvm.const
+    iou_threshold : tvm.tir.const
         Overlapping(IoU) threshold to suppress object with smaller score.
 
-    force_suppress : tvm.const
+    force_suppress : tvm.tir.const
         Whether to suppress all detections regardless of class_id.
 
-    top_k : tvm.const
+    top_k : tvm.tir.const
         Keep maximum top k detections before nms, -1 for no limit.
 
-    coord_start : tvm.const
+    coord_start : tvm.tir.const
         Start index of the consecutive 4 coordinates.
 
-    id_index : tvm.const
+    id_index : tvm.tir.const
         index of the class categories, -1 to disable.
 
-    score_index: tvm.const
+    score_index: tvm.tir.const
         Index of the scores/confidence of boxes.
 
-    zero: tvm.const
+    zero: tvm.tir.const
         Constant zero with the same dtype as data.
 
-    one: tvm.const
+    one: tvm.tir.const
         Constant one with the same dtype as data.
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6].
 
-    box_indices: tvm.Tensor
+    box_indices: tvm.te.Tensor
         2-D tensor with shape [batch_size, num_anchors].
     """
     batch_size = data.shape[0]
@@ -297,10 +298,10 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].
 
-    valid_count : tvm.Tensor
+    valid_count : tvm.te.Tensor
         1-D tensor for valid number of boxes.
 
     max_output_size : optional, int
@@ -333,7 +334,7 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6].
 
     Example
@@ -342,8 +343,8 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
 
         # An example to use non_max_suppression
         dshape = (1, 5, 6)
-        data = tvm.placeholder(dshape, name="data")
-        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        data = te.placeholder(dshape, name="data")
+        valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count")
         iou_threshold = 0.7
         force_suppress = True
         top_k = -1
@@ -363,19 +364,19 @@ def non_max_suppression(data, valid_count, max_output_size=-1,
     num_anchors = data.shape[1]
     score_axis = score_index
     score_shape = (batch_size, num_anchors)
-    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
+    score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis])
     sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False)
     out, box_indices = hybrid_nms(data, sort_tensor, valid_count,
-                                  tvm.const(max_output_size, dtype="int32"),
-                                  tvm.const(iou_threshold, dtype=data.dtype),
-                                  tvm.const(force_suppress, dtype="bool"),
-                                  tvm.const(top_k, dtype="int32"),
-                                  tvm.const(coord_start, dtype="int32"),
-                                  tvm.const(id_index, dtype="int32"),
-                                  tvm.const(score_index, dtype="int32"),
-                                  zero=tvm.const(0, dtype=data.dtype),
-                                  one=tvm.const(1, dtype=data.dtype))
+                                  tvm.tir.const(max_output_size, dtype="int32"),
+                                  tvm.tir.const(iou_threshold, dtype=data.dtype),
+                                  tvm.tir.const(force_suppress, dtype="bool"),
+                                  tvm.tir.const(top_k, dtype="int32"),
+                                  tvm.tir.const(coord_start, dtype="int32"),
+                                  tvm.tir.const(id_index, dtype="int32"),
+                                  tvm.tir.const(score_index, dtype="int32"),
+                                  zero=tvm.tir.const(0, dtype=data.dtype),
+                                  one=tvm.tir.const(1, dtype=data.dtype))
     if not return_indices and invalid_to_bottom:
-        out = hybrid_rearrange_out(out, one=tvm.const(1, dtype=data.dtype))
+        out = hybrid_rearrange_out(out, one=tvm.tir.const(1, dtype=data.dtype))
 
     return box_indices if return_indices else out
diff --git a/topi/python/topi/vision/rcnn/proposal.py b/topi/python/topi/vision/rcnn/proposal.py
index 5de4998c066c..23bd24d22fb3 100644
--- a/topi/python/topi/vision/rcnn/proposal.py
+++ b/topi/python/topi/vision/rcnn/proposal.py
@@ -18,6 +18,7 @@
 """Proposal operator"""
 import math
 import tvm
+from tvm import te
 from ...util import get_const_tuple, get_const_int
 from ...sort import argsort
 
@@ -43,8 +44,8 @@ def reg_bbox(x1, y1, x2, y2, dx, dy, dw, dh):
 
     pred_ctr_x = dx * bbox_w + ctr_x
     pred_ctr_y = dy * bbox_h + ctr_y
-    pred_w = tvm.exp(dw) * bbox_w
-    pred_h = tvm.exp(dh) * bbox_h
+    pred_w = te.exp(dw) * bbox_w
+    pred_h = te.exp(dh) * bbox_h
 
     pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0)
     pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0)
@@ -67,16 +68,16 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
 
     Parameters
     ----------
-    cls_prob_buf : tvm.schedule.Buffer
+    cls_prob_buf : tvm.te.schedule.Buffer
         4-D with shape [batch, 2 * num_anchors, height, width]
 
-    bbox_pred_buf : tvm.schedule.Buffer
+    bbox_pred_buf : tvm.te.schedule.Buffer
         4-D with shape [batch, 4 * num_anchors, height, width]
 
-    im_info_buf : tvm.schedule.Buffer
+    im_info_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, 3]
 
-    out_buf : tvm.schedule.Buffer
+    out_buf : tvm.te.schedule.Buffer
         3-D with shape [batch, num_bbox, 5]
         The last dimension is in format of [w_start, h_start, w_end, h_end, score]
 
@@ -103,15 +104,15 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
     """
     batch, num_anchors, height, width = get_const_tuple(cls_prob_buf.shape)
     num_anchors //= 2
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
 
     p_score = ib.buffer_ptr(cls_prob_buf)
     p_delta = ib.buffer_ptr(bbox_pred_buf)
     p_im_info = ib.buffer_ptr(im_info_buf)
     p_out = ib.buffer_ptr(out_buf)
 
-    idxm = tvm.indexmod
-    idxd = tvm.indexdiv
+    idxm = tvm.tir.indexmod
+    idxd = tvm.tir.indexdiv
 
     with ib.for_range(0, batch * height * width) as tid:
         w = idxm(tid, width)
@@ -135,10 +136,10 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
             regression_func = reg_iou if iou_loss else reg_bbox
             pred_x1, pred_y1, pred_x2, pred_y2 = regression_func(x1, y1, x2, y2, *delta)
 
-            pred_x1 = tvm.max(tvm.min(pred_x1, im_width - 1.0), 0.0)
-            pred_y1 = tvm.max(tvm.min(pred_y1, im_height - 1.0), 0.0)
-            pred_x2 = tvm.max(tvm.min(pred_x2, im_width - 1.0), 0.0)
-            pred_y2 = tvm.max(tvm.min(pred_y2, im_height - 1.0), 0.0)
+            pred_x1 = tvm.te.max(tvm.te.min(pred_x1, im_width - 1.0), 0.0)
+            pred_y1 = tvm.te.max(tvm.te.min(pred_y1, im_height - 1.0), 0.0)
+            pred_x2 = tvm.te.max(tvm.te.min(pred_x2, im_width - 1.0), 0.0)
+            pred_y2 = tvm.te.max(tvm.te.min(pred_y2, im_height - 1.0), 0.0)
 
             real_height = (im_height / feature_stride).astype('int32')
             real_width = (im_width / feature_stride).astype('int32')
@@ -148,15 +149,15 @@ def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, r
             min_size = p_im_info[b * 3 + 2] * rpn_min_size
 
             pred_score = p_score[((b * num_anchors * 2 + num_anchors + k) * height + h) * width + w]
-            pred_score = tvm.expr.Select(tvm.any(h >= real_height, w >= real_width),
-                                         -1.0, pred_score)
+            pred_score = tvm.tir.Select(tvm.tir.any(h >= real_height, w >= real_width),
+                                        -1.0, pred_score)
             p_out[out_index * 5 + 0] = pred_x1
             p_out[out_index * 5 + 1] = pred_y1
             p_out[out_index * 5 + 2] = pred_x2
             p_out[out_index * 5 + 3] = pred_y2
             p_out[out_index * 5 + 4] = pred_score
 
-            with ib.if_scope(tvm.any(bbox_w < min_size, bbox_h < min_size)):
+            with ib.if_scope(tvm.tir.any(bbox_w < min_size, bbox_h < min_size)):
                 p_out[out_index * 5 + 0] -= min_size / 2.0
                 p_out[out_index * 5 + 1] -= min_size / 2.0
                 p_out[out_index * 5 + 2] += min_size / 2.0
@@ -171,10 +172,10 @@ def argsort_ir(data_buf, out_index_buf):
 
     Parameters
     ----------
-    data_buf : tvm.schedule.Buffer
+    data_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, num_bbox]
 
-    out_index_buf : tvm.schedule.Buffer
+    out_index_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, num_bbox]. Indices of data in sorted order.
 
     Returns
@@ -183,12 +184,12 @@ def argsort_ir(data_buf, out_index_buf):
         The result IR statement.
     """
     batch, num_bbox = get_const_tuple(data_buf.shape)
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     p_data = ib.buffer_ptr(data_buf)
     index_out = ib.buffer_ptr(out_index_buf)
     temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local")
     temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local")
-    idxm = tvm.indexmod
+    idxm = tvm.tir.indexmod
     with ib.for_range(0, batch, for_type="unroll") as b:
         start = b * num_bbox
         for i in range(2):
@@ -199,8 +200,8 @@ def argsort_ir(data_buf, out_index_buf):
         with ib.for_range(0, num_bbox) as k:
             with ib.for_range(0, (num_bbox + 1) // 2) as tid:
                 offset = start + 2 * tid + idxm(k, 2)
-                with ib.if_scope(tvm.all(offset + 1 < num_bbox,
-                                         p_data[offset] < p_data[offset + 1])):
+                with ib.if_scope(tvm.tir.all(offset + 1 < num_bbox,
+                                             p_data[offset] < p_data[offset + 1])):
                     temp_data[0] = p_data[offset]
                     p_data[offset] = p_data[offset + 1]
                     p_data[offset + 1] = temp_data[0]
@@ -215,11 +216,11 @@ def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
 
     Parameters
     ----------
-    sorted_bbox_buf : tvm.schedule.Buffer
+    sorted_bbox_buf : tvm.te.schedule.Buffer
         3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
         [w_start, h_start, w_end, h_end, score].
 
-    out_buf : tvm.schedule.Buffer
+    out_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
 
     nms_threshold : float
@@ -233,10 +234,10 @@ def nms_ir(sorted_bbox_buf, out_buf, nms_threshold):
     def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         """Calculate overlap of two boxes.
         """
-        w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-                    - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx]) + 1.0)
-        h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-                    - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]) + 1.0)
+        w = tvm.te.max(0.0, tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
+                       - tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx]) + 1.0)
+        h = tvm.te.max(0.0, tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
+                       - tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]) + 1.0)
         i = w * h
         u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0) * \
             (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0) + \
@@ -245,7 +246,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
         return i / u
 
     batch, num_bbox = get_const_tuple(out_buf.shape)
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     p_data = ib.buffer_ptr(sorted_bbox_buf)
     p_out = ib.buffer_ptr(out_buf)
     with ib.for_range(0, batch, for_type="unroll", name="n") as b:
@@ -254,7 +255,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
             p_out[base_idx + i] = False
         with ib.for_range(0, num_bbox - 1) as l:
             with ib.for_range(0, num_bbox) as i:
-                with ib.if_scope(tvm.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
+                with ib.if_scope(tvm.tir.all(i < num_bbox, i > l, p_out[base_idx + l] == False)):
                     iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5)
                     with ib.if_scope(iou > nms_threshold):
                         p_out[base_idx + i] = True
@@ -266,14 +267,14 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
 
     Parameters
     ----------
-    sorted_bbox_buf : tvm.schedule.Buffer
+    sorted_bbox_buf : tvm.te.schedule.Buffer
         3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
         [w_start, h_start, w_end, h_end, score].
 
-    remove_mask_buf : tvm.schedule.Buffer
+    remove_mask_buf : tvm.te.schedule.Buffer
         2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.
 
-    out_buf : tvm.schedule.Buffer
+    out_buf : tvm.te.schedule.Buffer
         2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
         [batch_index, w_start, h_start, w_end, h_end].
 
@@ -284,7 +285,7 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
     """
     batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape)
     rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch
-    ib = tvm.ir_builder.create()
+    ib = tvm.tir.ir_builder.create()
     i = ib.allocate('int32', (batch,), 'i', scope='local')
     p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf)
     p_remove = ib.buffer_ptr(remove_mask_buf)
@@ -302,14 +303,14 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
                 nkeep[b] += 1
     with ib.for_range(0, batch) as b:
         with ib.if_scope(nkeep[b] > 0):
-            with ib.for_range(0, tvm.ceil(
-                    tvm.const(rpn_post_nms_top_n, 'float32') / nkeep[b]).astype('int32')):
+            with ib.for_range(0, te.ceil(
+                    tvm.tir.const(rpn_post_nms_top_n, 'float32') / nkeep[b]).astype('int32')):
                 with ib.for_range(0, num_bbox) as j:
                     offset_j = (b * num_bbox + j) * 5
                     offset_i = (b * rpn_post_nms_top_n + i[b]) * 5
-                    with ib.if_scope(tvm.all(i[b] < rpn_post_nms_top_n,
-                                             p_remove[(b*num_bbox+j)] == False)):
-                        p_out[offset_i] = tvm.expr.Cast('float32', b)
+                    with ib.if_scope(tvm.tir.all(i[b] < rpn_post_nms_top_n,
+                                                 p_remove[(b*num_bbox+j)] == False)):
+                        p_out[offset_i] = tvm.tir.Cast('float32', b)
                         with ib.for_range(0, 4, for_type='unroll') as k:
                             p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
                         i[b] = i[b] + 1
@@ -324,13 +325,13 @@ def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, thres
 
     Parameters
     ----------
-    cls_prob : tvm.Tensor
+    cls_prob : tvm.te.Tensor
         4-D with shape [batch, 2 * num_anchors, height, width]
 
-    bbox_pred : tvm.Tensor
+    bbox_pred : tvm.te.Tensor
         4-D with shape [batch, 4 * num_anchors, height, width]
 
-    im_info : tvm.Tensor
+    im_info : tvm.te.Tensor
         2-D with shape [batch, 3]
 
     scales : list/tuple of float
@@ -360,7 +361,7 @@ def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, thres
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
         [batch_index, w_start, h_start, w_end, h_end].
     """
@@ -370,20 +371,20 @@ def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, thres
     num_bbox = height * width * num_anchors
     rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox
 
-    bbox = tvm.extern((batch, num_bbox, 5), [cls_prob, bbox_pred, im_info], lambda ins, outs:
-                      predict_bbox_ir(ins[0], ins[1], ins[2], outs[0], scales, ratios,
-                                      feature_stride, rpn_min_size, iou_loss),
-                      dtype=bbox_pred.dtype)
-    score = tvm.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag='bbox_score')
+    bbox = te.extern((batch, num_bbox, 5), [cls_prob, bbox_pred, im_info], lambda ins, outs:
+                     predict_bbox_ir(ins[0], ins[1], ins[2], outs[0], scales, ratios,
+                                     feature_stride, rpn_min_size, iou_loss),
+                     dtype=bbox_pred.dtype)
+    score = te.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag='bbox_score')
     valid_count_shape = (1,)
-    valid_count = tvm.compute(valid_count_shape, lambda i: num_bbox)
+    valid_count = te.compute(valid_count_shape, lambda i: num_bbox)
     sorted_index = argsort(score, valid_count=valid_count, axis=1, is_ascend=False)
-    sorted_bbox = tvm.compute((batch, rpn_pre_nms_top_n, 5),
-                              lambda b, i, j: bbox[b, sorted_index[b, i], j], tag='sorted_bbox')
-    nms_remove_mask = tvm.extern((batch, rpn_pre_nms_top_n), [sorted_bbox],
-                                 lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
-                                 dtype='bool')
-    nms_out = tvm.extern((batch * rpn_post_nms_top_n, 5), [sorted_bbox, nms_remove_mask],
-                         lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
-                         dtype=sorted_bbox.dtype)
+    sorted_bbox = te.compute((batch, rpn_pre_nms_top_n, 5),
+                             lambda b, i, j: bbox[b, sorted_index[b, i], j], tag='sorted_bbox')
+    nms_remove_mask = te.extern((batch, rpn_pre_nms_top_n), [sorted_bbox],
+                                lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
+                                dtype='bool')
+    nms_out = te.extern((batch * rpn_post_nms_top_n, 5), [sorted_bbox, nms_remove_mask],
+                        lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
+                        dtype=sorted_bbox.dtype)
     return nms_out
diff --git a/topi/python/topi/vision/rcnn/roi_align.py b/topi/python/topi/vision/rcnn/roi_align.py
index a0bc5e291597..9aa1ef9c75a2 100644
--- a/topi/python/topi/vision/rcnn/roi_align.py
+++ b/topi/python/topi/vision/rcnn/roi_align.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name
 """Roi align operator"""
 import tvm
+from tvm import te
 from ...util import get_const_tuple
 from ...cpp.util import bilinear_sample_nchw
 
@@ -26,10 +27,10 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, channel, height, width]
 
-    rois : tvm.Tensor
+    rois : tvm.te.Tensor
         2-D with shape [num_roi, 5]. The last dimension should be in format of
         [batch_index, w_start, h_start, w_end, h_end]
 
@@ -45,7 +46,7 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [num_roi, channel, pooled_size, pooled_size]
     """
     dtype = rois.dtype
@@ -58,11 +59,11 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
         pooled_size_h, pooled_size_w = pooled_size
 
     def _bilinear(i, c, y, x):
-        outside = tvm.any(y < -1.0, x < -1.0, y > height, x > width)
-        y = tvm.max(y, 0.0)
-        x = tvm.max(x, 0.0)
+        outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width)
+        y = tvm.te.max(y, 0.0)
+        x = tvm.te.max(x, 0.0)
         val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1)
-        return tvm.if_then_else(outside, 0.0, val)
+        return tvm.tir.if_then_else(outside, 0.0, val)
 
     def _sample(i, c, ph, pw):
         roi = rois[i]
@@ -74,27 +75,27 @@ def _sample(i, c, ph, pw):
         roi_end_w *= spatial_scale
 
         # force malformed ROIs to be 1x1
-        roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype))
-        roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype))
+        roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
+        roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))
 
         bin_h = roi_h / pooled_size_h
         bin_w = roi_w / pooled_size_w
 
         if sample_ratio > 0:
-            roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32')
+            roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, 'int32')
         else:
-            roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32')
-            roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32')
+            roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype('int32')
+            roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype('int32')
 
         count = roi_bin_grid_h * roi_bin_grid_w
-        rh = tvm.reduce_axis((0, roi_bin_grid_h))
-        rw = tvm.reduce_axis((0, roi_bin_grid_w))
+        rh = te.reduce_axis((0, roi_bin_grid_h))
+        rw = te.reduce_axis((0, roi_bin_grid_w))
         roi_start_h += ph * bin_h
         roi_start_w += pw * bin_w
-        return tvm.sum(_bilinear(batch_index, c,
-                                 roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
-                                 roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count,
-                       axis=[rh, rw])
+        return te.sum(_bilinear(batch_index, c,
+                                roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
+                                roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count,
+                      axis=[rh, rw])
 
-    return tvm.compute((num_roi, channel, pooled_size_h, pooled_size_w), _sample,
-                       tag='pool,roi_align_nchw')
+    return te.compute((num_roi, channel, pooled_size_h, pooled_size_w), _sample,
+                      tag='pool,roi_align_nchw')
diff --git a/topi/python/topi/vision/rcnn/roi_pool.py b/topi/python/topi/vision/rcnn/roi_pool.py
index f346f580b3ba..a206f34831a1 100644
--- a/topi/python/topi/vision/rcnn/roi_pool.py
+++ b/topi/python/topi/vision/rcnn/roi_pool.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name
 """ROI pool operator"""
 import tvm
+from tvm import te
 from ...util import get_const_tuple
 
 def roi_pool_nchw(data, rois, pooled_size, spatial_scale):
@@ -24,10 +25,10 @@ def roi_pool_nchw(data, rois, pooled_size, spatial_scale):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, channel, height, width]
 
-    rois : tvm.Tensor
+    rois : tvm.te.Tensor
         2-D with shape [num_roi, 5]. The last dimension should be in format of
         [batch_index, w_start, h_start, w_end, h_end]
 
@@ -40,7 +41,7 @@ def roi_pool_nchw(data, rois, pooled_size, spatial_scale):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [num_roi, channel, pooled_size, pooled_size]
     """
     dtype = rois.dtype
@@ -57,36 +58,36 @@ def _pool(i, c, ph, pw):
         batch_index = roi[0].astype('int32')
         roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
 
-        roi_start_h = tvm.round(roi_start_h * spatial_scale).astype('int32')
-        roi_start_w = tvm.round(roi_start_w * spatial_scale).astype('int32')
-        roi_end_h = tvm.round(roi_end_h * spatial_scale).astype('int32')
-        roi_end_w = tvm.round(roi_end_w * spatial_scale).astype('int32')
+        roi_start_h = te.round(roi_start_h * spatial_scale).astype('int32')
+        roi_start_w = te.round(roi_start_w * spatial_scale).astype('int32')
+        roi_end_h = te.round(roi_end_h * spatial_scale).astype('int32')
+        roi_end_w = te.round(roi_end_w * spatial_scale).astype('int32')
 
         # force malformed ROIs to be 1x1
-        roi_h = tvm.max(roi_end_h - roi_start_h + 1, tvm.const(1, 'int32'))
-        roi_w = tvm.max(roi_end_w - roi_start_w + 1, tvm.const(1, 'int32'))
+        roi_h = tvm.te.max(roi_end_h - roi_start_h + 1, tvm.tir.const(1, 'int32'))
+        roi_w = tvm.te.max(roi_end_w - roi_start_w + 1, tvm.tir.const(1, 'int32'))
 
         bin_h = roi_h.astype(dtype) / pooled_size_h
         bin_w = roi_w.astype(dtype) / pooled_size_w
 
         # use epsilon to prevent floating point precision loss in floor/ceil
-        epsilon = tvm.const(0.00001, dtype)
-        hstart = tvm.floor(ph * bin_h + epsilon).astype('int32')
-        wstart = tvm.floor(pw * bin_w + epsilon).astype('int32')
-        hend = tvm.ceil((ph + 1) * bin_h - epsilon).astype('int32')
-        wend = tvm.ceil((pw + 1) * bin_w - epsilon).astype('int32')
-        hstart = tvm.min(tvm.max(hstart + roi_start_h, 0), height)
-        wstart = tvm.min(tvm.max(wstart + roi_start_w, 0), width)
-        hend = tvm.min(tvm.max(hend + roi_start_h, 0), height)
-        wend = tvm.min(tvm.max(wend + roi_start_w, 0), width)
+        epsilon = tvm.tir.const(0.00001, dtype)
+        hstart = te.floor(ph * bin_h + epsilon).astype('int32')
+        wstart = te.floor(pw * bin_w + epsilon).astype('int32')
+        hend = te.ceil((ph + 1) * bin_h - epsilon).astype('int32')
+        wend = te.ceil((pw + 1) * bin_w - epsilon).astype('int32')
+        hstart = tvm.te.min(tvm.te.max(hstart + roi_start_h, 0), height)
+        wstart = tvm.te.min(tvm.te.max(wstart + roi_start_w, 0), width)
+        hend = tvm.te.min(tvm.te.max(hend + roi_start_h, 0), height)
+        wend = tvm.te.min(tvm.te.max(wend + roi_start_w, 0), width)
 
-        non_empty = tvm.all(hstart < hend, wstart < wend)
-        min_value = lambda dtype: tvm.if_then_else(non_empty, tvm.min_value(dtype),
-                                                   tvm.const(0.0, dtype))
+        non_empty = tvm.tir.all(hstart < hend, wstart < wend)
+        min_value = lambda dtype: tvm.tir.if_then_else(
+            non_empty, tvm.te.min_value(dtype), tvm.tir.const(0.0, dtype))
         # pylint: disable=unnecessary-lambda
-        _max = tvm.comm_reducer(lambda x, y: tvm.max(x, y), min_value, name='max')
-        rh = tvm.reduce_axis((0, hend - hstart), 'rh')
-        rw = tvm.reduce_axis((0, wend - wstart), 'rw')
+        _max = te.comm_reducer(lambda x, y: tvm.te.max(x, y), min_value, name='max')
+        rh = te.reduce_axis((0, hend - hstart), 'rh')
+        rw = te.reduce_axis((0, wend - wstart), 'rw')
         return _max(data[batch_index, c, hstart+rh, wstart+rw], axis=[rh, rw])
 
-    return tvm.compute((num_roi, channel, pooled_size_h, pooled_size_w), _pool, tag="pool,roi_pool")
+    return te.compute((num_roi, channel, pooled_size_h, pooled_size_w), _pool, tag="pool,roi_pool")
diff --git a/topi/python/topi/vision/reorg.py b/topi/python/topi/vision/reorg.py
index 3ba5e8495a22..ec790fafbddd 100644
--- a/topi/python/topi/vision/reorg.py
+++ b/topi/python/topi/vision/reorg.py
@@ -27,7 +27,7 @@ def reorg(data, stride):
 
     Parameters
     ----------
-    Input : tvm.Tensor
+    Input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
     stride : int
@@ -35,7 +35,7 @@ def reorg(data, stride):
 
     Returns
     -------
-    Output : tvm.Tensor
+    Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     return cpp.vision.reorg(data, stride)
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index 4309af4303f1..8f287b949f2c 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -16,11 +16,10 @@
 # under the License.
 # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable
 """SSD multibox operators"""
-from __future__ import absolute_import as _abs
 import tvm
 
 from tvm import hybrid
-from tvm.intrin import exp, sqrt
+from tvm.tir import exp, sqrt
 
 import topi
 
@@ -32,7 +31,7 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
 
     Parameters
     ----------
-    data : tvm.Tensor or numpy NDArray
+    data : tvm.te.Tensor or numpy NDArray
         4-D tensor with shape [batch, channel, height, width]]
 
     sizes : tvm ConsExpr
@@ -49,7 +48,7 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
 
     Returns
     -------
-    output : tvm.Tensor or numpy NDArray
+    output : tvm.te.Tensor or numpy NDArray
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
     in_height = data.shape[2]
@@ -80,7 +79,7 @@ def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
                         * sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
                     h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
                 count = i * in_width * (num_sizes + num_ratios - 1) \
-                        + j * (num_sizes + num_ratios - 1) + k
+                    + j * (num_sizes + num_ratios - 1) + k
                 output[0, count, 0] = center_w - w
                 output[0, count, 1] = center_h - h
                 output[0, count, 2] = center_w + w
@@ -94,7 +93,7 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, c_in, h_in, w_in]]
 
     sizes : tuple of float
@@ -114,11 +113,11 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    out = hybrid_multibox_prior(data, tvm.convert(sizes), tvm.convert(ratios),
-                                tvm.convert(steps), tvm.convert(offsets))
+    out = hybrid_multibox_prior(data, tvm.runtime.convert(sizes), tvm.runtime.convert(ratios),
+                                tvm.runtime.convert(steps), tvm.runtime.convert(offsets))
     if clip:
         out = topi.clip(out, 0, 1)
     return out
@@ -166,19 +165,19 @@ def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
 
     Parameters
     ----------
-    cls_prob : tvm.Tensor or numpy NDArray
+    cls_prob : tvm.te.Tensor or numpy NDArray
         3-D tensor of class probabilities.
 
-    loc_pred : tvm.Tensor or numpy NDArray
+    loc_pred : tvm.te.Tensor or numpy NDArray
         2-D tensor of location regression predictions.
 
-    anchor : tvm.Tensor or numpy NDArray
+    anchor : tvm.te.Tensor or numpy NDArray
         3-D tensor of prior anchor boxes.
 
-    clip : tvm.const
+    clip : tvm.tir.const
         Whether to clip out-of-boundary boxes.
 
-    threshold : tvm.const
+    threshold : tvm.tir.const
         Threshold to be a positive prediction.
 
     variances : tvm.nd.NDArray
@@ -186,10 +185,10 @@ def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
 
     Returns
     -------
-    out_loc : tvm.Tensor or numpy NDArray
+    out_loc : tvm.te.Tensor or numpy NDArray
         3-D tensor of transformed location.
 
-    valid_count : tvm.Tensor or numpy NDArray
+    valid_count : tvm.te.Tensor or numpy NDArray
         1_d tensor of valid counts for boxes.
     """
     batch_size = cls_prob.shape[0]
@@ -238,13 +237,13 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
 
     Parameters
     ----------
-    cls_prob : tvm.Tensor
+    cls_prob : tvm.te.Tensor
         Class probabilities.
 
-    loc_pred : tvm.Tensor
+    loc_pred : tvm.te.Tensor
         Location regression predictions.
 
-    anchor : tvm.Tensor
+    anchor : tvm.te.Tensor
         Prior anchor boxes.
 
     clip : boolean
@@ -258,12 +257,12 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
 
     Returns
     -------
-    ret : tuple of tvm.Tensor
+    ret : tuple of tvm.te.Tensor
     """
     return hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
-                                         tvm.const(clip, "bool"),
-                                         tvm.const(threshold, "float32"),
-                                         tvm.convert(variances))
+                                         tvm.tir.const(clip, "bool"),
+                                         tvm.tir.const(threshold, "float32"),
+                                         tvm.runtime.convert(variances))
 
 def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
                        force_suppress=False, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=-1):
@@ -271,13 +270,13 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
 
     Parameters
     ----------
-    cls_prob : tvm.Tensor
+    cls_prob : tvm.te.Tensor
         Class probabilities.
 
-    loc_pred : tvm.Tensor
+    loc_pred : tvm.te.Tensor
         Location regression predictions.
 
-    anchor : tvm.Tensor
+    anchor : tvm.te.Tensor
         Prior anchor boxes.
 
     clip : boolean
@@ -300,7 +299,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
 
     Returns
     -------
-    out : tvm.Tensor
+    out : tvm.te.Tensor
         3-D tensor with shape (batch_size, num_anchors, 6)
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py
index a7cb9e98f11f..539a918f1f87 100644
--- a/topi/python/topi/x86/batch_matmul.py
+++ b/topi/python/topi/x86/batch_matmul.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,too-many-locals,unused-variable
 """x86 batch_matmul operators"""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cblas
@@ -34,13 +33,13 @@ def batch_matmul(cfg, x, y):
     ----------
     cfg : ConfigSpace
         Autotvm tuning space config file
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         3-D with shape [batch, M, K]
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         3-D with shape [batch, N, K]
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         3-D with shape [batch, M, N]
     """
     assert len(x.shape) == 3 and len(
@@ -54,10 +53,10 @@ def batch_matmul(cfg, x, y):
     if cfg.is_fallback:
         _default_batch_matmul_config(cfg, M, N, K)
 
-    k = tvm.reduce_axis((0, K), name='k')
-    C = tvm.compute(
+    k = te.reduce_axis((0, K), name='k')
+    C = te.compute(
         (B, M, N),
-        lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k),
+        lambda b, i, j: te.sum(x[b, i, k] * y[b, j, k], axis=k),
         tag='batch_matmul')
     return C
 
@@ -79,7 +78,7 @@ def schedule_batch_matmul(cfg, outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if "batch_matmul" in op.tag:
@@ -140,13 +139,13 @@ def batch_matmul_cblas(cfg, x, y):
     ----------
     cfg : ConfigSpace
         Autotvm tuning space config file
-    x : tvm.Tensor
+    x : tvm.te.Tensor
         3-D with shape [batch, M, K]
-    y : tvm.Tensor
+    y : tvm.te.Tensor
         3-D with shape [batch, N, K]
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         3-D with shape [batch, M, N]
     """
     assert len(x.shape) == 3 and len(
diff --git a/topi/python/topi/x86/binarize_pack.py b/topi/python/topi/x86/binarize_pack.py
index bab91a940edc..b4a01a5d2220 100644
--- a/topi/python/topi/x86/binarize_pack.py
+++ b/topi/python/topi/x86/binarize_pack.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name
 """Schedule for binarization and bit-packing."""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 
 
 def schedule_binarize_pack(outs):
@@ -34,8 +33,8 @@ def schedule_binarize_pack(outs):
     s: Schedule
         The computation schedule for binarize_pack.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(Out):
         s[Out].parallel(Out.op.axis[0])
diff --git a/topi/python/topi/x86/binary_dense.py b/topi/python/topi/x86/binary_dense.py
index ccf74e7bd230..d90694e819cb 100644
--- a/topi/python/topi/x86/binary_dense.py
+++ b/topi/python/topi/x86/binary_dense.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for binary dense operator."""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from .. import tag
 
 
@@ -35,8 +34,8 @@ def schedule_binary_dense(outs):
     s: Schedule
         The computation schedule for binary_dense.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def _schedule(A, B, C):
@@ -56,7 +55,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule binary_dense
         elif OP.tag == 'binary_dense':
diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
index 2ec565375654..37fe352827b0 100644
--- a/topi/python/topi/x86/bitserial_conv2d.py
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name,unused-variable,invalid-name
 """Bitserial conv2d schedule on x86"""
 import tvm
+from tvm import te
 from tvm import autotvm
 from .. import tag
 from ..util import get_const_int, get_const_tuple
@@ -94,40 +95,40 @@ def bitserial_conv2d_nchw(cfg, data, kernel, stride, padding, in_bits, weight_bi
     else:
         data_pad = data_q
 
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \
-        data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
+    data_vec = te.compute(dvshape, lambda n, h, w, ci, vh, vw, b: \
+                          data_pad[b][n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec')
 
     if len(kernel.shape) == 4:
-        kernel_vec = tvm.compute(kvshape, lambda co, ci, dh, dw, b, vc: \
-            kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec')
+        kernel_vec = te.compute(kvshape, lambda co, ci, dh, dw, b, vc: \
+                                kernel_q[b][co*VC+vc][ci][dh][dw], name='kernel_vec')
 
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
-    b1 = tvm.reduce_axis((0, IB), name='ib')
-    b2 = tvm.reduce_axis((0, KB), name='kb')
+    ci = te.reduce_axis((0, CI), name='ci')
+    dh = te.reduce_axis((0, KH), name='dh')
+    dw = te.reduce_axis((0, KW), name='dw')
+    b1 = te.reduce_axis((0, IB), name='ib')
+    b2 = te.reduce_axis((0, KB), name='kb')
 
     def _conv(n, co, h, w, vh, vw, vc):
         b1b2 = (b1+b2).astype(out_dtype)
         if unipolar:
-            return tvm.sum((tvm.popcount(
+            return te.sum((tvm.tir.popcount(
                 data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype) &
                 kernel_vec[co, ci, dh, dw, b2, vc].astype(out_dtype))  -
-                            tvm.popcount(
-                                data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype)
-                                & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2,
-                           axis=[ci, dh, dw, b1, b2])
+                           tvm.tir.popcount(
+                               data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1].astype(out_dtype)
+                               & ~kernel_vec[co, ci, dh, dw, b2, vc]).astype(out_dtype)) << b1b2,
+                          axis=[ci, dh, dw, b1, b2])
 
-        return tvm.sum((tvm.popcount(
+        return te.sum((tvm.tir.popcount(
             data_vec[n, h, w, ci, vh*HSTR+dh, vw*WSTR+dw, b1] &
             kernel_vec[co, ci, dh, dw, b2, vc])).astype(out_dtype) << b1b2,
-                       axis=[ci, dh, dw, b1, b2])
+                      axis=[ci, dh, dw, b1, b2])
 
-    conv = tvm.compute(ovshape, _conv, name='conv_out')
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    conv = te.compute(ovshape, _conv, name='conv_out')
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
-    return tvm.compute(
+    return te.compute(
         oshape, lambda n, co, h, w:
         conv[n,
              idxd(co, VC), idxd(h, VH), idxd(w, VW),
@@ -202,38 +203,38 @@ def bitserial_conv2d_nhwc(cfg, data, kernel, stride, padding, in_bits, weight_bi
     else:
         data_pad = data_q
 
-    data_vec = tvm.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \
-        data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec')
+    data_vec = te.compute(dvshape, lambda n, h, w, vh, vw, ci, b: \
+                          data_pad[n][h*VH*HSTR+vh][w*VW*WSTR+vw][ci][b], name='data_vec')
 
-    kernel_vec = tvm.compute(kvshape, lambda co, dh, dw, ci, vc, b: \
-        kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec')
+    kernel_vec = te.compute(kvshape, lambda co, dh, dw, ci, vc, b: \
+                            kernel_q[dh][dw][ci][co*VC+vc][b], name='kernel_vec')
 
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    dh = tvm.reduce_axis((0, KH), name='dh')
-    dw = tvm.reduce_axis((0, KW), name='dw')
-    b1 = tvm.reduce_axis((0, IB), name='ib')
-    b2 = tvm.reduce_axis((0, KB), name='kb')
+    ci = te.reduce_axis((0, CI), name='ci')
+    dh = te.reduce_axis((0, KH), name='dh')
+    dw = te.reduce_axis((0, KW), name='dw')
+    b1 = te.reduce_axis((0, IB), name='ib')
+    b2 = te.reduce_axis((0, KB), name='kb')
 
     def _conv(n, h, w, co, vh, vw, vc):
         b1b2 = (b1+b2).astype(out_dtype)
         if unipolar:
-            return tvm.sum(
-                ((tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
-                               kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) -
-                  tvm.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1]&
-                               ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2),
+            return te.sum(
+                ((tvm.tir.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
+                                   kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) -
+                  tvm.tir.popcount(data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1]&
+                                   ~kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype)) << b1b2),
                 axis=[dh, dw, ci, b1, b2])
 
-        return tvm.sum(tvm.popcount(
+        return te.sum(tvm.tir.popcount(
             data_vec[n, h, w, vh*HSTR+dh, vw*WSTR+dw, ci, b1] &
             kernel_vec[co, dh, dw, ci, vc, b2]).astype(out_dtype) << b1b2,
-                       axis=[dh, dw, ci, b1, b2])
+                      axis=[dh, dw, ci, b1, b2])
 
-    conv = tvm.compute(ovshape, _conv, name='conv')
+    conv = te.compute(ovshape, _conv, name='conv')
 
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
-    return tvm.compute(
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
+    return te.compute(
         oshape, lambda n, h, w, co:
         conv[n,
              idxd(h, VH), idxd(w, VW), idxd(co, VC),
@@ -250,7 +251,7 @@ def schedule_bitserial_conv2d_nhwc(cfg, outs):
 
 def _schedule_bitserial_conv2d(cfg, outs):
     """CPU schedule for bitserial convolutions NCHW and NHWC"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def traverse(op):
@@ -262,7 +263,7 @@ def traverse(op):
                 s[op].compute_inline()
             for tensor in op.input_tensors:
                 if tensor.op.input_tensors and (tensor.op not in scheduled_ops):
-                    if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                    if isinstance(tensor.op, tvm.te.ComputeOp):
                         traverse(tensor.op)
 
         elif 'spatial_bitserial_conv_nchw' in op.tag or 'spatial_bitserial_conv_nhwc' in op.tag:
@@ -273,7 +274,7 @@ def traverse(op):
             data_q = data_vec.op.input_tensors[0]
             data = data_q.op.input_tensors[0]
             data_pad = None
-            if isinstance(data_q.op, tvm.tensor.ComputeOp) and "pad" in data_q.op.tag:
+            if isinstance(data_q.op, tvm.te.ComputeOp) and "pad" in data_q.op.tag:
                 data_pad = data_q
                 data_q = data
                 data = data_q.op.input_tensors[0]
@@ -320,7 +321,7 @@ def _schedule_bitserial_conv2d_nchw(cfg, s, data_q, data_pad, data_vec,
     VH = cfg["tile_oh"].size[-1]
     VW = cfg["tile_ow"].size[-1]
 
-     ##### Schedule Data padding, and bitpacking
+    ##### Schedule Data padding, and bitpacking
     if data_pad is not None:
         s[data_pad].compute_inline()
 
diff --git a/topi/python/topi/x86/bitserial_dense.py b/topi/python/topi/x86/bitserial_dense.py
index d464cae951b3..cbc6ac83ffd9 100644
--- a/topi/python/topi/x86/bitserial_dense.py
+++ b/topi/python/topi/x86/bitserial_dense.py
@@ -18,6 +18,7 @@
 """Schedule for bitserial dense operator."""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from tvm import autotvm
 from topi.util import get_const_int, get_const_tuple
 from .. import tag
@@ -30,14 +31,14 @@ def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype='uint3
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         2-D with shape [batch, in_dim]
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         2-D with shape [out_dim, in_dim] or
         3-D with shape [out_dim, weight_bits, in_dim]
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
     data_packed = bitpack(data, data_bits, pack_axis=1, bit_axis=1, pack_type=pack_dtype)
@@ -68,26 +69,26 @@ def bitserial_dense(cfg, data, weight, data_bits, weight_bits, pack_dtype='uint3
     wvshape = (X//VX, WB, VX, K)
     oshape = (Y, X)
 
-    k = tvm.reduce_axis((0, K), name='k')
-    db = tvm.reduce_axis((0, DB), name='db')
-    wb = tvm.reduce_axis((0, WB), name='wb')
+    k = te.reduce_axis((0, K), name='k')
+    db = te.reduce_axis((0, DB), name='db')
+    wb = te.reduce_axis((0, WB), name='wb')
 
     # Tile data and weights
-    weight_vec = tvm.compute(wvshape, lambda xo, wb, vx, k:
-                             weight_packed[xo*VX+vx][wb][k], name='weight_vec')
+    weight_vec = te.compute(wvshape, lambda xo, wb, vx, k:
+                            weight_packed[xo*VX+vx][wb][k], name='weight_vec')
 
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
 
-    matmul_unipolar = tvm.compute(oshape, lambda i, j: tvm.sum(
-        (tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) -
-         tvm.popcount(~weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k])
-        ).astype(out_dtype)
+    matmul_unipolar = te.compute(oshape, lambda i, j: te.sum(
+        (tvm.tir.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]) -
+         tvm.tir.popcount(~weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k])
+         ).astype(out_dtype)
         << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense_unipolar')
 
-    matmul = tvm.compute(oshape, lambda i, j: tvm.sum(
-        tvm.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]
-                    ).astype(out_dtype)
+    matmul = te.compute(oshape, lambda i, j: te.sum(
+        tvm.tir.popcount(weight_vec[idxdiv(j, VX), wb, idxmod(j, VX), k] & data_packed[i, db, k]
+                         ).astype(out_dtype)
         << (db+wb).astype(out_dtype), axis=[wb, db, k]), tag='bitserial_dense')
 
     # binary ops
@@ -112,8 +113,8 @@ def schedule_bitserial_dense(cfg, outs):
     s: Schedule
         The computation schedule for bitserial_dense.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _schedule(cfg, s, data_vec, weight_vec, output):
         s[data_vec].parallel(s[data_vec].op.axis[0])
@@ -149,7 +150,7 @@ def traverse(op):
             if op not in s.outputs:
                 s[op].compute_inline()
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp):
+                if isinstance(tensor.op, tvm.te.ComputeOp):
                     traverse(tensor.op)
 
         elif op.tag == 'bitserial_dense' or 'bitserial_dense_unipolar':
diff --git a/topi/python/topi/x86/conv1d.py b/topi/python/topi/x86/conv1d.py
index 70c2a6881dbf..1e30c9f26b7e 100644
--- a/topi/python/topi/x86/conv1d.py
+++ b/topi/python/topi/x86/conv1d.py
@@ -16,14 +16,13 @@
 # under the License.
 # pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
 """Conv1D schedule on for Intel CPU"""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from .. import tag
 
 
 def schedule_conv1d_ncw(outs):
     """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     output_op = outs[0].op
     scheduled_ops = []
 
@@ -40,18 +39,18 @@ def traverse(op):
                     s[op].parallel(fused)
                     s[op].vectorize(w)
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
         if 'conv1d_ncw' in op.tag:
             conv = op.output(0)
             kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, te.tensor.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
 
             data = op.input_tensors[0]
             data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+            if isinstance(data.op, te.tensor.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
@@ -77,7 +76,7 @@ def traverse(op):
 
 def schedule_conv1d_nwc(outs):
     """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     output_op = outs[0].op
     scheduled_ops = []
 
@@ -94,18 +93,18 @@ def traverse(op):
                     s[op].parallel(fused)
                     s[op].vectorize(c)
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
         if 'conv1d_nwc' in op.tag:
             conv = op.output(0)
             kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, te.tensor.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
 
             data = op.input_tensors[0]
             data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+            if isinstance(data.op, te.tensor.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 2403b01b7453..81d848a4762f 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -21,6 +21,7 @@
 import logging
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from .. import nn
 from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
@@ -39,11 +40,11 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depth
     """
     static_data_shape = []
     for dim in get_const_tuple(data.shape):
-        if isinstance(dim, tvm.expr.Var):
+        if isinstance(dim, tvm.tir.Var):
             static_data_shape.append(1)
         else:
             static_data_shape.append(dim)
-    data = tvm.placeholder(static_data_shape, dtype=data.dtype)
+    data = te.placeholder(static_data_shape, dtype=data.dtype)
     if is_depthwise:
         wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype)
         from .depthwise_conv2d import _fallback_schedule
@@ -61,7 +62,7 @@ def _conv2d_infer_layout(workload, cfg):
     _, data, kernel, strides, padding, dilation, layout, _, dtype = workload
     batch_size, in_channel, in_height, in_width = data[1]
     out_channel, _, k_height, k_width = kernel[1]
-    idxdiv = tvm.indexdiv
+    idxdiv = tvm.tir.indexdiv
 
     pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width))
     out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1
@@ -75,20 +76,20 @@ def _conv2d_infer_layout(workload, cfg):
 
 def schedule_conv2d_nhwc(outs):
     """Create schedule for conv2d_nhwc"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     output_op = outs[0].op
 
     def _callback(op):
         if 'conv2d_nhwc' in op.tag:
             conv = op.output(0)
             kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
 
             data = op.input_tensors[0]
             data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
@@ -132,11 +133,11 @@ def _pack_data(cfg, data, kernel):
     ic_chunk = ic // ic_bn
     oc_chunk = oc // oc_bn
 
-    data = tvm.compute((n, ic_chunk, ih, iw, ic_bn),
-                       lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
-                       name="data_vec")
+    data = te.compute((n, ic_chunk, ih, iw, ic_bn),
+                      lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
+                      name="data_vec")
 
-    kernel = tvm.compute(
+    kernel = te.compute(
         (oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn),
         lambda occ, icc, k_h, k_w, icb, ocb:
         kernel[occ * oc_bn + ocb, icc * ic_bn + icb, k_h, k_w],
@@ -176,9 +177,9 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo
 
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
-        _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
-                            tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
-                                            dtype=kernel.dtype),
+        _get_default_config(cfg, te.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
+                            te.placeholder((num_filter, in_channel, kernel_height, kernel_width),
+                                           dtype=kernel.dtype),
                             strides, padding, out_dtype)
 
     # Pack data if raw 4-D data is provided.
@@ -198,8 +199,8 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo
 @autotvm.register_topi_schedule("conv2d_NCHWc.x86")
 def schedule_conv2d_NCHWc(cfg, outs):
     """Create schedule for tensors"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'conv2d_NCHWc' in op.tag:
diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py
index 377d81539b7c..5ee691b07362 100644
--- a/topi/python/topi/x86/conv2d_alter_op.py
+++ b/topi/python/topi/x86/conv2d_alter_op.py
@@ -20,6 +20,7 @@
 import logging
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 from .conv2d import _get_default_config
@@ -79,10 +80,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
 
         # Store altered operator's config
-        new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
-                                   dtype=data_dtype)
-        new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn,
-                                      kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype)
+        new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                  dtype=data_dtype)
+        new_kernel = te.placeholder((out_channel//oc_bn, in_channel//ic_bn,
+                                     kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"],
              new_attrs["out_layout"], out_dtype], topi_tmpl)
@@ -118,15 +119,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
 
         # Store altered operator's config.
-        new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
-                                   dtype=data_dtype)
-        new_kernel = tvm.placeholder((out_channel // oc_bn,
-                                      in_channel // ic_bn,
-                                      kh,
-                                      kw,
-                                      ic_bn // n_elems,
-                                      oc_bn,
-                                      n_elems), dtype=kernel_dtype)
+        new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                  dtype=data_dtype)
+        new_kernel = te.placeholder((out_channel // oc_bn,
+                                     in_channel // ic_bn,
+                                     kh,
+                                     kw,
+                                     ic_bn // n_elems,
+                                     oc_bn,
+                                     n_elems), dtype=kernel_dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'],
              new_attrs['out_layout'], out_dtype], topi_tmpl)
@@ -152,9 +153,9 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
 
         # Store altered operator's config.
-        new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
-                                   dtype=data_dtype)
-        new_kernel = tvm.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
+        new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                                  dtype=data_dtype)
+        new_kernel = te.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'],
              new_attrs['out_layout'], out_dtype], topi_tmpl)
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 083fff48d774..432f8b287513 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -18,6 +18,7 @@
 """1x1 Conv2D schedule on for Intel CPU"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
@@ -65,7 +66,7 @@ def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
     _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
 
     # schedule pad
-    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+    if isinstance(s[data_vec].op, tvm.te.ComputeOp) \
             and "pad" in data_vec.op.tag:
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
@@ -78,7 +79,7 @@ def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
         # this part will be folded during Relay fold_constant pass.
         s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
         s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
-    elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \
+    elif isinstance(kernel_vec.op, tvm.te.ComputeOp) and \
             kernel_vec.name == 'kernel_vec':
         # data and kernel are not pre-computed, schedule layout transform here.
         # this should only be used by x86 conv2d_nchw, which is for
@@ -190,23 +191,23 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o
 
     # packing the Filter to let memory access be consecutive for AVX512 intrinsic
     # Done in pre-compute stage
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     packw_shape = (kernel_h, kernel_w, idxd(num_filter, 16), 16 * idxd(channel, 4), 4)
-    PackW = tvm.compute(packw_shape,
-                        lambda a, b, c, d, e:
-                        Filter[a, b,
-                               c*16 + idxm(d, 16),
-                               idxd(d, 16) * 4 + e],
-                        name="packed_filter")
-
-    rc = tvm.reduce_axis((0, in_channel), name='rc')
-    ry = tvm.reduce_axis((0, kernel_h), name='ry')
-    rx = tvm.reduce_axis((0, kernel_w), name='rx')
-    Output = tvm.compute(
+    PackW = te.compute(packw_shape,
+                       lambda a, b, c, d, e:
+                       Filter[a, b,
+                              c*16 + idxm(d, 16),
+                              idxd(d, 16) * 4 + e],
+                       name="packed_filter")
+
+    rc = te.reduce_axis((0, in_channel), name='rc')
+    ry = te.reduce_axis((0, kernel_h), name='ry')
+    rx = te.reduce_axis((0, kernel_w), name='rx')
+    Output = te.compute(
         (batch, out_height, out_width, out_channel),
-        lambda nn, yy, xx, ff: tvm.sum(
+        lambda nn, yy, xx, ff: te.sum(
             PaddedInput[nn, yy * stride_h + ry * dilation_h,
                         xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
             PackW[ry, rx, idxd(ff, 16),
@@ -238,7 +239,7 @@ def _schedule_conv_nhwc_pack_int8(s, cfg, data, conv_out, last):
     ic_factor, oc_factor = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
     # schedule data
     A = data
-    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+    if isinstance(s[A].op, tvm.te.ComputeOp):
         batch, ih, iw, ic = s[A].op.axis
         d_ic_chunk, d_ic_block = s[A].split(ic, factor=4)
         s[A].vectorize(d_ic_block)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 085d0aeb67c3..ebed14cb924a 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
 """Conv2D schedule on for Intel CPU"""
-from __future__ import absolute_import as _abs
 import tvm
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
@@ -89,7 +88,7 @@ def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
     _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
 
     # schedule pad
-    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+    if isinstance(s[data_vec].op, tvm.te.ComputeOp) \
             and "pad" in data_vec.op.tag:
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
@@ -102,7 +101,7 @@ def _schedule_conv_NCHWc(s, cfg, data_vec, kernel_vec, conv_out, last):
         # this part will be folded during Relay fold_constant pass.
         s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
         s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
-    elif isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and \
+    elif isinstance(kernel_vec.op, tvm.te.ComputeOp) and \
             kernel_vec.name == 'kernel_vec':
         # data and kernel are not pre-computed, schedule layout transform here.
         # this should only be used by x86 conv2d_nchw, which is for
diff --git a/topi/python/topi/x86/conv2d_int8.py b/topi/python/topi/x86/conv2d_int8.py
index 64fe92bbaaa4..4b111435f704 100644
--- a/topi/python/topi/x86/conv2d_int8.py
+++ b/topi/python/topi/x86/conv2d_int8.py
@@ -19,6 +19,7 @@
 """Conv2D int8 schedule on x86"""
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 from .. import tag
@@ -96,11 +97,11 @@ def _pack_data(cfg, data, kernel):
     ic_chunk = ic // ic_bn
     oc_chunk = oc // oc_bn
 
-    data = tvm.compute((n, ic_chunk, ih, iw, ic_bn),
-                       lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
-                       name="data_vec")
+    data = te.compute((n, ic_chunk, ih, iw, ic_bn),
+                      lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
+                      name="data_vec")
 
-    kernel = tvm.compute(
+    kernel = te.compute(
         (oc_chunk, ic_chunk, kh, kw, ic_bn//n_elems, oc_bn, n_elems),
         lambda occ, icc, k_h, k_w, icbc, ocb, icbb:
         kernel[occ * oc_bn + ocb,
@@ -145,9 +146,9 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding,
     # If no config was set, we can fallback to default config.
     if cfg.is_fallback:
         _get_default_config_int8(
-            cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
-            tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
-                            dtype=kernel.dtype),
+            cfg, te.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
+            te.placeholder((num_filter, in_channel, kernel_height, kernel_width),
+                           dtype=kernel.dtype),
             strides, padding, out_dtype)
 
     # Pack data if raw 4-D data is provided.
@@ -168,7 +169,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding,
 @autotvm.register_topi_schedule("conv2d_NCHWc_int8.x86")
 def schedule_conv2d_NCHWc_int8(cfg, outs):
     """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         """Traverse operators from computation graph"""
@@ -192,7 +193,7 @@ def _callback(op):
 @autotvm.register_topi_schedule("conv2d_nhwc_pack_int8.x86")
 def schedule_conv2d_nhwc_pack_int8(cfg, outs):
     """Create schedule for tensors"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
     output_op = outs[0].op
     scheduled_ops = []
 
@@ -209,7 +210,7 @@ def traverse(op):
                     s[op].parallel(fused)
                     s[op].vectorize(c)
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
 
         if 'conv2d_nhwc_pack_int8' in op.tag:
@@ -217,9 +218,9 @@ def traverse(op):
             kernel = conv_out.op.input_tensors[1]
             data_vec = conv_out.op.input_tensors[0]
             data = data_vec.op.input_tensors[0] \
-                if isinstance(data_vec.op, tvm.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
+                if isinstance(data_vec.op, te.tensor.ComputeOp) and "pad" not in data_vec.op.tag \
                 else data_vec
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+            if isinstance(data.op, te.tensor.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
diff --git a/topi/python/topi/x86/conv2d_transpose.py b/topi/python/topi/x86/conv2d_transpose.py
index 71f47d6c037b..f90edb5e2911 100644
--- a/topi/python/topi/x86/conv2d_transpose.py
+++ b/topi/python/topi/x86/conv2d_transpose.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D Transpose schedule on x86"""
-import tvm
+from tvm import te
 from ..util import traverse_inline
 from .. import nn
 from .conv2d import conv2d_nchw, schedule_conv2d_nchw
@@ -30,7 +30,7 @@ def conv2d_transpose_nchw(data, kernel, strides, padding, out_dtype):
 
 def schedule_conv2d_transpose_nchw(outs):
     """Create schedule for tensors"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = schedule_conv2d_nchw(outs)
     def _callback(op):
         if 'unpack_nchwc' in op.tag:
diff --git a/topi/python/topi/x86/conv3d.py b/topi/python/topi/x86/conv3d.py
index 1e156509c0a8..989ec4cf4ffc 100644
--- a/topi/python/topi/x86/conv3d.py
+++ b/topi/python/topi/x86/conv3d.py
@@ -19,6 +19,7 @@
 """Conv3D operators"""
 from collections import namedtuple
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from ..util import traverse_inline
@@ -39,12 +40,12 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
 
     Parameters
     ----------
-    input : tvm.Tensor
+    input : tvm.te.Tensor
         5-D input data with shapes:
         [batch, in_channel, in_depth, in_height, in_width] for NCDHW layout
         [batch, in_depth, in_height, in_width, in_channel] for NDHWC layout
 
-    filter : tvm.Tensor
+    filter : tvm.te.Tensor
         5-D filter with shape [kernel_depth, kernel_height, kernel_width, in_channels, out_channels]
 
     strides : int or a list/tuple of three ints
@@ -58,7 +59,7 @@ def conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         5-D with shape [batch, out_depth, out_height, out_width, out_channel] for NDHWC layout
         5-D with shape [batch, out_channel, out_depth, out_height, out_width] for NCDHW layout
     """
@@ -86,7 +87,7 @@ def schedule_conv3d_ndhwc(cfg, outs):
     s: Schedule
         The computation schedule for conv3d.
     """
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _traverse(op):
         if 'conv3d_ndhwc' in op.tag:
@@ -94,12 +95,12 @@ def _traverse(op):
             conv_out = op.input_tensors[0]
             kernel_vec = conv_out.op.input_tensors[1]
             kernel = kernel_vec.op.input_tensors[0]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            if isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
                 s[kernel].compute_inline()
             data_vec = conv_out.op.input_tensors[0]
             data = data_vec.op.input_tensors[0]
             data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+            if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
@@ -154,47 +155,47 @@ def _conv3d_ndhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
     # fetch schedule
     ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
     shape = (batch_size, in_channel // ic_bn, pad_depth, pad_height, ic_bn, pad_width)
-    data_vec = tvm.compute(shape,
-                           lambda n, C, d, h, c, w: data_pad[n, d, h, w, C * ic_bn + c],
-                           name='data_vec')
+    data_vec = te.compute(shape,
+                          lambda n, C, d, h, c, w: data_pad[n, d, h, w, C * ic_bn + c],
+                          name='data_vec')
 
     # pack kernel
     shape = (num_filter//oc_bn, in_channel//ic_bn,
              kernel_depth, kernel_height, kernel_width, ic_bn, oc_bn)
-    kernel_vec = tvm.compute(shape,
-                             lambda CO, CI, d, h, w, ci, co:
-                             kernel[d, h, w, CI * ic_bn + ci, CO * oc_bn + co],
-                             name='kernel_vec')
+    kernel_vec = te.compute(shape,
+                            lambda CO, CI, d, h, w, ci, co:
+                            kernel[d, h, w, CI * ic_bn + ci, CO * oc_bn + co],
+                            name='kernel_vec')
 
     # convolution
     oshape = (batch_size, num_filter//oc_bn, out_depth, out_height, out_width, oc_bn)
     unpack_shape = (batch_size, out_depth, out_height, out_width, num_filter)
 
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
-    kd = tvm.reduce_axis((0, kernel_depth), name='kd')
-    idxmod = tvm.indexmod
-    idxdiv = tvm.indexdiv
-
-    conv = tvm.compute(oshape, lambda n, oc_chunk, od, oh, ow, oc_block:
-                       tvm.sum(data_vec[n,
-                                        idxdiv(ic, ic_bn),
-                                        od*DSTR+kd*dilation_d,
-                                        oh*HSTR+kh*dilation_h,
+    ic = te.reduce_axis((0, in_channel), name='ic')
+    kh = te.reduce_axis((0, kernel_height), name='kh')
+    kw = te.reduce_axis((0, kernel_width), name='kw')
+    kd = te.reduce_axis((0, kernel_depth), name='kd')
+    idxmod = tvm.tir.indexmod
+    idxdiv = tvm.tir.indexdiv
+
+    conv = te.compute(oshape, lambda n, oc_chunk, od, oh, ow, oc_block:
+                      te.sum(data_vec[n,
+                                      idxdiv(ic, ic_bn),
+                                      od*DSTR+kd*dilation_d,
+                                      oh*HSTR+kh*dilation_h,
+                                      idxmod(ic, ic_bn),
+                                      ow*WSTR+kw*dilation_w].astype(out_dtype) *
+                             kernel_vec[oc_chunk, idxdiv(ic, ic_bn), kd, kh, kw,
                                         idxmod(ic, ic_bn),
-                                        ow*WSTR+kw*dilation_w].astype(out_dtype) *
-                               kernel_vec[oc_chunk, idxdiv(ic, ic_bn), kd, kh, kw,
-                                          idxmod(ic, ic_bn),
-                                          oc_block].astype(out_dtype),
-                               axis=[kd, kh, kw, ic]), name='conv')
-    conv_unpacked = tvm.compute(unpack_shape,
-                                lambda n, d, h, w, c: conv[n, idxdiv(c, oc_bn),
-                                                           d, h, w,
-                                                           idxmod(c, oc_bn)]
-                                .astype(out_dtype),
-                                name='output_unpack',
-                                tag='conv3d_ndhwc')
+                                        oc_block].astype(out_dtype),
+                             axis=[kd, kh, kw, ic]), name='conv')
+    conv_unpacked = te.compute(unpack_shape,
+                               lambda n, d, h, w, c: conv[n, idxdiv(c, oc_bn),
+                                                          d, h, w,
+                                                          idxmod(c, oc_bn)]
+                               .astype(out_dtype),
+                               name='output_unpack',
+                               tag='conv3d_ndhwc')
     return conv_unpacked
 
 
@@ -231,11 +232,11 @@ def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, layout):
 
     static_data_shape = []
     for dim in get_const_tuple(data.shape):
-        if isinstance(dim, tvm.expr.Var):
+        if isinstance(dim, tvm.tir.Var):
             static_data_shape.append(1)
         else:
             static_data_shape.append(dim)
-    data = tvm.placeholder(static_data_shape, dtype=data.dtype)
+    data = te.placeholder(static_data_shape, dtype=data.dtype)
     wkl = _get_conv3d_workload(data, kernel, strides, padding, out_dtype, layout)
     _fallback_schedule(cfg, wkl)
 
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
index ea89cf4779b0..3e99d0612f96 100644
--- a/topi/python/topi/x86/dense.py
+++ b/topi/python/topi/x86/dense.py
@@ -18,6 +18,7 @@
 """x86 dense operators"""
 from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cblas
@@ -79,11 +80,11 @@ def _schedule_dense_nopack_template(cfg, s, C):
 
 def _default_dense_pack_config(cfg, M, N, K):
     # Generate default schedule for dynamic shape.
-    if isinstance(M, tvm.expr.Var):
+    if isinstance(M, tvm.tir.Var):
         M = 16
-    if isinstance(N, tvm.expr.Var):
+    if isinstance(N, tvm.tir.Var):
         N = 16
-    if isinstance(K, tvm.expr.Var):
+    if isinstance(K, tvm.tir.Var):
         K = 16
 
     vec_width = get_fp32_len()
@@ -116,11 +117,11 @@ def _default_dense_pack_config(cfg, M, N, K):
 
 def _default_dense_nopack_config(cfg, M, N, K):
     # Generate default schedule for dynamic shape.
-    if isinstance(M, tvm.expr.Var):
+    if isinstance(M, tvm.tir.Var):
         M = 16
-    if isinstance(N, tvm.expr.Var):
+    if isinstance(N, tvm.tir.Var):
         N = 16
-    if isinstance(K, tvm.expr.Var):
+    if isinstance(K, tvm.tir.Var):
         K = 16
 
     vec_width = get_fp32_len()
@@ -141,33 +142,33 @@ def dense_nopack(cfg, data, weight, bias=None, out_dtype=None):
     M, K = get_const_tuple(data.shape)
     N, _ = get_const_tuple(weight.shape)
     # create tuning space
-    cfg.define_split("tile_y", 32 if isinstance(M, tvm.expr.Var) else M, num_outputs=2)
-    cfg.define_split("tile_x", 32 if isinstance(N, tvm.expr.Var) else N, num_outputs=2)
-    cfg.define_split("tile_k", 32 if isinstance(K, tvm.expr.Var) else K, num_outputs=2)
+    cfg.define_split("tile_y", 32 if isinstance(M, tvm.tir.Var) else M, num_outputs=2)
+    cfg.define_split("tile_x", 32 if isinstance(N, tvm.tir.Var) else N, num_outputs=2)
+    cfg.define_split("tile_k", 32 if isinstance(K, tvm.tir.Var) else K, num_outputs=2)
     if cfg.is_fallback:
         _default_dense_nopack_config(cfg, M, N, K)
 
     vec = cfg["tile_k"].size[-1]
-    k = tvm.reduce_axis((0, K // vec), "k")
-    CC = tvm.compute((M, N, vec),
-                     lambda z, y, x: tvm.sum(
-                         data[z, k * vec + x].astype(out_dtype) *
-                         weight[y, k * vec + x].astype(out_dtype), axis=k))
-
-    kk = tvm.reduce_axis((0, vec), "kk")
-    C = tvm.compute((M, N),
-                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
-                    tag="dense_nopack")
+    k = te.reduce_axis((0, K // vec), "k")
+    CC = te.compute((M, N, vec),
+                    lambda z, y, x: te.sum(
+                        data[z, k * vec + x].astype(out_dtype) *
+                        weight[y, k * vec + x].astype(out_dtype), axis=k))
+
+    kk = te.reduce_axis((0, vec), "kk")
+    C = te.compute((M, N),
+                   lambda y, x: te.sum(CC[y, x, kk], axis=kk),
+                   tag="dense_nopack")
     if bias is not None:
-        C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
-                        tag=tag.BROADCAST)
+        C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
+                       tag=tag.BROADCAST)
     return C
 
 
 @autotvm.register_topi_schedule("dense_nopack.x86")
 def schedule_dense_nopack(cfg, outs):
     """Create the schedule for dense_nopack"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if 'dense_nopack' in op.tag:
@@ -191,27 +192,27 @@ def dense_pack(cfg, data, weight, bias=None, out_dtype=None):
 
     packw_bn = cfg["tile_x"].size[-1]
     packw_shape = (N // packw_bn, K, packw_bn)
-    packw = tvm.compute(packw_shape,
-                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
-
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
-    k = tvm.reduce_axis((0, K), name="k")
-    C = tvm.compute((M, N),
-                    lambda y, x: tvm.sum(
-                        data[y, k].astype(out_dtype) *
-                        packw[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype),
-                        axis=k),
-                    tag="dense_pack")
+    packw = te.compute(packw_shape,
+                       lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+    k = te.reduce_axis((0, K), name="k")
+    C = te.compute((M, N),
+                   lambda y, x: te.sum(
+                       data[y, k].astype(out_dtype) *
+                       packw[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype),
+                       axis=k),
+                   tag="dense_pack")
     if bias is not None:
-        C = tvm.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
-                        tag=tag.BROADCAST)
+        C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype),
+                       tag=tag.BROADCAST)
     return C
 
 @autotvm.register_topi_schedule("dense_pack.x86")
 def schedule_dense_pack(cfg, outs):
     """Create the schedule for dense_pack"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         if "dense_pack" in op.tag:
@@ -227,8 +228,8 @@ def dense_cblas(cfg, data, weight, bias=None, out_dtype=None):
     cfg.add_flop(M * K * N * 2)
     C = cblas.matmul(data, weight, False, True)
     if bias is not None:
-        C = tvm.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype),
-                        tag=tag.BROADCAST)
+        C = te.compute(C.shape, lambda i, j: C[i, j] + bias[j].astype(out_dtype),
+                       tag=tag.BROADCAST)
     return C
 
 @autotvm.register_topi_schedule("dense_cblas.x86")
diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
index 70b30fea8c51..fda964eb6639 100644
--- a/topi/python/topi/x86/depthwise_conv2d.py
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -18,6 +18,7 @@
 # pylint: disable=no-value-for-parameter
 """Depthwise Conv2D schedule on x86"""
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
 from ..nn.pad import pad
@@ -87,11 +88,11 @@ def _pack_data(cfg, data, kernel):
     ic_chunk = ic // ic_bn
     oc_chunk = oc // oc_bn
 
-    data = tvm.compute((n, ic_chunk, ih, iw, ic_bn),
-                       lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
-                       name="data_vec")
+    data = te.compute((n, ic_chunk, ih, iw, ic_bn),
+                      lambda bs, c, h, w, vc: data[bs, c*ic_bn + vc, h, w],
+                      name="data_vec")
 
-    kernel = tvm.compute(
+    kernel = te.compute(
         (oc_chunk, 1, kh, kw, 1, oc_bn),
         lambda occ, icc, k_h, k_w, icb, ocb:
         kernel[(occ * oc_bn + ocb) // cm,
@@ -135,9 +136,9 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
 
     # get workload and related schedule config
     wkl = _get_workload(
-        tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
-        tvm.placeholder((out_channel, channel_multiplier, filter_height, filter_width),
-                        dtype=kernel.dtype),
+        te.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
+        te.placeholder((out_channel, channel_multiplier, filter_height, filter_width),
+                       dtype=kernel.dtype),
         strides, padding, out_dtype)
     if cfg.is_fallback:
         _fallback_schedule(cfg, wkl)
@@ -160,14 +161,14 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
         data_pad = data
 
     # depthconv stage
-    idxdiv = tvm.indexdiv
-    idxmod = tvm.indexmod
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
 
-    kh = tvm.reduce_axis((0, filter_height), name='kh')
-    kw = tvm.reduce_axis((0, filter_width), name='kw')
-    Output = tvm.compute(
+    kh = te.reduce_axis((0, filter_height), name='kh')
+    kw = te.reduce_axis((0, filter_width), name='kw')
+    Output = te.compute(
         (batch, out_channel_chunk, out_height, out_width, out_channel_block),
-        lambda b, oco, oh, ow, oci: tvm.sum(
+        lambda b, oco, oh, ow, oci: te.sum(
             (data_pad[
                 b,
                 idxdiv(idxdiv(oco * out_channel_block + oci, channel_multiplier), in_channel_block),
@@ -182,8 +183,8 @@ def depthwise_conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation,
 @autotvm.register_topi_schedule("depthwise_conv2d_NCHWc.x86")
 def schedule_depthwise_conv2d_NCHWc(cfg, outs):
     """CPU schedule for depthwise conv2d in NCHW[x]c layout"""
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         """Traverse operators from computation graph"""
@@ -199,7 +200,7 @@ def _callback(op):
 def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data_vec, kernel_vec, conv_out, output):
     tile_ow, oc_bn = cfg["tile_ow"].size[-1], cfg["tile_oc"].size[-1]
     # schedule pad
-    if isinstance(s[data_vec].op, tvm.tensor.ComputeOp) \
+    if isinstance(s[data_vec].op, tvm.te.ComputeOp) \
             and "pad" in data_vec.op.tag:
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
index 375827bb271c..7c37ac7bc9b5 100644
--- a/topi/python/topi/x86/injective.py
+++ b/topi/python/topi/x86/injective.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name
 """x86 declaration and schedules."""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 from ..util import is_empty_shape
 
 def schedule_injective_from_existing(sch, out):
@@ -65,10 +64,10 @@ def schedule_injective(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     x = outs[0]
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    s = te.create_schedule([x.op for x in outs])
+    te.schedule.AutoInlineInjective(s)
 
     if not is_empty_shape(x.shape):
         schedule_injective_from_existing(s, x)
@@ -104,10 +103,10 @@ def vectorize(sch, tensor, vectorize_limit):
                 _, inner_i = sch[tensor].split(inner_axis, split_factor)
                 sch[tensor].vectorize(inner_i)
 
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     x = outs[0]
-    s = tvm.create_schedule([x.op for x in outs])
-    tvm.schedule.AutoInlineInjective(s)
+    s = te.create_schedule([x.op for x in outs])
+    te.schedule.AutoInlineInjective(s)
     if len(s[x].op.axis) >= 5:
         fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
         vectorize(s, x, 64)
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index 3d57b6bbf203..8f884b8b1a2e 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -16,8 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,too-many-locals,unused-variable
 """x86 nn operators"""
-from __future__ import absolute_import as _abs
-import tvm
+from tvm import te
 
 def schedule_softmax(outs):
     """Schedule for softmax
@@ -33,9 +32,9 @@ def schedule_softmax(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     softmax = outs[0]
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     op_tag = softmax.op.tag
     if op_tag == 'softmax_output':
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
index a8251dd13ae4..f7664d906799 100644
--- a/topi/python/topi/x86/pooling.py
+++ b/topi/python/topi/x86/pooling.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable
 """Schedule for pooling operators"""
-import tvm
+from tvm import te
 from .. import tag
 
 def _parallel_sch(sch, oshape, do_vectorize=False):
@@ -75,12 +75,12 @@ def schedule_pool(outs, layout):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def _schedule(PaddedInput, Pool):
-        if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
+        if isinstance(PaddedInput.op, te.tensor.ComputeOp):
             s[PaddedInput].compute_inline()
         do_vectorize = layout[-1] not in "HWhw"
         _parallel_sch(s[Pool], outs[0].shape, do_vectorize)
@@ -92,7 +92,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('pool'):
@@ -129,8 +129,8 @@ def schedule_adaptive_pool(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    s = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def traverse(OP):
@@ -140,7 +140,7 @@ def traverse(OP):
             if OP not in s.outputs:
                 s[OP].compute_inline()
             for tensor in OP.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('adaptive_pool'):
diff --git a/topi/python/topi/x86/reduction.py b/topi/python/topi/x86/reduction.py
index b9dd4d4f1b3c..0dfc3f23c2f2 100644
--- a/topi/python/topi/x86/reduction.py
+++ b/topi/python/topi/x86/reduction.py
@@ -16,8 +16,8 @@
 # under the License.
 # pylint: disable=invalid-name
 """x86 declaration and schedules."""
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import te
 from .injective import schedule_injective_from_existing
 from .. import tag
 from ..util import get_const_tuple
@@ -72,13 +72,13 @@ def schedule_reduce(outs):
     sch: Schedule
         The computation schedule for the op.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    sch = tvm.create_schedule([x.op for x in outs])
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    sch = te.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
     def traverse_before_reduce(operator):
         """Internal traverse function"""
-        if isinstance(operator, tvm.tensor.PlaceholderOp):
+        if isinstance(operator, tvm.te.PlaceholderOp):
             return
         if tag.is_injective(operator.tag):
             sch[operator].compute_inline()
@@ -108,7 +108,7 @@ def traverse_after_reduce(operator):
             for tensor in input_tensors:
                 if tensor.op not in scheduled_ops:
                     traverse_before_reduce(tensor.op)
-        elif isinstance(operator, tvm.tensor.PlaceholderOp):
+        elif isinstance(operator, tvm.te.PlaceholderOp):
             pass
         else:
             raise RuntimeError("Unsupported operator: %s (tag: %s)" % (operator, operator.tag))
diff --git a/topi/python/topi/x86/roi_align.py b/topi/python/topi/x86/roi_align.py
index 203c3dd1802b..205d70947ab2 100644
--- a/topi/python/topi/x86/roi_align.py
+++ b/topi/python/topi/x86/roi_align.py
@@ -30,32 +30,32 @@ def roi_align_nchw_ir(data, rois, w_pc, pos_pc, pooled_size, spatial_scale, samp
 
     Parameters
     ----------
-    data : tvm.Tensor or numpy NDArray
+    data : tvm.te.Tensor or numpy NDArray
         4-D with shape [batch, channel, height, width]
 
-    rois : tvm.Tensor or numpy NDArray
+    rois : tvm.te.Tensor or numpy NDArray
         2-D with shape [num_roi, 5]. The last dimension should be in format of
         [batch_index, w_start, h_start, w_end, h_end]
 
-    w_pc : tvm.Tensor or numpy NDArray
+    w_pc : tvm.te.Tensor or numpy NDArray
         3-D weight pre-calculation buffer
 
-    pos_pc : tvm.Tensor or numpy NDArray
+    pos_pc : tvm.te.Tensor or numpy NDArray
         3-D position pre-calculation buffer
 
     pooled_size : tvm ConsExpr
         [out_height, out_width]
 
-    spatial_scale : tvm.const
+    spatial_scale : tvm.tir.const
         Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
         of total stride in convolutional layers, which should be in range (0.0, 1.0]
 
-    sample_ratio : tvm.const
+    sample_ratio : tvm.tir.const
         Sampling ratio of ROI align, using adaptive size by default.
 
     Returns
     -------
-    output : tvm.Tensor or numpy NDArray
+    output : tvm.te.Tensor or numpy NDArray
         4-D with shape [num_roi, channel, pooled_size, pooled_size]
     """
     channels = data.shape[1]
@@ -161,21 +161,21 @@ def roi_align_nchw_ir(data, rois, w_pc, pos_pc, pooled_size, spatial_scale, samp
                     for iy in range(roi_bin_grid_h):
                         for ix in range(roi_bin_grid_w):
                             output_val += w_pc[n, pre_calc_index, 0] \
-                                          * data[roi_batch_index, c,
-                                                 pos_pc[n, pre_calc_index, 2],
-                                                 pos_pc[n, pre_calc_index, 0]] \
-                                          + w_pc[n, pre_calc_index, 1] \
-                                          * data[roi_batch_index, c,
-                                                 pos_pc[n, pre_calc_index, 2],
-                                                 pos_pc[n, pre_calc_index, 1]] \
-                                          + w_pc[n, pre_calc_index, 2] \
-                                          * data[roi_batch_index, c,
-                                                 pos_pc[n, pre_calc_index, 3],
-                                                 pos_pc[n, pre_calc_index, 0]] \
-                                          + w_pc[n, pre_calc_index, 3] \
-                                          * data[roi_batch_index, c,
-                                                 pos_pc[n, pre_calc_index, 3],
-                                                 pos_pc[n, pre_calc_index, 1]]
+                                * data[roi_batch_index, c,
+                                       pos_pc[n, pre_calc_index, 2],
+                                       pos_pc[n, pre_calc_index, 0]] \
+                                + w_pc[n, pre_calc_index, 1] \
+                                * data[roi_batch_index, c,
+                                       pos_pc[n, pre_calc_index, 2],
+                                       pos_pc[n, pre_calc_index, 1]] \
+                                + w_pc[n, pre_calc_index, 2] \
+                                * data[roi_batch_index, c,
+                                       pos_pc[n, pre_calc_index, 3],
+                                       pos_pc[n, pre_calc_index, 0]] \
+                                + w_pc[n, pre_calc_index, 3] \
+                                * data[roi_batch_index, c,
+                                       pos_pc[n, pre_calc_index, 3],
+                                       pos_pc[n, pre_calc_index, 1]]
                             pre_calc_index += 1
 
                     output_val /= count
@@ -189,10 +189,10 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, channel, height, width]
 
-    rois : tvm.Tensor
+    rois : tvm.te.Tensor
         2-D with shape [num_roi, 5]. The last dimension should be in format of
         [batch_index, w_start, h_start, w_end, h_end]
 
@@ -208,7 +208,7 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [num_roi, channel, pooled_size, pooled_size]
     """
     if not isinstance(pooled_size, (tuple, list)):
@@ -226,8 +226,8 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
     w_pc_buffer = full(max_pc_shape, data.dtype, 0)
     pos_pc_buffer = full(max_pc_shape, "int32", 0)
 
-    pooled_size = tvm.convert(pooled_size)
-    spatial_scale = tvm.const(spatial_scale, "float32")
-    sample_ratio = tvm.const(sample_ratio, "int32")
+    pooled_size = tvm.runtime.convert(pooled_size)
+    spatial_scale = tvm.tir.const(spatial_scale, "float32")
+    sample_ratio = tvm.tir.const(sample_ratio, "int32")
     return roi_align_nchw_ir(data, rois, w_pc_buffer, pos_pc_buffer,
                              pooled_size, spatial_scale, sample_ratio)
diff --git a/topi/python/topi/x86/sparse.py b/topi/python/topi/x86/sparse.py
index 898d0e5ea2c6..54a5af9ca9f0 100644
--- a/topi/python/topi/x86/sparse.py
+++ b/topi/python/topi/x86/sparse.py
@@ -16,7 +16,7 @@
 # under the License.
 
 """sparse_dense schedule on x86"""
-import tvm
+from tvm import te
 
 from ..util import traverse_inline, get_const_int
 from .util import get_fp32_len
@@ -24,7 +24,7 @@
 
 def schedule_sparse_dense(outs):
     """Create schedule for sparse dense"""
-    s = tvm.create_schedule([x.op for x in outs])
+    s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
         simd_width = get_fp32_len()
diff --git a/topi/python/topi/x86/tensor_intrin.py b/topi/python/topi/x86/tensor_intrin.py
index dc9e1456d2cd..955b6b4ad280 100644
--- a/topi/python/topi/x86/tensor_intrin.py
+++ b/topi/python/topi/x86/tensor_intrin.py
@@ -17,6 +17,7 @@
 """Core kernel of dot product of 4 Int8 operations"""
 #pylint: disable=invalid-name
 import tvm
+from tvm import te
 import tvm.target.codegen
 
 
@@ -25,7 +26,7 @@ def dot_16x1x16_uint8_int8_int32():
     mcpu = tvm.target.Target.current().mcpu
 
     assert mcpu in ("skylake-avx512", "cascadelake"), \
-            "An old Intel machine that does not have fast Int8 support."
+        "An old Intel machine that does not have fast Int8 support."
     if mcpu == "skylake-avx512":
         return dot_16x1x16_uint8_int8_int32_skylake()
     # cascadelake
@@ -63,43 +64,43 @@ def dot_16x1x16_uint8_int8_int32_skylake():
 
     int32_lanes = 16 # 16 int32 lanes in AVX512
     num_int8_elements = 4 # 4 int8 elements in int32
-    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
-    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
-    k = tvm.reduce_axis((0, num_int8_elements), name='k')
-    C = tvm.compute((int32_lanes,),
-                    lambda i: tvm.sum(data[k].astype('int32') *
-                                      kernel[i, k].astype('int32'),
-                                      axis=k),
-                    name="C")
-
-    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
-                               offset_factor=1,
-                               strides=[1])
-    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
-                               offset_factor=1,
-                               strides=[tvm.var('ldw'), 1])
+    data = te.placeholder((num_int8_elements,), dtype='uint8', name='data')
+    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
+    k = te.reduce_axis((0, num_int8_elements), name='k')
+    C = te.compute((int32_lanes,),
+                   lambda i: te.sum(data[k].astype('int32') *
+                                    kernel[i, k].astype('int32'),
+                                    axis=k),
+                   name="C")
+
+    a_buffer = tvm.tir.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+                                   offset_factor=1,
+                                   strides=[1])
+    b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+                                   offset_factor=1,
+                                   strides=[te.var('ldw'), 1])
 
     def _intrin_func(ins, outs):
         def _instr(index):
-            ib = tvm.ir_builder.create()
+            ib = tvm.tir.ir_builder.create()
             if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+                ib.emit(outs[0].vstore(0, tvm.tir.const(0, 'int32x16')))
                 return ib.get()
 
             a_int8 = ins[0].vload([0], "uint8x4")
-            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
+            re_int32 = tvm.tir.call_pure_intrin('int32', 'reinterpret', a_int8)
             vec_ai32 = re_int32.astype('int32x16')
-            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
+            vec_a = tvm.tir.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
             vec_b = ins[1].vload([0, 0], "int8x64")
-            vec_one = tvm.const(1, "int16x32")
-            pair_reduction = tvm.call_llvm_intrin('int16x32',
-                                                  'llvm.x86.avx512.pmaddubs.w.512',
-                                                  tvm.const(0, 'uint32'),
-                                                  vec_a, vec_b)
-            quad_reduction = tvm.call_llvm_intrin('int32x16',
-                                                  'llvm.x86.avx512.pmaddw.d.512',
-                                                  tvm.const(0, 'uint32'),
-                                                  pair_reduction, vec_one)
+            vec_one = tvm.tir.const(1, "int16x32")
+            pair_reduction = tvm.tir.call_llvm_intrin('int16x32',
+                                                      'llvm.x86.avx512.pmaddubs.w.512',
+                                                      tvm.tir.const(0, 'uint32'),
+                                                      vec_a, vec_b)
+            quad_reduction = tvm.tir.call_llvm_intrin('int32x16',
+                                                      'llvm.x86.avx512.pmaddw.d.512',
+                                                      tvm.tir.const(0, 'uint32'),
+                                                      pair_reduction, vec_one)
             if index == 0:
                 ib.emit(outs[0].vstore(0, quad_reduction))
             else:
@@ -109,8 +110,8 @@ def _instr(index):
         # body, reset, update
         return _instr(0), _instr(1), _instr(2)
 
-    with tvm.build_config(offset_factor=1, partition_const_loop=True):
-        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
+    with tvm.target.build_config(offset_factor=1, partition_const_loop=True):
+        return te.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
 
 
 def dot_16x1x16_uint8_int8_int16():
@@ -146,41 +147,41 @@ def dot_16x1x16_uint8_int8_int16():
 
     int16_lanes = 4*32 # 4*32 int32 lanes in 4 AVX512 vector registers
     num_int8_elements = 2 # 2 int8 elements in int16
-    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
-    kernel = tvm.placeholder((int16_lanes, num_int8_elements), dtype='int8', name='kernel')
-    k = tvm.reduce_axis((0, num_int8_elements), name='k')
-    C = tvm.compute((int16_lanes, ),
-                    lambda i: tvm.sum(data[k].astype('int16') *
-                                      kernel[i, k].astype('int16'),
-                                      axis=k),
-                    name="C")
-
-    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
-                               offset_factor=1,
-                               strides=[1])
-    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
-                               offset_factor=1)
-                               # strides=[tvm.var('ldw'), 1, 1])
+    data = te.placeholder((num_int8_elements,), dtype='uint8', name='data')
+    kernel = te.placeholder((int16_lanes, num_int8_elements), dtype='int8', name='kernel')
+    k = te.reduce_axis((0, num_int8_elements), name='k')
+    C = te.compute((int16_lanes, ),
+                   lambda i: te.sum(data[k].astype('int16') *
+                                    kernel[i, k].astype('int16'),
+                                    axis=k),
+                   name="C")
+
+    a_buffer = tvm.tir.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+                                   offset_factor=1,
+                                   strides=[1])
+    b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+                                   offset_factor=1)
+    # strides=[te.var('ldw'), 1, 1])
 
     def _intrin_func(ins, outs):
         def _instr(index):
-            ib = tvm.ir_builder.create()
+            ib = tvm.tir.ir_builder.create()
             if index == 1:
                 for i in range(4):
-                    ib.emit(outs[0].vstore([i*32], tvm.const(0, 'int16x32')))
+                    ib.emit(outs[0].vstore([i*32], tvm.tir.const(0, 'int16x32')))
                 return ib.get()
 
             a_int8 = ins[0].vload([0], "uint8x2")
-            re_int16 = tvm.call_pure_intrin('int16', 'reinterpret', a_int8)
+            re_int16 = tvm.tir.call_pure_intrin('int16', 'reinterpret', a_int8)
             vec_ai16 = re_int16.astype('int16x32')
-            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai16)
+            vec_a = tvm.tir.call_pure_intrin('int8x64', 'reinterpret', vec_ai16)
 
             for i in range(4):
                 vec_b = ins[1].vload([i*32, 0], "int8x64")
-                pair_reduction = tvm.call_llvm_intrin('int16x32',
-                                                      'llvm.x86.avx512.pmaddubs.w.512',
-                                                      tvm.const(0, 'uint32'),
-                                                      vec_a, vec_b)
+                pair_reduction = tvm.tir.call_llvm_intrin('int16x32',
+                                                          'llvm.x86.avx512.pmaddubs.w.512',
+                                                          tvm.tir.const(0, 'uint32'),
+                                                          vec_a, vec_b)
                 if index == 0:
                     ib.emit(outs[0].vstore([i*32], pair_reduction))
                 else:
@@ -191,8 +192,8 @@ def _instr(index):
         # body, reset, update
         return _instr(0), _instr(1), _instr(2)
 
-    with tvm.build_config(offset_factor=1, partition_const_loop=True):
-        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
+    with tvm.target.build_config(offset_factor=1, partition_const_loop=True):
+        return te.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
 
 
 def dot_16x1x16_uint8_int8_int32_cascadelake():
@@ -226,31 +227,31 @@ def dot_16x1x16_uint8_int8_int32_cascadelake():
 
     int32_lanes = 16 # 16 int32 lanes in AVX512
     num_int8_elements = 4 # 4 int8 elements in int32
-    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
-    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
-    k = tvm.reduce_axis((0, num_int8_elements), name='k')
-    C = tvm.compute((int32_lanes,),
-                    lambda i: tvm.sum(data[k].astype('int32') *
-                                      kernel[i, k].astype('int32'),
-                                      axis=k),
-                    name="C")
-
-    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
-                               offset_factor=1,
-                               strides=[1])
-    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
-                               offset_factor=1,
-                               strides=[tvm.var('ldw'), 1])
+    data = te.placeholder((num_int8_elements,), dtype='uint8', name='data')
+    kernel = te.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
+    k = te.reduce_axis((0, num_int8_elements), name='k')
+    C = te.compute((int32_lanes,),
+                   lambda i: te.sum(data[k].astype('int32') *
+                                    kernel[i, k].astype('int32'),
+                                    axis=k),
+                   name="C")
+
+    a_buffer = tvm.tir.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+                                   offset_factor=1,
+                                   strides=[1])
+    b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+                                   offset_factor=1,
+                                   strides=[te.var('ldw'), 1])
 
     def _intrin_func(ins, outs):
         def _instr(index):
-            ib = tvm.ir_builder.create()
+            ib = tvm.tir.ir_builder.create()
             if index == 1:
-                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+                ib.emit(outs[0].vstore(0, tvm.tir.const(0, 'int32x16')))
                 return ib.get()
 
             a_int8 = ins[0].vload([0], "uint8x4")
-            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
+            re_int32 = tvm.tir.call_pure_intrin('int32', 'reinterpret', a_int8)
             vec_ai32 = re_int32.astype('int32x16')
             vec_b = ins[1].vload([0, 0], "int8x64")
 
@@ -258,24 +259,24 @@ def _instr(index):
             llvm_id = tvm.target.codegen.llvm_lookup_intrinsic_id(vnni_inst_name)
 
             if llvm_id != 0: # VNNI is available for current LLVM version
-                vec_bi32 = tvm.call_pure_intrin('int32x16', 'reinterpret', vec_b)
-                vec_zero = tvm.const(0, "int32x16")
-                quad_reduction = tvm.call_llvm_intrin('int32x16',
-                                                      'llvm.x86.avx512.vpdpbusd.512',
-                                                      tvm.const(0, 'uint32'),
-                                                      vec_zero,
-                                                      vec_ai32, vec_bi32)
+                vec_bi32 = tvm.tir.call_pure_intrin('int32x16', 'reinterpret', vec_b)
+                vec_zero = tvm.tir.const(0, "int32x16")
+                quad_reduction = tvm.tir.call_llvm_intrin('int32x16',
+                                                          'llvm.x86.avx512.vpdpbusd.512',
+                                                          tvm.tir.const(0, 'uint32'),
+                                                          vec_zero,
+                                                          vec_ai32, vec_bi32)
             else: # Fall back to the normal AVX512
-                vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
-                vec_one = tvm.const(1, "int16x32")
-                pair_reduction = tvm.call_llvm_intrin('int16x32',
-                                                      'llvm.x86.avx512.pmaddubs.w.512',
-                                                      tvm.const(0, 'uint32'),
-                                                      vec_a, vec_b)
-                quad_reduction = tvm.call_llvm_intrin('int32x16',
-                                                      'llvm.x86.avx512.pmaddw.d.512',
-                                                      tvm.const(0, 'uint32'),
-                                                      pair_reduction, vec_one)
+                vec_a = tvm.tir.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
+                vec_one = tvm.tir.const(1, "int16x32")
+                pair_reduction = tvm.tir.call_llvm_intrin('int16x32',
+                                                          'llvm.x86.avx512.pmaddubs.w.512',
+                                                          tvm.tir.const(0, 'uint32'),
+                                                          vec_a, vec_b)
+                quad_reduction = tvm.tir.call_llvm_intrin('int32x16',
+                                                          'llvm.x86.avx512.pmaddw.d.512',
+                                                          tvm.tir.const(0, 'uint32'),
+                                                          pair_reduction, vec_one)
 
             if index == 0:
                 ib.emit(outs[0].vstore(0, quad_reduction))
@@ -286,5 +287,5 @@ def _instr(index):
         # body, reset, update
         return _instr(0), _instr(1), _instr(2)
 
-    with tvm.build_config(offset_factor=1, partition_const_loop=True):
-        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
+    with tvm.target.build_config(offset_factor=1, partition_const_loop=True):
+        return te.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py
index 04931f577b51..f2a35d277733 100644
--- a/topi/python/topi/x86/util.py
+++ b/topi/python/topi/x86/util.py
@@ -15,9 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Common x86 related utilities"""
-from __future__ import absolute_import as _abs
 import tvm
 
+
 def get_fp32_len():
     mcpu = tvm.target.Target.current().mcpu
     fp32_vec_len = 8
diff --git a/topi/recipe/broadcast/test_broadcast_map.py b/topi/recipe/broadcast/test_broadcast_map.py
index 4f8a4dece2c5..2f2bb9e900fe 100644
--- a/topi/recipe/broadcast/test_broadcast_map.py
+++ b/topi/recipe/broadcast/test_broadcast_map.py
@@ -16,6 +16,7 @@
 # under the License.
 import os
 import tvm
+from tvm import te
 from tvm.contrib import nvcc
 import numpy as np
 
@@ -52,7 +53,7 @@ def test_broadcast_to(in_shape, out_shape):
     TASK = "bcast_to_i" + "_".join([str(ele) for ele in in_shape])\
            + "o" + "_".join([str(ele) for ele in out_shape])
     # Build the logic and compile the function
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     B = topi.broadcast_to(A, out_shape)
     s = topi.cuda.schedule_broadcast(B)
     fcuda = tvm.build(s, [A, B], "cuda", name="broadcast_to")
@@ -72,8 +73,8 @@ def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"):
     TASK = "bcast_binary_" + typ + "_lhs" +\
            "_".join([str(ele) for ele in lhs_shape]) +\
            "rhs" + "_".join([str(ele) for ele in rhs_shape])
-    A = tvm.placeholder(shape=lhs_shape, name="A")
-    B = tvm.placeholder(shape=rhs_shape, name="B")
+    A = te.placeholder(shape=lhs_shape, name="A")
+    B = te.placeholder(shape=rhs_shape, name="B")
     if typ == "add":
         C = topi.broadcast_add(A, B)
     elif typ == "sub":
diff --git a/topi/recipe/conv/depthwise_conv2d_test.py b/topi/recipe/conv/depthwise_conv2d_test.py
index 90c61037f9b3..a2b527356662 100644
--- a/topi/recipe/conv/depthwise_conv2d_test.py
+++ b/topi/recipe/conv/depthwise_conv2d_test.py
@@ -16,6 +16,7 @@
 # under the License.
 import os
 import tvm
+from tvm import te
 import numpy as np
 from scipy import signal
 from tvm.contrib import nvcc
@@ -63,11 +64,11 @@ def test_depthwise_conv2d_nchw():
     padding = 'SAME' # or 'VALID'
 
     # Placeholder
-    Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
-    Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
+    Input = te.placeholder((batch, in_channel, in_height, in_width), name='Input')
+    Filter = te.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
     Stride = [stride_h, stride_w]
-    Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
-    Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
+    Scale = te.placeholder((in_channel * channel_multiplier,), name='Scale')
+    Shift = te.placeholder((in_channel * channel_multiplier,), name='Shift')
     # Declare
     DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, Stride, padding)
     ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
@@ -128,7 +129,7 @@ def check_device(device):
         print("success")
 
     for device in ['cuda', 'opencl', 'rocm']:
-        with tvm.build_config(auto_unroll_max_step=128,
+        with tvm.target.build_config(auto_unroll_max_step=128,
                               unroll_explicit=device == 'rocm',
                               detect_global_barrier=False,
                               restricted_func=True):
@@ -152,11 +153,11 @@ def test_depthwise_conv2d_nhwc():
     padding = 'SAME' # or 'VALID'
 
     # Placeholder
-    Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input')
-    Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
+    Input = te.placeholder((batch, in_height, in_width, in_channel), name='Input')
+    Filter = te.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
     Stride = [stride_h, stride_w]
-    Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
-    Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
+    Scale = te.placeholder((in_channel * channel_multiplier,), name='Scale')
+    Shift = te.placeholder((in_channel * channel_multiplier,), name='Shift')
     # Declare
     DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter, Stride, padding)
     ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
@@ -217,7 +218,7 @@ def check_device(device):
         print("success")
 
     for device in ['cuda', 'opencl', 'rocm']:
-        with tvm.build_config(auto_unroll_max_step=128,
+        with tvm.target.build_config(auto_unroll_max_step=128,
                               detect_global_barrier=False,
                               restricted_func=True):
             check_device(device)
diff --git a/topi/recipe/conv/test_conv2d_hwcn_map.py b/topi/recipe/conv/test_conv2d_hwcn_map.py
index 3f7decabfd0e..69bda79555a9 100644
--- a/topi/recipe/conv/test_conv2d_hwcn_map.py
+++ b/topi/recipe/conv/test_conv2d_hwcn_map.py
@@ -19,6 +19,7 @@
 import numpy as np
 import scipy.signal
 import tvm
+from tvm import te
 from tvm.contrib import nvcc
 import topi
 from topi.util import get_const_tuple
@@ -55,8 +56,8 @@ def test_conv2d_hwcn_map():
     stride = 2
     padding = 'SAME'
 
-    A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
-    W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
+    A = te.placeholder((in_height, in_width, in_channel, batch), name='A')
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name='W')
     B = topi.nn.conv2d_hwcn(A, W, stride, padding)
     C = topi.nn.relu(B)
     s1 = topi.cuda.schedule_conv2d_hwcn([B])
@@ -76,7 +77,7 @@ def check_device(device):
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=128,
+        with tvm.target.build_config(auto_unroll_max_step=128,
                               unroll_explicit=device == 'rocm'):
             func1 = tvm.build(s1, [A, W, B], device)
             func1(a, w, b)
diff --git a/topi/recipe/conv/test_conv_int8_arm.py b/topi/recipe/conv/test_conv_int8_arm.py
index ff0d37d9a66d..336e2f2f405b 100644
--- a/topi/recipe/conv/test_conv_int8_arm.py
+++ b/topi/recipe/conv/test_conv_int8_arm.py
@@ -20,6 +20,7 @@
 import logging
 import numpy as np
 import tvm
+from tvm import te
 import topi
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@@ -92,8 +93,8 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
                                                     hstride, wstride, out_dtype)
 
     # Create TVM placeholders
-    data = tvm.placeholder(data_shape, name='data', dtype=data_dtype)
-    kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)
+    data = te.placeholder(data_shape, name='data', dtype=data_dtype)
+    kernel = te.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)
 
     # Create the numpy arrays to be used for executing conv models
     if data_dtype == 'float32':
@@ -119,7 +120,7 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
                                              padding=hpad, dilation=(1, 1),
                                              layout='NCHWc', out_layout='NCHWc', out_dtype=out_dtype)
         out = topi.nn.relu(conv)
-        sch = tvm.create_schedule(out.op)
+        sch = te.create_schedule(out.op)
         func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out')
         func(data_array, kernel_array, c_orig)
         LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))
diff --git a/topi/recipe/conv/test_conv_int8_intel.py b/topi/recipe/conv/test_conv_int8_intel.py
index f39f4cd7b830..767262d81d83 100644
--- a/topi/recipe/conv/test_conv_int8_intel.py
+++ b/topi/recipe/conv/test_conv_int8_intel.py
@@ -20,6 +20,7 @@
 import logging
 import numpy as np
 import tvm
+from tvm import te
 import topi
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@@ -93,8 +94,8 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
                                                     hstride, wstride, out_dtype)
 
     # Create TVM placeholders
-    data = tvm.placeholder(data_shape, name='data', dtype=data_dtype)
-    kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)
+    data = te.placeholder(data_shape, name='data', dtype=data_dtype)
+    kernel = te.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)
 
     # Create the numpy arrays to be used for executing conv models
     if data_dtype == 'float32':
@@ -115,7 +116,7 @@ def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_f
                                     padding=hpad, dilation=(1, 1),
                                     layout='NCHWc', out_layout='NCHWc', out_dtype=out_dtype)
         out = topi.nn.relu(conv)
-        sch = tvm.create_schedule(out.op)
+        sch = te.create_schedule(out.op)
         func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out')
         func(data_array, kernel_array, c_orig)
         LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))
diff --git a/topi/recipe/gemm/android_gemm_square.py b/topi/recipe/gemm/android_gemm_square.py
index 46129cbc1f30..7692f9cf4497 100644
--- a/topi/recipe/gemm/android_gemm_square.py
+++ b/topi/recipe/gemm/android_gemm_square.py
@@ -16,6 +16,7 @@
 # under the License.
 """Example code to do square matrix multiplication on Android Phone."""
 import tvm
+from tvm import te
 import os
 from tvm import rpc
 from tvm.contrib import util, ndk
@@ -52,28 +53,28 @@ def test_gemm_gpu(N, times, bn, num_block, num_thread):
     assert(bn <= N)
     assert(num_thread * num_thread * 16 <= N)
     assert(num_block * num_block * 2 <= N)
-    A = tvm.placeholder((N, N), name='A')
-    B = tvm.placeholder((N, N), name='Btmp')
-    k = tvm.reduce_axis((0, N), name='k')
+    A = te.placeholder((N, N), name='A')
+    B = te.placeholder((N, N), name='Btmp')
+    k = te.reduce_axis((0, N), name='k')
 
-    packedB = tvm.compute((N, N / bn, bn),
+    packedB = te.compute((N, N / bn, bn),
               lambda x, y, z: B[x, y * bn + z], name = 'B')
 
-    C = tvm.compute(
+    C = te.compute(
         (N, N),
-        lambda ii, jj: tvm.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k),
+        lambda ii, jj: te.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k),
         name='C')
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     CC = s.cache_write(C, "local")
 
-    block_x = tvm.thread_axis("blockIdx.x")
-    block_y = tvm.thread_axis("blockIdx.y")
-    thread_x = tvm.thread_axis("threadIdx.x")
-    thread_y = tvm.thread_axis("threadIdx.y")
+    block_x = te.thread_axis("blockIdx.x")
+    block_y = te.thread_axis("blockIdx.y")
+    thread_x = te.thread_axis("threadIdx.x")
+    thread_y = te.thread_axis("threadIdx.y")
 
-    thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx")
-    thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy")
+    thread_xz = te.thread_axis((0, 2), "vthread", name="vx")
+    thread_yz = te.thread_axis((0, 2), "vthread", name="vy")
 
     pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread)
     pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread)
diff --git a/topi/recipe/gemm/cuda_gemm_square.py b/topi/recipe/gemm/cuda_gemm_square.py
index 899379e9e488..196bf72e23a3 100644
--- a/topi/recipe/gemm/cuda_gemm_square.py
+++ b/topi/recipe/gemm/cuda_gemm_square.py
@@ -16,6 +16,7 @@
 # under the License.
 """Example code to do square matrix multiplication."""
 import tvm
+from tvm import te
 import os
 from tvm.contrib import nvcc
 from tvm.contrib import spirv
@@ -46,19 +47,19 @@ def tvm_callback_cuda_postproc(code):
 def test_gemm():
     # graph
     nn = 2048
-    n = tvm.var('n')
-    n = tvm.convert(nn)
+    n = te.var('n')
+    n = tvm.runtime.convert(nn)
     m, l = n, n
-    A = tvm.placeholder((l, n), name='A')
-    B = tvm.placeholder((l, m), name='B')
-    k = tvm.reduce_axis((0, l), name='k')
-    C = tvm.compute(
+    A = te.placeholder((l, n), name='A')
+    B = te.placeholder((l, m), name='B')
+    k = te.reduce_axis((0, l), name='k')
+    C = te.compute(
         (m, n),
-        lambda ii, jj: tvm.sum(A[k, jj] * B[k, ii], axis=k),
+        lambda ii, jj: te.sum(A[k, jj] * B[k, ii], axis=k),
         name='C')
 
     # schedule
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     AA = s.cache_read(A, "shared", [C])
     BB = s.cache_read(B, "shared", [C])
     AL = s.cache_read(AA, "local", [C])
@@ -68,12 +69,12 @@ def test_gemm():
     scale = 8
     num_thread = 8
     block_factor = scale * num_thread
-    block_x = tvm.thread_axis("blockIdx.x")
-    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-    block_y = tvm.thread_axis("blockIdx.y")
-    thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
-    thread_xz = tvm.thread_axis((0, 2), "vthread", name="vx")
-    thread_yz = tvm.thread_axis((0, 2), "vthread", name="vy")
+    block_x = te.thread_axis("blockIdx.x")
+    thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
+    block_y = te.thread_axis("blockIdx.y")
+    thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
+    thread_xz = te.thread_axis((0, 2), "vthread", name="vx")
+    thread_yz = te.thread_axis((0, 2), "vthread", name="vy")
 
     by, yi = s[C].split(C.op.axis[0], factor=block_factor)
     bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
@@ -145,7 +146,7 @@ def check_device(device):
         print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS))
 
     for device in ["cuda", "opencl", "rocm", "nvptx", "vulkan"]:
-        with tvm.build_config(auto_unroll_max_step=128,
+        with tvm.target.build_config(auto_unroll_max_step=128,
                               unroll_explicit=(device != "cuda")):
             check_device(device)
 
diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
index cf3621479d41..9d668ebf6fa9 100644
--- a/topi/recipe/gemm/gemm_int8.py
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -19,6 +19,7 @@
 import sys
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 from topi.cuda.tensor_intrin import dp4a
 
@@ -29,15 +30,15 @@
 
 @autotvm.template
 def gemm_int8(n, m, l):
-    A = tvm.placeholder((n, l), name='A', dtype='int8')
-    B = tvm.placeholder((m, l), name='B', dtype='int8')
+    A = te.placeholder((n, l), name='A', dtype='int8')
+    B = te.placeholder((m, l), name='B', dtype='int8')
 
-    k = tvm.reduce_axis((0, l), name='k')
-    C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype(
+    k = te.reduce_axis((0, l), name='k')
+    C = te.compute((n, m), lambda i, j: te.sum(A[i, k].astype('int32') * B[j, k].astype(
         'int32'), axis=k), name='C')
 
     cfg = autotvm.get_config()
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     y, x = C.op.axis
 
     AA = s.cache_read(A, 'shared', [C])
@@ -56,10 +57,10 @@ def gemm_int8(n, m, l):
 
     s[CC].tensorize(ki, intrin_dp4a)
 
-    block_x = tvm.thread_axis('blockIdx.x')
-    block_y = tvm.thread_axis('blockIdx.y')
-    thread_x = tvm.thread_axis('threadIdx.x')
-    thread_y = tvm.thread_axis('threadIdx.y')
+    block_x = te.thread_axis('blockIdx.x')
+    block_y = te.thread_axis('blockIdx.y')
+    thread_x = te.thread_axis('threadIdx.x')
+    thread_y = te.thread_axis('threadIdx.y')
 
     def block_size_filter(entity):
         return entity.size[0] * 2 >= entity.size[1] * 2 and \
@@ -71,8 +72,8 @@ def block_size_filter(entity):
 
     s[C].bind(by, block_y)
     s[C].bind(bx, block_x)
-    s[C].bind(tyz, tvm.thread_axis('vthread'))
-    s[C].bind(txz, tvm.thread_axis('vthread'))
+    s[C].bind(tyz, te.thread_axis('vthread'))
+    s[C].bind(txz, te.thread_axis('vthread'))
     s[C].bind(ty, thread_y)
     s[C].bind(tx, thread_x)
     s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
diff --git a/topi/recipe/reduce/test_reduce_map.py b/topi/recipe/reduce/test_reduce_map.py
index 1adc41374f9c..31f9bae7426c 100644
--- a/topi/recipe/reduce/test_reduce_map.py
+++ b/topi/recipe/reduce/test_reduce_map.py
@@ -16,6 +16,7 @@
 # under the License.
 import os
 import tvm
+from tvm import te
 from tvm.contrib import nvcc
 import numpy as np
 
@@ -50,7 +51,7 @@ def tvm_callback_cuda_postproc(code):
 def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
     global TASK
     # Build the logic and compile the function
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     if type == "sum":
         TASK = "sum_map_id%d" %test_id
         B = topi.sum(A, axis=axis, keepdims=keepdims)
@@ -63,7 +64,7 @@ def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
     else:
         raise NotImplementedError
     s = topi.cuda.schedule_reduce(B)
-    with tvm.build_config(auto_unroll_max_step=16,
+    with tvm.target.build_config(auto_unroll_max_step=16,
                           auto_unroll_min_depth=0):
         fcuda = tvm.build(s, [A, B], "cuda", name="sum")
 
diff --git a/topi/recipe/rnn/lstm.py b/topi/recipe/rnn/lstm.py
index 0d7635d08288..4076eb6a4614 100644
--- a/topi/recipe/rnn/lstm.py
+++ b/topi/recipe/rnn/lstm.py
@@ -16,6 +16,7 @@
 # under the License.
 """LSTM Example, still work in progress.."""
 import tvm
+from tvm import te
 import os
 from tvm.contrib import nvcc
 import numpy as np
@@ -58,52 +59,52 @@ def lstm():
     num_thread_x = 16 * 3 // 2
     num_sm = 24
     n_num_step = 128
-    num_step = tvm.var('num_step')
+    num_step = te.var('num_step')
     num_hidden = 1152 // 2
     batch_size = 1
     # Global transition matrix
     # Input hidden channel can be pre-caculated by a gemm
-    Xi2h = tvm.placeholder((num_step, batch_size, 4, num_hidden), name="Xi2h")
+    Xi2h = te.placeholder((num_step, batch_size, 4, num_hidden), name="Xi2h")
     # Only handle hidden transition, saves space.
-    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
+    Wh2h = te.placeholder((4, num_hidden, num_hidden), name="Wh2h")
     # h: output hidden state, c: cell state.
-    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))
-    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))
-    s_init_c = tvm.compute((1, batch_size, num_hidden),
+    s_state_h = te.placeholder((num_step, batch_size, num_hidden))
+    s_state_c = te.placeholder((num_step, batch_size, num_hidden))
+    s_init_c = te.compute((1, batch_size, num_hidden),
                            lambda *i: 0.0, name="init_c")
-    s_init_h = tvm.compute((1, batch_size, num_hidden),
+    s_init_h = te.compute((1, batch_size, num_hidden),
                            lambda *i: 0.0, name="init_h")
     # LSTM transition
-    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
-    s_h2h = tvm.compute(
+    k = te.reduce_axis((0, num_hidden), name="ki2h")
+    s_h2h = te.compute(
         (num_step, batch_size, 4, num_hidden),
-        lambda t, i, x, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
+        lambda t, i, x, j: te.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k),
         name="s_h2h")
     # Gate rules
-    gates = tvm.compute(Xi2h.shape, lambda *i:
+    gates = te.compute(Xi2h.shape, lambda *i:
                         Xi2h(*i) + s_h2h(*i), name="gates")
     gshape = (num_step, batch_size, num_hidden)
-    in_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 0, j]), name="in_gate")
-    in_transform = tvm.compute(gshape, lambda t, i, j: tvm.tanh(gates[t, i, 1, j]), name="in_transform")
-    forget_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 2, j]), name="forget_gate")
-    out_gate = tvm.compute(gshape, lambda t, i, j: tvm.sigmoid(gates[t, i, 3, j]), name="out_gate")
-    next_c = tvm.compute(gshape,
+    in_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, i, 0, j]), name="in_gate")
+    in_transform = te.compute(gshape, lambda t, i, j: te.tanh(gates[t, i, 1, j]), name="in_transform")
+    forget_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, i, 2, j]), name="forget_gate")
+    out_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, i, 3, j]), name="out_gate")
+    next_c = te.compute(gshape,
                          lambda t, i, j:
                          forget_gate[t, i, j] * s_state_c[t - 1, i, j] +
                          in_gate[t, i, j] * in_transform[t, i, j], name="next_c")
-    next_h = tvm.compute(gshape,
-                         lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]), name="next_h")
-    update_c = tvm.compute(gshape, lambda *i: next_c(*i), name="update_c")
-    update_h = tvm.compute(gshape, lambda *i: next_h(*i), name="update_h")
+    next_h = te.compute(gshape,
+                         lambda t, i, j: out_gate[t, i, j] * te.tanh(next_c[t, i, j]), name="next_h")
+    update_c = te.compute(gshape, lambda *i: next_c(*i), name="update_c")
+    update_h = te.compute(gshape, lambda *i: next_h(*i), name="update_h")
     # schedule
-    scan_h, scan_c = tvm.scan(
+    scan_h, scan_c = tvm.te.scan(
         [s_init_h, s_init_c],
         [update_h, update_c],
         [s_state_h, s_state_c],
         inputs=[Xi2h],
         name="lstm_scan")
     # schedule
-    s = tvm.create_schedule(scan_h.op)
+    s = te.create_schedule(scan_h.op)
     # Inline gate computations
     s[gates].compute_inline()
     s[in_gate].compute_inline()
@@ -111,9 +112,9 @@ def lstm():
     s[forget_gate].compute_inline()
     s[out_gate].compute_inline()
 
-    block_x = tvm.thread_axis((0, num_sm), "blockIdx.x")
-    thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-    thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
+    block_x = te.thread_axis((0, num_sm), "blockIdx.x")
+    thread_x = te.thread_axis((0, num_thread_x), "threadIdx.x")
+    thread_y = te.thread_axis((0, num_thread_y), "threadIdx.y")
 
     s_state_h_S = s.cache_read(s_state_h, "shared", [s_h2h])
     s_state_c_S = s.cache_read(s_state_c, "shared", [next_c])
@@ -187,7 +188,7 @@ def check_device(target):
         print("Time cost=%g" % eval_result.mean)
 
     # set unroll_explicit for more readable code.
-    with tvm.build_config(
+    with tvm.target.build_config(
             detect_global_barrier=DETECT_GLOBAL_BARRIER,
             auto_unroll_max_step=128,
             unroll_explicit=False):
diff --git a/topi/recipe/rnn/matexp.py b/topi/recipe/rnn/matexp.py
index 7466008d81e3..9991895ec8dc 100644
--- a/topi/recipe/rnn/matexp.py
+++ b/topi/recipe/rnn/matexp.py
@@ -24,6 +24,7 @@
 ```
 """
 import tvm
+from tvm import te
 import time
 import os
 import argparse
@@ -62,25 +63,25 @@ def rnn_matexp():
     n_batch_size = 4
     detect_global_barrier = DETECT_GLOBAL_BARRIER
 
-    num_step = tvm.var("num_step")
-    num_hidden = tvm.convert(n_num_hidden)
-    batch_size = tvm.convert(n_batch_size)
+    num_step = te.var("num_step")
+    num_hidden = tvm.runtime.convert(n_num_hidden)
+    batch_size = tvm.runtime.convert(n_batch_size)
     num_thread_y = 8
     num_thread_x = 16 * 3
     num_sm = 24
 
-    Whh = tvm.placeholder((num_hidden, num_hidden), name="Whh")
-    s_init = tvm.compute((1, batch_size, num_hidden),
+    Whh = te.placeholder((num_hidden, num_hidden), name="Whh")
+    s_init = te.compute((1, batch_size, num_hidden),
                          lambda _, i, j: 1.0, name="init")
-    s_state = tvm.placeholder((num_step, batch_size, num_hidden))
-    kh = tvm.reduce_axis((0, num_hidden), name="kh")
-    s_update = tvm.compute(
+    s_state = te.placeholder((num_step, batch_size, num_hidden))
+    kh = te.reduce_axis((0, num_hidden), name="kh")
+    s_update = te.compute(
         (num_step, batch_size, num_hidden),
-        lambda t, i, j: tvm.sum(s_state[t-1, i, kh] * Whh[kh, j], axis=kh),
+        lambda t, i, j: te.sum(s_state[t-1, i, kh] * Whh[kh, j], axis=kh),
         name="update")
-    s_scan = tvm.scan(s_init, s_update, s_state)
+    s_scan = tvm.te.scan(s_init, s_update, s_state)
     # schedule
-    s = tvm.create_schedule(s_scan.op)
+    s = te.create_schedule(s_scan.op)
     CL = s_update
     SS = s.cache_read(s_state, "shared", [CL])
     SL = s.cache_read(SS, "local", [CL])
@@ -88,9 +89,9 @@ def rnn_matexp():
     ko, ki = s[CL].split(s[CL].op.reduce_axis[0], nparts=num_thread_y)
     CLF = s.rfactor(CL, ko)
 
-    block_x = tvm.thread_axis((0, num_sm), "blockIdx.x")
-    thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-    thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
+    block_x = te.thread_axis((0, num_sm), "blockIdx.x")
+    thread_x = te.thread_axis((0, num_thread_x), "threadIdx.x")
+    thread_y = te.thread_axis((0, num_thread_y), "threadIdx.y")
     if PERSIST_KERNEL:
         s[s_scan.op].env_threads([block_x, thread_y, thread_x])
 
@@ -126,7 +127,7 @@ def rnn_matexp():
     s[SS].bind(tx, thread_x)
 
     def check_device(target):
-        with tvm.build_config(
+        with tvm.target.build_config(
                 detect_global_barrier=detect_global_barrier,
                 auto_unroll_max_step=128,
                 unroll_explicit=False):
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
index e03708c67f26..eeaf6325cec2 100644
--- a/topi/tests/python/common.py
+++ b/topi/tests/python/common.py
@@ -17,6 +17,7 @@
 """Common utility for topi test"""
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
 import topi
diff --git a/topi/tests/python/test_fifo_buffer.py b/topi/tests/python/test_fifo_buffer.py
index 34c389aad6c9..676c1f975c93 100644
--- a/topi/tests/python/test_fifo_buffer.py
+++ b/topi/tests/python/test_fifo_buffer.py
@@ -17,6 +17,7 @@
 """Test code for FIFO buffer"""
 
 import tvm
+from tvm import te
 import topi
 import topi.testing
 import numpy as np
@@ -25,8 +26,8 @@
 from common import get_all_backend
 
 def verify_fifo_buffer(buffer_shape, data_shape, axis, dtype='float32'):
-    buffer = tvm.placeholder(buffer_shape, name='buffer', dtype=dtype)
-    data = tvm.placeholder(data_shape, name='data', dtype=dtype)
+    buffer = te.placeholder(buffer_shape, name='buffer', dtype=dtype)
+    data = te.placeholder(data_shape, name='data', dtype=dtype)
 
     # Use memoize, pickle the test data for next time use
     @memoize('topi.tests.test_fifo_buffer')
@@ -98,12 +99,12 @@ def verify_conv1d_integration():
 
     dtype = 'float32'
 
-    inc_input = tvm.placeholder(inc_input_shape, name='inc_input', dtype=dtype)
-    input_window = tvm.placeholder(input_window_shape, name='input_window', dtype=dtype)
-    context = tvm.placeholder(context_shape, name='context', dtype=dtype)
-    kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=dtype)
-    inc_output = tvm.placeholder(inc_input_shape, name='inc_output', dtype=dtype)
-    output_window = tvm.placeholder(output_window_shape, name='output_window', dtype=dtype)
+    inc_input = te.placeholder(inc_input_shape, name='inc_input', dtype=dtype)
+    input_window = te.placeholder(input_window_shape, name='input_window', dtype=dtype)
+    context = te.placeholder(context_shape, name='context', dtype=dtype)
+    kernel = te.placeholder(kernel_shape, name='kernel', dtype=dtype)
+    inc_output = te.placeholder(inc_input_shape, name='inc_output', dtype=dtype)
+    output_window = te.placeholder(output_window_shape, name='output_window', dtype=dtype)
 
     # Use memoize, pickle the test data for next time use
     @memoize('topi.tests.test_fifo_buffer_conv1d_integration')
diff --git a/topi/tests/python/test_topi_basic.py b/topi/tests/python/test_topi_basic.py
index 53b29df4f36d..83f0469dc00f 100644
--- a/topi/tests/python/test_topi_basic.py
+++ b/topi/tests/python/test_topi_basic.py
@@ -15,20 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import topi
 from topi import util
 
 
 def test_util():
-    x = tvm.const(100, "int32")
+    x = tvm.tir.const(100, "int32")
     assert util.get_const_int(x) == 100
     assert util.get_const_tuple((x, x)) == (100, 100)
 
 
 def test_ewise():
-    m = tvm.var('m')
-    l = tvm.var('l')
-    A = tvm.placeholder((m, l), name='A')
+    m = te.var('m')
+    l = te.var('l')
+    A = te.placeholder((m, l), name='A')
 
     def test_apply(func, name):
         B = func(A)
diff --git a/topi/tests/python/test_topi_batch_matmul.py b/topi/tests/python/test_topi_batch_matmul.py
index 1b38e9037fb9..b8c854746847 100644
--- a/topi/tests/python/test_topi_batch_matmul.py
+++ b/topi/tests/python/test_topi_batch_matmul.py
@@ -17,6 +17,7 @@
 """Test code for batch_matmul operator"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -31,8 +32,8 @@
 }
 
 def verify_batch_matmul(batch, M, N, K):
-    x = tvm.placeholder((batch, M, K), name='x')
-    y = tvm.placeholder((batch, N, K), name='y')
+    x = te.placeholder((batch, M, K), name='x')
+    y = te.placeholder((batch, N, K), name='y')
     dtype = x.dtype
 
     # use memoize to pickle the test data for next time use
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
index 274743d274ae..44811d189189 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -33,8 +34,8 @@ def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel,
     out_dtype = 'int32'
 
     with tvm.target.create('llvm'):
-        A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_dtype, name='A')
-        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_dtype, name='W')
+        A = te.placeholder((batch, in_channel, in_height, in_width), dtype=input_dtype, name='A')
+        W = te.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_dtype, name='W')
         B = topi.x86.bitserial_conv2d_nchw(A, W, stride, padding, activation_bits, weight_bits,
                                            input_dtype, out_dtype, unipolar)
         s = topi.x86.schedule_bitserial_conv2d_nchw([B])
@@ -71,8 +72,8 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
     out_dtype='int32'
 
     with tvm.target.create('llvm'):
-        A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_dtype, name='A')
-        W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_dtype, name='W')
+        A = te.placeholder((batch, in_height, in_width, in_channel), dtype=input_dtype, name='A')
+        W = te.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_dtype, name='W')
         B = topi.x86.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits,
                                            input_dtype, out_dtype, unipolar)
         s = topi.x86.schedule_bitserial_conv2d_nhwc([B])
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
index 1f87785b4f48..99ba0dba8328 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -18,6 +18,7 @@
 import re
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -37,8 +38,8 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
 
     device = 'llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon'
     with tvm.target.create(device):
-        A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
-        W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
+        A = te.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
+        W = te.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
         B = topi.arm_cpu.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits,
                                                'uint8', out_dtype, unipolar)
         s = topi.arm_cpu.schedule_bitserial_conv2d_nhwc([B])
diff --git a/topi/tests/python/test_topi_bitserial_dense.py b/topi/tests/python/test_topi_bitserial_dense.py
index 505ce794312f..fbb20a663f3b 100644
--- a/topi/tests/python/test_topi_bitserial_dense.py
+++ b/topi/tests/python/test_topi_bitserial_dense.py
@@ -18,6 +18,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -54,8 +55,8 @@ def get_ref_data(a_shape, b_shape, input_dtype):
             print ("Skipped running code, not an arm device")
             continue
         input_dtype = 'uint8' if "arm_cpu" in target else "uint32"
-        A = tvm.placeholder((batch, in_dim), dtype=input_dtype, name='A')
-        B = tvm.placeholder((out_dim, in_dim), dtype=input_dtype, name='B')
+        A = te.placeholder((batch, in_dim), dtype=input_dtype, name='A')
+        B = te.placeholder((out_dim, in_dim), dtype=input_dtype, name='B')
         fcompute, fschedule = topi.testing.dispatch(target, _bitserial_dense_implement)
         C = fcompute(A, B, activation_bits, weight_bits,
                      input_dtype, out_dtype, unipolar)
diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py
index ce6a28643b58..275f34fd916e 100644
--- a/topi/tests/python/test_topi_bnn.py
+++ b/topi/tests/python/test_topi_bnn.py
@@ -17,19 +17,20 @@
 """Test code for binary neural network operators."""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
 
 def verify_binary_dense(batch, in_dim, out_dim):
-    A = tvm.placeholder((batch, in_dim), name='A')
-    B = tvm.placeholder((out_dim, in_dim), name='B')
+    A = te.placeholder((batch, in_dim), name='A')
+    B = te.placeholder((out_dim, in_dim), name='B')
     bnn_A = topi.nn.binarize_pack(A)
     bnn_B = topi.nn.binarize_pack(B)
     # binary dense
-    bnn_A1 = tvm.placeholder(bnn_A.shape, dtype=bnn_A.dtype)
-    bnn_B1 = tvm.placeholder(bnn_B.shape, dtype=bnn_B.dtype)
+    bnn_A1 = te.placeholder(bnn_A.shape, dtype=bnn_A.dtype)
+    bnn_B1 = te.placeholder(bnn_B.shape, dtype=bnn_B.dtype)
     bnn_C = topi.nn.binary_dense(bnn_A1, bnn_B1)
     # schedule
     with tvm.target.create('llvm'):
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index 2bea9b09bbf4..2fe00c7d4ec9 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -17,6 +17,7 @@
 """Test code for broadcasting operators."""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from common import get_all_backend
@@ -24,7 +25,7 @@
 
 def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
     # Build the logic and compile the function
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     B = fbcast(A, out_shape)
 
     def check_device(device):
@@ -54,13 +55,13 @@ def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
                                 rhs_min=-100, rhs_max=100,
                                 dtype="float32"):
     # Build the logic and compile the function
-    A = (tvm.var("A", dtype=dtype) if lhs_shape is None
-         else tvm.placeholder(shape=lhs_shape, name="A", dtype=dtype))
-    B = (tvm.var("B", dtype=dtype) if rhs_shape is None
-         else tvm.placeholder(shape=rhs_shape, name="B", dtype=dtype))
+    A = (te.var("A", dtype=dtype) if lhs_shape is None
+         else te.placeholder(shape=lhs_shape, name="A", dtype=dtype))
+    B = (te.var("B", dtype=dtype) if rhs_shape is None
+         else te.placeholder(shape=rhs_shape, name="B", dtype=dtype))
     C = ftopi(A, B)
-    if isinstance(A, tvm.expr.PrimExpr) and isinstance(B, tvm.expr.PrimExpr):
-        assert(isinstance(C, tvm.expr.PrimExpr))
+    if isinstance(A, tvm.tir.PrimExpr) and isinstance(B, tvm.tir.PrimExpr):
+        assert(isinstance(C, tvm.tir.PrimExpr))
         return
 
     def gen_operand(shape, low, high, ctx):
@@ -240,10 +241,10 @@ def test_apply(
             dtype="bool",
     ):
         # Build the logic and compile the function
-        A = tvm.placeholder(shape=indata.shape, name="A", dtype=dtype)
+        A = te.placeholder(shape=indata.shape, name="A", dtype=dtype)
         B = func(A)
-        if isinstance(A, tvm.expr.PrimExpr):
-            assert (isinstance(B, tvm.expr.PrimExpr))
+        if isinstance(A, tvm.tir.PrimExpr):
+            assert (isinstance(B, tvm.tir.PrimExpr))
             return
 
         def check_device(device):
@@ -280,11 +281,11 @@ def test_apply(
             dtype="int32",
     ):
         # Build the logic and compile the function
-        A = tvm.placeholder(shape=shape, name="A", dtype=dtype)
+        A = te.placeholder(shape=shape, name="A", dtype=dtype)
         B = func(A)
 
-        if isinstance(A, tvm.expr.PrimExpr):
-            assert (isinstance(B, tvm.expr.PrimExpr))
+        if isinstance(A, tvm.tir.PrimExpr):
+            assert (isinstance(B, tvm.tir.PrimExpr))
             return
 
         def check_device(device):
@@ -322,11 +323,11 @@ def test_apply(
             dtype="bool",
     ):
         # Build the logic and compile the function
-        A = (tvm.var("A", dtype=dtype))
-        B = (tvm.var("B", dtype=dtype))
+        A = (te.var("A", dtype=dtype))
+        B = (te.var("B", dtype=dtype))
         C = func(A, B)
-        if isinstance(A, tvm.expr.PrimExpr) and isinstance(B, tvm.expr.PrimExpr):
-            assert (isinstance(C, tvm.expr.PrimExpr))
+        if isinstance(A, tvm.tir.PrimExpr) and isinstance(B, tvm.tir.PrimExpr):
+            assert (isinstance(C, tvm.tir.PrimExpr))
             return
 
         def check_device(device):
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index 74034ce30b0e..38617ee11443 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -17,6 +17,7 @@
 """Test code for clip operator"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -25,9 +26,9 @@
 from common import get_all_backend
 
 def verify_clip(N, a_min, a_max, dtype):
-    A = tvm.placeholder((N, N), dtype=dtype, name='A')
+    A = te.placeholder((N, N), dtype=dtype, name='A')
     B = topi.clip(A, a_min, a_max)
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
 
     # use memoize to pickle the test data for next time use
     @memoize("topi.tests.test_topi_clip")
diff --git a/topi/tests/python/test_topi_conv1d.py b/topi/tests/python/test_topi_conv1d.py
index 6e55a574de4a..972a3f195a4f 100644
--- a/topi/tests/python/test_topi_conv1d.py
+++ b/topi/tests/python/test_topi_conv1d.py
@@ -18,6 +18,7 @@
 import numpy as np
 import itertools
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -54,8 +55,8 @@ def verify_conv1d(batch,
         kernel_shape = [kernel_size, in_channels, filters]
 
     dtype = 'float32'
-    A = tvm.placeholder(in_shape, name='A', dtype=dtype)
-    W = tvm.placeholder(kernel_shape, name='W', dtype=dtype)
+    A = te.placeholder(in_shape, name='A', dtype=dtype)
+    W = te.placeholder(kernel_shape, name='W', dtype=dtype)
 
     def get_ref_data(layout):
         a_np = np.random.uniform(size=in_shape).astype(dtype)
diff --git a/topi/tests/python/test_topi_conv1d_transpose_ncw.py b/topi/tests/python/test_topi_conv1d_transpose_ncw.py
index 64af254adc7d..4d015bf53321 100644
--- a/topi/tests/python/test_topi_conv1d_transpose_ncw.py
+++ b/topi/tests/python/test_topi_conv1d_transpose_ncw.py
@@ -18,6 +18,7 @@
 import numpy as np
 import itertools
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -31,8 +32,8 @@
 
 def verify_conv1d_transpose_ncw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
     in_width = in_size
-    A = tvm.placeholder((batch, in_channel, in_width), name='A')
-    W = tvm.placeholder((in_channel, num_filter, kernel), name='W')
+    A = te.placeholder((batch, in_channel, in_width), name='A')
+    W = te.placeholder((in_channel, num_filter, kernel), name='W')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py
index 8a74b4f06cd2..a072d2abdafc 100644
--- a/topi/tests/python/test_topi_conv2d_NCHWc.py
+++ b/topi/tests/python/test_topi_conv2d_NCHWc.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import topi.testing
@@ -71,9 +72,9 @@ def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
             ic_block = bn
             break
 
-    A = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='A')
-    W = tvm.placeholder((num_filter//oc_block, in_channel//ic_block, kernel, kernel, ic_block, oc_block), name='W')
-    bias = tvm.placeholder((num_filter//oc_block, 1, 1, oc_block), name='bias')
+    A = te.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='A')
+    W = te.placeholder((num_filter//oc_block, in_channel//ic_block, kernel, kernel, ic_block, oc_block), name='W')
+    bias = te.placeholder((num_filter//oc_block, 1, 1, oc_block), name='bias')
 
     @memoize("topi.tests.test_topi_conv2d_NCHWc.verify_conv2d_NCHWc")
     def get_ref_data():
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index 086523e46013..41192bd45deb 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -18,6 +18,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -33,9 +34,9 @@
 def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
     in_height = in_width = in_size
 
-    A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
-    W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-    B = tvm.placeholder((1, num_filter, 1), name='bias')
+    A = te.placeholder((in_height, in_width, in_channel, batch), name='A')
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name='W')
+    B = te.placeholder((1, num_filter, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
index c36bfa331faf..d784e5cd3f86 100644
--- a/topi/tests/python/test_topi_conv2d_int8.py
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
 import topi
@@ -38,9 +39,9 @@ def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, str
 
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
-    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8')
-    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
+    W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8')
+    bias = te.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
                             dtype='int8')
 
     a_shape = get_const_tuple(A.shape)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index a0258ec93bf2..d42c8c7c24c0 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import topi.testing
@@ -36,9 +37,9 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = te.placeholder((num_filter, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py
index 2a5915ef0a53..814fd45e0636 100644
--- a/topi/tests/python/test_topi_conv2d_nhwc.py
+++ b/topi/tests/python/test_topi_conv2d_nhwc.py
@@ -18,6 +18,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -37,8 +38,8 @@
 def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A')
-    W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
+    A = te.placeholder((batch, in_height, in_width, in_channel), name='A')
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name='W')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py b/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py
index 8267aad382e8..a5d532c4e016 100644
--- a/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py
+++ b/topi/tests/python/test_topi_conv2d_nhwc_pack_int8.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
 import topi
@@ -30,8 +31,8 @@
 def verify_conv2d_1x1_nhwc_pack_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A', dtype='uint8')
-    W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W', dtype='int8')
+    A = te.placeholder((batch, in_height, in_width, in_channel), name='A', dtype='uint8')
+    W = te.placeholder((kernel, kernel, in_channel, num_filter), name='W', dtype='int8')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index e8aabc61a4fa..e8e1fce97eb1 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -17,6 +17,7 @@
 """Test code for transposed convolution."""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -38,8 +39,8 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel,
     stride_height, stride_width = stride
     pad_top, pad_left, pad_bottom, pad_right = padding
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    W = tvm.placeholder((in_channel, num_filter, kernel_height, kernel_width), name='W')
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = te.placeholder((in_channel, num_filter, kernel_height, kernel_width), name='W')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py
index 2d12336e771a..cfbc30063d09 100644
--- a/topi/tests/python/test_topi_conv2d_winograd.py
+++ b/topi/tests/python/test_topi_conv2d_winograd.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
 import topi
@@ -42,9 +43,9 @@ def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = te.placeholder((num_filter, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_conv3d_ncdhw.py b/topi/tests/python/test_topi_conv3d_ncdhw.py
index 6c60c27ed426..33e791716e34 100644
--- a/topi/tests/python/test_topi_conv3d_ncdhw.py
+++ b/topi/tests/python/test_topi_conv3d_ncdhw.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import topi.testing
@@ -40,9 +41,9 @@ def verify_conv3d_ncdhw(batch, in_channel, in_size, num_filter, kernel, stride,
 
     in_depth = in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A')
-    W = tvm.placeholder((num_filter, in_channel, kernel, kernel, kernel), name='W')
-    bias = tvm.placeholder((num_filter, 1, 1, 1), name='bias')
+    A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A')
+    W = te.placeholder((num_filter, in_channel, kernel, kernel, kernel), name='W')
+    bias = te.placeholder((num_filter, 1, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_conv3d_ndhwc.py b/topi/tests/python/test_topi_conv3d_ndhwc.py
index 7e2f02cea20a..8526bb1fc90a 100644
--- a/topi/tests/python/test_topi_conv3d_ndhwc.py
+++ b/topi/tests/python/test_topi_conv3d_ndhwc.py
@@ -18,6 +18,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -41,8 +42,8 @@ def verify_conv3d_ndhwc(batch, in_channel, in_size, num_filter, kernel, stride,
     else:
         kernel_depth = kernel_height = kernel_width = kernel
 
-    A = tvm.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A')
-    W = tvm.placeholder((kernel_depth, kernel_height, kernel_width, in_channel, num_filter), name='W')
+    A = te.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A')
+    W = te.placeholder((kernel_depth, kernel_height, kernel_width, in_channel, num_filter), name='W')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
diff --git a/topi/tests/python/test_topi_deformable_conv2d.py b/topi/tests/python/test_topi_deformable_conv2d.py
index 1b1a0327a3d5..a88525407e27 100644
--- a/topi/tests/python/test_topi_deformable_conv2d.py
+++ b/topi/tests/python/test_topi_deformable_conv2d.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import topi.testing
@@ -34,11 +35,11 @@ def verify_deformable_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel
     print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size,
             num_filter, kernel, stride, padding, dilation, deformable_groups, groups))
 
-    A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A')
+    A = te.placeholder((batch, in_channel, in_size, in_size), name='A')
     out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1
-    Offset = tvm.placeholder((batch, deformable_groups * kernel * kernel * 2, out_size, out_size), name='offset')
-    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+    Offset = te.placeholder((batch, deformable_groups * kernel * kernel * 2, out_size, out_size), name='offset')
+    W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = te.placeholder((num_filter, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     offset_shape = get_const_tuple(Offset.shape)
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index d729e4330e52..7498c004c8dd 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -17,6 +17,7 @@
 """Test code for dense operator"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -38,9 +39,9 @@
 }
 
 def verify_dense(batch, in_dim, out_dim, use_bias=True):
-    A = tvm.placeholder((batch, in_dim), name='A')
-    B = tvm.placeholder((out_dim, in_dim), name='B')
-    C = tvm.placeholder((out_dim,), name='C')
+    A = te.placeholder((batch, in_dim), name='A')
+    B = te.placeholder((out_dim, in_dim), name='B')
+    C = te.placeholder((out_dim,), name='C')
     dtype = A.dtype
 
     # use memoize to pickle the test data for next time use
@@ -83,9 +84,9 @@ def check_device(device):
 def verify_dense_int8(batch, in_dim, out_dim, use_bias=True):
     dtype = 'int8'
     out_dtype = 'int32'
-    A = tvm.placeholder((batch, in_dim), name='A', dtype=dtype)
-    B = tvm.placeholder((out_dim, in_dim), name='B', dtype=dtype)
-    C = tvm.placeholder((out_dim,), name='C', dtype=out_dtype)
+    A = te.placeholder((batch, in_dim), name='A', dtype=dtype)
+    B = te.placeholder((out_dim, in_dim), name='B', dtype=dtype)
+    C = te.placeholder((out_dim,), name='C', dtype=out_dtype)
 
     # use memoize to pickle the test data for next time use
     @memoize("topi.tests.test_topi_dense_int8")
diff --git a/topi/tests/python/test_topi_depth_to_space.py b/topi/tests/python/test_topi_depth_to_space.py
index 693bfb624042..b21eb9773c32 100644
--- a/topi/tests/python/test_topi_depth_to_space.py
+++ b/topi/tests/python/test_topi_depth_to_space.py
@@ -17,6 +17,7 @@
 """Test code for depth to space"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 
@@ -37,7 +38,7 @@ def verify_depth_to_space(block_size, batch, in_channel, in_height, in_width, la
     else:
         raise NotImplementedError('Layout not supported {}'.format(layout))
 
-    A = tvm.placeholder(in_shape, name='A', dtype='float32')
+    A = te.placeholder(in_shape, name='A', dtype='float32')
     dtype = A.dtype
     a_np = np.random.uniform(size=in_shape).astype(dtype)
 
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index 7efe5a21578c..693348918d3e 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import topi.testing
@@ -58,10 +59,10 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
         padding_args = padding
 
     # placeholder
-    Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
-    Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
-    Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
-    Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
+    Input = te.placeholder((batch, in_channel, in_height, in_width), name='Input')
+    Filter = te.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
+    Scale = te.placeholder((in_channel * channel_multiplier,), name='Scale')
+    Shift = te.placeholder((in_channel * channel_multiplier,), name='Shift')
 
     dtype = 'float32'
 
@@ -161,10 +162,10 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
         padding_args = padding
 
     # placeholder
-    Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input')
-    Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
-    Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
-    Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
+    Input = te.placeholder((batch, in_height, in_width, in_channel), name='Input')
+    Filter = te.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
+    Scale = te.placeholder((in_channel * channel_multiplier,), name='Scale')
+    Shift = te.placeholder((in_channel * channel_multiplier,), name='Shift')
 
     dtype = 'float32'
 
@@ -289,8 +290,8 @@ def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_m
             break
 
     # placeholder
-    Input = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='Input')
-    Filter = tvm.placeholder((out_channel//oc_block, 1, filter_height, filter_width, 1, oc_block), name='Filter')
+    Input = te.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='Input')
+    Filter = te.placeholder((out_channel//oc_block, 1, filter_height, filter_width, 1, oc_block), name='Filter')
     in_layout = "NCHW%dc" % ic_block
     out_layout = "NCHW%dc" % oc_block
     dtype = 'float32'
diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
index ad44429612ce..aac0cd523b0b 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import topi
 import numpy as np
 from tvm.contrib.pickle_memoize import memoize
@@ -40,8 +41,8 @@ def verify_depthwise_conv2d_back_input(batch, in_channel, in_h, channel_multipli
     oshape = [batch, out_h, out_w, out_channel]
 
     # placeholder
-    Out_grad = tvm.placeholder(oshape, name='Out_grad')
-    Filter = tvm.placeholder((filter_h, filter_w, filter_channel, channel_multiplier))
+    Out_grad = te.placeholder(oshape, name='Out_grad')
+    Filter = te.placeholder((filter_h, filter_w, filter_channel, channel_multiplier))
     # declare
     In_grad = topi.nn.depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape,
         stride=[stride_h, stride_w], padding=[padding_h, padding_w])
diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
index 2e09e675c502..4602d098bf91 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import topi
 import topi.testing
 import numpy as np
@@ -40,8 +41,8 @@ def verify_depthwise_conv2d_back_weight(batch, in_channel, in_h, channel_multipl
     fshape = [filter_h, filter_w, in_channel, channel_multiplier]
 
     # placeholder
-    Out_grad = tvm.placeholder(oshape, name='Out_grad')
-    Input = tvm.placeholder((batch, in_h, in_w, in_channel), name='In_grad')
+    Out_grad = te.placeholder(oshape, name='Out_grad')
+    Input = te.placeholder((batch, in_h, in_w, in_channel), name='In_grad')
     # declare
     Weight_grad = topi.nn.depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape,
         stride=[stride_h, stride_w], padding=[padding_h, padding_w])
diff --git a/topi/tests/python/test_topi_dilate.py b/topi/tests/python/test_topi_dilate.py
index 24988212e52a..1e69383238c7 100644
--- a/topi/tests/python/test_topi_dilate.py
+++ b/topi/tests/python/test_topi_dilate.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import topi
 import topi.testing
 import numpy as np
@@ -25,9 +26,9 @@ def test_dilate():
     ctx = tvm.cpu(0)
 
     def _test_dilate(input_size, strides):
-        Input = tvm.placeholder((input_size))
+        Input = te.placeholder((input_size))
         Output = topi.nn.dilate(Input, strides)
-        schedule = tvm.create_schedule(Output.op)
+        schedule = te.create_schedule(Output.op)
         input_np = np.random.uniform(size=input_size).astype(Input.dtype)
         output_np = topi.testing.dilate_python(input_np, strides)
         input_tvm = tvm.nd.array(input_np, ctx=ctx)
diff --git a/topi/tests/python/test_topi_group_conv2d.py b/topi/tests/python/test_topi_group_conv2d.py
index 3904db7d2b23..6909bbee8bb0 100644
--- a/topi/tests/python/test_topi_group_conv2d.py
+++ b/topi/tests/python/test_topi_group_conv2d.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.autotvm.task.space import FallbackConfigEntity
 import topi
@@ -41,9 +42,9 @@ def verify_group_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, str
 
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    W = tvm.placeholder((num_filter, in_channel // groups, kernel, kernel), name='W')
-    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = te.placeholder((num_filter, in_channel // groups, kernel, kernel), name='W')
+    bias = te.placeholder((num_filter, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
@@ -112,9 +113,9 @@ def verify_group_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kerne
 
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
-    W = tvm.placeholder((num_filter, in_channel // groups, kernel, kernel), name='W', dtype='int8')
-    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
+    W = te.placeholder((num_filter, in_channel // groups, kernel, kernel), name='W', dtype='int8')
+    bias = te.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
                             dtype='int8')
 
     a_shape = get_const_tuple(A.shape)
diff --git a/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py b/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py
index 08f136e5ae23..0fd4205eef64 100644
--- a/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py
+++ b/topi/tests/python/test_topi_group_conv2d_NCHWc_int8.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import topi.testing
@@ -61,8 +62,8 @@ def verify_group_conv2d_NCHWc_int8(batch, in_channel, groups, in_size, num_filte
 
     ic_block = 8
     autotvm.DispatchContext.current.silent = True
-    A = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='A', dtype='uint8')
-    W = tvm.placeholder((num_filter//oc_block, in_channel//ic_block//groups, kernel, kernel, ic_block//4, oc_block, 4), name='W', dtype='int8')
+    A = te.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='A', dtype='uint8')
+    W = te.placeholder((num_filter//oc_block, in_channel//ic_block//groups, kernel, kernel, ic_block//4, oc_block, 4), name='W', dtype='int8')
 
     @memoize("topi.tests.test_topi_conv2d_NCHWc_int8.verify_conv2d_NCHWc_int8")
     def get_ref_data():
diff --git a/topi/tests/python/test_topi_image.py b/topi/tests/python/test_topi_image.py
index 4297638b3dfe..4eea75d68d28 100644
--- a/topi/tests/python/test_topi_image.py
+++ b/topi/tests/python/test_topi_image.py
@@ -17,6 +17,7 @@
 """Test code for bilinear scale """
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 
@@ -25,12 +26,12 @@
 def verify_resize(batch, in_channel, in_height, in_width, out_height, out_width,
                   layout='NCHW', coord_trans="align_corners", method="bilinear"):
     if layout == 'NCHW':
-        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='float32')
+        A = te.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='float32')
         dtype = A.dtype
         out_shape = (batch, in_channel, out_height, out_width)
         a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
     elif layout == 'NHWC':
-        A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A', dtype='float32')
+        A = te.placeholder((batch, in_height, in_width, in_channel), name='A', dtype='float32')
         dtype = A.dtype
         out_shape = (batch, out_height, out_width, in_channel)
         a_np = np.random.uniform(size=(batch, in_height, in_width, in_channel)).astype(dtype)
@@ -84,12 +85,12 @@ def test_resize():
 def verify_resize3d(batch, in_channel, in_depth, in_height, in_width, out_depth, out_height, out_width,
                     layout='NCDHW', coordinate_transformation_mode="half_pixel", method="trilinear"):
     if layout == 'NCDHW':
-        A = tvm.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A', dtype='float32')
+        A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A', dtype='float32')
         dtype = A.dtype
         out_shape = (batch, in_channel, out_depth, out_height, out_width)
         a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height, in_width)).astype(dtype)
     elif layout == 'NDHWC':
-        A = tvm.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A', dtype='float32')
+        A = te.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A', dtype='float32')
         dtype = A.dtype
         out_shape = (batch, out_depth, out_height, out_width, in_channel)
         a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width, in_channel)).astype(dtype)
@@ -146,10 +147,10 @@ def test_crop_and_resize():
     def verify_crop_and_resize(image_shape, np_boxes, np_box_indices, np_crop_size, layout='NHWC',
                                method="bilinear", extrapolation_value=0.0):
 
-        images = tvm.placeholder(image_shape, name='images', dtype='float32')
+        images = te.placeholder(image_shape, name='images', dtype='float32')
         np_images = np.random.uniform(size=image_shape).astype("float32")
-        boxes = tvm.placeholder(np_boxes.shape, name="boxes", dtype="float32")
-        box_ind = tvm.placeholder(np_box_indices.shape, name="box_ind", dtype="int32")
+        boxes = te.placeholder(np_boxes.shape, name="boxes", dtype="float32")
+        box_ind = te.placeholder(np_box_indices.shape, name="box_ind", dtype="int32")
 
         batch = len(np_box_indices)
         target_height, target_width = np_crop_size[0], np_crop_size[1]
diff --git a/topi/tests/python/test_topi_lrn.py b/topi/tests/python/test_topi_lrn.py
index 4cb3c7581800..7e003a7a52b2 100644
--- a/topi/tests/python/test_topi_lrn.py
+++ b/topi/tests/python/test_topi_lrn.py
@@ -17,6 +17,7 @@
 """Test code for local response normalization"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 from topi.util import get_const_tuple
 import topi.testing
@@ -32,7 +33,7 @@
 }
 
 def verify_lrn(shape, size, axis, bias, alpha, beta):
-    A = tvm.placeholder(shape, name='A')
+    A = te.placeholder(shape, name='A')
     B = topi.nn.lrn(A, size, axis, alpha, beta, bias)
     dtype = A.dtype
 
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index debc3efe0d27..30a0f44aad57 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -17,6 +17,7 @@
 import numpy as np
 import scipy
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi import util
@@ -24,7 +25,7 @@
 
 
 def test_util():
-    x = tvm.const(100, "int32")
+    x = tvm.tir.const(100, "int32")
     assert util.get_const_int(x) == 100
     assert util.get_const_tuple((x, x)) == (100, 100)
 
@@ -37,13 +38,13 @@ def test_apply(
         low,
         high,
         shape=(20, 3),
-        dtype=tvm.float32,
+        dtype="float32",
         check_round=False,
         skip_name_check=False,
     ):
-        m = tvm.var("m")
-        l = tvm.var("l")
-        A = tvm.placeholder((m, l), dtype=dtype, name="A")
+        m = te.var("m")
+        l = te.var("l")
+        A = te.placeholder((m, l), dtype=dtype, name="A")
 
         B = func(A)
         assert tuple(B.shape) == tuple(A.shape)
@@ -76,13 +77,13 @@ def test_isnan(
         low,
         high,
         shape=(20, 3),
-        dtype=tvm.float32,
+        dtype="float32",
         check_round=False,
         skip_name_check=False,
     ):
-        m = tvm.var("m")
-        l = tvm.var("l")
-        A = tvm.placeholder((m, l), dtype=dtype, name="A")
+        m = te.var("m")
+        l = te.var("l")
+        A = te.placeholder((m, l), dtype=dtype, name="A")
 
         B = topi.isnan(A)
         assert tuple(B.shape) == tuple(A.shape)
@@ -134,7 +135,7 @@ def check_device(device):
 def test_cast():
     def verify(from_dtype, to_dtype, low=-100, high=100):
         shape = (5, 4)
-        A = tvm.placeholder(shape, dtype=from_dtype, name="A")
+        A = te.placeholder(shape, dtype=from_dtype, name="A")
         B = topi.cast(A, to_dtype)
 
         if from_dtype == "bool":
@@ -177,11 +178,11 @@ def test_apply(
         low,
         high,
         step,
-        dtype=tvm.float32
+        dtype="float32"
     ):
         a_np = np.arange(low, high, step).astype(dtype)
         b_np = f_numpy(a_np)
-        A = tvm.placeholder(a_np.shape, dtype=dtype, name="A")
+        A = te.placeholder(a_np.shape, dtype=dtype, name="A")
         B = func(A)
         assert tuple(B.shape) == tuple(A.shape)
 
diff --git a/topi/tests/python/test_topi_matmul.py b/topi/tests/python/test_topi_matmul.py
index c712970945fc..0c0a365688b3 100644
--- a/topi/tests/python/test_topi_matmul.py
+++ b/topi/tests/python/test_topi_matmul.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+from tvm import te
 import topi
 from topi.util import get_const_tuple
 
@@ -27,12 +28,12 @@ def with_tvm(lam, *args):
     pls = []     # placeholders
     vals_nd = [] # initial values
     for i,arg in enumerate(args):
-        pls.append(tvm.placeholder(arg.shape, name='pl'+str(i)))
+        pls.append(te.placeholder(arg.shape, name='pl'+str(i)))
         vals_nd.append(tvm.nd.array(arg, ctx))
 
     out = lam(*pls)
     out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx)
-    s = tvm.create_schedule([out.op])
+    s = te.create_schedule([out.op])
     m = tvm.build(s, pls + [out], "llvm")
     m(*(vals_nd+[out_nd]))
     return out_nd.asnumpy()
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index 084a2c7c7671..64f0841274e2 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -18,6 +18,7 @@
 import math
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -48,7 +49,7 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_
     sw = sh
     pt, pl, pb, pr = padding
     layout = "NCHW"
-    A = tvm.placeholder((n, ic, ih, iw), name='A')
+    A = te.placeholder((n, ic, ih, iw), name='A')
     B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
                      pool_type=pool_type, ceil_mode=ceil_mode,
                      layout="NCHW", count_include_pad=count_include_pad)
@@ -112,7 +113,7 @@ def verify_pool_grad(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_inc
     sw = sh
     pt, pl, pb, pr = padding
     layout = "NCHW"
-    A = tvm.placeholder((n, ic, ih, iw), name='A')
+    A = te.placeholder((n, ic, ih, iw), name='A')
     B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
                      pool_type=pool_type, ceil_mode=ceil_mode,
                      layout="NCHW", count_include_pad=count_include_pad)
@@ -126,7 +127,7 @@ def verify_pool_grad(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_inc
     else:
         assert bshape[2] == int(math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1)
         assert bshape[3] == int(math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1)
-    OutGrad = tvm.placeholder(bshape, name='OutGrad')
+    OutGrad = te.placeholder(bshape, name='OutGrad')
     PoolGrad = topi.nn.pool_grad(OutGrad, A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
                                  pool_type=pool_type, ceil_mode=ceil_mode,
                                  layout="NCHW", count_include_pad=count_include_pad)
@@ -202,7 +203,7 @@ def test_pool_grad():
 def verify_global_pool(n, c, h, w, pool_type, layout='NCHW'):
 
     assert layout in ["NCHW", "NHWC"]
-    A = tvm.placeholder((n, c, h, w), name='A')
+    A = te.placeholder((n, c, h, w), name='A')
     B = topi.nn.global_pool(A, pool_type=pool_type, layout=layout)
     B = topi.nn.relu(B)
 
@@ -268,7 +269,7 @@ def end_index(index, odim, idim):
                     l_sl = slice(l_start, l_end)
                     np_out[i, j, k, l] = np_op(np_data[i, j, k_sl, l_sl])
 
-    data = tvm.placeholder(dshape, name="data", dtype=dtype)
+    data = te.placeholder(dshape, name="data", dtype=dtype)
     out = topi.nn.adaptive_pool(data, out_size, pool_type, layout)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -302,7 +303,7 @@ def verify_pool3d(n, ic, ih, kh, sh, padding, pool_type,
     input_shape = (n, ic, id, ih, iw)
     kernel = [kd, kh, kw]
     stride = [sd, sh, sw]
-    A = tvm.placeholder(input_shape, name='A')
+    A = te.placeholder(input_shape, name='A')
     B = topi.nn.pool3d(A, kernel=kernel, stride=stride, padding=padding,
                        pool_type=pool_type, ceil_mode=ceil_mode,
                        layout=layout, count_include_pad=count_include_pad)
@@ -355,7 +356,7 @@ def verify_pool1d(n, ic, iw, kw, sw, padding, pool_type,
     input_shape = (n, ic, iw)
     kernel = [kw]
     stride = [sw]
-    A = tvm.placeholder(input_shape, name='A')
+    A = te.placeholder(input_shape, name='A')
     B = topi.nn.pool1d(A, kernel=kernel, stride=stride, padding=padding,
                        pool_type=pool_type, ceil_mode=ceil_mode,
                        layout=layout, count_include_pad=count_include_pad)
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 751025bf82b8..cc84fe006f64 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -18,6 +18,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 
@@ -46,7 +47,7 @@ def _my_npy_argmin(arr, axis, keepdims):
 
 def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum", dtype="float32"):
     # Build the logic and compile the function
-    A = tvm.placeholder(shape=in_shape, name="A", dtype=dtype)
+    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
     A1 = topi.sqrt(topi.exp(A))
     out_dtype = dtype
     if type == "sum":
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index 8ef354907691..4d4166ff6487 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -18,6 +18,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -26,7 +27,7 @@
 from common import get_all_backend
 
 def verify_relu(m, n, dtype="float32"):
-    A = tvm.placeholder((m, n), name='A', dtype=dtype)
+    A = te.placeholder((m, n), name='A', dtype=dtype)
     B = topi.nn.relu(A)
 
     a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
@@ -55,9 +56,9 @@ def check_device(device):
 
 
 def verify_leaky_relu(m, alpha):
-    A = tvm.placeholder((m,), name='A')
+    A = te.placeholder((m,), name='A')
     B = topi.nn.leaky_relu(A, alpha)
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
 
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha
@@ -70,8 +71,8 @@ def verify_leaky_relu(m, alpha):
 
 
 def verify_prelu(x, w, axis, weight_reshape):
-    X = tvm.placeholder((x), name='X')
-    W = tvm.placeholder((w), name='W')
+    X = te.placeholder((x), name='X')
+    W = te.placeholder((w), name='W')
     x_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(X.shape)).astype(X.dtype)
     w_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(W.shape)).astype(W.dtype)
 
@@ -79,7 +80,7 @@ def _prelu_numpy(x, W):
         return (x < 0) * (x *W.reshape(weight_reshape)) + (x>=0) * x
 
     B = topi.nn.prelu(X, W, axis)
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
 
     ctx = tvm.cpu(0)
     x_tvm = tvm.nd.array(x_np, ctx)
diff --git a/topi/tests/python/test_topi_reorg.py b/topi/tests/python/test_topi_reorg.py
index c4cd2b5d0eb8..09c2f2f966de 100644
--- a/topi/tests/python/test_topi_reorg.py
+++ b/topi/tests/python/test_topi_reorg.py
@@ -19,6 +19,7 @@
 import topi
 from topi.util import get_const_tuple
 import tvm
+from tvm import te
 import topi.testing
 
 _reorg_schedule = {
@@ -30,7 +31,7 @@ def verify_reorg(batch, in_size, in_channel, stride):
     '''Verify reorg operator by comparing outputs from tvm and numpy implementation'''
     in_height = in_width = in_size
 
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
     B = topi.vision.reorg(A, stride)
 
     a_shape = get_const_tuple(A.shape)
diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py
index 5396b6beef81..485738700300 100644
--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -18,6 +18,7 @@
 import os
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 import logging
@@ -50,10 +51,10 @@ def check_device(A, B, a_np, b_np, device, name):
     tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def verify_softmax(m, n, dtype="float32"):
-    A = tvm.placeholder((m, n), dtype=dtype, name='A')
+    A = te.placeholder((m, n), dtype=dtype, name='A')
     B = topi.nn.softmax(A)
     # confirm lower works
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
     tvm.lower(s, [A, B], simple_mode=True)
 
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
@@ -63,7 +64,7 @@ def verify_softmax(m, n, dtype="float32"):
         check_device(A, B, a_np, b_np, device, "softmax")
 
 def verify_softmax_4d(shape, dtype="float32"):
-    A = tvm.placeholder(shape, dtype=dtype, name='A')
+    A = te.placeholder(shape, dtype=dtype, name='A')
     B = topi.nn.softmax(A, axis=1)
 
     _, c, h, w = shape
@@ -81,10 +82,10 @@ def test_softmax():
     verify_softmax_4d((1, 16, 256, 256))
 
 def verify_log_softmax(m, n, dtype="float32"):
-    A = tvm.placeholder((m, n), dtype=dtype, name='A')
+    A = te.placeholder((m, n), dtype=dtype, name='A')
     B = topi.nn.log_softmax(A)
     # confirm lower works
-    s = tvm.create_schedule([B.op])
+    s = te.create_schedule([B.op])
     tvm.lower(s, [A, B], simple_mode=True)
     a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype)
     b_np = topi.testing.log_softmax_python(a_np)
diff --git a/topi/tests/python/test_topi_sort.py b/topi/tests/python/test_topi_sort.py
index 74e55ec248d9..2728733e2394 100644
--- a/topi/tests/python/test_topi_sort.py
+++ b/topi/tests/python/test_topi_sort.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 
@@ -34,7 +35,7 @@
 def verify_argsort(axis, is_ascend):
     dshape = (20, 100)
     data_dtype = "float32"
-    data = tvm.placeholder(dshape, name="data", dtype=data_dtype)
+    data = te.placeholder(dshape, name="data", dtype=data_dtype)
 
     perm = np.arange(dshape[0] * dshape[1], dtype=data_dtype)
     np.random.shuffle(perm)
@@ -74,7 +75,7 @@ def check_device(device):
 def verify_topk(k, axis, ret_type, is_ascend, dtype):
     shape = (20, 100)
     data_dtype = "float32"
-    data = tvm.placeholder(shape, name="data", dtype=data_dtype)
+    data = te.placeholder(shape, name="data", dtype=data_dtype)
 
     np_data = np.random.uniform(size=shape).astype(data_dtype)
     if is_ascend:
diff --git a/topi/tests/python/test_topi_space_to_depth.py b/topi/tests/python/test_topi_space_to_depth.py
index 99a798e733ee..11a009d3fde9 100644
--- a/topi/tests/python/test_topi_space_to_depth.py
+++ b/topi/tests/python/test_topi_space_to_depth.py
@@ -17,6 +17,7 @@
 """Test code for space to depth"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 
@@ -37,7 +38,7 @@ def verify_space_to_depth(block_size, batch, in_channel, in_height, in_width, la
     else:
         raise NotImplementedError('Layout not supported {}'.format(layout))
 
-    A = tvm.placeholder(in_shape, name='A', dtype='float32')
+    A = te.placeholder(in_shape, name='A', dtype='float32')
     dtype = A.dtype
     a_np = np.random.uniform(size=in_shape).astype(dtype)
 
diff --git a/topi/tests/python/test_topi_sparse.py b/topi/tests/python/test_topi_sparse.py
index fc2885999997..fc2d26b82842 100644
--- a/topi/tests/python/test_topi_sparse.py
+++ b/topi/tests/python/test_topi_sparse.py
@@ -17,6 +17,7 @@
 """Test code for sparse operator"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from topi.util import get_const_tuple
@@ -26,13 +27,13 @@
 import scipy.sparse as sp
 
 def verify_dynamic_csrmv(batch, in_dim, out_dim, use_bias=True):
-    nr, nc, n = tvm.var("nr"), tvm.var("nc"), tvm.var("n")
+    nr, nc, n = te.var("nr"), te.var("nc"), te.var("n")
     dtype = 'float32'
     A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, dtype=dtype, name='A')
-    B = tvm.placeholder((in_dim, 1), name='B')
-    C = tvm.placeholder((nr,), name='C')
+    B = te.placeholder((in_dim, 1), name='B')
+    C = te.placeholder((nr,), name='C')
     D = topi.sparse.csrmv(A, B, C if use_bias else None)
-    s = tvm.create_schedule(D.op)
+    s = te.create_schedule(D.op)
     dtype = A.dtype
 
     # get the test data
@@ -70,13 +71,13 @@ def check_device(device):
         check_device(device)
 
 def verify_dynamic_csrmm(batch, in_dim, out_dim, use_bias=True):
-    nr, nc, n = tvm.var("nr"), tvm.var("nc"), tvm.var("n")
+    nr, nc, n = te.var("nr"), te.var("nc"), te.var("n")
     dtype = 'float32'
     A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, dtype=dtype, name='A')
-    B = tvm.placeholder((in_dim, out_dim), name='B')
-    C = tvm.placeholder((nr,), name='C')
+    B = te.placeholder((in_dim, out_dim), name='B')
+    C = te.placeholder((nr,), name='C')
     D = topi.sparse.csrmm(A, B, C if use_bias else None)
-    s = tvm.create_schedule(D.op)
+    s = te.create_schedule(D.op)
     dtype = A.dtype
 
     # get the test data
@@ -112,12 +113,12 @@ def check_device(device):
         check_device(device)
 
 def verify_dense_si(batch, in_dim, out_dim, use_bias=True, dtype='float32'):
-    nonzeros = tvm.var('nonzeros')
+    nonzeros = te.var('nonzeros')
     A = tvmsp.placeholder(shape=(batch, in_dim), nonzeros=nonzeros, dtype=dtype, name='A')
-    B = tvm.placeholder((out_dim, in_dim), dtype=dtype, name='B')
-    C = tvm.placeholder((out_dim,), dtype=dtype, name='C')
+    B = te.placeholder((out_dim, in_dim), dtype=dtype, name='B')
+    C = te.placeholder((out_dim,), dtype=dtype, name='C')
     D = topi.sparse.dense(A, B, C if use_bias else None)
-    s = tvm.create_schedule(D.op)
+    s = te.create_schedule(D.op)
 
     # get the test data
     def get_ref_data():
@@ -149,12 +150,12 @@ def check_device(device):
     check_device('llvm')
 
 def verify_dense_sw(batch, in_dim, out_dim, use_bias=True, dtype='float32'):
-    nonzeros = tvm.var('nonzeros')
-    A = tvm.placeholder((batch, in_dim), dtype=dtype, name='A')
+    nonzeros = te.var('nonzeros')
+    A = te.placeholder((batch, in_dim), dtype=dtype, name='A')
     B = tvmsp.placeholder(shape=(out_dim, in_dim), nonzeros=nonzeros, dtype=dtype, name='B')
-    C = tvm.placeholder((out_dim,), dtype=dtype, name='C')
+    C = te.placeholder((out_dim,), dtype=dtype, name='C')
     D = topi.sparse.dense(A, B, C if use_bias else None)
-    s = tvm.create_schedule(D.op)
+    s = te.create_schedule(D.op)
 
     # get the test data
     def get_ref_data():
@@ -224,12 +225,12 @@ def test_sparse_dense_csr():
     W_np = W_sp_np.todense()
     Y_np = X_np.dot(W_np.T)
 
-    W_data = tvm.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-    W_indices = tvm.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-    W_indptr = tvm.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-    X = tvm.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
+    W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
+    W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
+    W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
+    X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
     Y = topi.nn.sparse_dense(X, W_data, W_indices, W_indptr)
-    s = tvm.create_schedule(Y.op)
+    s = te.create_schedule(Y.op)
     func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
     Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype))
     func(tvm.nd.array(X_np), tvm.nd.array(W_sp_np.data), tvm.nd.array(W_sp_np.indices), tvm.nd.array(W_sp_np.indptr), Y_tvm)
@@ -243,12 +244,12 @@ def test_sparse_transpose_csr():
     X_sp_T = X_sp.transpose()
     X_np_T = X_sp_T.todense()
 
-    X_data = tvm.placeholder(shape=X_sp.data.shape, dtype=str(X_sp.data.dtype))
-    X_indices = tvm.placeholder(shape=X_sp.indices.shape, dtype=str(X_sp.indices.dtype))
-    X_indptr = tvm.placeholder(shape=X_sp.indptr.shape, dtype=str(X_sp.indptr.dtype))
+    X_data = te.placeholder(shape=X_sp.data.shape, dtype=str(X_sp.data.dtype))
+    X_indices = te.placeholder(shape=X_sp.indices.shape, dtype=str(X_sp.indices.dtype))
+    X_indptr = te.placeholder(shape=X_sp.indptr.shape, dtype=str(X_sp.indptr.dtype))
 
     X_T_data, X_T_indices, X_T_indptr = topi.nn.sparse_transpose(X_data, X_indices, X_indptr)
-    s = tvm.create_schedule([X_T_data.op, X_T_indices.op, X_T_indptr.op])
+    s = te.create_schedule([X_T_data.op, X_T_indices.op, X_T_indptr.op])
     func = tvm.build(s, [X_data, X_indices, X_indptr, X_T_data, X_T_indices, X_T_indptr])
 
 
@@ -288,12 +289,12 @@ def test_sparse_dense_bsr():
     W_np = W_sp_np.todense()
     Y_np = X_np.dot(W_np.T)
 
-    W_data = tvm.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-    W_indices = tvm.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-    W_indptr = tvm.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-    X = tvm.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
+    W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
+    W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
+    W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
+    X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
     Y = topi.nn.sparse_dense(X, W_data, W_indices, W_indptr)
-    s = tvm.create_schedule(Y.op)
+    s = te.create_schedule(Y.op)
     func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
     Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype))
     func(tvm.nd.array(X_np),
@@ -317,12 +318,12 @@ def test_sparse_dense_bsr_randomized():
         W_np = W_sp_np.todense()
         Y_np = np.array(X_np.dot(W_np.T))
 
-        W_data = tvm.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
-        W_indices = tvm.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
-        W_indptr = tvm.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
-        X = tvm.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
+        W_data = te.placeholder(shape=W_sp_np.data.shape, dtype=str(W_sp_np.data.dtype))
+        W_indices = te.placeholder(shape=W_sp_np.indices.shape, dtype=str(W_sp_np.indices.dtype))
+        W_indptr = te.placeholder(shape=W_sp_np.indptr.shape, dtype=str(W_sp_np.indptr.dtype))
+        X = te.placeholder(shape=X_np.shape, dtype=str(X_np.dtype))
         Y = topi.nn.sparse_dense(X, W_data, W_indices, W_indptr)
-        s = tvm.create_schedule(Y.op)
+        s = te.create_schedule(Y.op)
         func = tvm.build(s, [X, W_data, W_indices, W_indptr, Y])
         Y_tvm = tvm.nd.array(np.zeros(Y_np.shape, dtype=Y_np.dtype))
         func(tvm.nd.array(X_np),
diff --git a/topi/tests/python/test_topi_tensor.py b/topi/tests/python/test_topi_tensor.py
index 05098421c561..68ea7ab6d7d9 100644
--- a/topi/tests/python/test_topi_tensor.py
+++ b/topi/tests/python/test_topi_tensor.py
@@ -17,6 +17,7 @@
 """Test code for tensor operator"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -28,9 +29,9 @@ def verify_elemwise_sum(num_args, dtype):
     tvm_placeholders = []
     for i in range(num_args):
         tvm_placeholders.append(
-            tvm.placeholder(shape, name="data"+str(i), dtype=dtype))
+            te.placeholder(shape, name="data"+str(i), dtype=dtype))
     esum = topi.elemwise_sum(tvm_placeholders)
-    s = tvm.create_schedule([esum.op])
+    s = te.create_schedule([esum.op])
 
     @memoize("topi.tests.test_topi_elemwise_sum")
     def get_ref_data():
@@ -57,11 +58,11 @@ def check_device(device):
 
 
 def verify_full(shape, dtype, fill_value):
-    A = tvm.placeholder(shape, dtype=dtype, name="A")
+    A = te.placeholder(shape, dtype=dtype, name="A")
     B = topi.full_like(A, fill_value=fill_value)
     C = topi.full(shape=shape, dtype=dtype, fill_value=fill_value)
-    s1 = tvm.create_schedule([B.op])
-    s2 = tvm.create_schedule([C.op])
+    s1 = te.create_schedule([B.op])
+    s2 = te.create_schedule([C.op])
 
     @memoize("topi.tests.test_topi_full")
     def get_ref_data():
@@ -96,9 +97,9 @@ def check_device(device):
             return
         with tvm.target.create(device):
             ctx = tvm.context(device, 0)
-            A = tvm.placeholder((n, m), name='A', dtype=dtype)
-            B = tvm.compute((n, m), lambda i, j:
-                             A[i, j] + tvm.const(1, A.dtype), name='B')
+            A = te.placeholder((n, m), name='A', dtype=dtype)
+            B = te.compute((n, m), lambda i, j:
+                             A[i, j] + tvm.tir.const(1, A.dtype), name='B')
             S = topi.testing.get_elemwise_schedule(device)(B)
 
             fun = tvm.build(S, [A, B], device)
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 880e86d205e7..097c87db3a97 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -17,6 +17,7 @@
 """Test code for broadcasting operators."""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 from tvm.contrib.nvcc import have_fp16
@@ -24,7 +25,7 @@
 from common import get_all_backend
 
 def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     B = topi.expand_dims(A, axis, num_newaxis)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -47,7 +48,7 @@ def check_device(device):
 
 
 def verify_reinterpret(in_shape, in_dtype, out_dtype, generator):
-    A = tvm.placeholder(shape=in_shape, name="A", dtype=in_dtype)
+    A = te.placeholder(shape=in_shape, name="A", dtype=in_dtype)
     B = topi.reinterpret(A, out_dtype)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -73,7 +74,7 @@ def check_device(device):
 
 
 def verify_transpose(in_shape, axes):
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     B = topi.transpose(A, axes)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -96,7 +97,7 @@ def check_device(device):
 
 
 def verify_reshape(src_shape, dst_shape):
-    A = tvm.placeholder(shape=src_shape, name="A")
+    A = te.placeholder(shape=src_shape, name="A")
     B = topi.reshape(A, dst_shape)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -119,7 +120,7 @@ def check_device(device):
 
 
 def verify_squeeze(src_shape, axis):
-    A = tvm.placeholder(shape=src_shape, name="A")
+    A = te.placeholder(shape=src_shape, name="A")
     B = topi.squeeze(A, axis=axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -158,7 +159,7 @@ def get_concat_schedule(target):
 
     tensor_l = []
     for i, shape in enumerate(shapes):
-        tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
+        tensor_l.append(te.placeholder(shape, name="A" + str(i)))
     out_tensor = topi.concatenate(a_tuple=tensor_l, axis=axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -183,7 +184,7 @@ def check_device(device):
 def verify_stack(shapes, axis):
     tensor_l = []
     for i, shape in enumerate(shapes):
-        tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
+        tensor_l.append(te.placeholder(shape, name="A" + str(i)))
     out_tensor = topi.stack(tensor_l, axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -207,7 +208,7 @@ def check_device(device):
 
 
 def verify_split(src_shape, indices_or_sections, axis):
-    A = tvm.placeholder(shape=src_shape, name="A")
+    A = te.placeholder(shape=src_shape, name="A")
     tensor_l = topi.split(A, indices_or_sections, axis=axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -232,10 +233,10 @@ def check_device(device):
 
 
 def verify_expand_like(in_shape, out_shape, axis):
-    A = tvm.placeholder(shape=in_shape, name="A")
-    B = tvm.placeholder(shape=out_shape, name="B")
+    A = te.placeholder(shape=in_shape, name="A")
+    B = te.placeholder(shape=out_shape, name="B")
     C = topi.expand_like(A, B, axis)
-    s = tvm.create_schedule([C.op])
+    s = te.create_schedule([C.op])
 
     def check_device(device):
         if not tvm.runtime.enabled(device):
@@ -266,7 +267,7 @@ def check_device(device):
         check_device(device)
 
 def verify_flip(in_shape, axis):
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     B = topi.flip(A, axis) + 1
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -292,8 +293,8 @@ def verify_take(src_shape, indices_src, axis=None, mode="clip"):
     src_dtype = "float32"
     indices_dtype = "int32"
     indices_src = np.array(indices_src, dtype=indices_dtype)
-    A = tvm.placeholder(shape=src_shape, dtype=src_dtype, name="A")
-    indices = tvm.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
+    A = te.placeholder(shape=src_shape, dtype=src_dtype, name="A")
+    indices = te.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
     if axis is None:
         out_tensor = topi.take(a=A, indices=indices, mode=mode)
     else:
@@ -330,7 +331,7 @@ def check_device(device):
         check_device(device)
 
 def verify_strided_slice(in_shape, begin, end, strides=None):
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     strides = [1,1,1] if strides is None else strides
     B = topi.strided_slice(A, begin, end, strides) + 1
 
@@ -356,12 +357,12 @@ def check_device(device):
         check_device(device)
 
 def verify_strided_set(in_shape, v_shape, begin, end, strides=None):
-    A = tvm.placeholder(shape=in_shape, name="A")
-    V = tvm.placeholder(shape=v_shape, name="V")
-    b = tvm.placeholder(shape=(len(begin),), name="b", dtype='int32')
-    e = tvm.placeholder(shape=(len(end),), name="e", dtype='int32')
+    A = te.placeholder(shape=in_shape, name="A")
+    V = te.placeholder(shape=v_shape, name="V")
+    b = te.placeholder(shape=(len(begin),), name="b", dtype='int32')
+    e = te.placeholder(shape=(len(end),), name="e", dtype='int32')
     if strides is not None:
-        st = tvm.placeholder(shape=(len(strides),), name="st", dtype='int32')
+        st = te.placeholder(shape=(len(strides),), name="st", dtype='int32')
         B = topi.strided_set(A, V, b, e, st) + 1
     else:
         B = topi.strided_set(A, V, b, e) + 1
@@ -404,8 +405,8 @@ def check_device(device):
 def verify_gather_nd(src_shape, indices_src, indices_dtype):
     src_dtype = "float32"
     indices_src = np.array(indices_src, dtype=indices_dtype)
-    A = tvm.placeholder(shape=src_shape, dtype=src_dtype, name="A")
-    indices = tvm.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
+    A = te.placeholder(shape=src_shape, dtype=src_dtype, name="A")
+    indices = te.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
     out_tensor = topi.gather_nd(a=A, indices=indices)
 
     def check_device(device):
@@ -464,7 +465,7 @@ def check_device(device):
         check_device(device)
 
 def verify_repeat(in_shape, repeats, axis):
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     B = topi.repeat(A, repeats, axis)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -486,7 +487,7 @@ def check_device(device):
         check_device(device)
 
 def verify_tile(in_shape, reps):
-    A = tvm.placeholder(shape=in_shape, name="A")
+    A = te.placeholder(shape=in_shape, name="A")
     B = topi.tile(A, reps)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -508,10 +509,10 @@ def check_device(device):
         check_device(device)
 
 def verify_where(in_shape):
-    Cond = tvm.placeholder(shape=in_shape, name="cond")
+    Cond = te.placeholder(shape=in_shape, name="cond")
     dtype = Cond.dtype
-    A = tvm.placeholder(shape=in_shape, name="A")
-    B = tvm.placeholder(shape=in_shape, name="B")
+    A = te.placeholder(shape=in_shape, name="A")
+    B = te.placeholder(shape=in_shape, name="B")
     C = topi.where(Cond, A, B)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -537,9 +538,9 @@ def check_device(device):
         check_device(device)
 
 def verify_one_hot(indices_shape, depth, on_value, off_value, axis, dtype):
-    indices = tvm.placeholder(shape=indices_shape, name="indices", dtype="int32")
-    on_value_const = tvm.const(on_value, dtype)
-    off_value_const = tvm.const(off_value, dtype)
+    indices = te.placeholder(shape=indices_shape, name="indices", dtype="int32")
+    on_value_const = tvm.tir.const(on_value, dtype)
+    off_value_const = tvm.tir.const(off_value, dtype)
     one_hot_result = topi.transform.one_hot(indices, on_value_const, off_value_const, depth, axis, dtype)
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -624,9 +625,9 @@ def test_squeeze():
     verify_squeeze((1, 1, 1, 1), None)
 
     # a special case to trigger inline let expression
-    A = tvm.placeholder((2,), 'float32', 'A')
+    A = te.placeholder((2,), 'float32', 'A')
     E = topi.squeeze(A)
-    C = tvm.compute((1,), lambda i: E[(2 * A[0] - 1).astype('int32')])
+    C = te.compute((1,), lambda i: E[(2 * A[0] - 1).astype('int32')])
     for device in ['cuda', 'opencl']:
         ctx = tvm.context(device, 0)
         if ctx.exist:
@@ -737,7 +738,7 @@ def test_tile():
 
 def test_layout_transform():
     in_shape = (1, 32, 8, 8)
-    A = tvm.placeholder(shape=in_shape, dtype="float32", name="A")
+    A = te.placeholder(shape=in_shape, dtype="float32", name="A")
     B = topi.layout_transform(A, "NCHW", "NCHW16c")
 
     input = np.random.uniform(size=in_shape).astype(A.dtype)
@@ -766,7 +767,7 @@ def check_device(device):
 def test_shape():
     in_shape = (8, 7, 13)
     dtype = "int32"
-    A = tvm.placeholder(shape=in_shape, dtype="float32", name="A")
+    A = te.placeholder(shape=in_shape, dtype="float32", name="A")
     B = topi.shape(A, dtype)
 
     input = np.random.uniform(size=in_shape).astype(A.dtype)
@@ -796,8 +797,8 @@ def test_sequence_mask():
             for mask_value in [0.0, 1.0]:
                 max_length = in_shape[axis]
                 batch_size = in_shape[1 - axis]
-                A = tvm.placeholder(shape=in_shape, dtype="float32", name="A")
-                B = tvm.placeholder(shape=(batch_size,), dtype="int32", name="B")
+                A = te.placeholder(shape=in_shape, dtype="float32", name="A")
+                B = te.placeholder(shape=(batch_size,), dtype="int32", name="B")
                 C = topi.sequence_mask(A, B, axis=axis, mask_value=mask_value)
                 A_data = np.random.normal(0, 1, in_shape).astype(np.float32)
                 B_data = np.random.randint(1, max_length, (batch_size,)).astype(np.int32)
@@ -823,7 +824,7 @@ def check_device(device):
 def test_ndarray_size():
     in_shape = (5, 11, 7)
     dtype = "int32"
-    A = tvm.placeholder(shape=in_shape, dtype="float32", name="A")
+    A = te.placeholder(shape=in_shape, dtype="float32", name="A")
     B = topi.ndarray_size(A, dtype)
 
     input = np.random.uniform(size=in_shape).astype(A.dtype)
@@ -857,13 +858,13 @@ def check_device(device):
                 return
             print("Running on target: %s" % device)
             conv2d_compute, conv2d_schedule = topi.testing.get_conv2d_nchw_implement(device)
-            data = tvm.placeholder((2, 1, 2, 4), 'int8', 'data')
-            w = tvm.placeholder((3, 1, 2, 2), 'int8', 'w')
+            data = te.placeholder((2, 1, 2, 4), 'int8', 'data')
+            w = te.placeholder((3, 1, 2, 2), 'int8', 'w')
             conv1 = conv2d_compute(data, w, 1, 0, 1, 'int32')
-            zeros = topi.full((2, 3, 1, 3), 'int32', tvm.const(0, dtype='int32'))
+            zeros = topi.full((2, 3, 1, 3), 'int32', tvm.tir.const(0, dtype='int32'))
             gt = topi.greater_equal(conv1, zeros)
-            one = topi.full((2, 3, 1, 3), 'int32', tvm.const(1, dtype='int32'))
-            two = topi.full((2, 3, 1, 3), 'int32', tvm.const(2, dtype='int32'))
+            one = topi.full((2, 3, 1, 3), 'int32', tvm.tir.const(1, dtype='int32'))
+            two = topi.full((2, 3, 1, 3), 'int32', tvm.tir.const(2, dtype='int32'))
             where = topi.where(gt, one, two)
             add = topi.add(conv1, where)
             outs = [add]
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
index 003748719a0e..874471b830fd 100644
--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -17,6 +17,7 @@
 """Test code for upsampling"""
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 import math
@@ -28,12 +29,12 @@ def verify_upsampling(batch, in_channel, in_height, in_width, scale_h, scale_w,
                       layout='NCHW', method="nearest_neighbor",
                       in_batch_block = 0, in_channel_block = 0):
     if layout == 'NCHW':
-        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+        A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
         dtype = A.dtype
         out_shape = (batch, in_channel, int(round(in_height*scale_h)), int(round(in_width*scale_w)))
         a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
     elif nchw_pack_layout(layout):
-        A = tvm.placeholder((batch, in_channel, in_height, in_width, in_batch_block, in_channel_block),
+        A = te.placeholder((batch, in_channel, in_height, in_width, in_batch_block, in_channel_block),
                              name='A')
         dtype = A.dtype
         out_shape = (batch, in_channel, int(round(in_height*scale_h)), int(round(in_width*scale_w)),
@@ -41,7 +42,7 @@ def verify_upsampling(batch, in_channel, in_height, in_width, scale_h, scale_w,
         a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width,
                                  in_batch_block, in_channel_block)).astype(dtype)
     elif layout == 'NHWC':
-        A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A')
+        A = te.placeholder((batch, in_height, in_width, in_channel), name='A')
         dtype = A.dtype
         out_shape = (batch, int(round(in_height*scale_h)), int(round(in_width*scale_w)), in_channel)
         a_np = np.random.uniform(size=(batch, in_height, in_width, in_channel)).astype(dtype)
@@ -115,13 +116,13 @@ def test_upsampling():
 def verify_upsampling3d(batch, in_channel, in_depth, in_height, in_width, scale_d, scale_h, scale_w,
                         layout='NCDHW', method="nearest_neighbor"):
     if layout == 'NCDHW':
-        A = tvm.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A')
+        A = te.placeholder((batch, in_channel, in_depth, in_height, in_width), name='A')
         dtype = A.dtype
         out_shape = (batch, in_channel, int(round(in_depth*scale_d)), int(round(in_height*scale_h)),
                      int(round(in_width*scale_w)))
         a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height, in_width)).astype(dtype)
     elif layout == 'NDHWC':
-        A = tvm.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A')
+        A = te.placeholder((batch, in_depth, in_height, in_width, in_channel), name='A')
         dtype = A.dtype
         out_shape = (batch, int(round(in_depth*scale_d)), int(round(in_height*scale_h)),
                      int(round(in_width*scale_w)), in_channel)
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 7d27b8221a60..0aa410d7ea13 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -19,6 +19,7 @@
 import math
 import numpy as np
 import tvm
+from tvm import te
 import topi
 import topi.testing
 
@@ -90,7 +91,7 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             fcompute, fschedule = topi.testing.dispatch(device, _get_valid_counts_implement)
-            data = tvm.placeholder(dshape, name="data", dtype=dtype)
+            data = te.placeholder(dshape, name="data", dtype=dtype)
             outs = fcompute(data, score_threshold, id_index, score_index)
             s = fschedule(outs)
 
@@ -121,8 +122,8 @@ def verify_non_max_suppression(np_data, np_valid_count, np_result, np_indices_re
     dshape = np_data.shape
     batch, num_anchors, _ = dshape
     indices_dshape = (batch, num_anchors)
-    data = tvm.placeholder(dshape, name="data")
-    valid_count = tvm.placeholder((batch,), dtype="int32", name="valid_count")
+    data = te.placeholder(dshape, name="data")
+    valid_count = te.placeholder((batch,), dtype="int32", name="valid_count")
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -182,7 +183,7 @@ def test_non_max_suppression():
 
 
 def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False):
-    data = tvm.placeholder(dshape, name="data")
+    data = te.placeholder(dshape, name="data")
 
     dtype = data.dtype
     input_data = np.random.uniform(size=dshape).astype(dtype)
@@ -223,7 +224,7 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
-        
+
         fcompute, fschedule = topi.testing.dispatch(device, _multibox_prior_implement)
         with tvm.target.create(device):
             out = fcompute(data, sizes, ratios, steps, offsets, clip)
@@ -249,9 +250,9 @@ def test_multibox_detection():
     batch_size = 1
     num_anchors = 3
     num_classes = 3
-    cls_prob = tvm.placeholder((batch_size, num_anchors, num_classes), name="cls_prob")
-    loc_preds = tvm.placeholder((batch_size, num_anchors * 4), name="loc_preds")
-    anchors = tvm.placeholder((1, num_anchors, 4), name="anchors")
+    cls_prob = te.placeholder((batch_size, num_anchors, num_classes), name="cls_prob")
+    loc_preds = te.placeholder((batch_size, num_anchors * 4), name="loc_preds")
+    anchors = te.placeholder((1, num_anchors, 4), name="anchors")
 
     # Manually create test case
     np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
@@ -290,8 +291,8 @@ def verify_roi_align(batch, in_channel, in_size, num_roi, pooled_size, spatial_s
     a_shape = (batch, in_channel, in_size, in_size)
     rois_shape = (num_roi, 5)
 
-    a = tvm.placeholder(a_shape)
-    rois = tvm.placeholder(rois_shape)
+    a = te.placeholder(a_shape)
+    rois = te.placeholder(rois_shape)
 
     @memoize("topi.tests.test_topi_vision.verify_roi_align")
     def get_ref_data():
@@ -342,8 +343,8 @@ def verify_roi_pool(batch, in_channel, in_size, num_roi, pooled_size, spatial_sc
     a_shape = (batch, in_channel, in_size, in_size)
     rois_shape = (num_roi, 5)
 
-    a = tvm.placeholder(a_shape)
-    rois = tvm.placeholder(rois_shape)
+    a = te.placeholder(a_shape)
+    rois = te.placeholder(rois_shape)
 
     @memoize("topi.tests.test_topi_vision.verify_roi_pool")
     def get_ref_data():
@@ -387,9 +388,9 @@ def test_roi_pool():
 
 
 def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
-    cls_prob = tvm.placeholder(np_cls_prob.shape)
-    bbox_pred = tvm.placeholder(np_bbox_pred.shape)
-    im_info = tvm.placeholder(np_im_info.shape)
+    cls_prob = te.placeholder(np_cls_prob.shape)
+    bbox_pred = te.placeholder(np_bbox_pred.shape)
+    im_info = te.placeholder(np_im_info.shape)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 0e26dcb97412..260cf5a4bb08 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -49,6 +49,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 import topi
 from topi.testing import conv2d_nchw_python
 
@@ -82,10 +83,10 @@
 def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     assert N == 1, "Only consider batch_size = 1 in this template"
 
-    data = tvm.placeholder((N, CI, H, W), name='data')
-    kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
+    data = te.placeholder((N, CI, H, W), name='data')
+    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
     conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')
-    s = tvm.create_schedule([conv.op])
+    s = te.create_schedule([conv.op])
 
     ##### space definition begin #####
     n, f, y, x = s[conv].op.axis
@@ -123,15 +124,15 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
     kernel_scope = n  # this is the scope to attach global config inside this kernel
 
-    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
-    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
-    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[output].bind(vf, tvm.thread_axis("vthread"))
-    s[output].bind(vy, tvm.thread_axis("vthread"))
-    s[output].bind(vx, tvm.thread_axis("vthread"))
-    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
-    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].bind(bf, te.thread_axis("blockIdx.z"))
+    s[output].bind(by, te.thread_axis("blockIdx.y"))
+    s[output].bind(bx, te.thread_axis("blockIdx.x"))
+    s[output].bind(vf, te.thread_axis("vthread"))
+    s[output].bind(vy, te.thread_axis("vthread"))
+    s[output].bind(vx, te.thread_axis("vthread"))
+    s[output].bind(tf, te.thread_axis("threadIdx.z"))
+    s[output].bind(ty, te.thread_axis("threadIdx.y"))
+    s[output].bind(tx, te.thread_axis("threadIdx.x"))
     s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
     s[OL].compute_at(s[output], tx)
 
@@ -155,9 +156,9 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
         tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
         ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
         tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
-        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
-        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[load].bind(tz, te.thread_axis("threadIdx.z"))
+        s[load].bind(ty, te.thread_axis("threadIdx.y"))
+        s[load].bind(tx, te.thread_axis("threadIdx.x"))
 
     # tune unroll
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
diff --git a/tutorials/autotvm/tune_relay_arm.py b/tutorials/autotvm/tune_relay_arm.py
index ea24b1685788..ffd3e8b9b5cb 100644
--- a/tutorials/autotvm/tune_relay_arm.py
+++ b/tutorials/autotvm/tune_relay_arm.py
@@ -62,6 +62,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm import relay
 import tvm.relay.testing
diff --git a/tutorials/autotvm/tune_relay_cuda.py b/tutorials/autotvm/tune_relay_cuda.py
index 58c8751b73b9..4195075ca66d 100644
--- a/tutorials/autotvm/tune_relay_cuda.py
+++ b/tutorials/autotvm/tune_relay_cuda.py
@@ -60,6 +60,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm import relay
 import tvm.relay.testing
diff --git a/tutorials/autotvm/tune_relay_mobile_gpu.py b/tutorials/autotvm/tune_relay_mobile_gpu.py
index 5425f1b15715..ad7460829329 100644
--- a/tutorials/autotvm/tune_relay_mobile_gpu.py
+++ b/tutorials/autotvm/tune_relay_mobile_gpu.py
@@ -61,6 +61,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm import relay
 import tvm.relay.testing
diff --git a/tutorials/autotvm/tune_relay_x86.py b/tutorials/autotvm/tune_relay_x86.py
index f44773e544a7..15ce2de4b82f 100644
--- a/tutorials/autotvm/tune_relay_x86.py
+++ b/tutorials/autotvm/tune_relay_x86.py
@@ -28,6 +28,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm import relay
 from tvm.relay import testing
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 8efeed487b43..dd3b9dce3d7a 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -55,6 +55,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 
 # the module is called `autotvm`
 from tvm import autotvm
@@ -70,12 +71,12 @@
 
 # Matmul V0: Constant tiling factor
 def matmul_v0(N, L, M, dtype):
-    A = tvm.placeholder((N, L), name='A', dtype=dtype)
-    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+    A = te.placeholder((N, L), name='A', dtype=dtype)
+    B = te.placeholder((L, M), name='B', dtype=dtype)
 
-    k = tvm.reduce_axis((0, L), name='k')
-    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
-    s = tvm.create_schedule(C.op)
+    k = te.reduce_axis((0, L), name='k')
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = te.create_schedule(C.op)
 
     # schedule
     y, x = s[C].op.axis
@@ -104,12 +105,12 @@ def matmul_v0(N, L, M, dtype):
 # Matmul V1: List candidate values
 @autotvm.register_customized_task("tutorial/matmul_v1")  # 1. use a decorator
 def matmul_v1(N, L, M, dtype):
-    A = tvm.placeholder((N, L), name='A', dtype=dtype)
-    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+    A = te.placeholder((N, L), name='A', dtype=dtype)
+    B = te.placeholder((L, M), name='B', dtype=dtype)
 
-    k = tvm.reduce_axis((0, L), name='k')
-    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
-    s = tvm.create_schedule(C.op)
+    k = te.reduce_axis((0, L), name='k')
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = te.create_schedule(C.op)
 
     # schedule
     y, x = s[C].op.axis
@@ -184,12 +185,12 @@ def matmul_v1(N, L, M, dtype):
 
 @autotvm.register_customized_task("tutorial/matmul")
 def matmul(N, L, M, dtype):
-    A = tvm.placeholder((N, L), name='A', dtype=dtype)
-    B = tvm.placeholder((L, M), name='B', dtype=dtype)
+    A = te.placeholder((N, L), name='A', dtype=dtype)
+    B = te.placeholder((L, M), name='B', dtype=dtype)
 
-    k = tvm.reduce_axis((0, L), name='k')
-    C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
-    s = tvm.create_schedule(C.op)
+    k = te.reduce_axis((0, L), name='k')
+    C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name='C')
+    s = te.create_schedule(C.op)
 
     # schedule
     y, x = s[C].op.axis
diff --git a/tutorials/cross_compilation_and_rpc.py b/tutorials/cross_compilation_and_rpc.py
index 2ffcb11989b8..553d77dd2023 100644
--- a/tutorials/cross_compilation_and_rpc.py
+++ b/tutorials/cross_compilation_and_rpc.py
@@ -96,13 +96,14 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import rpc
 from tvm.contrib import util
 
-n = tvm.convert(1024)
-A = tvm.placeholder((n,), name='A')
-B = tvm.compute((n,), lambda i: A[i] + 1.0, name='B')
-s = tvm.create_schedule(B.op)
+n = tvm.runtime.convert(1024)
+A = te.placeholder((n,), name='A')
+B = te.compute((n,), lambda i: A[i] + 1.0, name='B')
+s = te.create_schedule(B.op)
 
 ######################################################################
 # Then we cross compile the kernel.
@@ -228,10 +229,10 @@ def run_opencl():
     opencl_device_port = 9090
 
     # create schedule for the above "add one" compute declaration
-    s = tvm.create_schedule(B.op)
+    s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=32)
-    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[B].bind(xo, te.thread_axis("blockIdx.x"))
+    s[B].bind(xi, te.thread_axis("threadIdx.x"))
     func = tvm.build(s, [A, B], "opencl", target_host=target_host)
 
     remote = rpc.connect(opencl_device_host, opencl_device_port)
diff --git a/tutorials/dev/low_level_custom_pass.py b/tutorials/dev/low_level_custom_pass.py
index 97c4a1f49a9a..298b24f6d046 100644
--- a/tutorials/dev/low_level_custom_pass.py
+++ b/tutorials/dev/low_level_custom_pass.py
@@ -43,6 +43,7 @@
 
 from __future__ import absolute_import, print_function
 import tvm
+from tvm import te
 import numpy as np
 
 ######################################################################
@@ -50,12 +51,12 @@
 # our customized lowering pass to manipulate the IR directly instead of using schedule primitives.
 #
 
-n = tvm.const(128, "int32")
-a = tvm.placeholder((n, ), name="a")
-b = tvm.placeholder((n, ), name="b")
-c = tvm.compute((n, ), lambda i: a[i] + b[i], name='c')
+n = tvm.tir.const(128, "int32")
+a = te.placeholder((n, ), name="a")
+b = te.placeholder((n, ), name="b")
+c = te.compute((n, ), lambda i: a[i] + b[i], name='c')
 
-sch = tvm.create_schedule(c.op)
+sch = te.create_schedule(c.op)
 ir  = tvm.lower(sch, [a, b, c], simple_mode=True)
 print(ir)
 
@@ -71,7 +72,7 @@
 #
 # IR Visitor
 # ~~~~~~~~~~
-# We can use ``tvm.ir_pass.PostOrderVisit(stmt, func)`` to gather information from the Halide IR.
+# We can use ``tvm.tir.ir_pass.PostOrderVisit(stmt, func)`` to gather information from the Halide IR.
 # ``func`` is a function callback. This function will be called before exiting the current IR node,
 # i.e. post-order visit. Then we leverage side effects to store the result of IR visit, because the
 # return value of ``func`` will be ignored.
@@ -111,8 +112,8 @@ def vectorize8(op):
     if op in loops:
         extent = op.extent.value
         name = op.loop_var.name
-        lo, li = tvm.var(name + '.outer'), tvm.var(name + '.inner')
-        body = tvm.ir_pass.Substitute(op.body, {op.loop_var: lo * 8 + li})
+        lo, li = te.var(name + '.outer'), te.var(name + '.inner')
+        body = tvm.tir.ir_pass.Substitute(op.body, {op.loop_var: lo * 8 + li})
         body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body)
         body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body)
         return body
@@ -121,14 +122,14 @@ def vectorize8(op):
 def vectorize(stmt):
     global loops
 
-    tvm.ir_pass.PostOrderVisit(stmt, find_width8)
+    tvm.tir.ir_pass.PostOrderVisit(stmt, find_width8)
 
     if not loops:
         return stmt
 
     # The last list arugment indicates what kinds of nodes will be transformed.
     # Thus, in this case only `For` nodes will call `vectorize8`
-    stmt = tvm.ir_pass.IRTransform(stmt, None, vectorize8, ['For'])
+    stmt = tvm.tir.ir_pass.IRTransform(stmt, None, vectorize8, ['For'])
 
     return stmt
 
@@ -158,15 +159,15 @@ def vectorize(stmt):
 # Thus, a good place to put this transformation pass is just after Phase 1.
 #
 
-with tvm.build_config(add_lower_pass=[(1, vectorize)]) as cfg:
+with tvm.target.build_config(add_lower_pass=[(1, vectorize)]) as cfg:
     print(tvm.lower(sch, [a, b, c], simple_mode=True))
 
 #####################################################################
 # Quick View
 # ----------
 # This tutorial gives a quick view of writing a customized IR transformation pass:
-# - Use ``tvm.ir_pass.PostOrderVisit`` to gather information on each IR nodes.
-# - Use ``tvm.ir_pass.IRTransform`` to transform IR nodes.
+# - Use ``tvm.tir.ir_pass.PostOrderVisit`` to gather information on each IR nodes.
+# - Use ``tvm.tir.ir_pass.IRTransform`` to transform IR nodes.
 # - Wrap up two above to write an IR-transformation function.
-# - Use ``tvm.build_config`` to put this function to TVM lowering pass
+# - Use ``tvm.target.build_config`` to put this function to TVM lowering pass
 #
diff --git a/tutorials/dev/relay_pass_infra.py b/tutorials/dev/relay_pass_infra.py
index 494593eeb5a1..7f818cfa3068 100644
--- a/tutorials/dev/relay_pass_infra.py
+++ b/tutorials/dev/relay_pass_infra.py
@@ -49,6 +49,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 import tvm.relay as relay
 
 ###############################################################################
diff --git a/tutorials/frontend/build_gcn.py b/tutorials/frontend/build_gcn.py
index d385dc9e72ab..e0d0aa074317 100644
--- a/tutorials/frontend/build_gcn.py
+++ b/tutorials/frontend/build_gcn.py
@@ -186,6 +186,7 @@ def evaluate(data, logits):
 from tvm import relay
 from tvm.contrib import graph_runtime
 import tvm
+from tvm import te
 
 def GraphConv(layer_name,
               input_dim,
diff --git a/tutorials/frontend/deploy_model_on_android.py b/tutorials/frontend/deploy_model_on_android.py
index 3d0e83d5e450..f516004181c5 100644
--- a/tutorials/frontend/deploy_model_on_android.py
+++ b/tutorials/frontend/deploy_model_on_android.py
@@ -31,6 +31,7 @@
 import keras
 from keras.applications.mobilenet_v2 import MobileNetV2
 import tvm
+from tvm import te
 import tvm.relay as relay
 from tvm import rpc
 from tvm.contrib import util, ndk, graph_runtime as runtime
diff --git a/tutorials/frontend/deploy_model_on_rasp.py b/tutorials/frontend/deploy_model_on_rasp.py
index e78c73659b98..ef707feedd2f 100644
--- a/tutorials/frontend/deploy_model_on_rasp.py
+++ b/tutorials/frontend/deploy_model_on_rasp.py
@@ -27,6 +27,7 @@
 """
 
 import tvm
+from tvm import te
 import tvm.relay as relay
 from tvm import rpc
 from tvm.contrib import util, graph_runtime as runtime
diff --git a/tutorials/frontend/deploy_quantized.py b/tutorials/frontend/deploy_quantized.py
index 0e09ba9c9307..5af9fc950bc2 100644
--- a/tutorials/frontend/deploy_quantized.py
+++ b/tutorials/frontend/deploy_quantized.py
@@ -28,6 +28,7 @@
 """
 
 import tvm
+from tvm import te
 from tvm import relay
 import mxnet as mx
 from tvm.contrib.download import download_testdata
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
index 78bb0cafcfe8..6126df0e73ab 100644
--- a/tutorials/frontend/deploy_ssd_gluoncv.py
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -24,6 +24,7 @@
 We will use GluonCV pre-trained SSD model and convert it to Relay IR
 """
 import tvm
+from tvm import te
 
 from matplotlib import pyplot as plt
 from tvm.relay.testing.config import ctx_list
@@ -47,7 +48,7 @@
 #
 #   To get best inference performance on Intel graphics,
 #   change target argument to :code:`opencl -device=intel_graphics`.
-#   But when using Intel graphics on Mac, target needs to 
+#   But when using Intel graphics on Mac, target needs to
 #   be set to `opencl` only for the reason that Intel subgroup
 #   extension is not supported on Mac.
 #
diff --git a/tutorials/frontend/from_caffe2.py b/tutorials/frontend/from_caffe2.py
index aadee02f4b02..8fad80df1d1e 100644
--- a/tutorials/frontend/from_caffe2.py
+++ b/tutorials/frontend/from_caffe2.py
@@ -96,6 +96,7 @@ def transform_image(image):
 # ---------------
 # The process is no different from other examples.
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime
 # context x86 CPU, use tvm.gpu(0) if you run on GPU
 ctx = tvm.cpu(0)
diff --git a/tutorials/frontend/from_coreml.py b/tutorials/frontend/from_coreml.py
index 2f70353501b8..2a0c8dbc93f2 100644
--- a/tutorials/frontend/from_coreml.py
+++ b/tutorials/frontend/from_coreml.py
@@ -35,6 +35,7 @@
 https://github.com/apple/coremltools
 """
 import tvm
+from tvm import te
 import tvm.relay as relay
 from tvm.contrib.download import download_testdata
 import coremltools as cm
diff --git a/tutorials/frontend/from_darknet.py b/tutorials/frontend/from_darknet.py
index e90c8bb63b32..e2c1ea5aacbf 100644
--- a/tutorials/frontend/from_darknet.py
+++ b/tutorials/frontend/from_darknet.py
@@ -38,6 +38,7 @@
 
 # tvm, relay
 import tvm
+from tvm import te
 from tvm import relay
 from ctypes import *
 from tvm.contrib.download import download_testdata
diff --git a/tutorials/frontend/from_keras.py b/tutorials/frontend/from_keras.py
index c1f3471bb644..928a8acbefa7 100644
--- a/tutorials/frontend/from_keras.py
+++ b/tutorials/frontend/from_keras.py
@@ -35,6 +35,7 @@
 https://keras.io/#installation
 """
 import tvm
+from tvm import te
 import tvm.relay as relay
 from tvm.contrib.download import download_testdata
 import keras
diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py
index d0e4c4ab0d18..bf53db532e1d 100644
--- a/tutorials/frontend/from_mxnet.py
+++ b/tutorials/frontend/from_mxnet.py
@@ -38,6 +38,7 @@
 # some standard imports
 import mxnet as mx
 import tvm
+from tvm import te
 import tvm.relay as relay
 import numpy as np
 
diff --git a/tutorials/frontend/from_onnx.py b/tutorials/frontend/from_onnx.py
index 7a615930a905..766451c2f8b1 100644
--- a/tutorials/frontend/from_onnx.py
+++ b/tutorials/frontend/from_onnx.py
@@ -35,6 +35,7 @@
 import onnx
 import numpy as np
 import tvm
+from tvm import te
 import tvm.relay as relay
 from tvm.contrib.download import download_testdata
 
diff --git a/tutorials/frontend/from_tensorflow.py b/tutorials/frontend/from_tensorflow.py
index 55eb3d014191..0ebd733ef9aa 100644
--- a/tutorials/frontend/from_tensorflow.py
+++ b/tutorials/frontend/from_tensorflow.py
@@ -26,6 +26,7 @@
 
 # tvm, relay
 import tvm
+from tvm import te
 from tvm import relay
 
 # os and numpy
diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py
index e93a71ce4a78..50fd69f3f92b 100644
--- a/tutorials/frontend/from_tflite.py
+++ b/tutorials/frontend/from_tflite.py
@@ -151,6 +151,7 @@ def extract(path):
 # Execute on TVM
 # --------------
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime as runtime
 
 # Create a runtime executor module
diff --git a/tutorials/frontend/using_external_lib.py b/tutorials/frontend/using_external_lib.py
index 71acedaf181b..7063c0e27fea 100644
--- a/tutorials/frontend/using_external_lib.py
+++ b/tutorials/frontend/using_external_lib.py
@@ -32,6 +32,7 @@
 To begin with, we import Relay and TVM.
 """
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import graph_runtime as runtime
 from tvm import relay
diff --git a/tutorials/language/extern_op.py b/tutorials/language/extern_op.py
index 2ad3e3063415..64e9880dc20b 100644
--- a/tutorials/language/extern_op.py
+++ b/tutorials/language/extern_op.py
@@ -32,13 +32,14 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import cblas
 
 ######################################################################
 # Use Extern Tensor Function
 # --------------------------
-# In the example below, we use :any:`tvm.extern` to add an extern
+# In the example below, we use :any:`te.extern` to add an extern
 # array function call. In the extern call, we declare the shape
 # of output tensors. In the second argument we provide the list of inputs.
 #
@@ -53,15 +54,15 @@
 n = 1024
 l = 128
 m = 235
-bias = tvm.var('bias', dtype=tvm.float32)
-A = tvm.placeholder((n, l), name='A')
-B = tvm.placeholder((l, m), name='B')
-C = tvm.extern((n, m), [A, B],
-               lambda ins, outs: tvm.call_packed(
+bias = te.var('bias', dtype="float32")
+A = te.placeholder((n, l), name='A')
+B = te.placeholder((l, m), name='B')
+C = te.extern((n, m), [A, B],
+               lambda ins, outs: tvm.tir.call_packed(
                    "tvm.contrib.cblas.matmul",
                    ins[0], ins[1], outs[0], False, False), name="C")
-D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
-s = tvm.create_schedule(D.op)
+D = te.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
+s = te.create_schedule(D.op)
 
 ######################################################################
 # Verify the Result
@@ -86,8 +87,8 @@
 #
 from tvm.contrib import cblas
 C = cblas.matmul(A, B)
-D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
-s = tvm.create_schedule(D.op)
+D = te.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
+s = te.create_schedule(D.op)
 
 ######################################################################
 # Hook Python Function as Extern
@@ -106,10 +107,10 @@ def my_tvm_addone(x, y):
     print("my_tvm_addone signatures: %s, %s" % (type(x), type(y)))
     tvm.nd.array(x.asnumpy() + 1).copyto(y)
 
-A = tvm.placeholder((n,), name='A')
-B = tvm.extern(A.shape, [A], lambda ins, outs: tvm.call_packed(
+A = te.placeholder((n,), name='A')
+B = te.extern(A.shape, [A], lambda ins, outs: tvm.tir.call_packed(
     "tvm.contrib.my_tvm_addone", ins[0], outs[0]), name="C")
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 f = tvm.build(s, [A, B], "llvm")
 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
 b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
@@ -119,7 +120,7 @@ def my_tvm_addone(x, y):
 ######################################################################
 # Summary
 # -------
-# - TVM calls extern tensor function via :any:`tvm.extern`
+# - TVM calls extern tensor function via :any:`te.extern`
 # - Use contrib wrappers for short sugars of extern tensor calls.
 # - We can hook front-end function as extern tensor callbacks.
 #
diff --git a/tutorials/language/intrin_math.py b/tutorials/language/intrin_math.py
index 59bf79d13092..eebab3f6c3c3 100644
--- a/tutorials/language/intrin_math.py
+++ b/tutorials/language/intrin_math.py
@@ -31,6 +31,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 
 ######################################################################
@@ -38,19 +39,19 @@
 # -------------------------------
 # The most straight-forward way to call target specific function is via
 # extern function call construct in tvm.
-# In the following example, we use :any:`tvm.call_pure_extern` to call
+# In the following example, we use :any:`tvm.tir.call_pure_extern` to call
 # :code:`__expf` function, which is only available under CUDA.
 #
-n = tvm.var("n")
-A = tvm.placeholder((n,), name='A')
-B = tvm.compute(A.shape,
-                lambda i: tvm.call_pure_extern("float32", "__expf", A[i]),
+n = te.var("n")
+A = te.placeholder((n,), name='A')
+B = te.compute(A.shape,
+                lambda i: tvm.tir.call_pure_extern("float32", "__expf", A[i]),
                 name="B")
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 num_thread = 64
 bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+s[B].bind(bx, te.thread_axis("blockIdx.x"))
+s[B].bind(tx, te.thread_axis("threadIdx.x"))
 f = tvm.build(s, [A, B], "cuda", name="myexp")
 print(f.imported_modules[0].get_source())
 
@@ -64,22 +65,22 @@
 #
 # TVM intrinsic provides the user a mechanism to achieve this, and this
 # is the recommended way to solve the problem.
-# The following code use tvm.exp instead, which create an intrinsic call
-# :any:`tvm.exp` to do the exponential.
+# The following code use te.exp instead, which create an intrinsic call
+# :any:`te.exp` to do the exponential.
 #
-n = tvm.var("n")
-A = tvm.placeholder((n,), name='A')
-B = tvm.compute(A.shape, lambda i: tvm.exp(A[i]), name="B")
-s = tvm.create_schedule(B.op)
+n = te.var("n")
+A = te.placeholder((n,), name='A')
+B = te.compute(A.shape, lambda i: te.exp(A[i]), name="B")
+s = te.create_schedule(B.op)
 num_thread = 64
 bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+s[B].bind(bx, te.thread_axis("blockIdx.x"))
+s[B].bind(tx, te.thread_axis("threadIdx.x"))
 fcuda = tvm.build(s, [A, B], "cuda", name="myexp")
 print(fcuda.imported_modules[0].get_source())
 ######################################################################
 # We can find that the code works for both CUDA and opencl.
-# The same tvm.exp can also be used for float64 data types.
+# The same te.exp can also be used for float64 data types.
 #
 fopencl = tvm.build(s, [A, B], "opencl", name="myexp")
 print(fopencl.imported_modules[0].get_source())
@@ -87,7 +88,7 @@
 ######################################################################
 # Intrinsic Lowering Rule
 # -----------------------
-# When :any:`tvm.exp` is called, TVM creates an intrinsic Call Expr.
+# When :any:`te.exp` is called, TVM creates an intrinsic Call Expr.
 # TVM uses transformation rules to transform the intrinsic
 # call to device specific extern calls.
 #
@@ -101,10 +102,10 @@ def my_cuda_math_rule(op):
     assert isinstance(op, tvm.tir.Call)
     if op.dtype == "float32":
         # call float function
-        return tvm.call_pure_extern("float32", "%sf" % op.name, op.args[0])
+        return tvm.tir.call_pure_extern("float32", "%sf" % op.name, op.args[0])
     elif op.dtype == "float64":
         # call double function
-        return tvm.call_pure_extern("float32", op.name, op.args[0])
+        return tvm.tir.call_pure_extern("float32", op.name, op.args[0])
     else:
         # cannot do translation, return self.
         return op
@@ -131,29 +132,29 @@ def my_cuda_math_rule(op):
 
 def mylog(x):
     """customized log intrinsic function"""
-    return tvm.call_pure_intrin(x.dtype, "mylog", x)
+    return tvm.tir.call_pure_intrin(x.dtype, "mylog", x)
 
 
 def my_cuda_mylog_rule(op):
     """CUDA lowering rule for log"""
     if op.dtype == "float32":
-        return tvm.call_pure_extern("float32", "logf", op.args[0])
+        return tvm.tir.call_pure_extern("float32", "logf", op.args[0])
     elif op.dtype == "float64":
-        return tvm.call_pure_extern("float64", "log", op.args[0])
+        return tvm.tir.call_pure_extern("float64", "log", op.args[0])
     else:
         return op
 
 
 tvm.target.register_intrin_rule("cuda", "mylog", my_cuda_mylog_rule, override=True)
 
-n = tvm.var("n")
-A = tvm.placeholder((n,), name='A')
-B = tvm.compute(A.shape, lambda i: mylog(A[i]), name="B")
-s = tvm.create_schedule(B.op)
+n = te.var("n")
+A = te.placeholder((n,), name='A')
+B = te.compute(A.shape, lambda i: mylog(A[i]), name="B")
+s = te.create_schedule(B.op)
 num_thread = 64
 bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+s[B].bind(bx, te.thread_axis("blockIdx.x"))
+s[B].bind(tx, te.thread_axis("threadIdx.x"))
 fcuda = tvm.build(s, [A, B], "cuda", name="mylog")
 print(fcuda.imported_modules[0].get_source())
 
@@ -162,6 +163,6 @@ def my_cuda_mylog_rule(op):
 # -------
 # - TVM can call extern target dependent math function.
 # - Use intrinsic to defined a unified interface for the functions.
-# - For more intrinsics available in tvm, take a look at :any:`tvm.intrin`
+# - For more intrinsics available in tvm, take a look at :any:`tvm.tir`
 # - You can customize the intrinsic behavior by defining your own rules.
 #
diff --git a/tutorials/language/reduction.py b/tutorials/language/reduction.py
index 0b631cb077f4..cdfc94ef096f 100644
--- a/tutorials/language/reduction.py
+++ b/tutorials/language/reduction.py
@@ -28,6 +28,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 
 ######################################################################
@@ -38,8 +39,8 @@
 #
 # The following lines describe the row sum operation.
 # To create a reduction formula, we declare a reduction axis using
-# :any:`tvm.reduce_axis`. :any:`tvm.reduce_axis` takes in the range of reductions.
-# :any:`tvm.sum` takes in the expression to be reduced as well as the reduction
+# :any:`te.reduce_axis`. :any:`te.reduce_axis` takes in the range of reductions.
+# :any:`te.sum` takes in the expression to be reduced as well as the reduction
 # axis and compute the sum of value over all k in the declared range.
 #
 # The equivalent C code is as follows:
@@ -53,11 +54,11 @@
 #     }
 #   }
 #
-n = tvm.var("n")
-m = tvm.var("m")
-A = tvm.placeholder((n, m), name='A')
-k = tvm.reduce_axis((0, m), "k")
-B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
+n = te.var("n")
+m = te.var("m")
+A = te.placeholder((n, m), name='A')
+k = te.reduce_axis((0, m), "k")
+B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
 
 ######################################################################
 # Schedule the Reduction
@@ -65,7 +66,7 @@
 # There are several ways to schedule a reduction.
 # Before doing anything, let us print out the IR code of default schedule.
 #
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 print(tvm.lower(s, [A, B], simple_mode=True))
 
 ######################################################################
@@ -81,8 +82,8 @@
 
 ######################################################################
 # If we are building a GPU kernel, we can bind the rows of B to GPU threads.
-s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+s[B].bind(xo, te.thread_axis("blockIdx.x"))
+s[B].bind(xi, te.thread_axis("threadIdx.x"))
 print(tvm.lower(s, [A, B], simple_mode=True))
 
 ######################################################################
@@ -97,7 +98,7 @@
 # In the following schedule, the result of B is written to a temporary
 # result B.rf. The factored dimension becomes the first dimension of B.rf.
 #
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 ko, ki = s[B].split(B.op.reduce_axis[0], factor=16)
 BF = s.rfactor(B, ki)
 print(tvm.lower(s, [A, B], simple_mode=True))
@@ -122,9 +123,9 @@
 # columns by threadIdx.x and finally do a cross thread reduction over threadIdx.x
 #
 xo, xi = s[B].split(s[B].op.axis[0], factor=32)
-s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-s[B].bind(xi, tvm.thread_axis("threadIdx.y"))
-tx = tvm.thread_axis("threadIdx.x")
+s[B].bind(xo, te.thread_axis("blockIdx.x"))
+s[B].bind(xi, te.thread_axis("threadIdx.y"))
+tx = te.thread_axis("threadIdx.x")
 s[B].bind(s[B].op.reduce_axis[0], tx)
 s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
 s[B].set_store_predicate(tx.var.equal(0))
@@ -148,16 +149,16 @@
 # In TVM, we can describe convolution via 2D reduction in a simple way.
 # Here is an example for 2D convolution with filter size = [3, 3] and strides = [1, 1].
 #
-n = tvm.var('n')
-Input = tvm.placeholder((n, n), name='Input')
-Filter = tvm.placeholder((3, 3), name='Filter')
-di = tvm.reduce_axis((0, 3), name='di')
-dj = tvm.reduce_axis((0, 3), name='dj')
-Output = tvm.compute(
+n = te.var('n')
+Input = te.placeholder((n, n), name='Input')
+Filter = te.placeholder((3, 3), name='Filter')
+di = te.reduce_axis((0, 3), name='di')
+dj = te.reduce_axis((0, 3), name='dj')
+Output = te.compute(
     (n - 2, n - 2),
-    lambda i, j: tvm.sum(Input[i + di, j + dj] * Filter[di, dj], axis=[di, dj]),
+    lambda i, j: te.sum(Input[i + di, j + dj] * Filter[di, dj], axis=[di, dj]),
     name='Output')
-s = tvm.create_schedule(Output.op)
+s = te.create_schedule(Output.op)
 print(tvm.lower(s, [Input, Filter, Output], simple_mode=True))
 
 ######################################################################
@@ -165,18 +166,18 @@
 #
 # Define General Commutative Reduction Operation
 # ----------------------------------------------
-# Besides the built-in reduction operations like :any:`tvm.sum`,
-# :any:`tvm.min` and :any:`tvm.max`, you can also define your
-# commutative reduction operation by :any:`tvm.comm_reducer`.
+# Besides the built-in reduction operations like :any:`te.sum`,
+# :any:`tvm.te.min` and :any:`tvm.te.max`, you can also define your
+# commutative reduction operation by :any:`te.comm_reducer`.
 #
 
-n = tvm.var('n')
-m = tvm.var('m')
-product = tvm.comm_reducer(lambda x, y: x*y,
-    lambda t: tvm.const(1, dtype=t), name="product")
-A = tvm.placeholder((n, m), name='A')
-k = tvm.reduce_axis((0, m), name='k')
-B = tvm.compute((n,), lambda i: product(A[i, k], axis=k), name='B')
+n = te.var('n')
+m = te.var('m')
+product = te.comm_reducer(lambda x, y: x*y,
+    lambda t: tvm.tir.const(1, dtype=t), name="product")
+A = te.placeholder((n, m), name='A')
+k = te.reduce_axis((0, m), name='k')
+B = te.compute((n,), lambda i: product(A[i, k], axis=k), name='B')
 
 ######################################################################
 # .. note::
@@ -192,4 +193,4 @@
 #
 # - Describe reduction with reduce_axis.
 # - Use rfactor to factor out axis if we need parallelism.
-# - Define new reduction operation by :any:`tvm.comm_reducer`
+# - Define new reduction operation by :any:`te.comm_reducer`
diff --git a/tutorials/language/scan.py b/tutorials/language/scan.py
index 2fa9c210ead2..73790da27500 100644
--- a/tutorials/language/scan.py
+++ b/tutorials/language/scan.py
@@ -25,6 +25,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 
 ######################################################################
@@ -46,13 +47,13 @@
 # The result of the scan is a tensor, giving the result of :code:`s_state` after the
 # update over the time domain.
 #
-m = tvm.var("m")
-n = tvm.var("n")
-X = tvm.placeholder((m, n), name="X")
-s_state = tvm.placeholder((m, n))
-s_init = tvm.compute((1, n), lambda _, i: X[0, i])
-s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
-s_scan = tvm.scan(s_init, s_update, s_state, inputs=[X])
+m = te.var("m")
+n = te.var("n")
+X = te.placeholder((m, n), name="X")
+s_state = te.placeholder((m, n))
+s_init = te.compute((1, n), lambda _, i: X[0, i])
+s_update = te.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
+s_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X])
 
 ######################################################################
 # Schedule the Scan Cell
@@ -62,10 +63,10 @@
 # first iteration dimension of the update part.
 # To split on the time iteration, user can schedule on scan_op.scan_axis instead.
 #
-s = tvm.create_schedule(s_scan.op)
+s = te.create_schedule(s_scan.op)
 num_thread = 256
-block_x = tvm.thread_axis("blockIdx.x")
-thread_x = tvm.thread_axis("threadIdx.x")
+block_x = te.thread_axis("blockIdx.x")
+thread_x = te.thread_axis("threadIdx.x")
 xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
 s[s_init].bind(xo, block_x)
 s[s_init].bind(xi, thread_x)
@@ -100,21 +101,21 @@
 # The following lines demonstrate a scan with two stage operations
 # in the scan cell.
 #
-m = tvm.var("m")
-n = tvm.var("n")
-X = tvm.placeholder((m, n), name="X")
-s_state = tvm.placeholder((m, n))
-s_init = tvm.compute((1, n), lambda _, i: X[0, i])
-s_update_s1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] * 2, name="s1")
-s_update_s2 = tvm.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")
-s_scan = tvm.scan(s_init, s_update_s2, s_state, inputs=[X])
+m = te.var("m")
+n = te.var("n")
+X = te.placeholder((m, n), name="X")
+s_state = te.placeholder((m, n))
+s_init = te.compute((1, n), lambda _, i: X[0, i])
+s_update_s1 = te.compute((m, n), lambda t, i: s_state[t-1, i] * 2, name="s1")
+s_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2")
+s_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X])
 
 ######################################################################
 # These intermediate tensors can also be scheduled normally.
 # To ensure correctness, TVM creates a group constraint to forbid
 # the body of scan to be compute_at locations outside the scan loop.
 #
-s = tvm.create_schedule(s_scan.op)
+s = te.create_schedule(s_scan.op)
 xo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32)
 s[s_update_s1].compute_at(s[s_update_s2], xo)
 print(tvm.lower(s, [X, s_scan], simple_mode=True))
@@ -126,20 +127,20 @@
 # recurrent state. Scan support multiple recurrent states.
 # The following example demonstrates how we can build recurrence with two states.
 #
-m = tvm.var("m")
-n = tvm.var("n")
-l = tvm.var("l")
-X = tvm.placeholder((m, n), name="X")
-s_state1 = tvm.placeholder((m, n))
-s_state2 = tvm.placeholder((m, l))
-s_init1 = tvm.compute((1, n), lambda _, i: X[0, i])
-s_init2 = tvm.compute((1, l), lambda _, i: 0.0)
-s_update1 = tvm.compute((m, n), lambda t, i: s_state1[t-1, i] + X[t, i])
-s_update2 = tvm.compute((m, l), lambda t, i: s_state2[t-1, i] + s_state1[t-1, 0])
-s_scan1, s_scan2 = tvm.scan([s_init1, s_init2],
+m = te.var("m")
+n = te.var("n")
+l = te.var("l")
+X = te.placeholder((m, n), name="X")
+s_state1 = te.placeholder((m, n))
+s_state2 = te.placeholder((m, l))
+s_init1 = te.compute((1, n), lambda _, i: X[0, i])
+s_init2 = te.compute((1, l), lambda _, i: 0.0)
+s_update1 = te.compute((m, n), lambda t, i: s_state1[t-1, i] + X[t, i])
+s_update2 = te.compute((m, l), lambda t, i: s_state2[t-1, i] + s_state1[t-1, 0])
+s_scan1, s_scan2 = tvm.te.scan([s_init1, s_init2],
                             [s_update1, s_update2],
                             [s_state1, s_state2], inputs=[X])
-s = tvm.create_schedule(s_scan1.op)
+s = te.create_schedule(s_scan1.op)
 print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True))
 
 ######################################################################
diff --git a/tutorials/language/schedule_primitives.py b/tutorials/language/schedule_primitives.py
index e59264f29898..61bfcad1f3a9 100644
--- a/tutorials/language/schedule_primitives.py
+++ b/tutorials/language/schedule_primitives.py
@@ -27,6 +27,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 
 ######################################################################
@@ -41,19 +42,19 @@
 #
 
 # declare some variables for use later
-n = tvm.var('n')
-m = tvm.var('m')
+n = te.var('n')
+m = te.var('m')
 
 ######################################################################
 # A schedule can be created from a list of ops, by default the
 # schedule computes tensor in a serial manner in a row-major order.
 
 # declare a matrix element-wise multiply
-A = tvm.placeholder((m, n), name='A')
-B = tvm.placeholder((m, n), name='B')
-C = tvm.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C')
+A = te.placeholder((m, n), name='A')
+B = te.placeholder((m, n), name='B')
+C = te.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C')
 
-s = tvm.create_schedule([C.op])
+s = te.create_schedule([C.op])
 # lower will transform the computation from definition to the real
 # callable function. With argument `simple_mode=True`, it will
 # return you a readable C like statement, we use it here to print the
@@ -70,20 +71,20 @@
 # -----
 # :code:`split` can split a specified axis into two axises by
 # :code:`factor`.
-A = tvm.placeholder((m,), name='A')
-B = tvm.compute((m,), lambda i: A[i]*2, name='B')
+A = te.placeholder((m,), name='A')
+B = te.compute((m,), lambda i: A[i]*2, name='B')
 
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 xo, xi = s[B].split(B.op.axis[0], factor=32)
 print(tvm.lower(s, [A, B], simple_mode=True))
 
 ######################################################################
 # You can also split a axis by :code:`nparts`, which splits the axis
 # contrary with :code:`factor`.
-A = tvm.placeholder((m,), name='A')
-B = tvm.compute((m,), lambda i: A[i], name='B')
+A = te.placeholder((m,), name='A')
+B = te.compute((m,), lambda i: A[i], name='B')
 
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 bx, tx = s[B].split(B.op.axis[0], nparts=32)
 print(tvm.lower(s, [A, B], simple_mode=True))
 
@@ -92,10 +93,10 @@
 # ----
 # :code:`tile` help you execute the computation tile by tile over two
 # axises.
-A = tvm.placeholder((m, n), name='A')
-B = tvm.compute((m, n), lambda i, j: A[i, j], name='B')
+A = te.placeholder((m, n), name='A')
+B = te.compute((m, n), lambda i, j: A[i, j], name='B')
 
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
 print(tvm.lower(s, [A, B], simple_mode=True))
 
@@ -103,10 +104,10 @@
 # fuse
 # ----
 # :code:`fuse` can fuse two consecutive axises of one computation.
-A = tvm.placeholder((m, n), name='A')
-B = tvm.compute((m, n), lambda i, j: A[i, j], name='B')
+A = te.placeholder((m, n), name='A')
+B = te.compute((m, n), lambda i, j: A[i, j], name='B')
 
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 # tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
 xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
 # then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)
@@ -117,10 +118,10 @@
 # reorder
 # -------
 # :code:`reorder` can reorder the axises in the specified order.
-A = tvm.placeholder((m, n), name='A')
-B = tvm.compute((m, n), lambda i, j: A[i, j], name='B')
+A = te.placeholder((m, n), name='A')
+B = te.compute((m, n), lambda i, j: A[i, j], name='B')
 
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 # tile to four axises first: (i.outer, j.outer, i.inner, j.inner)
 xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)
 # then reorder the axises: (i.inner, j.outer, i.outer, j.inner)
@@ -132,13 +133,13 @@
 # ----
 # :code:`bind` can bind a specified axis with a thread axis, often used
 # in gpu programming.
-A = tvm.placeholder((n,), name='A')
-B = tvm.compute(A.shape, lambda i: A[i] * 2, name='B')
+A = te.placeholder((n,), name='A')
+B = te.compute(A.shape, lambda i: A[i] * 2, name='B')
 
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 bx, tx = s[B].split(B.op.axis[0], factor=64)
-s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
-s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
+s[B].bind(bx, te.thread_axis("blockIdx.x"))
+s[B].bind(tx, te.thread_axis("threadIdx.x"))
 print(tvm.lower(s, [A, B], simple_mode=True))
 
 ######################################################################
@@ -146,21 +147,21 @@
 # ----------
 # For a schedule that consists of multiple operators, TVM will compute
 # tensors at the root separately by default.
-A = tvm.placeholder((m,), name='A')
-B = tvm.compute((m,), lambda i: A[i]+1, name='B')
-C = tvm.compute((m,), lambda i: B[i]*2, name='C')
+A = te.placeholder((m,), name='A')
+B = te.compute((m,), lambda i: A[i]+1, name='B')
+C = te.compute((m,), lambda i: B[i]*2, name='C')
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 print(tvm.lower(s, [A, B, C], simple_mode=True))
 
 ######################################################################
 # :code:`compute_at` can move computation of `B` into the first axis
 # of computation of `C`.
-A = tvm.placeholder((m,), name='A')
-B = tvm.compute((m,), lambda i: A[i]+1, name='B')
-C = tvm.compute((m,), lambda i: B[i]*2, name='C')
+A = te.placeholder((m,), name='A')
+B = te.compute((m,), lambda i: A[i]+1, name='B')
+C = te.compute((m,), lambda i: B[i]*2, name='C')
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 s[B].compute_at(s[C], C.op.axis[0])
 print(tvm.lower(s, [A, B, C], simple_mode=True))
 
@@ -170,11 +171,11 @@
 # :code:`compute_inline` can mark one stage as inline, then the body of
 # computation will be expanded and inserted at the address where the
 # tensor is required.
-A = tvm.placeholder((m,), name='A')
-B = tvm.compute((m,), lambda i: A[i]+1, name='B')
-C = tvm.compute((m,), lambda i: B[i]*2, name='C')
+A = te.placeholder((m,), name='A')
+B = te.compute((m,), lambda i: A[i]+1, name='B')
+C = te.compute((m,), lambda i: B[i]*2, name='C')
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 s[B].compute_inline()
 print(tvm.lower(s, [A, B, C], simple_mode=True))
 
@@ -182,11 +183,11 @@
 # compute_root
 # ------------
 # :code:`compute_root` can move computation of one stage to the root.
-A = tvm.placeholder((m,), name='A')
-B = tvm.compute((m,), lambda i: A[i]+1, name='B')
-C = tvm.compute((m,), lambda i: B[i]*2, name='C')
+A = te.placeholder((m,), name='A')
+B = te.compute((m,), lambda i: A[i]+1, name='B')
+C = te.compute((m,), lambda i: B[i]*2, name='C')
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 s[B].compute_at(s[C], C.op.axis[0])
 s[B].compute_root()
 print(tvm.lower(s, [A, B, C], simple_mode=True))
diff --git a/tutorials/language/tedd.py b/tutorials/language/tedd.py
index 36146880d150..a6cd8019e31e 100644
--- a/tutorials/language/tedd.py
+++ b/tutorials/language/tedd.py
@@ -37,9 +37,8 @@
 how to use TEDD and how to interpret the rendered graphs.
 
 """
-from __future__ import absolute_import, print_function
-
 import tvm
+from tvm import te
 import topi
 from tvm.contrib import tedd
 
@@ -58,11 +57,13 @@
 stride = 1
 padding = "SAME"
 dilation=1
-A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
-W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-B = tvm.placeholder((1, num_filter, 1), name='bias')
+
+A = te.placeholder((in_size, in_size, in_channel, batch), name='A')
+W = te.placeholder((kernel, kernel, in_channel, num_filter), name='W')
+B = te.placeholder((1, num_filter, 1), name='bias')
+
 with tvm.target.create("llvm"):
-    t_conv = topi.nn.conv2d(A, W, stride, padding, dilation, layout='HWCN')
+    t_conv = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)
     t_bias = topi.add(t_conv, B)
     t_relu = topi.nn.relu(t_bias)
     s = topi.generic.schedule_conv2d_hwcn([t_relu])
diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
index afc708e5d1d3..429060626837 100644
--- a/tutorials/language/tensorize.py
+++ b/tutorials/language/tensorize.py
@@ -35,6 +35,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 
 ######################################################################
@@ -46,12 +47,12 @@
 # The following lines describe the computation :code:`A * B^T` in TVM.
 #
 N, M, L = 1024, 512, 64
-A = tvm.placeholder((N, L), name='A')
-B = tvm.placeholder((M, L), name='B')
-k = tvm.reduce_axis((0, L), name='k')
-C = tvm.compute((N, M), lambda i, j:
-                tvm.sum(A[i, k] * B[j, k], axis=k), name='C')
-s = tvm.create_schedule(C.op)
+A = te.placeholder((N, L), name='A')
+B = te.placeholder((M, L), name='B')
+k = te.reduce_axis((0, L), name='k')
+C = te.compute((N, M), lambda i, j:
+                te.sum(A[i, k] * B[j, k], axis=k), name='C')
+s = te.create_schedule(C.op)
 print(tvm.lower(s, [A, B, C], simple_mode=True))
 
 ######################################################################
@@ -88,37 +89,37 @@
 # which is done in :code:`intrin_func` below.
 #
 def intrin_gemv(m, l):
-    a = tvm.placeholder((l,), name='a')
-    b = tvm.placeholder((m, l), name='b')
-    k = tvm.reduce_axis((0, l), name='k')
-    c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c')
-    Ab = tvm.decl_buffer(a.shape, a.dtype,
+    a = te.placeholder((l,), name='a')
+    b = te.placeholder((m, l), name='b')
+    k = te.reduce_axis((0, l), name='k')
+    c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name='c')
+    Ab = tvm.tir.decl_buffer(a.shape, a.dtype,
                          name="A",
                          offset_factor=1,
                          strides=[1])
-    Bb = tvm.decl_buffer(b.shape, b.dtype,
+    Bb = tvm.tir.decl_buffer(b.shape, b.dtype,
                          name="B",
                          offset_factor=1,
-                         strides=[tvm.var("s1"), 1])
-    Cb = tvm.decl_buffer(c.shape, c.dtype,
+                         strides=[te.var("s1"), 1])
+    Cb = tvm.tir.decl_buffer(c.shape, c.dtype,
                          name="C",
                          offset_factor=1,
                          strides=[1])
     def intrin_func(ins, outs):
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
         aa, bb = ins
         cc = outs[0]
-        ib.emit(tvm.call_extern("int32", "gemv_update",
+        ib.emit(tvm.tir.call_extern("int32", "gemv_update",
                                 cc.access_ptr("w"),
                                 aa.access_ptr("r"),
                                 bb.access_ptr("r"),
                                 m, l, bb.strides[0]))
         return ib.get()
-    with tvm.build_config(offset_factor=1):
-        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
+    with tvm.target.build_config(offset_factor=1):
+        return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
 
 ######################################################################
-# Here :code:`tvm.decl_tensor_intrin` declares how to execute the computation :code:`c.op`.
+# Here :code:`te.decl_tensor_intrin` declares how to execute the computation :code:`c.op`.
 # Our implementation simply takes the inputs and outputs,
 # converts them to pointers and emit an external function call.
 # Note that tensorization requires user to specify :code:`offset_factor`,
@@ -134,7 +135,7 @@ def intrin_func(ins, outs):
 # For now :code:`bb.strides[0] == l`,
 # but later we will see how they can differ with more complicated schedules.
 #
-# Note that we use :code:`tvm.var("s1")` as the first stride dimension for :code:`B`.
+# Note that we use :code:`te.var("s1")` as the first stride dimension for :code:`B`.
 # If the strides can be inferred
 # - in this case, TVM knows tensor B is compact thus the strides are :code:`[L, 1]` -
 # such placeholder can be put to let TVM automatically bind the inferred value for us.
@@ -233,20 +234,20 @@ def gemv_impl():
     return ll_code
 
 def intrin_gemv(m, l):
-    a = tvm.placeholder((l,), name='a')
-    b = tvm.placeholder((m, l), name='b')
-    k = tvm.reduce_axis((0, l), name='k')
-    c = tvm.compute((m,), lambda i:
-    tvm.sum(a[k] * b[i, k], axis=k), name='c')
-    Ab = tvm.decl_buffer(a.shape, a.dtype,
+    a = te.placeholder((l,), name='a')
+    b = te.placeholder((m, l), name='b')
+    k = te.reduce_axis((0, l), name='k')
+    c = te.compute((m,), lambda i:
+    te.sum(a[k] * b[i, k], axis=k), name='c')
+    Ab = tvm.tir.decl_buffer(a.shape, a.dtype,
                          name="A",
                          offset_factor=1,
                          strides=[1])
-    Bb = tvm.decl_buffer(b.shape, b.dtype,
+    Bb = tvm.tir.decl_buffer(b.shape, b.dtype,
                          name="B",
                          offset_factor=1,
-                         strides=[tvm.var("s1"), 1])
-    Cb = tvm.decl_buffer(c.shape, c.dtype,
+                         strides=[te.var("s1"), 1])
+    Cb = tvm.tir.decl_buffer(c.shape, c.dtype,
                          name="C",
                          offset_factor=1,
                          strides=[1])
@@ -254,22 +255,22 @@ def intrin_func(ins, outs):
         aa, bb = ins
         cc = outs[0]
         def _body():
-            ib = tvm.ir_builder.create()
-            ib.emit(tvm.call_extern("int32", "gemv_update",
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_extern("int32", "gemv_update",
                                     cc.access_ptr("w"),
                                     aa.access_ptr("r"),
                                     bb.access_ptr("r"),
                                     m, l, bb.strides[0]))
             return ib.get()
         def _reduce_reset():
-            ib = tvm.ir_builder.create()
-            ib.emit(tvm.call_extern("int32", "gemv_reset", cc.access_ptr("w"), m))
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_extern("int32", "gemv_reset", cc.access_ptr("w"), m))
             return ib.get()
         def _reduce_update():
             return _body()
         return _body(), _reduce_reset(), _reduce_update()
-    with tvm.build_config(offset_factor=1):
-        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
+    with tvm.target.build_config(offset_factor=1):
+        return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
 
 ######################################################################
 # Note that :code:`intrin_func` now returns a triplet:
diff --git a/tutorials/language/tuple_inputs.py b/tutorials/language/tuple_inputs.py
index 715e2ef36f7e..828797ac1b1d 100644
--- a/tutorials/language/tuple_inputs.py
+++ b/tutorials/language/tuple_inputs.py
@@ -28,23 +28,24 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 
 ######################################################################
 # Describe Batchwise Computation
 # ------------------------------
 # For operators which have the same shape, we can put them together as
-# the inputs of :any:`tvm.compute`, if we want them to be scheduled
+# the inputs of :any:`te.compute`, if we want them to be scheduled
 # together in the next schedule procedure.
 #
-n = tvm.var("n")
-m = tvm.var("m")
-A0 = tvm.placeholder((m, n), name='A0')
-A1 = tvm.placeholder((m, n), name='A1')
-B0, B1 = tvm.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name='B')
+n = te.var("n")
+m = te.var("m")
+A0 = te.placeholder((m, n), name='A0')
+A1 = te.placeholder((m, n), name='A1')
+B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name='B')
 
 # The generated IR code would be:
-s = tvm.create_schedule(B0.op)
+s = te.create_schedule(B0.op)
 print(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True))
 
 ######################################################################
@@ -56,7 +57,7 @@
 # operators, and the inputs will collaborate together, e.g. :code:`argmax`.
 # In the reduction procedure, :code:`argmax` need to compare the value of
 # operands, also need to keep the index of operand. It can be expressed
-# with :py:func:`tvm.comm_reducer` as below:
+# with :py:func:`te.comm_reducer` as below:
 
 # x and y are the operands of reduction, both of them is a tuple of index
 # and value.
@@ -68,20 +69,20 @@ def fcombine(x, y):
 # our identity element also need to be a tuple, so `fidentity` accepts
 # two types as inputs.
 def fidentity(t0, t1):
-    return tvm.const(-1, t0), tvm.min_value(t1)
+    return tvm.tir.const(-1, t0), tvm.te.min_value(t1)
 
-argmax = tvm.comm_reducer(fcombine, fidentity, name='argmax')
+argmax = te.comm_reducer(fcombine, fidentity, name='argmax')
 
 # describe the reduction computation
-m = tvm.var('m')
-n = tvm.var('n')
-idx = tvm.placeholder((m, n), name='idx', dtype='int32')
-val = tvm.placeholder((m, n), name='val', dtype='int32')
-k = tvm.reduce_axis((0, n), 'k')
-T0, T1 = tvm.compute((m, ), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name='T')
+m = te.var('m')
+n = te.var('n')
+idx = te.placeholder((m, n), name='idx', dtype='int32')
+val = te.placeholder((m, n), name='val', dtype='int32')
+k = te.reduce_axis((0, n), 'k')
+T0, T1 = te.compute((m, ), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name='T')
 
 # the generated IR code would be:
-s = tvm.create_schedule(T0.op)
+s = te.create_schedule(T0.op)
 print(tvm.lower(s, [idx, val, T0, T1], simple_mode=True))
 
 ######################################################################
@@ -97,14 +98,14 @@ def fidentity(t0, t1):
 # with one batch operation, but they can only be scheduled together
 # in terms of operation.
 
-n = tvm.var("n")
-m = tvm.var("m")
-A0 = tvm.placeholder((m, n), name='A0')
-B0, B1 = tvm.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name='B')
-A1 = tvm.placeholder((m, n), name='A1')
-C = tvm.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name='C')
+n = te.var("n")
+m = te.var("m")
+A0 = te.placeholder((m, n), name='A0')
+B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name='B')
+A1 = te.placeholder((m, n), name='A1')
+C = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name='C')
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 s[B0].compute_at(s[C], C.op.axis[0])
 # as you can see in the below generated IR code:
 print(tvm.lower(s, [A0, A1, C], simple_mode=True))
diff --git a/tutorials/optimize/opt_conv_cuda.py b/tutorials/optimize/opt_conv_cuda.py
index 74d1e6d8b6a0..025e53eb012a 100644
--- a/tutorials/optimize/opt_conv_cuda.py
+++ b/tutorials/optimize/opt_conv_cuda.py
@@ -42,6 +42,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 
 # The sizes of inputs and filters
 batch = 256
@@ -53,25 +54,25 @@
 stride = 1
 
 # Algorithm
-A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
-W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W')
+A = te.placeholder((in_size, in_size, in_channel, batch), name='A')
+W = te.placeholder((kernel, kernel, in_channel, out_channel), name='W')
 out_size = (in_size - kernel + 2*pad) // stride + 1
 # Pad input
-Apad = tvm.compute(
+Apad = te.compute(
     (in_size + 2*pad, in_size + 2*pad, in_channel, batch),
-    lambda yy, xx, cc, nn: tvm.if_then_else(
-        tvm.all(yy >= pad, yy - pad < in_size,
+    lambda yy, xx, cc, nn: tvm.tir.if_then_else(
+        tvm.tir.all(yy >= pad, yy - pad < in_size,
                 xx >= pad, xx - pad < in_size),
-        A[yy - pad, xx - pad, cc, nn], tvm.const(0., "float32")),
+        A[yy - pad, xx - pad, cc, nn], tvm.tir.const(0., "float32")),
     name='Apad')
 # Create reduction variables
-rc = tvm.reduce_axis((0, in_channel), name='rc')
-ry = tvm.reduce_axis((0, kernel), name='ry')
-rx = tvm.reduce_axis((0, kernel), name='rx')
+rc = te.reduce_axis((0, in_channel), name='rc')
+ry = te.reduce_axis((0, kernel), name='ry')
+rx = te.reduce_axis((0, kernel), name='rx')
 # Compute the convolution
-B = tvm.compute(
+B = te.compute(
     (out_size, out_size, out_channel, batch),
-    lambda yy, xx, ff, nn: tvm.sum(
+    lambda yy, xx, ff, nn: te.sum(
         Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff],
         axis=[ry, rx, rc]),
     name='B')
@@ -101,7 +102,7 @@
 #
 
 # Designate the memory hierarchy
-s = tvm.create_schedule(B.op)
+s = te.create_schedule(B.op)
 s[Apad].compute_inline() # compute Apad inline
 AA = s.cache_read(Apad, 'shared', [B])
 WW = s.cache_read(W, "shared", [B])
@@ -135,13 +136,13 @@
 vthread = 2
 
 # Get the GPU thread indices
-block_x = tvm.thread_axis("blockIdx.x")
-block_y = tvm.thread_axis("blockIdx.y")
-block_z = tvm.thread_axis("blockIdx.z")
-thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
-thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
-thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
+block_x = te.thread_axis("blockIdx.x")
+block_y = te.thread_axis("blockIdx.y")
+block_z = te.thread_axis("blockIdx.z")
+thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
+thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
+thread_xz = te.thread_axis((0, vthread), "vthread", name="vx")
+thread_yz = te.thread_axis((0, vthread), "vthread", name="vy")
 
 # Split the workloads
 hi, wi, fi, ni = s[B].op.axis
diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/tutorials/optimize/opt_conv_tensorcore.py
index ef840892d7d5..44b9de3b99ff 100644
--- a/tutorials/optimize/opt_conv_tensorcore.py
+++ b/tutorials/optimize/opt_conv_tensorcore.py
@@ -52,6 +52,7 @@
 # NHWCnc memory layout.The following code defines the convolution algorithm in TVM.
 
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import nvcc
 
@@ -98,30 +99,30 @@
                 block_size)
 
 # Reduction axes
-kh = tvm.reduce_axis((0, kernel_h), name='kh')
-kw = tvm.reduce_axis((0, kernel_w), name='kw')
-ic = tvm.reduce_axis((0, in_channels // block_size), name='ic')
-ii = tvm.reduce_axis((0, block_size), name='ii')
+kh = te.reduce_axis((0, kernel_h), name='kh')
+kw = te.reduce_axis((0, kernel_w), name='kw')
+ic = te.reduce_axis((0, in_channels // block_size), name='ic')
+ii = te.reduce_axis((0, block_size), name='ii')
 
 # Algorithm
-A = tvm.placeholder(data_shape, name='A', dtype="float16")
-W = tvm.placeholder(kernel_shape, name='W', dtype="float16")
-Apad = tvm.compute(
+A = te.placeholder(data_shape, name='A', dtype="float16")
+W = te.placeholder(kernel_shape, name='W', dtype="float16")
+Apad = te.compute(
     (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
      block_size),
-    lambda n, h, w, i, nn, ii: tvm.if_then_else(
-        tvm.all(h >= pad_h, h - pad_h < height,
+    lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
+        tvm.tir.all(h >= pad_h, h - pad_h < height,
                 w >= pad_w, w - pad_w < width),
-        A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")),
+        A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0., "float16")),
     name='Apad')
-Conv = tvm.compute(output_shape,
-                   lambda n, h, w, o, nn, oo: tvm.sum(
+Conv = te.compute(output_shape,
+                   lambda n, h, w, o, nn, oo: te.sum(
                        Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
                        W[kh, kw, ic, o, ii, oo].astype("float32"),
                        axis=[ic, kh, kw, ii]),
                    name="Conv")
 
-s = tvm.create_schedule(Conv.op)
+s = te.create_schedule(Conv.op)
 s[Apad].compute_inline()
 
 ###############################################################################
@@ -152,49 +153,49 @@
 
 def intrin_wmma_load_matrix(scope):
     n = 16
-    A = tvm.placeholder((n, n), name='A', dtype='float16')
-    BA = tvm.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)
-    C = tvm.compute((n, n), lambda i, j: A[i, j], name='C')
-    BC = tvm.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)
+    A = te.placeholder((n, n), name='A', dtype='float16')
+    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)
+    C = te.compute((n, n), lambda i, j: A[i, j], name='C')
+    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)
 
     def intrin_func(ins, outs):
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
 
         BA = ins[0]
         BC = outs[0]
-        ib.emit(tvm.call_intrin('handle', 'tvm_load_matrix_sync',
+        ib.emit(tvm.tir.call_intrin('handle', 'tvm_load_matrix_sync',
                                 BC.data, n, n, n, BC.elem_offset // 256,
                                 BA.access_ptr('r'), n, 'row_major'))
         return ib.get()
 
-    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
 
 
 def intrin_wmma_gemm():
     n = 16
-    A = tvm.placeholder((n, n), name='A', dtype='float16')
-    B = tvm.placeholder((n, n), name='B', dtype='float16')
-    k = tvm.reduce_axis((0, n), name="k")
-    C = tvm.compute((n, n),
+    A = te.placeholder((n, n), name='A', dtype='float16')
+    B = te.placeholder((n, n), name='B', dtype='float16')
+    k = te.reduce_axis((0, n), name="k")
+    C = te.compute((n, n),
                     lambda ii, jj:
-                    tvm.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
+                    te.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
                     name='C')
-    BA = tvm.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256)
-    BB = tvm.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256)
-    BC = tvm.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+    BA = tvm.tir.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256)
+    BB = tvm.tir.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256)
+    BC = tvm.tir.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256)
 
     def intrin_func(ins, outs):
         BA, BB = ins
         BC, = outs
 
         def init():
-            ib = tvm.ir_builder.create()
-            ib.emit(tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0))
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0))
             return ib.get()
 
         def update():
-            ib = tvm.ir_builder.create()
-            ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync',
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(tvm.tir.call_intrin('handle', 'tvm_mma_sync',
                                     BC.data, BC.elem_offset // 256,
                                     BA.data, BA.elem_offset // 256,
                                     BB.data, BB.elem_offset // 256,
@@ -203,26 +204,26 @@ def update():
 
         return update(), init(), update()
 
-    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
+    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
 
 
 def intrin_wmma_store_matrix():
     n = 16
-    A = tvm.placeholder((n, n), name='A', dtype='float32')
-    BA = tvm.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256)
-    C = tvm.compute((n, n), lambda i, j: A[i, j], name='C')
-    BC = tvm.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256)
+    A = te.placeholder((n, n), name='A', dtype='float32')
+    BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+    C = te.compute((n, n), lambda i, j: A[i, j], name='C')
+    BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256)
 
     def intrin_func(ins, outs):
-        ib = tvm.ir_builder.create()
+        ib = tvm.tir.ir_builder.create()
         BA = ins[0]
         BC = outs[0]
-        ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync',
+        ib.emit(tvm.tir.call_intrin('handle', 'tvm_store_matrix_sync',
                                 BA.data, n, n, n, BA.elem_offset // 256,
                                 BC.access_ptr('w'), n, 'row_major'))
         return ib.get()
 
-    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
 
 ###############################################################################
 # Scheduling the Computation
@@ -255,12 +256,12 @@ def intrin_func(ins, outs):
 warp_size = 32
 chunk = 2
 
-block_x = tvm.thread_axis('blockIdx.x')
-block_y = tvm.thread_axis('blockIdx.y')
-block_z = tvm.thread_axis('blockIdx.z')
-thread_x = tvm.thread_axis('threadIdx.x')
-thread_y = tvm.thread_axis('threadIdx.y')
-thread_z = tvm.thread_axis('threadIdx.z')
+block_x = te.thread_axis('blockIdx.x')
+block_y = te.thread_axis('blockIdx.y')
+block_z = te.thread_axis('blockIdx.z')
+thread_x = te.thread_axis('threadIdx.x')
+thread_y = te.thread_axis('threadIdx.y')
+thread_z = te.thread_axis('threadIdx.z')
 
 nc, hc, wc, oc, nnc, ooc = Conv.op.axis
 block_k = s[Conv].fuse(hc, wc)
@@ -330,7 +331,7 @@ def intrin_func(ins, outs):
 
 ctx = tvm.gpu(0)
 if nvcc.have_tensorcore(ctx.compute_version):
-    with tvm.build_config(auto_unroll_max_step=16):
+    with tvm.target.build_config(auto_unroll_max_step=16):
         func = tvm.build(s, [A, W, Conv], 'cuda')
     a_np = np.random.uniform(size=data_shape).astype(A.dtype)
     w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 8ed152aee918..daca89b23a52 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -56,6 +56,7 @@
 # Then we write a baseline implementation, the simplest way to write a matrix multiplication in TVM.
 
 import tvm
+from tvm import te
 import numpy
 import timeit
 
@@ -94,16 +95,16 @@
 answer = numpy.dot(a.asnumpy(), b.asnumpy())
 
 # Algorithm
-k = tvm.reduce_axis((0, K), 'k')
-A = tvm.placeholder((M, K), name='A')
-B = tvm.placeholder((K, N), name='B')
-C = tvm.compute(
+k = te.reduce_axis((0, K), 'k')
+A = te.placeholder((M, K), name='A')
+B = te.placeholder((K, N), name='B')
+C = te.compute(
            (M, N),
-           lambda x, y: tvm.sum(A[x, k] * B[k, y], axis=k),
+           lambda x, y: te.sum(A[x, k] * B[k, y], axis=k),
            name='C')
 
 # Default schedule
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 func = tvm.build(s, [A, B, C], target=target, name='mmult')
 assert func
 
@@ -129,7 +130,7 @@
 # fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache)
 
 bn = 32
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 
 # Blocking by loop tiling
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
@@ -165,7 +166,7 @@
 #
 # In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly.
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 k, = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=4)
@@ -199,7 +200,7 @@
 # which is not cache friendly. If we change the nested loop order of ki and inner axes xi,
 # the access pattern for A matrix is more cache friendly.
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 k, = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=4)
@@ -244,12 +245,12 @@
 #
 
 # We have to re-write the algorithm slightly.
-packedB = tvm.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')
-C = tvm.compute((M, N),
-                lambda x, y: tvm.sum(A[x, k] * packedB[y // bn, k, tvm.indexmod(y, bn)], axis=k),
+packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name='packedB')
+C = te.compute((M, N),
+                lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
                 name = 'C')
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 k, = s[C].op.reduce_axis
@@ -285,7 +286,7 @@
 # write to C when all the block results are ready.
 #
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 
 # Allocate write cache
 CC = s.cache_write(C, 'global')
@@ -328,7 +329,7 @@
 # --------
 # Futhermore, we can also utilize multi-core processors to do the thread-level parallelization.
 
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 
 CC = s.cache_write(C, 'global')
 
diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py
index a4658eba2bee..490ccdb9373a 100644
--- a/tutorials/optimize/opt_matmul_auto_tensorcore.py
+++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py
@@ -46,12 +46,13 @@
 
 import numpy as np
 import tvm
+from tvm import te
 
 from tvm import autotvm
 from tvm.contrib import nvcc
 
 def matmul_nn(A, B, L, dtype='float16', layout='NN'):
-    k = tvm.reduce_axis((0, L), name='k')
+    k = te.reduce_axis((0, L), name='k')
     if dtype == 'float16':
       out_type = 'float'
     elif dtype == 'int8':
@@ -59,13 +60,13 @@ def matmul_nn(A, B, L, dtype='float16', layout='NN'):
     elif dtype == 'int4' or dtype == 'int1':
       out_type = 'int'
     if (layout == 'NN'):
-      return tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k].astype(out_type) * B[k, j].astype(out_type), axis=k))
+      return te.compute((N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[k, j].astype(out_type), axis=k))
     if (layout == 'NT'):
-      return tvm.compute((N, M), lambda i, j: tvm.sum(A[k, i].astype(out_type) * B[k, j].astype(out_type), axis=k))
+      return te.compute((N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[k, j].astype(out_type), axis=k))
     if (layout == 'TN'):
-      return tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k].astype(out_type) * B[j, k].astype(out_type), axis=k))
+      return te.compute((N, M), lambda i, j: te.sum(A[i, k].astype(out_type) * B[j, k].astype(out_type), axis=k))
     if (layout == 'TT'):
-      return tvm.compute((N, M), lambda i, j: tvm.sum(A[k, i].astype(out_type) * B[j, k].astype(out_type), axis=k))
+      return te.compute((N, M), lambda i, j: te.sum(A[k, i].astype(out_type) * B[j, k].astype(out_type), axis=k))
 
 ###############################################################################
 # Scheduling the Computation
@@ -85,7 +86,7 @@ def matmul_nn(A, B, L, dtype='float16', layout='NN'):
 #   (2) The warp tile size is not 16x16x16 on CUDA9, or not one of {16x16x16, 32x8x16, 8x32x16} on CUDA version >= 10.0.
 #
 # In this schedule, storage_align is used to reduce bank conflicts of shared memory. Please refer to this
-# `doc <https://docs.tvm.ai/api/python/schedule.html#tvm.schedule.Stage.storage_align>`_
+# `doc <https://docs.tvm.ai/api/python/schedule.html#tvm.te.schedule.Stage.storage_align>`_
 # for the usage of storage_align primitive. In short, we need to add an offset to some shared memory buffer
 # to reduce bank conflicts.
 # According to the `wmma doc <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-description>`_,
@@ -111,11 +112,11 @@ def test_gemm(N, L, M, dtype, layout):
     else:
       print ("Unsupported layout:", layout)
       sys.exit(1);
-    A = tvm.placeholder(shape_a, name='A', dtype=dtype)
-    B = tvm.placeholder(shape_b, name='B', dtype=dtype)
+    A = te.placeholder(shape_a, name='A', dtype=dtype)
+    B = te.placeholder(shape_b, name='B', dtype=dtype)
     C = matmul_nn(A, B, L, dtype, layout)
 
-    s = tvm.create_schedule(C.op)
+    s = te.create_schedule(C.op)
     y, x = s[C].op.axis
     k = s[C].op.reduce_axis[0]
 
@@ -182,11 +183,11 @@ def test_gemm(N, L, M, dtype, layout):
     tz, xi = s[C].split(xi, WX)
     tx, xi = s[C].split(xi, TX)
     s[C].reorder(yo, xo, tz, ty, tx, yi, xi)
-    s[C].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].bind(yo, te.thread_axis("blockIdx.y"))
+    s[C].bind(xo, te.thread_axis("blockIdx.x"))
+    s[C].bind(ty, te.thread_axis("threadIdx.y"))
+    s[C].bind(tz, te.thread_axis("threadIdx.z"))
+    s[C].bind(tx, te.thread_axis("threadIdx.x"))
 
     # schedule for CL stage
     ko, ki = s[CL].split(k, step_k * warp_tile_k)
@@ -202,9 +203,9 @@ def test_gemm(N, L, M, dtype, layout):
     tx, vec = s[AA].split(tx, factor=v)
     fused = s[AA].fuse(s[AA].op.axis[0], xo)
     _, ty = s[AA].split(fused, factor=by)
-    s[AA].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[AA].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[AA].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[AA].bind(ty, te.thread_axis("threadIdx.y"))
+    s[AA].bind(tz, te.thread_axis("threadIdx.z"))
+    s[AA].bind(tx, te.thread_axis("threadIdx.x"))
     # vectorization is very important for float16/int8 inputs
     s[AA].vectorize(vec)
 
@@ -215,9 +216,9 @@ def test_gemm(N, L, M, dtype, layout):
     tx, vec = s[BB].split(tx, factor=v)
     fused = s[BB].fuse(s[BB].op.axis[0], xo)
     _, ty = s[BB].split(fused, factor=by)
-    s[BB].bind(ty, tvm.thread_axis("threadIdx.y"))
-    s[BB].bind(tz, tvm.thread_axis("threadIdx.z"))
-    s[BB].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[BB].bind(ty, te.thread_axis("threadIdx.y"))
+    s[BB].bind(tz, te.thread_axis("threadIdx.z"))
+    s[BB].bind(tx, te.thread_axis("threadIdx.x"))
     s[BB].vectorize(vec)
 
     s[AL].compute_at(s[CL], kl)
@@ -286,7 +287,7 @@ def tune_and_evaluate(M, N, L, dtype, layout):
   print(best_config)
   with autotvm.apply_history_best('matmul.log'):
     with tvm.target.create("cuda"):
-        with tvm.build_config():
+        with tvm.target.build_config():
             s, arg_bufs = test_gemm(N, L, M, dtype, layout)
             print(tvm.lower(s, arg_bufs, simple_mode=True))
             func = tvm.build(s, arg_bufs)
diff --git a/tutorials/relay_quick_start.py b/tutorials/relay_quick_start.py
index d272a0e315b8..b258d1bf3338 100644
--- a/tutorials/relay_quick_start.py
+++ b/tutorials/relay_quick_start.py
@@ -42,6 +42,7 @@
 from tvm import relay
 from tvm.relay import testing
 import tvm
+from tvm import te
 from tvm.contrib import graph_runtime
 
 ######################################################################
diff --git a/tutorials/tensor_expr_get_started.py b/tutorials/tensor_expr_get_started.py
index ca92b3b3ab95..ecd3f2b89528 100644
--- a/tutorials/tensor_expr_get_started.py
+++ b/tutorials/tensor_expr_get_started.py
@@ -28,6 +28,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import numpy as np
 
 # Global declarations of environment.
@@ -62,10 +63,10 @@
 # No computation happens during this phase, as we are only declaring how
 # the computation should be done.
 #
-n = tvm.var("n")
-A = tvm.placeholder((n,), name='A')
-B = tvm.placeholder((n,), name='B')
-C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+n = te.var("n")
+A = te.placeholder((n,), name='A')
+B = te.placeholder((n,), name='B')
+C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
 print(type(C))
 
 ######################################################################
@@ -88,7 +89,7 @@
 #     C[i] = A[i] + B[i];
 #   }
 #
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 
 ######################################################################
 # We used the split construct to split the first axis of C,
@@ -114,8 +115,8 @@
 # to generate code that runs on GPU.
 #
 if tgt == "cuda" or tgt == "rocm" or tgt.startswith('opencl'):
-  s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
-  s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+  s[C].bind(bx, te.thread_axis("blockIdx.x"))
+  s[C].bind(tx, te.thread_axis("threadIdx.x"))
 
 ######################################################################
 # Compilation
@@ -188,7 +189,7 @@
 #   arrays with different shapes into fadd, an error will be raised.
 #
 #   We can do more specializations. For example, we can write
-#   :code:`n = tvm.convert(1024)` instead of :code:`n = tvm.var("n")`,
+#   :code:`n = tvm.runtime.convert(1024)` instead of :code:`n = te.var("n")`,
 #   in the computation declaration. The generated function will
 #   only take vectors with length 1024.
 #
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index 2e049828e5cc..5bb5f0a66e30 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -26,6 +26,7 @@
 from __future__ import absolute_import, print_function
 
 import tvm
+from tvm import te
 import topi
 import numpy as np
 
@@ -36,12 +37,12 @@
 # To compute the sum of rows of a two dimensional TVM tensor A, we should
 # specify the symbolic operation as well as schedule as follows
 #
-n = tvm.var("n")
-m = tvm.var("m")
-A = tvm.placeholder((n, m), name='A')
-k = tvm.reduce_axis((0, m), "k")
-B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
-s = tvm.create_schedule(B.op)
+n = te.var("n")
+m = te.var("m")
+A = te.placeholder((n, m), name='A')
+k = te.reduce_axis((0, m), "k")
+B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B")
+s = te.create_schedule(B.op)
 
 ######################################################################
 # and to examine the IR code in human readable format, we can do
@@ -50,11 +51,11 @@
 
 ######################################################################
 # However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with
-# :code:`tvm.compute`. Imagine for more complicated operations how much details we need to provide.
+# :code:`te.compute`. Imagine for more complicated operations how much details we need to provide.
 # Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code:`numpy.sum`
 #
 C = topi.sum(A, axis=1)
-ts = tvm.create_schedule(C.op)
+ts = te.create_schedule(C.op)
 print(tvm.lower(ts, [A], simple_mode=True))
 
 ######################################################################
@@ -64,8 +65,8 @@
 # Even shorter, TOPI provides operator overloading for such common operations. For example,
 #
 x, y = 100, 10
-a = tvm.placeholder((x, y, y), name="a")
-b = tvm.placeholder((y, y), name="b")
+a = te.placeholder((x, y, y), name="a")
+b = te.placeholder((y, y), name="b")
 c = a + b  # same as topi.broadcast_add
 d = a * b  # same as topi.broadcast_mul
 
@@ -110,7 +111,7 @@
 ######################################################################
 # TOPI also provides common neural nets operations such as _softmax_ with optimized schedule
 #
-tarray = tvm.placeholder((512, 512), name="tarray")
+tarray = te.placeholder((512, 512), name="tarray")
 softmax_topi = topi.nn.softmax(tarray)
 with tvm.target.create("cuda"):
     sst = topi.cuda.schedule_softmax(softmax_topi)
@@ -129,8 +130,8 @@
 #    compute declaration and schedule. TVM will choose the right function to call with
 #    the target information.
 
-data = tvm.placeholder((1, 3, 224, 224))
-kernel = tvm.placeholder((10, 3, 5, 5))
+data = te.placeholder((1, 3, 224, 224))
+kernel = te.placeholder((10, 3, 5, 5))
 
 with tvm.target.create("cuda"):
     conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1)
diff --git a/vta/apps/gemm/python/tsim.py b/vta/apps/gemm/python/tsim.py
index c0f7b136e11b..85fd463e3278 100644
--- a/vta/apps/gemm/python/tsim.py
+++ b/vta/apps/gemm/python/tsim.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import ctypes
 import os.path as osp
 from sys import platform
diff --git a/vta/apps/gemm/tests/python/chisel_accel.py b/vta/apps/gemm/tests/python/chisel_accel.py
index 4666661f9bc9..441f36d8de09 100644
--- a/vta/apps/gemm/tests/python/chisel_accel.py
+++ b/vta/apps/gemm/tests/python/chisel_accel.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 import tsim
 import sys
@@ -32,7 +33,7 @@
 """
 def slice(A, slice_width):
     assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
-    dtype = type(A[0]) 
+    dtype = type(A[0])
     row = 0
     # currently only supports uint
     if dtype is np.uint8: row = 8 // slice_width
@@ -45,7 +46,7 @@ def slice(A, slice_width):
     else:
         dtype = 'uint8'
 
-    C = np.zeros((row, len(A))).astype(dtype) # sliced and transform 
+    C = np.zeros((row, len(A))).astype(dtype) # sliced and transform
 
     # create mask
     slice_mask = 2**(slice_width)-1
@@ -57,7 +58,7 @@ def slice(A, slice_width):
 
 def slice_mat(A, slice_width):
     assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
-    dtype = type(A[0][0]) 
+    dtype = type(A[0][0])
     row = 0
     # currently only supports uint
     if dtype is np.uint8: row = 8 // slice_width
@@ -71,7 +72,7 @@ def slice_mat(A, slice_width):
         dtype = 'uint8'
 
     # 3d array (bits, row, clmn)
-    C = np.zeros((row, A.shape[0], A.shape[1])).astype(dtype) # sliced and transform 
+    C = np.zeros((row, A.shape[0], A.shape[1])).astype(dtype) # sliced and transform
 
     # create mask
     slice_mask = 2**(slice_width)-1
@@ -162,16 +163,16 @@ def test_accel(A, B, i_width, w_width):
     for i in range(len(a_arr)):
         for j in range(len(b_arr)):
             shift = np.uint8(i*i_width + j*w_width)
-            if i == 0 and j == 0: 
+            if i == 0 and j == 0:
                 cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(1)) # reset accumulator
-            else: 
+            else:
                 cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(0)) # no reset
 
     return (accum.asnumpy(), cycles)
 
 """ Matrix Generator
 Parameters
-----------     
+----------
 dtype : String, datatype generated (supports only uint)
 i_width : weight bit slices(needs to be less than actual bit width)
 w_width : activation bit slices(needs to be less than actual bit width)
@@ -179,9 +180,9 @@ def test_accel(A, B, i_width, w_width):
 def top_test(dtype, i_width, w_width):
 
     # only supports positive values (up to 2**(bits-1))
-    rmax = 127 
+    rmax = 127
     # (m,16) * (16,16) GEMM
-    rrow = np.random.randint(7) + 1 
+    rrow = np.random.randint(7) + 1
     clmn = 16
     A = np.random.randint(rmax, size=(rrow,clmn)).astype(dtype)
     B = np.random.randint(rmax, size=(clmn,clmn)).astype(dtype)
@@ -196,8 +197,8 @@ def top_test(dtype, i_width, w_width):
     for i in range(1):
         # reg1 and reg2 bits in hardware/chisel/src/main/Compute.scala must be modified for slices greater than 8 bits
         if sys.argv[1] == 'serial':
-          # generates a random uint8 GEMM with 2-bit(8/4) input and 4-bit(8/2) weight 
+          # generates a random uint8 GEMM with 2-bit(8/4) input and 4-bit(8/2) weight
           top_test("uint8", 4, 2)
         elif sys.argv[1] == 'parallel':
-          # generates a random uint8 GEMM with 8-bit input and 8-bit weight (bit parallel) 
+          # generates a random uint8 GEMM with 8-bit input and 8-bit weight (bit parallel)
           top_test('uint8', 8, 8)
diff --git a/vta/apps/tsim_example/python/tsim.py b/vta/apps/tsim_example/python/tsim.py
index c0f7b136e11b..85fd463e3278 100644
--- a/vta/apps/tsim_example/python/tsim.py
+++ b/vta/apps/tsim_example/python/tsim.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import ctypes
 import os.path as osp
 from sys import platform
diff --git a/vta/apps/tsim_example/tests/python/chisel_accel.py b/vta/apps/tsim_example/tests/python/chisel_accel.py
index 1749aaa3b54d..370ac4068e18 100644
--- a/vta/apps/tsim_example/tests/python/chisel_accel.py
+++ b/vta/apps/tsim_example/tests/python/chisel_accel.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 import tsim
 
diff --git a/vta/apps/tsim_example/tests/python/verilog_accel.py b/vta/apps/tsim_example/tests/python/verilog_accel.py
index 578a7c3a0020..3489ff2f6fed 100644
--- a/vta/apps/tsim_example/tests/python/verilog_accel.py
+++ b/vta/apps/tsim_example/tests/python/verilog_accel.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+from tvm import te
 import numpy as np
 import tsim
 
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 4a62d354eb41..4c33d36d69b5 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -16,8 +16,6 @@
 # under the License.
 # pylint: disable=unused-argument
 """VTA specific buildin for runtime."""
-from __future__ import absolute_import as _abs
-
 import tvm
 from . import ir_pass
 from .environment import get_env
@@ -26,13 +24,13 @@
 def lift_coproc_scope(x):
     """Lift coprocessings cope to the """
     x = ir_pass.lift_alloc_to_scope_begin(x)
-    x = tvm.ir_pass.LiftAttrScope(x, "coproc_scope", False)
+    x = tvm.tir.ir_pass.LiftAttrScope(x, "coproc_scope", False)
     return x
 
 def early_rewrite(stmt):
     """Try to do storage rewrite in early pass."""
     try:
-        return tvm.ir_pass.StorageRewrite(stmt)
+        return tvm.tir.ir_pass.StorageRewrite(stmt)
     except tvm.error.TVMError:
         return stmt
 
@@ -63,7 +61,7 @@ def build_config(debug_flag=0, **kwargs):
     """
     env = get_env()
     def add_debug(stmt):
-        debug = tvm.call_extern(
+        debug = tvm.tir.call_extern(
             "int32", "VTASetDebugMode",
             env.dev.command_handle,
             debug_flag)
@@ -73,17 +71,17 @@ def add_debug(stmt):
                  (1, ir_pass.inject_dma_intrin),
                  (1, ir_pass.inject_skip_copy),
                  (1, ir_pass.annotate_alu_coproc_scope),
-                 (1, lambda x: tvm.ir_pass.LiftAttrScope(x, "coproc_uop_scope", True)),
+                 (1, lambda x: tvm.tir.ir_pass.LiftAttrScope(x, "coproc_uop_scope", True)),
                  (1, lift_coproc_scope),
                  (1, ir_pass.inject_coproc_sync),
                  (1, early_rewrite)]
     if debug_flag:
         pass_list.append((1, add_debug))
     pass_list.append((2, ir_pass.inject_alu_intrin))
-    pass_list.append((3, tvm.ir_pass.LowerStorageAccessInfo))
+    pass_list.append((3, tvm.tir.ir_pass.LowerStorageAccessInfo))
     pass_list.append((3, ir_pass.fold_uop_loop))
     pass_list.append((3, ir_pass.cpu_access_rewrite))
-    return tvm.build_config(add_lower_pass=pass_list, **kwargs)
+    return tvm.target.build_config(add_lower_pass=pass_list, **kwargs)
 
 
 def lower(*args, **kwargs):
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index 8d58958410e5..49b78b321bd2 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -22,6 +22,7 @@
 import json
 import copy
 import tvm
+from tvm import te
 from . import intrin
 from .pkg_config import PkgConfig
 
@@ -61,9 +62,9 @@ class DevContext(object):
     QID_COMPUTE = 2
 
     def __init__(self, env):
-        self.vta_axis = tvm.thread_axis("vta")
+        self.vta_axis = te.thread_axis("vta")
         self.vta_push_uop = tvm.tir.StringImm("VTAPushGEMMOp")
-        ctx = tvm.call_extern("handle", "VTATLSCommandHandle")
+        ctx = tvm.tir.call_extern("handle", "VTATLSCommandHandle")
         self.command_handle = tvm.tir.Call(
             "handle", "tvm_thread_context", [ctx],
             tvm.tir.Call.Intrinsic, None, 0)
@@ -284,14 +285,14 @@ def mem_info_acc_buffer():
 @tvm.register_func("tvm.intrin.rule.default.vta.coproc_sync")
 def coproc_sync(op):
     _ = op
-    return tvm.call_extern(
+    return tvm.tir.call_extern(
         "int32", "VTASynchronize",
         get_env().dev.command_handle, 1<<31)
 
 
 @tvm.register_func("tvm.intrin.rule.default.vta.coproc_dep_push")
 def coproc_dep_push(op):
-    return tvm.call_extern(
+    return tvm.tir.call_extern(
         "int32", "VTADepPush",
         get_env().dev.command_handle,
         op.args[0], op.args[1])
@@ -299,7 +300,7 @@ def coproc_dep_push(op):
 
 @tvm.register_func("tvm.intrin.rule.default.vta.coproc_dep_pop")
 def coproc_dep_pop(op):
-    return tvm.call_extern(
+    return tvm.tir.call_extern(
         "int32", "VTADepPop",
         get_env().dev.command_handle,
         op.args[0], op.args[1])
diff --git a/vta/python/vta/intrin.py b/vta/python/vta/intrin.py
index a43fc75a92d0..8532ffa318b5 100644
--- a/vta/python/vta/intrin.py
+++ b/vta/python/vta/intrin.py
@@ -18,6 +18,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import te
 
 def gemm(env, mock=False):
     """Matrix-matrix multiply intrinsic
@@ -45,26 +46,26 @@ def gemm(env, mock=False):
     out_shape = (env.BATCH, env.BLOCK_OUT)
     assert out_shape[0] * out_shape[1] == out_lanes
 
-    wgt = tvm.placeholder((wgt_shape[0], wgt_shape[1]),
-                          dtype="int%d" % env.WGT_WIDTH,
-                          name=env.wgt_scope)
-    inp = tvm.placeholder((inp_shape[0], inp_shape[1]),
-                          dtype="int%d" % env.INP_WIDTH,
-                          name=env.inp_scope)
-    k = tvm.reduce_axis((0, wgt_shape[1]), name="k")
+    wgt = te.placeholder((wgt_shape[0], wgt_shape[1]),
+                         dtype="int%d" % env.WGT_WIDTH,
+                         name=env.wgt_scope)
+    inp = te.placeholder((inp_shape[0], inp_shape[1]),
+                         dtype="int%d" % env.INP_WIDTH,
+                         name=env.inp_scope)
+    k = te.reduce_axis((0, wgt_shape[1]), name="k")
     out_dtype = "int%d" % env.ACC_WIDTH
-    out = tvm.compute((out_shape[0], out_shape[1]),
-                      lambda i, j: tvm.sum(inp[i, k].astype(out_dtype) *
-                                           wgt[j, k].astype(out_dtype),
-                                           axis=[k]),
-                      name="out")
-    wgt_layout = tvm.decl_buffer(
+    out = te.compute((out_shape[0], out_shape[1]),
+                     lambda i, j: te.sum(inp[i, k].astype(out_dtype) *
+                                         wgt[j, k].astype(out_dtype),
+                                         axis=[k]),
+                     name="out")
+    wgt_layout = tvm.tir.decl_buffer(
         wgt.shape, wgt.dtype, env.wgt_scope,
         scope=env.wgt_scope, offset_factor=wgt_lanes, data_alignment=wgt_lanes)
-    inp_layout = tvm.decl_buffer(
+    inp_layout = tvm.tir.decl_buffer(
         inp.shape, inp.dtype, env.inp_scope,
         scope=env.inp_scope, offset_factor=inp_lanes, data_alignment=inp_lanes)
-    out_layout = tvm.decl_buffer(
+    out_layout = tvm.tir.decl_buffer(
         out.shape, out.dtype, env.acc_scope,
         scope=env.acc_scope, offset_factor=out_lanes, data_alignment=out_lanes)
 
@@ -74,14 +75,14 @@ def intrin_func(ins, outs):
         dout = outs[0]
         def instr(index):
             """Generate matrix-matrix multiply VTA instruction"""
-            irb = tvm.ir_builder.create()
+            irb = tvm.tir.ir_builder.create()
             dev = env.dev
             irb.scope_attr(dev.vta_axis, "coproc_scope",
                            dev.get_task_qid(dev.QID_COMPUTE))
             irb.scope_attr(dev.vta_axis, "coproc_uop_scope",
                            dev.vta_push_uop)
             if index in (0, 2):
-                irb.emit(tvm.call_extern(
+                irb.emit(tvm.tir.call_extern(
                     "int32", "VTAUopPush",
                     0, 0,
                     dout.access_ptr("rw", "int32"),
@@ -89,7 +90,7 @@ def instr(index):
                     dwgt.access_ptr("r", "int32"),
                     0, 0, 0))
             else:
-                irb.emit(tvm.call_extern(
+                irb.emit(tvm.tir.call_extern(
                     "int32", "VTAUopPush",
                     0, 1,
                     dout.access_ptr("rw", "int32"),
@@ -103,8 +104,8 @@ def instr(index):
             return (nop, nop, nop)
         return (instr(0), instr(1), instr(2))
 
-    return tvm.decl_tensor_intrin(out.op, intrin_func,
-                                  name="GEMM",
-                                  binds={inp: inp_layout,
-                                         wgt: wgt_layout,
-                                         out: out_layout})
+    return te.decl_tensor_intrin(out.op, intrin_func,
+                                 name="GEMM",
+                                 binds={inp: inp_layout,
+                                        wgt: wgt_layout,
+                                        out: out_layout})
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
index 0c9b2eac2df7..4f8deff285a6 100644
--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -17,6 +17,7 @@
 """Additional IR Pass for VTA"""
 # pylint: disable=len-as-condition, no-else-return
 import tvm
+from tvm import te
 from topi import util
 
 from .environment import get_env
@@ -82,7 +83,7 @@ def _post_order(op):
                         fail[0] = True
                         return op
                     if gemm_offsets[i] is not None:
-                        if not tvm.ir_pass.Equal(m[0], gemm_offsets[i]):
+                        if not tvm.tir.ir_pass.Equal(m[0], gemm_offsets[i]):
                             fail[0] = True
                             return op
                         args.append(m[1])
@@ -90,23 +91,23 @@ def _post_order(op):
                         gemm_offsets[i] = m[0]
                         args.append(m[1])
                 args += op.args[base_args+3:]
-                return tvm.call_extern("int32", "VTAUopPush", *args)
+                return tvm.tir.call_extern("int32", "VTAUopPush", *args)
             if op.name not in ("VTATLSCommandHandle", "tvm_thread_context"):
                 raise RuntimeError("unexpected op %s" % op)
             return op
 
-        ret = tvm.ir_pass.IRTransform(
+        ret = tvm.tir.ir_pass.IRTransform(
             stmt.body, None, _post_order, ["Call"])
 
         if not fail[0] and all(x is not None for x in gemm_offsets):
             def _visit(op):
                 if op.same_as(loop_var):
                     fail[0] = True
-            tvm.ir_pass.PostOrderVisit(ret, _visit)
+            tvm.tir.ir_pass.PostOrderVisit(ret, _visit)
             if not fail[0]:
-                begin = tvm.call_extern(
+                begin = tvm.tir.call_extern(
                     "int32", "VTAUopLoopBegin", stmt.extent, *gemm_offsets)
-                end = tvm.call_extern("int32", "VTAUopLoopEnd")
+                end = tvm.tir.call_extern("int32", "VTAUopLoopEnd")
                 return [begin, ret, end]
         raise ValueError("Failed to fold the GEMM instructions..")
 
@@ -137,7 +138,7 @@ def _do_fold(stmt):
             return tvm.tir.AttrStmt(
                 stmt.node, stmt.attr_key, stmt.value, body)
         return None
-    out = tvm.ir_pass.IRTransform(
+    out = tvm.tir.ir_pass.IRTransform(
         stmt_in, _do_fold, None, ["AttrStmt"])
     return out
 
@@ -169,7 +170,7 @@ def _post_order(op):
                 return None
             new_var = rw_info[buffer_var]
             let_stmt = tvm.tir.LetStmt(
-                new_var, tvm.call_extern(
+                new_var, tvm.tir.call_extern(
                     "handle", "VTABufferCPUPtr",
                     env.dev.command_handle,
                     buffer_var), op.body)
@@ -181,23 +182,23 @@ def _post_order(op):
         if isinstance(op, tvm.tir.Load):
             buffer_var = op.buffer_var
             if not buffer_var in rw_info:
-                rw_info[buffer_var] = tvm.var(
+                rw_info[buffer_var] = te.var(
                     buffer_var.name + "_ptr", "handle")
             new_var = rw_info[buffer_var]
             return tvm.tir.Load(op.dtype, new_var, op.index)
         if isinstance(op, tvm.tir.Store):
             buffer_var = op.buffer_var
             if not buffer_var in rw_info:
-                rw_info[buffer_var] = tvm.var(
+                rw_info[buffer_var] = te.var(
                     buffer_var.name + "_ptr", "handle")
             new_var = rw_info[buffer_var]
             return tvm.tir.Store(new_var, op.value, op.index)
         raise RuntimeError("not reached")
-    stmt = tvm.ir_pass.IRTransform(
+    stmt = tvm.tir.ir_pass.IRTransform(
         stmt_in, None, _post_order, ["Allocate", "Load", "Store"])
     for buffer_var, new_var in rw_info.items():
         stmt = tvm.tir.LetStmt(
-            new_var, tvm.call_extern(
+            new_var, tvm.tir.call_extern(
                 "handle", "VTABufferCPUPtr",
                 env.dev.command_handle,
                 buffer_var), stmt)
@@ -259,7 +260,7 @@ def _post_order(op):
         if isinstance(op, tvm.tir.For):
             return _merge_block(lift_stmt.pop() + [op], op.body)
         raise RuntimeError("not reached")
-    stmt = tvm.ir_pass.IRTransform(
+    stmt = tvm.tir.ir_pass.IRTransform(
         stmt_in, _pre_order, _post_order, ["Allocate", "AttrStmt", "For"])
     assert len(lift_stmt) == 1
     return _merge_block(lift_stmt[0], stmt)
@@ -282,7 +283,7 @@ def _do_fold(stmt):
         if _match_pragma(stmt, "skip_dma_copy"):
             return tvm.tir.Evaluate(0)
         return None
-    return tvm.ir_pass.IRTransform(
+    return tvm.tir.ir_pass.IRTransform(
         stmt_in, _do_fold, None, ["AttrStmt"])
 
 
@@ -313,9 +314,9 @@ def _do_fold(stmt):
                 op.loop_var, op.min, 2, op.for_type,
                 op.device_api, op.body)
         return None
-    stmt = tvm.ir_pass.IRTransform(
+    stmt = tvm.tir.ir_pass.IRTransform(
         stmt_in, None, _do_fold, ["AttrStmt"])
-    stmt = tvm.ir_pass.CoProcSync(stmt)
+    stmt = tvm.tir.ir_pass.CoProcSync(stmt)
     return stmt
 
 
@@ -333,12 +334,12 @@ def inject_dma_intrin(stmt_in):
         Transformed statement
     """
     env = get_env()
-    idxd = tvm.indexdiv
-    idxm = tvm.indexmod
+    idxd = tvm.tir.indexdiv
+    idxm = tvm.tir.indexmod
 
     def _check_compact(buf):
         ndim = len(buf.shape)
-        size = tvm.const(1, buf.shape[0].dtype)
+        size = tvm.tir.const(1, buf.shape[0].dtype)
         for i in reversed(range(ndim)):
             if not util.equal_const_int(size - buf.strides[i], 0):
                 raise RuntimeError(
@@ -380,7 +381,7 @@ def _fold_buffer_dim(buf, scope, elem_block):
                     break
                 x_size = x_size * buf.shape[k]
                 next_base = i + 1
-            shape.append(tvm.ir_pass.Simplify(x_size))
+            shape.append(tvm.tir.ir_pass.Simplify(x_size))
             strides.append(x_stride)
             assert next_base != base
             base = next_base
@@ -491,10 +492,10 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             _check_compact(src)
             x_size, y_size, x_stride, offset = _get_2d_pattern(
                 dst, elem_width, elem_bytes, data_type, src.scope, allow_fold=True)
-            irb = tvm.ir_builder.create()
+            irb = tvm.tir.ir_builder.create()
             irb.scope_attr(env.dev.vta_axis, "coproc_scope",
                            env.dev.get_task_qid(task_qid))
-            irb.emit(tvm.call_extern(
+            irb.emit(tvm.tir.call_extern(
                 "int32", "VTAStoreBuffer2D",
                 env.dev.command_handle,
                 src.access_ptr("r", "int32"),
@@ -561,11 +562,11 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
                 src, elem_width, elem_bytes, data_type,
                 dst.scope, allow_fold=allow_fold)
 
-            irb = tvm.ir_builder.create()
+            irb = tvm.tir.ir_builder.create()
             irb.scope_attr(env.dev.vta_axis, "coproc_scope",
                            env.dev.get_task_qid(task_qid))
 
-            irb.emit(tvm.call_extern(
+            irb.emit(tvm.tir.call_extern(
                 "int32", "VTALoadBuffer2D",
                 env.dev.command_handle,
                 src.data, offset, x_size, y_size, x_stride,
@@ -577,7 +578,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
         else:
             raise RuntimeError("Do not support copy %s->%s" % (src.scope, dst.scope))
 
-    return tvm.ir_pass.InjectCopyIntrin(stmt_in, "dma_copy", _inject_copy)
+    return tvm.tir.ir_pass.InjectCopyIntrin(stmt_in, "dma_copy", _inject_copy)
 
 
 def _get_gemm_intrin_buffer():
@@ -594,26 +595,26 @@ def _get_gemm_intrin_buffer():
     assert out_lanes == env.BATCH * env.BLOCK_OUT
     out_shape = (env.BATCH, env.BLOCK_OUT)
     assert out_shape[0] * out_shape[1] == out_lanes
-    wgt = tvm.placeholder((wgt_shape[0], wgt_shape[1]),
-                          dtype="int%d" % env.WGT_WIDTH,
-                          name=env.wgt_scope)
-    inp = tvm.placeholder((inp_shape[0], inp_shape[1]),
-                          dtype="int%d" % env.INP_WIDTH,
-                          name=env.inp_scope)
-    k = tvm.reduce_axis((0, wgt_shape[1]), name="k")
+    wgt = te.placeholder((wgt_shape[0], wgt_shape[1]),
+                         dtype="int%d" % env.WGT_WIDTH,
+                         name=env.wgt_scope)
+    inp = te.placeholder((inp_shape[0], inp_shape[1]),
+                         dtype="int%d" % env.INP_WIDTH,
+                         name=env.inp_scope)
+    k = te.reduce_axis((0, wgt_shape[1]), name="k")
     out_dtype = "int%d" % env.ACC_WIDTH
-    out = tvm.compute((out_shape[0], out_shape[1]),
-                      lambda i, j: tvm.sum(inp[i, k].astype(out_dtype) *
-                                           wgt[j, k].astype(out_dtype),
-                                           axis=[k]),
-                      name="out")
-    wgt_layout = tvm.decl_buffer(
+    out = te.compute((out_shape[0], out_shape[1]),
+                     lambda i, j: te.sum(inp[i, k].astype(out_dtype) *
+                                         wgt[j, k].astype(out_dtype),
+                                         axis=[k]),
+                     name="out")
+    wgt_layout = tvm.tir.decl_buffer(
         wgt.shape, wgt.dtype, env.wgt_scope,
         scope=env.wgt_scope, offset_factor=wgt_lanes, data_alignment=wgt_lanes)
-    inp_layout = tvm.decl_buffer(
+    inp_layout = tvm.tir.decl_buffer(
         inp.shape, inp.dtype, env.inp_scope,
         scope=env.inp_scope, offset_factor=inp_lanes, data_alignment=inp_lanes)
-    out_layout = tvm.decl_buffer(
+    out_layout = tvm.tir.decl_buffer(
         out.shape, out.dtype, env.acc_scope,
         scope=env.acc_scope, offset_factor=out_lanes, data_alignment=out_lanes)
 
@@ -648,30 +649,30 @@ def _find_basics(op):
     def _do_fold(op):
         if _match_pragma(op, "conv2d_transpose_gemm"):
             is_init = ".init" in str(op)
-            tvm.ir_pass.PostOrderVisit(op, _find_basics)
+            tvm.tir.ir_pass.PostOrderVisit(op, _find_basics)
 
             if is_init:
                 # create inner most block
-                irb = tvm.ir_builder.create()
+                irb = tvm.tir.ir_builder.create()
                 dev = env.dev
                 irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE))
                 irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop)
-                irb.emit(tvm.call_extern("int32", "VTAUopPush",
-                                         0, 1,
-                                         dout.access_ptr("rw", "int32"),
-                                         0, 0,
-                                         0, 0, 0))
+                irb.emit(tvm.tir.call_extern("int32", "VTAUopPush",
+                                             0, 1,
+                                             dout.access_ptr("rw", "int32"),
+                                             0, 0,
+                                             0, 0, 0))
                 inner = irb.get()
                 # TODO(@tmoreau89): This is only a temporary fix, please take a look.
                 body = op.body.body
-                while isinstance(body, tvm.stmt.IfThenElse):
+                while isinstance(body, tvm.tir.IfThenElse):
                     body = body.then_case
                 args = body.args
                 res_tensor = body.func.output(0)
                 tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0, env.BLOCK_OUT)
                 inner = tvm.tir.AttrStmt(
                     [dout, res_tensor], 'buffer_bind_scope',
-                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
+                    tvm.tir.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                 return inner
             else:
                 conv_call, data_call, kernel_call = calls[-3:]
@@ -682,20 +683,20 @@ def _do_fold(op):
                 if selects:
                     condition = selects[0].condition
                 else:
-                    condition = tvm.const(1, 'int')
+                    condition = tvm.tir.const(1, 'int')
 
                 # create inner most block
-                irb = tvm.ir_builder.create()
+                irb = tvm.tir.ir_builder.create()
                 with irb.if_scope(condition):
                     dev = env.dev
                     irb.scope_attr(dev.vta_axis, "coproc_scope", dev.get_task_qid(dev.QID_COMPUTE))
                     irb.scope_attr(dev.vta_axis, "coproc_uop_scope", dev.vta_push_uop)
-                    irb.emit(tvm.call_extern("int32", "VTAUopPush",
-                                             0, 0,
-                                             dout.access_ptr("rw", "int32"),
-                                             dinp.access_ptr("r", "int32"),
-                                             dwgt.access_ptr("r", "int32"),
-                                             0, 0, 0))
+                    irb.emit(tvm.tir.call_extern("int32", "VTAUopPush",
+                                                 0, 0,
+                                                 dout.access_ptr("rw", "int32"),
+                                                 dinp.access_ptr("r", "int32"),
+                                                 dwgt.access_ptr("r", "int32"),
+                                                 0, 0, 0))
                 inner = irb.get()
 
                 args = conv_call.args
@@ -703,22 +704,22 @@ def _do_fold(op):
                        1, 0, 1, 0, env.BLOCK_OUT)
                 inner = tvm.tir.AttrStmt(
                     [dout, res_tensor], 'buffer_bind_scope',
-                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
+                    tvm.tir.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                 args = kernel_call.args
                 tpl = (args[0], 1, args[1], 1, args[2], 1, args[3],
                        1, 0, env.BLOCK_OUT, 0, env.BLOCK_IN)
                 inner = tvm.tir.AttrStmt(
                     [dwgt, kernel_tensor], 'buffer_bind_scope',
-                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
+                    tvm.tir.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                 args = data_call.args
                 tpl = (args[0], 1, args[1], 1, args[2], 1, args[3],
                        1, 0, 1, 0, env.BLOCK_IN)
                 inner = tvm.tir.AttrStmt(
                     [dinp, pad_data_tensor], 'buffer_bind_scope',
-                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
+                    tvm.tir.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                 return inner
         return None
-    ret = tvm.ir_pass.IRTransform(
+    ret = tvm.tir.ir_pass.IRTransform(
         stmt_in, _do_fold, None, ["AttrStmt"])
     return ret
 
@@ -739,7 +740,7 @@ def annotate_alu_coproc_scope(stmt_in):
     env = get_env()
     def _do_fold(stmt):
         if _match_pragma(stmt, "alu"):
-            irb = tvm.ir_builder.create()
+            irb = tvm.tir.ir_builder.create()
             irb.scope_attr(env.dev.vta_axis, "coproc_scope",
                            env.dev.get_task_qid(env.dev.QID_COMPUTE))
             irb.scope_attr(env.dev.vta_axis, "coproc_uop_scope",
@@ -750,7 +751,7 @@ def _do_fold(stmt):
             return tvm.tir.Evaluate(0)
         return stmt
 
-    stmt_out = tvm.ir_pass.IRTransform(
+    stmt_out = tvm.tir.ir_pass.IRTransform(
         stmt_in, None, _do_fold, ["AttrStmt"])
 
     return stmt_out
@@ -770,11 +771,11 @@ def inject_alu_intrin(stmt_in):
         Transformed statement
     """
     env = get_env()
-    idxm = tvm.indexmod
+    idxm = tvm.tir.indexmod
 
     def _do_fold(stmt):
         def _equal(x, y):
-            return tvm.ir_pass.Equal(tvm.ir_pass.Simplify(x - y), 0)
+            return tvm.tir.ir_pass.Equal(tvm.tir.ir_pass.Simplify(x - y), 0)
 
         def _flatten_loop(src_coeff, dst_coeff, extents):
             src_coeff = list(src_coeff)
@@ -793,7 +794,7 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
                 next_ext = extents.pop()
 
                 if _equal(next_src, vsrc * vext) and _equal(next_dst, vdst * vext):
-                    vext = tvm.ir_pass.Simplify(vext * next_ext)
+                    vext = tvm.tir.ir_pass.Simplify(vext * next_ext)
                 else:
                     rev_src_coeff.append(vsrc)
                     rev_dst_coeff.append(vdst)
@@ -853,7 +854,7 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
                 if loop_body.value.name == 'shift_left':
                     alu_opcode = env.dev.ALU_OPCODE_SHR
                     lhs = loop_body.value.args[0]
-                    rhs = tvm.ir_pass.Simplify(-loop_body.value.args[1])
+                    rhs = tvm.tir.ir_pass.Simplify(-loop_body.value.args[1])
                 elif loop_body.value.name == 'shift_right':
                     alu_opcode = env.dev.ALU_OPCODE_SHR
                     lhs = loop_body.value.args[0]
@@ -864,7 +865,7 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
             elif isinstance(loop_body.value, tvm.tir.Load):
                 alu_opcode = env.dev.ALU_OPCODE_SHR
                 lhs = loop_body.value
-                rhs = tvm.const(0, "int32")
+                rhs = tvm.tir.const(0, "int32")
             else:
                 raise RuntimeError(
                     "Expression not recognized %s, %s, %s" % (
@@ -894,9 +895,9 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
                 lhs_equal = True
                 rhs_equal = True
                 for i, coef in enumerate(dst_coeff):
-                    if not tvm.ir_pass.Equal(coef, src_lhs_coeff[i]):
+                    if not tvm.tir.ir_pass.Equal(coef, src_lhs_coeff[i]):
                         lhs_equal = False
-                    if not tvm.ir_pass.Equal(coef, src_rhs_coeff[i]):
+                    if not tvm.tir.ir_pass.Equal(coef, src_rhs_coeff[i]):
                         rhs_equal = False
                 # Make sure at least one of the source is identical to the
                 # destination (in-place computation)
@@ -915,20 +916,20 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
             assert len(src_coeff) > 1
             assert len(dst_coeff) > 1
             assert len(extents) != 0
-            assert tvm.ir_pass.Equal(
-                tvm.ir_pass.Simplify(
+            assert tvm.tir.ir_pass.Equal(
+                tvm.tir.ir_pass.Simplify(
                     idxm(src_coeff[-1], env.BATCH * env.BLOCK_OUT)), 0)
-            assert tvm.ir_pass.Equal(
-                tvm.ir_pass.Simplify(
+            assert tvm.tir.ir_pass.Equal(
+                tvm.tir.ir_pass.Simplify(
                     idxm(dst_coeff[-1], env.BATCH * env.BLOCK_OUT)), 0)
-            assert tvm.ir_pass.Equal(src_coeff[-2], 1)
-            assert tvm.ir_pass.Equal(dst_coeff[-2], 1)
+            assert tvm.tir.ir_pass.Equal(src_coeff[-2], 1)
+            assert tvm.tir.ir_pass.Equal(dst_coeff[-2], 1)
             if env.BATCH > 1:
                 assert len(src_coeff) > 2
                 assert len(dst_coeff) > 2
                 assert len(extents) > 1
-                assert tvm.ir_pass.Equal(src_coeff[-3], env.BLOCK_OUT)
-                assert tvm.ir_pass.Equal(dst_coeff[-3], env.BLOCK_OUT)
+                assert tvm.tir.ir_pass.Equal(src_coeff[-3], env.BLOCK_OUT)
+                assert tvm.tir.ir_pass.Equal(dst_coeff[-3], env.BLOCK_OUT)
 
             # Apply tensorization of the loop coefficients
             src_offset = src_coeff[-1]
@@ -944,22 +945,22 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
             src_coeff.append(src_offset)
             dst_coeff.append(dst_offset)
             src_coeff = [
-                tvm.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in src_coeff]
+                tvm.tir.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in src_coeff]
             dst_coeff = [
-                tvm.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in dst_coeff]
+                tvm.tir.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT)) for c in dst_coeff]
 
             # Flatten the outer loops
             if extents:
                 src_coeff, dst_coeff, extents = _flatten_loop(src_coeff, dst_coeff, extents)
 
             # Insert ALU micro-ops
-            irb = tvm.ir_builder.create()
+            irb = tvm.tir.ir_builder.create()
             for idx, extent in enumerate(extents):
-                irb.emit(tvm.call_extern(
+                irb.emit(tvm.tir.call_extern(
                     "int32", "VTAUopLoopBegin",
                     extent, dst_coeff[idx], src_coeff[idx], 0))
             use_imm = int(use_imm)
-            irb.emit(tvm.call_extern(
+            irb.emit(tvm.tir.call_extern(
                 "int32", "VTAUopPush",
                 1, 0,
                 dst_coeff[len(dst_coeff)-1],
@@ -967,12 +968,12 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
                 0,
                 alu_opcode, use_imm, imm_val))
             for extent in extents:
-                irb.emit(tvm.call_extern(
+                irb.emit(tvm.tir.call_extern(
                     "int32", "VTAUopLoopEnd"))
             return irb.get()
         return stmt
 
-    stmt_out = tvm.ir_pass.IRTransform(
+    stmt_out = tvm.tir.ir_pass.IRTransform(
         stmt_in, None, _do_fold, ["AttrStmt"])
     return stmt_out
 
diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py
index 0720e2fc9734..0516e839484a 100644
--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -193,7 +193,7 @@ def __init__(self, cfg, proj_root):
         self.inp_mem_size = 1 << cfg["LOG_INP_BUFF_SIZE"]  # bytes
         self.inp_mem_banks = (inp_mem_bus_width + \
                               max_bus_width - 1) // \
-                              max_bus_width
+            max_bus_width
         self.inp_mem_width = min(inp_mem_bus_width, max_bus_width)
         self.inp_mem_depth = self.inp_mem_size * 8 // inp_mem_bus_width
         self.inp_mem_axi_ratio = self.inp_mem_width // mem_bus_width
@@ -204,7 +204,7 @@ def __init__(self, cfg, proj_root):
         self.wgt_mem_size = 1 << cfg["LOG_WGT_BUFF_SIZE"]  # bytes
         self.wgt_mem_banks = (wgt_mem_bus_width + \
                               max_bus_width - 1) // \
-                              max_bus_width
+            max_bus_width
         self.wgt_mem_width = min(wgt_mem_bus_width, max_bus_width)
         self.wgt_mem_depth = self.wgt_mem_size * 8 // wgt_mem_bus_width
         self.wgt_mem_axi_ratio = self.wgt_mem_width // mem_bus_width
@@ -215,7 +215,7 @@ def __init__(self, cfg, proj_root):
         self.out_mem_size = 1 << cfg["LOG_OUT_BUFF_SIZE"]  # bytes
         self.out_mem_banks = (out_mem_bus_width + \
                               max_bus_width - 1) // \
-                              max_bus_width
+            max_bus_width
         self.out_mem_width = min(out_mem_bus_width, max_bus_width)
         self.out_mem_depth = self.out_mem_size * 8 // out_mem_bus_width
         self.out_mem_axi_ratio = self.out_mem_width // mem_bus_width
@@ -235,23 +235,23 @@ def __init__(self, cfg, proj_root):
         self.macro_defs.append("-DVTA_STORE_ADDR=%s" % (self.store_base_addr))
         # IP register offsets
         self.macro_defs.append("-DVTA_FETCH_INSN_COUNT_OFFSET=%s" % \
-                (self.fetch_insn_count_offset))
+                               (self.fetch_insn_count_offset))
         self.macro_defs.append("-DVTA_FETCH_INSN_ADDR_OFFSET=%s" % \
-                (self.fetch_insn_addr_offset))
+                               (self.fetch_insn_addr_offset))
         self.macro_defs.append("-DVTA_LOAD_INP_ADDR_OFFSET=%s" % \
-                (self.load_inp_addr_offset))
+                               (self.load_inp_addr_offset))
         self.macro_defs.append("-DVTA_LOAD_WGT_ADDR_OFFSET=%s" % \
-                (self.load_wgt_addr_offset))
+                               (self.load_wgt_addr_offset))
         self.macro_defs.append("-DVTA_COMPUTE_DONE_WR_OFFSET=%s" % \
-                (self.compute_done_wr_offet))
+                               (self.compute_done_wr_offet))
         self.macro_defs.append("-DVTA_COMPUTE_DONE_RD_OFFSET=%s" % \
-                (self.compute_done_rd_offet))
+                               (self.compute_done_rd_offet))
         self.macro_defs.append("-DVTA_COMPUTE_UOP_ADDR_OFFSET=%s" % \
-                (self.compute_uop_addr_offset))
+                               (self.compute_uop_addr_offset))
         self.macro_defs.append("-DVTA_COMPUTE_BIAS_ADDR_OFFSET=%s" % \
-                (self.compute_bias_addr_offset))
+                               (self.compute_bias_addr_offset))
         self.macro_defs.append("-DVTA_STORE_OUT_ADDR_OFFSET=%s" % \
-                (self.store_out_addr_offset))
+                               (self.store_out_addr_offset))
         # Coherency
         if coherent:
             self.macro_defs.append("-DVTA_COHERENT_ACCESSES=true")
diff --git a/vta/python/vta/top/bitpack.py b/vta/python/vta/top/bitpack.py
index 6e9d57bc0001..7a0710053b87 100644
--- a/vta/python/vta/top/bitpack.py
+++ b/vta/python/vta/top/bitpack.py
@@ -20,6 +20,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import te
 from topi import util
 
 from tvm.relay.op.op import register_compute, register_injective_schedule
@@ -59,7 +60,7 @@ def bitpack(data, bits, pack_type="int8", name="bitpack"):
 
     def _bitpack(*indices):
         ret = None
-        mask = tvm.const((1 << bits) - 1, pack_type)
+        mask = tvm.tir.const((1 << bits) - 1, pack_type)
         for k in range(lanes):
             idx = list(indices)
             idx[-1] = idx[-1] * lanes + k
@@ -67,11 +68,11 @@ def _bitpack(*indices):
             if k == 0:
                 ret = elem & mask
             else:
-                val = (elem & mask) << tvm.const(k * bits, pack_type)
+                val = (elem & mask) << tvm.tir.const(k * bits, pack_type)
                 ret = ret | val
         return ret
 
-    return tvm.compute(
+    return te.compute(
         oshape, _bitpack, name=name, tag='bitpack')
 
 
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 04e14b1e2bdd..2198ed4c191f 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -19,6 +19,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import te
 import topi
 
 from tvm.relay.op import op as reg
@@ -42,13 +43,13 @@ def compute_clip_vta(attrs, inputs, output_type):
     x = inputs[0]
     a_min = attrs.a_min
     a_max = attrs.a_max
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    with tvm.tag_scope(topi.tag.ELEMWISE):
-        x = tvm.compute(
-            x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-        x = tvm.compute(
-            x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    with tvm.te.tag_scope(topi.tag.ELEMWISE):
+        x = te.compute(
+            x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+        x = te.compute(
+            x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return [x]
 
 def clip_strategy_vta(attrs, inputs, out_type, target):
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index ba93b05ca232..5b23ddeba1c1 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 
@@ -44,14 +45,14 @@ def conv2d_packed(cfg, data, kernel, strides, padding, dilation, layout, out_dty
 
     ishape = topi.util.get_const_tuple(data.shape)
     kshape = topi.util.get_const_tuple(kernel.shape)
-    d_i = tvm.reduce_axis((0, kshape[2]), name='d_i')
-    d_j = tvm.reduce_axis((0, kshape[3]), name='d_j')
-    k_o = tvm.reduce_axis((0, ishape[1]), name='k_o')
-    k_i = tvm.reduce_axis((0, ishape[-1]), name='k_i')
+    d_i = te.reduce_axis((0, kshape[2]), name='d_i')
+    d_j = te.reduce_axis((0, kshape[3]), name='d_j')
+    k_o = te.reduce_axis((0, ishape[1]), name='k_o')
+    k_i = te.reduce_axis((0, ishape[-1]), name='k_i')
     hstride, wstride = strides
-    res = tvm.compute(
+    res = te.compute(
         oshape,
-        lambda b_o, c_o, i, j, b_i, c_i: tvm.sum(
+        lambda b_o, c_o, i, j, b_i, c_i: te.sum(
             pad_data[b_o, k_o, i*hstride+d_i, j*wstride+d_j, b_i, k_i].astype(out_dtype) *
             kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
             axis=[k_o, d_i, d_j, k_i]),
@@ -81,7 +82,7 @@ def _traverse(op):
                 else:
                     ewise_ops.append(op)
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
                     ewise_inputs.append((op, tensor))
                 else:
                     _traverse(tensor.op)
@@ -92,7 +93,7 @@ def _traverse(op):
     _traverse(output.op)
     assert len(conv2d_res) == 1
     conv2d_stage = conv2d_res[0].output(0)
-    s = tvm.create_schedule(output.op)
+    s = te.create_schedule(output.op)
 
     ##### space definition begin #####
     b, c_o, x_i, x_j, _, _ = s[conv2d_stage].op.axis
@@ -107,7 +108,7 @@ def _traverse(op):
     ###### space definition end ######
 
     data, kernel = conv2d_stage.op.input_tensors
-    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
         temp = data.op.input_tensors[0]
         pad_data = data
         data = temp
@@ -160,13 +161,13 @@ def _traverse(op):
     if cfg['oc_nthread'].val > 1:
         _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val)
         s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, tvm.thread_axis("cthread"))
+        s[output].bind(v_t, te.thread_axis("cthread"))
 
     # virtual threading along spatial rows
     if cfg['h_nthread'].val > 1:
         _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val)
         s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, tvm.thread_axis("cthread"))
+        s[output].bind(v_t, te.thread_axis("cthread"))
 
     x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis
     k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis
diff --git a/vta/python/vta/top/vta_conv2d_transpose.py b/vta/python/vta/top/vta_conv2d_transpose.py
index a3fd7ac92cd3..4f213f64d0da 100644
--- a/vta/python/vta/top/vta_conv2d_transpose.py
+++ b/vta/python/vta/top/vta_conv2d_transpose.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 from topi.util import get_const_tuple
@@ -52,14 +53,14 @@ def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype):
     out_h = (i_h - 1) * stride_h - fpad_top - fpad_bottom + k_h
     out_w = (i_w - 1) * stride_w - fpad_left - fpad_right + k_w
     oshape = (b, c_o, out_h, out_w, t_b, t_co)
-    d_c = tvm.reduce_axis((0, c_i), name='d_c')
-    d_h = tvm.reduce_axis((0, k_h), name='d_h')
-    d_w = tvm.reduce_axis((0, k_w), name='d_w')
-    d_ci = tvm.reduce_axis((0, t_ci), name='d_ci')
+    d_c = te.reduce_axis((0, c_i), name='d_c')
+    d_h = te.reduce_axis((0, k_h), name='d_h')
+    d_w = te.reduce_axis((0, k_w), name='d_w')
+    d_ci = te.reduce_axis((0, t_ci), name='d_ci')
 
-    out = tvm.compute(
+    out = te.compute(
         oshape,
-        lambda i_n, i_c, i_h, i_w, j_n, j_c: tvm.sum(
+        lambda i_n, i_c, i_h, i_w, j_n, j_c: te.sum(
             data_pad(i_n, d_c, i_h + d_h, i_w + d_w, j_n, d_ci).astype(out_dtype) *
             kernel[i_c, d_c, d_h, d_w, j_c, d_ci].astype(out_dtype),
             axis=[d_c, d_h, d_w, d_ci]),
@@ -87,7 +88,7 @@ def _traverse(op):
             if not op.same_as(output.op):
                 ewise_ops.append(op)
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
                     ewise_inputs.append((op, tensor))
                 else:
                     _traverse(tensor.op)
@@ -98,7 +99,7 @@ def _traverse(op):
     _traverse(output.op)
     assert len(conv2d_res) == 1
     conv2d_stage = conv2d_res[0].output(0)
-    s = tvm.create_schedule(output.op)
+    s = te.create_schedule(output.op)
 
     ##### space definition begin #####
     b, c_o, x_i, x_j, _, c_i = s[conv2d_stage].op.axis
@@ -113,7 +114,7 @@ def _traverse(op):
     ###### space definition end ######
 
     data, kernel = conv2d_stage.op.input_tensors
-    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
         temp = data.op.input_tensors[0]
         pad_data = data
         data = temp
@@ -162,13 +163,13 @@ def _traverse(op):
     if cfg['oc_nthread'].val > 1:
         _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val)
         s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, tvm.thread_axis("cthread"))
+        s[output].bind(v_t, te.thread_axis("cthread"))
 
     # virtual threading along spatial rows
     if cfg['h_nthread'].val > 1:
         _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val)
         s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, tvm.thread_axis("cthread"))
+        s[output].bind(v_t, te.thread_axis("cthread"))
 
     x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis
     k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis
diff --git a/vta/python/vta/top/vta_dense.py b/vta/python/vta/top/vta_dense.py
index e23910447ba8..912f41f30dfb 100644
--- a/vta/python/vta/top/vta_dense.py
+++ b/vta/python/vta/top/vta_dense.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 
@@ -48,11 +49,11 @@ def dense_packed(cfg, data, weight, bias=None, out_dtype=None):
     # Reduction axes (input channel)
     assert ishape[1] == wshape[1]
     assert ishape[3] == wshape[3]
-    k_o = tvm.reduce_axis((0, ishape[1]), name='k_o')
-    k_i = tvm.reduce_axis((0, ishape[3]), name='k_i')
-    res = tvm.compute(
+    k_o = te.reduce_axis((0, ishape[1]), name='k_o')
+    k_i = te.reduce_axis((0, ishape[3]), name='k_i')
+    res = te.compute(
         oshape,
-        lambda b_o, c_o, b_i, c_i: tvm.sum(
+        lambda b_o, c_o, b_i, c_i: te.sum(
             data[b_o, k_o, b_i, k_i].astype(out_dtype) *
             weight[c_o, k_o, c_i, k_i].astype(out_dtype),
             axis=[k_o, k_i]),
@@ -83,7 +84,7 @@ def _traverse(op):
                 else:
                     ewise_ops.append(op)
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
                     ewise_inputs.append((op, tensor))
                 else:
                     _traverse(tensor.op)
@@ -94,7 +95,7 @@ def _traverse(op):
     _traverse(output.op)
     assert len(dense_res) == 1
     dense_stage = dense_res[0].output(0)
-    s = tvm.create_schedule(output.op)
+    s = te.create_schedule(output.op)
 
     ##### space definition begin #####
     b, c_o, _, _ = s[dense_stage].op.axis
@@ -147,7 +148,7 @@ def _traverse(op):
     if cfg['oc_nthread'].val > 1:
         _, v_t = s[output].split(x_co, factor=cfg['oc_nthread'].val)
         s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, tvm.thread_axis("cthread"))
+        s[output].bind(v_t, te.thread_axis("cthread"))
 
     x_bo, x_co, x_bi, _ = s[dense_stage].op.axis
     k_o, _ = s[dense_stage].op.reduce_axis
diff --git a/vta/python/vta/top/vta_group_conv2d.py b/vta/python/vta/top/vta_group_conv2d.py
index aa06c61c3ec0..d470fb77038b 100644
--- a/vta/python/vta/top/vta_group_conv2d.py
+++ b/vta/python/vta/top/vta_group_conv2d.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 
@@ -54,14 +55,14 @@ def group_conv2d_packed(cfg,
     kshape = topi.util.get_const_tuple(kernel.shape)
     assert group * kshape[1] == ishape[1]
     assert kshape[0] % group == 0
-    d_i = tvm.reduce_axis((0, kshape[2]), name='d_i')
-    d_j = tvm.reduce_axis((0, kshape[3]), name='d_j')
-    k_o = tvm.reduce_axis((0, kshape[1]), name='k_o')
-    k_i = tvm.reduce_axis((0, kshape[-1]), name='k_i')
+    d_i = te.reduce_axis((0, kshape[2]), name='d_i')
+    d_j = te.reduce_axis((0, kshape[3]), name='d_j')
+    k_o = te.reduce_axis((0, kshape[1]), name='k_o')
+    k_i = te.reduce_axis((0, kshape[-1]), name='k_i')
     hstride, wstride = strides
-    out = tvm.compute(
+    out = te.compute(
         oshape,
-        lambda b_o, c_o, i, j, b_i, c_i: tvm.sum(
+        lambda b_o, c_o, i, j, b_i, c_i: te.sum(
             pad_data[b_o, c_o // (kshape[0] // group) * kshape[1] + k_o, i * hstride + d_i,
                      j * wstride + d_j, b_i, k_i].astype(out_dtype) *
             kernel[c_o, k_o, d_i, d_j, c_i, k_i].astype(out_dtype),
@@ -95,7 +96,7 @@ def _traverse(op):
                 else:
                     ewise_ops.append(op)
             for tensor in op.input_tensors:
-                if isinstance(tensor.op, tvm.tensor.PlaceholderOp):
+                if isinstance(tensor.op, tvm.te.PlaceholderOp):
                     ewise_inputs.append((op, tensor))
                 else:
                     _traverse(tensor.op)
@@ -106,7 +107,7 @@ def _traverse(op):
     _traverse(output.op)
     assert len(conv2d_res) == 1
     conv2d_stage = conv2d_res[0].output(0)
-    s = tvm.create_schedule(output.op)
+    s = te.create_schedule(output.op)
 
     ##### space definition begin #####
     b, c_o, x_i, x_j, _, _ = s[conv2d_stage].op.axis
@@ -121,7 +122,7 @@ def _traverse(op):
     ###### space definition end ######
 
     data, kernel = conv2d_stage.op.input_tensors
-    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+    if isinstance(data.op, tvm.te.ComputeOp) and "pad" in data.op.tag:
         temp = data.op.input_tensors[0]
         pad_data = data
         data = temp
@@ -174,13 +175,13 @@ def _traverse(op):
     if cfg['oc_nthread'].val > 1:
         _, v_t = s[output].split(x_co0, factor=cfg['oc_nthread'].val)
         s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, tvm.thread_axis("cthread"))
+        s[output].bind(v_t, te.thread_axis("cthread"))
 
     # virtual threading along spatial rows
     if cfg['h_nthread'].val > 1:
         _, v_t = s[output].split(x_i0, factor=cfg['h_nthread'].val)
         s[output].reorder(v_t, x_bo)
-        s[output].bind(v_t, tvm.thread_axis("cthread"))
+        s[output].bind(v_t, te.thread_axis("cthread"))
 
     x_bo, x_co, x_i, x_j, x_bi, x_ci = s[conv2d_stage].op.axis
     k_o, d_i, d_j, k_i = s[conv2d_stage].op.reduce_axis
diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py
index 265a6392b054..6d0b5d435b3b 100644
--- a/vta/scripts/tune_conv2d.py
+++ b/vta/scripts/tune_conv2d.py
@@ -22,6 +22,7 @@
 import os
 
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import vta
@@ -48,13 +49,13 @@
     ('resnet-18.C11', Workload(env.BATCH,   7,   7, 512, 512, 3, 3, 1, 1, 1, 1)),
 ]
 
-@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x
 
 def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation):
@@ -62,9 +63,9 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation):
     kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN)
     bias_shape = (N//env.BATCH, CO//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
 
-    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
+    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
 
     with tvm.target.vta():
         res = topi.nn.conv2d(
@@ -83,7 +84,7 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation):
     if tvm.target.Target.current().device_name == 'vta':
         s = topi.generic.schedule_conv2d_nchw([res])
     else:
-        s = tvm.create_schedule([res.op])
+        s = te.create_schedule([res.op])
 
     return s, [data, kernel, bias, res]
 
diff --git a/vta/scripts/tune_conv2d_transpose.py b/vta/scripts/tune_conv2d_transpose.py
index d6475abff667..087136797c5a 100644
--- a/vta/scripts/tune_conv2d_transpose.py
+++ b/vta/scripts/tune_conv2d_transpose.py
@@ -22,6 +22,7 @@
 import os
 
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import vta
@@ -41,21 +42,21 @@
     ('DCGAN.CT3', Workload(env.BATCH, 16, 16,  256, 128, 4, 4, 1, 1, 2, 2)),
 ]
 
-@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x
 
 def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding):
     data_shape = (N//env.BATCH, CI//env.BLOCK_IN, H, W, env.BATCH, env.BLOCK_IN)
     kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN)
 
-    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
 
     with tvm.target.vta():
         res = topi.nn.conv2d_transpose_nchw(
@@ -71,7 +72,7 @@ def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding):
     if tvm.target.Target.current().device_name == 'vta':
         s = topi.generic.schedule_conv2d_transpose_nchw([res])
     else:
-        s = tvm.create_schedule([res.op])
+        s = te.create_schedule([res.op])
 
     return s, [data, kernel, res]
 
diff --git a/vta/scripts/tune_dense.py b/vta/scripts/tune_dense.py
index fa49be7f9b27..e54de1d4ea70 100644
--- a/vta/scripts/tune_dense.py
+++ b/vta/scripts/tune_dense.py
@@ -22,6 +22,7 @@
 import os
 
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import vta
@@ -37,21 +38,21 @@
     ('lstm.dense.4',  Workload(4, 256, 128)),
 ]
 
-@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x
 
 def dense(N, CI, CO):
     data_shape = (N//env.BATCH, CI//env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
     kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN)
 
-    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
 
     with tvm.target.vta():
         res = topi.nn.dense(data, kernel, None, 'int32')
@@ -62,7 +63,7 @@ def dense(N, CI, CO):
     if tvm.target.Target.current().device_name == 'vta':
         s = topi.generic.schedule_dense([res])
     else:
-        s = tvm.create_schedule([res.op])
+        s = te.create_schedule([res.op])
 
     return s, [data, kernel, res]
 
diff --git a/vta/scripts/tune_group_conv2d.py b/vta/scripts/tune_group_conv2d.py
index 555154d708fc..72f9525320ef 100644
--- a/vta/scripts/tune_group_conv2d.py
+++ b/vta/scripts/tune_group_conv2d.py
@@ -22,6 +22,7 @@
 import os
 
 import tvm
+from tvm import te
 from tvm import autotvm
 import topi
 import vta
@@ -46,13 +47,13 @@
     ('mobilenet.D9', Workload(env.BATCH,   7,  7,  1024, 1024, 64, 3, 3, 1, 1, 1, 1)),
 ]
 
-@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x
 
 def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group):
@@ -62,9 +63,9 @@ def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group):
     kernel_shape = (CO//env.BLOCK_OUT, CI_G//env.BLOCK_IN, KH, KW, env.BLOCK_OUT, env.BLOCK_IN)
     bias_shape = (N//env.BATCH, CO//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
 
-    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
+    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
 
     with tvm.target.vta():
         res = topi.nn.group_conv2d_nchw(
@@ -83,7 +84,7 @@ def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group):
     if tvm.target.Target.current().device_name == 'vta':
         s = topi.generic.schedule_group_conv2d_nchw([res])
     else:
-        s = tvm.create_schedule([res.op])
+        s = te.create_schedule([res.op])
 
     return s, [data, kernel, bias, res]
 
diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py
index cf6f42654e6e..1de35c024203 100644
--- a/vta/scripts/tune_resnet.py
+++ b/vta/scripts/tune_resnet.py
@@ -24,6 +24,7 @@
 
 import topi
 import tvm
+from tvm import te
 from tvm import rpc, autotvm, relay
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
@@ -60,13 +61,13 @@ def parse_arguments():
 def register_vta_tuning_tasks():
     from tvm.autotvm.task.topi_integration import TaskExtractEnv, deserialize_args
 
-    @tvm.tag_scope(tag=topi.tag.ELEMWISE)
+    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
     def my_clip(x, a_min, a_max):
         """Unlike topi's current clip, put min and max into two stages."""
-        const_min = tvm.const(a_min, x.dtype)
-        const_max = tvm.const(a_max, x.dtype)
-        x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-        x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+        const_min = tvm.tir.const(a_min, x.dtype)
+        const_max = tvm.tir.const(a_max, x.dtype)
+        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+        x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
         return x
 
     # init autotvm env to register VTA operator
@@ -87,7 +88,7 @@ def _topi_nn_conv2d(*args, **kwargs):
         if tvm.target.Target.current().device_name == 'vta':
             s = topi.generic.schedule_conv2d_nchw([res])
         else:
-            s = tvm.create_schedule([res.op])
+            s = te.create_schedule([res.op])
         return s, [A, W, res]
 
     @autotvm.task.register("topi_nn_dense", override=True)
@@ -105,7 +106,7 @@ def _topi_nn_dense(*args, **kwargs):
         if tvm.target.Target.current().device_name == 'vta':
             s = topi.generic.schedule_dense([res])
         else:
-            s = tvm.create_schedule([res.op])
+            s = te.create_schedule([res.op])
 
         return s, [A, W, res]
 
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
index d4eed91aa1c7..e023c2017acf 100644
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+from tvm import te
 import numpy as np
 from tvm.contrib import util
 import vta.testing
@@ -38,37 +39,37 @@ def run_gemm_packed(env, remote, batch_size, channel, block):
         # To compute number of ops, use a x2 factor for FMA
         num_ops = 2 * channel * channel * batch_size
 
-        ko = tvm.reduce_axis((0, channel // env.BLOCK_IN), name='ko')
-        ki = tvm.reduce_axis((0, env.BLOCK_IN), name='ki')
+        ko = te.reduce_axis((0, channel // env.BLOCK_IN), name='ko')
+        ki = te.reduce_axis((0, env.BLOCK_IN), name='ki')
 
-        data = tvm.placeholder(data_shape,
+        data = te.placeholder(data_shape,
                                name="data",
                                dtype=env.inp_dtype)
-        weight = tvm.placeholder(weight_shape,
+        weight = te.placeholder(weight_shape,
                                  name="weight",
                                  dtype=env.wgt_dtype)
-        data_buf = tvm.compute(data_shape,
+        data_buf = te.compute(data_shape,
                                lambda *i: data(*i),
                                "data_buf")
-        weight_buf = tvm.compute(weight_shape,
+        weight_buf = te.compute(weight_shape,
                                  lambda *i: weight(*i),
                                  "weight_buf")
-        res_gem = tvm.compute(res_shape,
-                              lambda bo, co, bi, ci: tvm.sum(
+        res_gem = te.compute(res_shape,
+                              lambda bo, co, bi, ci: te.sum(
                                   data_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
                                   weight_buf[co, ko, ci, ki].astype(env.acc_dtype),
                                   axis=[ko, ki]),
                               name="res_gem")
-        res_shf = tvm.compute(res_shape,
+        res_shf = te.compute(res_shape,
                               lambda *i: res_gem(*i)>>8,
                             name="res_shf")
-        res_max = tvm.compute(res_shape,
-                              lambda *i: tvm.max(res_shf(*i), 0),
+        res_max = te.compute(res_shape,
+                              lambda *i: tvm.te.max(res_shf(*i), 0),
                               "res_max") #relu
-        res_min = tvm.compute(res_shape,
-                              lambda *i: tvm.min(res_max(*i), (1<<(env.INP_WIDTH-1))-1),
+        res_min = te.compute(res_shape,
+                              lambda *i: tvm.te.min(res_max(*i), (1<<(env.INP_WIDTH-1))-1),
                               "res_min") #relu
-        res = tvm.compute(res_shape,
+        res = te.compute(res_shape,
                           lambda *i: res_min(*i).astype(env.inp_dtype),
                           name="res")
 
@@ -128,7 +129,7 @@ def run_schedule(load_inp,
                          store_out,
                          print_ir,
                          check_correctness):
-            s = tvm.create_schedule(res.op)
+            s = te.create_schedule(res.op)
             s[data_buf].set_scope(env.inp_scope)
             s[weight_buf].set_scope(env.wgt_scope)
             s[res_gem].set_scope(env.acc_scope)
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 6935e4794c4e..b3c36e85d56b 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -25,6 +25,7 @@
 from collections import namedtuple
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 from tvm.contrib import util
@@ -61,13 +62,13 @@
 ]
 
 # FIXME: we need a custom clip operator to circumvent a pattern detection limitation
-@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x
 
 def run_conv2d(env, remote, wl, target,
@@ -104,9 +105,9 @@ def run_conv2d(env, remote, wl, target,
         data_shape = a_shape
         kernel_shape = w_shape
         bias_shape = b_shape
-    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
+    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
     padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
 
     # Define base computation schedule
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
index 2d96a7313480..90cc21fc8405 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
@@ -25,6 +25,7 @@
 from collections import namedtuple
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 from tvm.contrib import util
@@ -53,13 +54,13 @@
 ]
 
 # FIXME: we need a custom clip operator to circumvent a pattern detection limitation
-@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x
 
 # Helper function to get factors
@@ -102,8 +103,8 @@ def run_conv2d_transpose(env, remote, wl, target,
     else:
         data_shape = a_shape
         kernel_shape = w_shape
-    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
     padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
 
     # Define base computation schedule
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
index a0acdc34acef..95c491a6d723 100644
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -24,6 +24,7 @@
 import numpy as np
 
 import tvm
+from tvm import te
 from tvm import autotvm
 from tvm.contrib import util
 from tvm.contrib.pickle_memoize import memoize
@@ -35,13 +36,13 @@
 from vta.testing import simulator
 
 # FIXME: we need a custom clip operator to circumvent a pattern detection limitation
-@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x
 
 def run_gemm(env, remote, target,
@@ -70,8 +71,8 @@ def run_gemm(env, remote, target,
         kernel_shape = w_shape
         fcompute = topi.x86.dense_nopack
         fschedule = topi.x86.schedule_dense_nopack
-    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
 
     # Define base computation schedule
     with target:
diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
index 31fef4923328..1d5838ce8cda 100644
--- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
@@ -25,6 +25,7 @@
 from collections import namedtuple
 
 import tvm
+from tvm import te
 from tvm import relay
 from tvm import autotvm
 from tvm.contrib import util
@@ -57,13 +58,13 @@
 ]
 
 # FIXME: we need a custom clip operator to circumvent a pattern detection limitation
-@tvm.tag_scope(tag=topi.tag.ELEMWISE)
+@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
-    const_min = tvm.const(a_min, x.dtype)
-    const_max = tvm.const(a_max, x.dtype)
-    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+    const_min = tvm.tir.const(a_min, x.dtype)
+    const_max = tvm.tir.const(a_max, x.dtype)
+    x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+    x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x
 
 def run_group_conv2d(env, remote, wl, target,
@@ -101,9 +102,9 @@ def run_group_conv2d(env, remote, wl, target,
         data_shape = a_shape
         kernel_shape = w_shape
         bias_shape = b_shape
-    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
-    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
+    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
+    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
     padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))
 
     # Define base computation schedule
diff --git a/vta/tests/python/pynq/test_program_rpc.py b/vta/tests/python/pynq/test_program_rpc.py
index 2d8da5acc16a..fb0873586d44 100644
--- a/vta/tests/python/pynq/test_program_rpc.py
+++ b/vta/tests/python/pynq/test_program_rpc.py
@@ -16,6 +16,7 @@
 # under the License.
 import os
 import tvm
+from tvm import te
 from tvm import rpc
 from vta import get_bitstream_path, download_bitstream, program_fpga, reconfig_runtime
 
diff --git a/vta/tests/python/unittest/test_vta_insn.py b/vta/tests/python/unittest/test_vta_insn.py
index ef3c45ce58d6..c76636a4d242 100644
--- a/vta/tests/python/unittest/test_vta_insn.py
+++ b/vta/tests/python/unittest/test_vta_insn.py
@@ -16,6 +16,7 @@
 # under the License.
 """Unit test VTA's instructions """
 import tvm
+from tvm import te
 import numpy as np
 import topi
 from tvm.contrib import util
@@ -30,22 +31,22 @@ def test_save_load_out():
     """Test save/store output command"""
     def _run(env, remote):
         n = 6
-        x = tvm.placeholder(
+        x = te.placeholder(
             (n, n, env.BATCH, env.BLOCK_OUT),
             name="x",
             dtype=env.acc_dtype)
-        x_buf = tvm.compute(
+        x_buf = te.compute(
             (n, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: x(*i), "x_buf")
         # insert no-op that won't be optimized away
-        y_buf = tvm.compute(
+        y_buf = te.compute(
             (n, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: x_buf(*i)>>0, "y_buf")
-        y = tvm.compute(
+        y = te.compute(
             (n, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
         # schedule
-        s = tvm.create_schedule(y.op)
+        s = te.create_schedule(y.op)
         s[x_buf].set_scope(env.acc_scope)
         s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
         s[y_buf].set_scope(env.acc_scope)
@@ -93,22 +94,22 @@ def check_padded_load(pad_before, pad_after, test_name=None):
             # declare
             n = 3
             m = 5
-            x = tvm.placeholder(
+            x = te.placeholder(
                 (n, m, env.BATCH, env.BLOCK_OUT),
                 name="x",
                 dtype=env.acc_dtype)
             x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
             # insert no-op that won't be optimized away
-            y_buf = tvm.compute((n + pad_before[0] + pad_after[0],
+            y_buf = te.compute((n + pad_before[0] + pad_after[0],
                                  m + pad_before[1] + pad_after[1],
                                  env.BATCH,
                                  env.BLOCK_OUT), lambda *i: x_buf(*i)>>0, "y_buf")
-            y = tvm.compute((n + pad_before[0] + pad_after[0],
+            y = te.compute((n + pad_before[0] + pad_after[0],
                              m + pad_before[1] + pad_after[1],
                              env.BATCH,
                              env.BLOCK_OUT), lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
             # schedule
-            s = tvm.create_schedule(y.op)
+            s = te.create_schedule(y.op)
             s[x_buf].set_scope(env.acc_scope)
             s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
             s[y_buf].set_scope(env.acc_scope)
@@ -167,32 +168,32 @@ def _run(env, remote):
         o = 4
         n = 1
         m = 4
-        x = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype)
-        w = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype)
-        x_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i), "x_buf")
-        w_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: w(*i), "w_buf")
-        ko = tvm.reduce_axis((0, n), name="ko")
-        ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki")
-        y_gem = tvm.compute(
+        x = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype)
+        w = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype)
+        x_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i), "x_buf")
+        w_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: w(*i), "w_buf")
+        ko = te.reduce_axis((0, n), name="ko")
+        ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
+        y_gem = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
             lambda bo, co, bi, ci:
-            tvm.sum(x_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
+            te.sum(x_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
                     w_buf[co, ko, ci, ki].astype(env.acc_dtype),
                     axis=[ko, ki]),
             name="y_gem")
-        y_shf = tvm.compute(
+        y_shf = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
             lambda *i: y_gem(*i)>>8,
             name="y_shf")
-        y_max = tvm.compute(
+        y_max = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
-            lambda *i: tvm.max(y_shf(*i), 0),
+            lambda *i: tvm.te.max(y_shf(*i), 0),
             "y_max") #relu
-        y_min = tvm.compute(
+        y_min = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
-            lambda *i: tvm.min(y_max(*i), (1<<(env.INP_WIDTH-1))-1),
+            lambda *i: tvm.te.min(y_max(*i), (1<<(env.INP_WIDTH-1))-1),
             "y_min") #relu
-        y = tvm.compute(
+        y = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
             lambda *i: y_min(*i).astype(env.inp_dtype),
             name="y")
@@ -240,7 +241,7 @@ def verify(s, name=None):
 
         def test_schedule1():
             # default schedule with no smt
-            s = tvm.create_schedule(y.op)
+            s = te.create_schedule(y.op)
             # set the scope of the SRAM buffers
             s[x_buf].set_scope(env.inp_scope)
             s[w_buf].set_scope(env.wgt_scope)
@@ -270,7 +271,7 @@ def test_schedule1():
 
         def test_smt():
             # test smt schedule
-            s = tvm.create_schedule(y.op)
+            s = te.create_schedule(y.op)
             s[x_buf].set_scope(env.inp_scope)
             s[w_buf].set_scope(env.wgt_scope)
             s[y_gem].set_scope(env.acc_scope)
@@ -279,7 +280,7 @@ def test_smt():
             s[y_min].set_scope(env.acc_scope)
             abo, aco, abi, aci = s[y].op.axis
             abo1, abo2 = s[y].split(abo, nparts=2)
-            s[y].bind(abo1, tvm.thread_axis("cthread"))
+            s[y].bind(abo1, te.thread_axis("cthread"))
             s[y_gem].compute_at(s[y], abo1)
             s[y_shf].compute_at(s[y], abo1)
             s[y_max].compute_at(s[y], abo1)
@@ -315,38 +316,38 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
             n = 8
             imm = np.random.randint(1,5)
             # compute
-            a = tvm.placeholder(
+            a = te.placeholder(
                 (m, n, env.BATCH, env.BLOCK_OUT),
                 name="a",
                 dtype=env.acc_dtype)
-            a_buf = tvm.compute(
+            a_buf = te.compute(
                 (m, n, env.BATCH, env.BLOCK_OUT),
                 lambda *i: a(*i),
                 "a_buf") #DRAM->SRAM
             if use_imm:
-                res_buf = tvm.compute(
+                res_buf = te.compute(
                     (m, n, env.BATCH, env.BLOCK_OUT),
                     lambda *i: tvm_op(a_buf(*i), imm),
                     "res_buf") #compute
             else:
-                b = tvm.placeholder(
+                b = te.placeholder(
                     (m, n, env.BATCH, env.BLOCK_OUT),
                     name="b",
                     dtype=env.acc_dtype)
-                b_buf = tvm.compute(
+                b_buf = te.compute(
                     (m, n, env.BATCH, env.BLOCK_OUT),
                     lambda *i: b(*i),
                     "b_buf") #DRAM->SRAM
-                res_buf = tvm.compute(
+                res_buf = te.compute(
                     (m, n, env.BATCH, env.BLOCK_OUT),
                     lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                     "res_buf") #compute5B
-            res = tvm.compute(
+            res = te.compute(
                 (m, n, env.BATCH, env.BLOCK_OUT),
                 lambda *i: res_buf(*i).astype(env.inp_dtype),
                 "res") #SRAM->DRAM
             # schedule
-            s = tvm.create_schedule(res.op)
+            s = te.create_schedule(res.op)
             s[a_buf].set_scope(env.acc_scope) # SRAM
             s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
             s[res_buf].set_scope(env.acc_scope) # SRAM
@@ -402,8 +403,8 @@ def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
                     print("\t{:<16}: {:>16}".format(k, v))
 
         check_alu(lambda x, y: x << y, np.left_shift, use_imm=True, test_name="SHL")
-        check_alu(tvm.max, np.maximum, use_imm=True, test_name="MAX")
-        check_alu(tvm.max, np.maximum, test_name="MAX")
+        check_alu(tvm.te.max, np.maximum, use_imm=True, test_name="MAX")
+        check_alu(tvm.te.max, np.maximum, test_name="MAX")
         check_alu(lambda x, y: x + y, use_imm=True, test_name="ADD")
         check_alu(lambda x, y: x + y, test_name="ADD")
         check_alu(lambda x, y: x >> y, np.right_shift, use_imm=True, test_name="SHR")
@@ -417,28 +418,28 @@ def _run(env, remote):
         m = 8
         n = 10
         # compute
-        a = tvm.placeholder(
+        a = te.placeholder(
             (m, n, env.BATCH, env.BLOCK_OUT),
             name="a",
             dtype=env.acc_dtype)
-        a_buf = tvm.compute(
+        a_buf = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: a(*i),
             "a_buf") # DRAM->SRAM
-        max_buf = tvm.compute(
+        max_buf = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
-            lambda *i: tvm.max(a_buf(*i), 0),
+            lambda *i: tvm.te.max(a_buf(*i), 0),
             "res_buf") # relu
-        min_buf = tvm.compute(
+        min_buf = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
-            lambda *i: tvm.min(max_buf(*i), (1<<(env.INP_WIDTH-1))-1),
+            lambda *i: tvm.te.min(max_buf(*i), (1<<(env.INP_WIDTH-1))-1),
             "max_buf") # relu
-        res = tvm.compute(
+        res = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: min_buf(*i).astype(env.inp_dtype),
             "min_buf") # SRAM->DRAM
         # schedule
-        s = tvm.create_schedule(res.op)
+        s = te.create_schedule(res.op)
         s[a_buf].set_scope(env.acc_scope) # SRAM
         s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
         s[max_buf].set_scope(env.acc_scope) # SRAM
@@ -488,27 +489,27 @@ def _run(env, remote):
         imm_shift = np.random.randint(0,8)
         imm_scale = np.random.randint(1,5)
         # compute
-        a = tvm.placeholder(
+        a = te.placeholder(
             (m, n, env.BATCH, env.BLOCK_OUT),
             name="a", dtype=env.acc_dtype)
-        a_buf = tvm.compute(
+        a_buf = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: a(*i),
             "a_buf") # DRAM->SRAM
-        res_shift = tvm.compute(
+        res_shift = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: a_buf(*i)+imm_shift,
             "res_shift") # compute
-        res_scale = tvm.compute(
+        res_scale = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: res_shift(*i)>>imm_scale,
             "res_scale") # compute
-        res = tvm.compute(
+        res = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: res_scale(*i).astype(env.inp_dtype),
             "res") # SRAM->DRAM
         # schedule
-        s = tvm.create_schedule(res.op)
+        s = te.create_schedule(res.op)
         s[a_buf].set_scope(env.acc_scope) # SRAM
         s[res_shift].set_scope(env.acc_scope) # SRAM
         s[res_scale].set_scope(env.acc_scope) # SRAM
diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py
index a20b8ec8d3d3..16c8b3e1ad88 100644
--- a/vta/tutorials/autotvm/tune_relay_vta.py
+++ b/vta/tutorials/autotvm/tune_relay_vta.py
@@ -60,6 +60,7 @@
 
 import topi
 import tvm
+from tvm import te
 from tvm import rpc, autotvm, relay
 from tvm.contrib import graph_runtime, util, download
 from tvm.autotvm.measure.measure_methods import request_remote
@@ -297,13 +298,13 @@ def tune_tasks(tasks,
 def register_vta_tuning_tasks():
     from tvm.autotvm.task import TaskExtractEnv
 
-    @tvm.tag_scope(tag=topi.tag.ELEMWISE)
+    @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
     def my_clip(x, a_min, a_max):
         """Unlike topi's current clip, put min and max into two stages."""
-        const_min = tvm.const(a_min, x.dtype)
-        const_max = tvm.const(a_max, x.dtype)
-        x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
-        x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
+        const_min = tvm.tir.const(a_min, x.dtype)
+        const_max = tvm.tir.const(a_max, x.dtype)
+        x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
+        x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
         return x
 
     # init autotvm env to register VTA operator
@@ -323,7 +324,7 @@ def _topi_nn_conv2d(*args, **kwargs):
         if tvm.target.Target.current().device_name == 'vta':
             s = vta.top.schedule_conv2d_packed([res])
         else:
-            s = tvm.create_schedule([res.op])
+            s = te.create_schedule([res.op])
         return s, [A, W, res]
 
 
diff --git a/vta/tutorials/frontend/deploy_classification.py b/vta/tutorials/frontend/deploy_classification.py
index df02b4842488..15cba4373056 100644
--- a/vta/tutorials/frontend/deploy_classification.py
+++ b/vta/tutorials/frontend/deploy_classification.py
@@ -50,6 +50,7 @@
 from matplotlib import pyplot as plt
 
 import tvm
+from tvm import te
 from tvm import rpc, autotvm, relay
 from tvm.contrib import graph_runtime, util, download
 from tvm.contrib.debugger import debug_runtime
diff --git a/vta/tutorials/matrix_multiply.py b/vta/tutorials/matrix_multiply.py
index 3e46b427baf6..444762684bb9 100644
--- a/vta/tutorials/matrix_multiply.py
+++ b/vta/tutorials/matrix_multiply.py
@@ -36,6 +36,7 @@
 
 import os
 import tvm
+from tvm import te
 import vta
 import numpy as np
 from tvm import rpc
@@ -167,13 +168,13 @@
 # Batch factor o (we use single batch inference)
 o = 1
 # A placeholder tensor in tiled data format
-A = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype)
+A = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype)
 # B placeholder tensor in tiled data format
-B = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype)
+B = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype)
 # A copy buffer
-A_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf")
+A_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf")
 # B copy buffer
-B_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf")
+B_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf")
 
 ######################################################################
 # Matrix Multiplication
@@ -186,8 +187,8 @@
 # In order to implement matrix multiplication, the lambda function needs to
 # include a reduction formula over the input channel dimension axes.
 # To create a reduction formula, we can declare a reduction axis using
-# :code:`tvm.reduce_axis`, which takes in the range of reductions.
-# :code:`tvm.sum` takes in the expression to be reduced as well as
+# :code:`te.reduce_axis`, which takes in the range of reductions.
+# :code:`te.sum` takes in the expression to be reduced as well as
 # the reduction axes to compute the sum of value over all k in the declared
 # ranges.
 #
@@ -198,14 +199,14 @@
 # the computation should be done.
 
 # Outer input feature reduction axis
-ko = tvm.reduce_axis((0, n), name="ko")
+ko = te.reduce_axis((0, n), name="ko")
 # Inner input feature reduction axis
-ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki")
+ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
 # Describe the in-VTA matrix multiplication
-C_buf = tvm.compute(
+C_buf = te.compute(
     (o, m, env.BATCH, env.BLOCK_OUT),
     lambda bo, co, bi, ci:
-        tvm.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
+        te.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
                 B_buf[co, ko, ci, ki].astype(env.acc_dtype),
                 axis=[ko, ki]),
     name="C_buf")
@@ -234,7 +235,7 @@
 # input activation data format.
 
 # Cast to output type, and send to main memory
-C = tvm.compute(
+C = te.compute(
     (o, m, env.BATCH, env.BLOCK_OUT),
     lambda *i: C_buf(*i).astype(env.inp_dtype),
     name="C")
@@ -265,7 +266,7 @@
 # :code:`C` in the following way:
 
 # Let's take a look at the generated schedule
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 print(tvm.lower(s, [A, B, C], simple_mode=True))
 
 ######################################################################
diff --git a/vta/tutorials/optimize/convolution_opt.py b/vta/tutorials/optimize/convolution_opt.py
index e5cf8e595b15..2616fb28c89a 100644
--- a/vta/tutorials/optimize/convolution_opt.py
+++ b/vta/tutorials/optimize/convolution_opt.py
@@ -39,6 +39,7 @@
 
 import os
 import tvm
+from tvm import te
 import vta
 import numpy as np
 
@@ -167,16 +168,16 @@
                 env.BLOCK_OUT)
 
 # Convolution reduction axes
-dy = tvm.reduce_axis((0, kernel_h), name='dy')
-dx = tvm.reduce_axis((0, kernel_w), name='dx')
-ic = tvm.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
-ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
+dy = te.reduce_axis((0, kernel_h), name='dy')
+dx = te.reduce_axis((0, kernel_w), name='dx')
+ic = te.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
+ic_tns = te.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
 
 # Input placeholder tensors
-data = tvm.placeholder(data_shape,
+data = te.placeholder(data_shape,
                        name="data",
                        dtype=env.inp_dtype)
-kernel = tvm.placeholder(kernel_shape,
+kernel = te.placeholder(kernel_shape,
                          name="kernel",
                          dtype=env.wgt_dtype)
 
@@ -185,33 +186,33 @@
 data_buf = topi.nn.pad(data,
                        [0, 0, pad_h, pad_w, 0, 0],
                        name="data_buf")
-kernel_buf = tvm.compute(kernel_shape, lambda *i: kernel(*i), "kernel_buf")
+kernel_buf = te.compute(kernel_shape, lambda *i: kernel(*i), "kernel_buf")
 
 # Declare 2D convolution
-res_conv = tvm.compute(
+res_conv = te.compute(
     output_shape,
-    lambda bo, co, i, j, bi, ci: tvm.sum(
+    lambda bo, co, i, j, bi, ci: te.sum(
       data_buf[bo, ic, i*stride_h+dy, j*stride_w+dx, bi, ic_tns].astype(env.acc_dtype) *
       kernel_buf[co, ic, dy, dx, ci, ic_tns].astype(env.acc_dtype),
     axis=[ic, dy, dx, ic_tns]),
     name="res_conv")
 
 # Add shift stage for fix-point normalization
-res_shr = tvm.compute(output_shape,
+res_shr = te.compute(output_shape,
                       lambda *i: res_conv(*i) >> 8,
                       name="res_shr")
 
 # Apply clipping between (0, input max value)
 inp_max = (1 << (env.INP_WIDTH - 1)) - 1
-res_max = tvm.compute(output_shape,
-                      lambda *i: tvm.max(res_shr(*i), 0),
+res_max = te.compute(output_shape,
+                      lambda *i: tvm.te.max(res_shr(*i), 0),
                       "res_max")
-res_min = tvm.compute(output_shape,
-                      lambda *i: tvm.min(res_max(*i), inp_max),
+res_min = te.compute(output_shape,
+                      lambda *i: tvm.te.min(res_max(*i), inp_max),
                       "res_min")
 
 # Result Tensor
-res = tvm.compute(output_shape,
+res = te.compute(output_shape,
                   lambda *i: res_min(*i).astype(env.inp_dtype),
                   name="res")
 
@@ -228,7 +229,7 @@
 # - Lowering to VTA hardware intrinsics
 
 # Create TVM schedule
-s = tvm.create_schedule(res.op)
+s = te.create_schedule(res.op)
 # Let's look at the default TVM schedule
 print(tvm.lower(s, [data, kernel, res], simple_mode=True))
 
@@ -306,7 +307,7 @@
 # Perform virtual thread split along output channel outer axis
 _, tx = s[res].split(oc_out, factor=v_threads)
 s[res].reorder(tx, b_out)
-s[res].bind(tx, tvm.thread_axis("cthread"))
+s[res].bind(tx, te.thread_axis("cthread"))
 
 # Let's look at the current TVM schedule after blocking and virtual threading
 print(tvm.lower(s, [data, kernel, res], simple_mode=True))
diff --git a/vta/tutorials/optimize/matrix_multiply_opt.py b/vta/tutorials/optimize/matrix_multiply_opt.py
index 2722af594c03..597a7e8ecf7f 100644
--- a/vta/tutorials/optimize/matrix_multiply_opt.py
+++ b/vta/tutorials/optimize/matrix_multiply_opt.py
@@ -39,6 +39,7 @@
 
 import os
 import tvm
+from tvm import te
 import vta
 import numpy as np
 from tvm import rpc
@@ -119,45 +120,45 @@
 num_ops = in_channels * out_channels * batch_size * 2
 
 # Reduction axes
-ic = tvm.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
-ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
+ic = te.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
+ic_tns = te.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
 
 # Input placeholder tensors
-data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
-weight = tvm.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype)
+data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
+weight = te.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype)
 
 # Copy buffers
-data_buf = tvm.compute(data_shape,
+data_buf = te.compute(data_shape,
                        lambda *i: data(*i),
                        "data_buf")
-weight_buf = tvm.compute(weight_shape,
+weight_buf = te.compute(weight_shape,
                          lambda *i: weight(*i),
                          "weight_buf")
 
 # Declare matrix multiply computation
-res_gemm = tvm.compute(output_shape,
-                       lambda bo, co, bi, ci: tvm.sum(
+res_gemm = te.compute(output_shape,
+                       lambda bo, co, bi, ci: te.sum(
                             data_buf[bo, ic, bi, ic_tns].astype(env.acc_dtype) *
                             weight_buf[co, ic, ci, ic_tns].astype(env.acc_dtype),
                             axis=[ic, ic_tns]),
                        name="res_gem")
 
 # Add shift stage for fix-point normalization
-res_shr = tvm.compute(output_shape,
+res_shr = te.compute(output_shape,
                       lambda *i: res_gemm(*i) >> env.INP_WIDTH,
                       name="res_shr")
 
 # Apply clipping between (0, input max value)
 inp_max = (1<<(env.INP_WIDTH-1))-1
-res_max = tvm.compute(output_shape,
-                      lambda *i: tvm.max(res_shr(*i), 0),
+res_max = te.compute(output_shape,
+                      lambda *i: tvm.te.max(res_shr(*i), 0),
                       "res_max")
-res_min = tvm.compute(output_shape,
-                      lambda *i: tvm.min(res_max(*i), inp_max),
+res_min = te.compute(output_shape,
+                      lambda *i: tvm.te.min(res_max(*i), inp_max),
                       "res_min")
 
 # Apply typecast to input data type before sending results back
-res = tvm.compute(output_shape,
+res = te.compute(output_shape,
                   lambda *i: res_min(*i).astype(env.inp_dtype),
                   name="res")
 
@@ -173,7 +174,7 @@
 
 
 # Create TVM schedule
-s = tvm.create_schedule(res.op)
+s = te.create_schedule(res.op)
 # Let's look at the default TVM schedule
 print(tvm.lower(s, [data, weight, res], simple_mode=True))
 
diff --git a/vta/tutorials/vta_get_started.py b/vta/tutorials/vta_get_started.py
index dd305154a733..3dd1f8c8753a 100644
--- a/vta/tutorials/vta_get_started.py
+++ b/vta/tutorials/vta_get_started.py
@@ -36,6 +36,7 @@
 
 import os
 import tvm
+from tvm import te
 import vta
 import numpy as np
 
@@ -137,9 +138,9 @@
 # Batch factor o - total 1 x 1 = 1
 o = 1
 # A placeholder tensor in tiled data format
-A = tvm.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="A", dtype=env.acc_dtype)
+A = te.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="A", dtype=env.acc_dtype)
 # B placeholder tensor in tiled data format
-B = tvm.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="B", dtype=env.acc_dtype)
+B = te.placeholder((o, m, env.BATCH, env.BLOCK_OUT), name="B", dtype=env.acc_dtype)
 
 ######################################################################
 # Copy Buffers
@@ -158,9 +159,9 @@
 # This can later be interpreted by the compiler as a cached read operation.
 
 # A copy buffer
-A_buf = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: A(*i), "A_buf")
+A_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: A(*i), "A_buf")
 # B copy buffer
-B_buf = tvm.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf")
+B_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf")
 
 ######################################################################
 # Vector Addition
@@ -174,7 +175,7 @@
 # the computation should be done.
 
 # Describe the in-VTA vector addition
-C_buf = tvm.compute(
+C_buf = te.compute(
     (o, m, env.BATCH, env.BLOCK_OUT),
     lambda *i: A_buf(*i).astype(env.acc_dtype) + B_buf(*i).astype(env.acc_dtype),
     name="C_buf")
@@ -199,7 +200,7 @@
 # input activation data format.
 
 # Cast to output type, and send to main memory
-C = tvm.compute(
+C = te.compute(
     (o, m, env.BATCH, env.BLOCK_OUT),
     lambda *i: C_buf(*i).astype(env.inp_dtype),
     name="C")
@@ -231,7 +232,7 @@
 # :code:`C` in the following way:
 
 # Let's take a look at the generated schedule
-s = tvm.create_schedule(C.op)
+s = te.create_schedule(C.op)
 
 print(tvm.lower(s, [A, B, C], simple_mode=True))
 

From 8c6a772365109216aa26b942666317f5624375a1 Mon Sep 17 00:00:00 2001
From: zhengdi <zhengdi@nationalchip.com>
Date: Fri, 28 Feb 2020 00:43:10 +0800
Subject: [PATCH 49/73] [RELAY] fix error message (#4945)

---
 python/tvm/ir/transform.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/ir/transform.py b/python/tvm/ir/transform.py
index 4519fb630c2a..a35feb30dccb 100644
--- a/python/tvm/ir/transform.py
+++ b/python/tvm/ir/transform.py
@@ -82,7 +82,7 @@ def __init__(self,
         elif isinstance(fallback_device, TVMContext):
             fallback_device = fallback_device.device_type
         if not isinstance(fallback_device, int):
-            raise TypeError("required_pass is expected to be the type of " +
+            raise TypeError("fallback_device is expected to be the type of " +
                             "int/str/TVMContext.")
 
         required = list(required_pass) if required_pass else []

From 6b1136dd311190a70d22fd13751b4f64ac3e075a Mon Sep 17 00:00:00 2001
From: maheshambule <15611578+maheshambule@users.noreply.github.com>
Date: Fri, 28 Feb 2020 01:10:30 +0530
Subject: [PATCH 50/73] [Frontend] [MXNet] make_loss operator support (#4930)

* make_loss test case

* mxnet frontend make_loss support

* added comment for make_loss

* pylint fix

* Update mxnet.py
---
 python/tvm/relay/frontend/mxnet.py          | 8 ++++++++
 tests/python/frontend/mxnet/test_forward.py | 9 ++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 2787cd6d4647..8510adb8589c 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -644,6 +644,13 @@ def _mx_arange(inputs, attrs):
     return _op.arange(**new_attrs)
 
 
+# pylint: disable=unused-argument
+def _mx_make_loss(inputs, attrs):
+    # while doing inference make_loss does not have any effect
+    # and it should be mapped to identity
+    return inputs[0]
+
+
 def _mx_repeat(inputs, attrs):
     assert len(inputs) == 1
     new_attrs = {}
@@ -1822,6 +1829,7 @@ def _get_bias_requantize_scale(_inputs, _data_scale, _kernel_scale):
     "SoftmaxActivation" : _mx_softmax_activation,
     "LinearRegressionOutput" : _mx_linear_regression_output,
     "smooth_l1"     : _mx_smooth_l1,
+    "make_loss"     : _mx_make_loss,
     "_contrib_div_sqrt_dim": _mx_contrib_div_sqrt_dim,
     "one_hot"           : _mx_one_hot,
     # vision
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index f676295b324d..b81fbab8423b 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -201,6 +201,12 @@ def test_forward_ones_like():
     mx_sym = mx.sym.ones_like(data, dtype='float32')
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
 
+def test_forward_make_loss():
+    data = mx.sym.var('data')
+    ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.make_loss((data-ones)**2/2, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
 def test_forward_zeros_like():
     data = mx.sym.var('data')
     mx_sym = mx.sym.zeros_like(data, dtype='float32')
@@ -996,4 +1002,5 @@ def verify(a_np, b_np):
     test_forward_one_hot()
     test_forward_convolution()
     test_forward_deconvolution()
-    test_forward_cond()
\ No newline at end of file
+    test_forward_cond()
+    test_forward_make_loss()
\ No newline at end of file

From 81ff0613e15eb5bc4e2d02dc74c14c5b6403f46d Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 27 Feb 2020 12:38:54 -0800
Subject: [PATCH 51/73] Move Ops in relay.op.contrib.* (#4942)

* move contrib

* lint

* address comment

* address comment
---
 python/tvm/relay/frontend/mxnet.py      |   2 +-
 python/tvm/relay/frontend/pytorch.py    |   4 +-
 python/tvm/relay/op/__init__.py         |   1 -
 python/tvm/relay/op/_tensor.py          |   2 +
 python/tvm/relay/op/contrib/__init__.py |   2 -
 python/tvm/relay/op/contrib/_contrib.py |  36 --------
 python/tvm/relay/op/contrib/_make.py    |  20 -----
 python/tvm/relay/op/contrib/contrib.py  | 112 ------------------------
 python/tvm/relay/op/nn/_nn.py           |  10 +++
 python/tvm/relay/op/nn/nn.py            |  94 ++++++++++++++++++++
 python/tvm/relay/op/tensor.py           |  19 ++++
 src/relay/op/nn/pooling.cc              |  16 ++--
 src/relay/op/tensor/unary.cc            |   6 +-
 tests/python/relay/test_op_level10.py   |   4 +-
 14 files changed, 141 insertions(+), 187 deletions(-)
 delete mode 100644 python/tvm/relay/op/contrib/_contrib.py
 delete mode 100644 python/tvm/relay/op/contrib/_make.py

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 8510adb8589c..0020a63be625 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -313,7 +313,7 @@ def _pool2d(new_op, is_avg):
 
 def _mx_adaptive_avg_pooling(inputs, attrs):
     output_size = attrs.get_int_tuple("output_size", [])
-    return _op.contrib.adaptive_avg_pool2d(inputs[0], output_size)
+    return _op.nn.adaptive_avg_pool2d(inputs[0], output_size)
 
 
 def _mx_dropout(inputs, attrs):
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 0b766a17aa1b..edd6ad84ae3e 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -151,7 +151,7 @@ def _impl(inputs, input_types):
         data = inputs[0]
         output_size = _infer_shape(inputs[1])
 
-        return _op.contrib.contrib.adaptive_avg_pool2d(
+        return _op.nn.adaptive_avg_pool2d(
             data,
             output_size=output_size)
     return _impl
@@ -161,7 +161,7 @@ def _impl(inputs, input_types):
         data = inputs[0]
         output_size = _infer_shape(inputs[1])
 
-        return _op.contrib.contrib.adaptive_max_pool2d(
+        return _op.nn.adaptive_max_pool2d(
             data,
             output_size=output_size)
     return _impl
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 7427c63a14c1..4a4823d7d23b 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -32,7 +32,6 @@
 from . import memory
 from . import image
 from . import vision
-from . import contrib
 from . import op_attrs
 
 
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 0fbbaef374df..9f0906b113ba 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -70,6 +70,8 @@
 register_injective_schedule("right_shift")
 register_injective_schedule("left_shift")
 register_injective_schedule("shape_of")
+register_injective_schedule("ndarray_size")
+
 
 # zeros
 @register_compute("zeros")
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index 3159006486b3..c6e086aecd1d 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -17,5 +17,3 @@
 # pylint: disable=wildcard-import
 """Neural network related operators."""
 from __future__ import absolute_import as _abs
-from .contrib import *
-from . import _contrib
diff --git a/python/tvm/relay/op/contrib/_contrib.py b/python/tvm/relay/op/contrib/_contrib.py
deleted file mode 100644
index 3927cef69706..000000000000
--- a/python/tvm/relay/op/contrib/_contrib.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-argument
-"""Backend compiler related feature registration"""
-from __future__ import absolute_import
-
-from .. import op as reg
-from .. import strategy
-from ..op import OpPattern
-
-
-# adaptive_max_pool2d
-reg.register_schedule("contrib.adaptive_max_pool2d", strategy.schedule_adaptive_pool)
-reg.register_pattern("contrib.adaptive_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-
-# adaptive_avg_pool2d
-reg.register_schedule("contrib.adaptive_avg_pool2d", strategy.schedule_adaptive_pool)
-reg.register_pattern("contrib.adaptive_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
-
-# relay.contrib.ndarray_size
-reg.register_injective_schedule("contrib.ndarray_size")
diff --git a/python/tvm/relay/op/contrib/_make.py b/python/tvm/relay/op/contrib/_make.py
deleted file mode 100644
index 9d3369ebe7b2..000000000000
--- a/python/tvm/relay/op/contrib/_make.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constructor APIs"""
-import tvm._ffi
-
-tvm._ffi._init_api("relay.op.contrib._make", __name__)
diff --git a/python/tvm/relay/op/contrib/contrib.py b/python/tvm/relay/op/contrib/contrib.py
index 7114b7e712db..cb7e5d407e10 100644
--- a/python/tvm/relay/op/contrib/contrib.py
+++ b/python/tvm/relay/op/contrib/contrib.py
@@ -17,115 +17,3 @@
 #pylint: disable=invalid-name, too-many-lines
 """Contrib operations."""
 from __future__ import absolute_import as _abs
-from . import _make
-
-
-def adaptive_max_pool2d(data,
-                        output_size=None,
-                        layout="NCHW"):
-    r"""2D adaptive max pooling operator. This operator is experimental.
-
-    This operator takes data as input and does 2D max value calculation
-    across each window represented by WxH.
-
-
-    In the default case, where the data_layout is `NCHW`
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    to produce an output Tensor with shape
-    (batch_size, in_channels, output_height, output_width).
-
-    The pooling kernel and stride sizes are automatically chosen for
-    desired output sizes.
-
-    For output_size:
-        If this argument is not provided, input height and width will be used
-        as output height and width.
-
-        If a single integer is provided for output_size, the output size is
-        (N x C x output_size x output_size) for any input (NCHW).
-
-        If a tuple of integers (height, width) are provided for output_size,
-        the output size is (N x C x height x width) for any input (NCHW).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    output_size : tuple of int. optional
-        Output height and width.
-
-    layout : str, optional
-        Layout of the input.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [] or output_size
-    return _make.adaptive_max_pool2d(data, output_size, layout)
-
-def adaptive_avg_pool2d(data,
-                        output_size=None,
-                        layout="NCHW"):
-    r"""2D adaptive average pooling operator. This operator is experimental.
-
-    This operator takes data as input and does 2D average value calculation
-    across each window represented by WxH.
-
-
-    In the default case, where the data_layout is `NCHW`
-    a data Tensor with shape `(batch_size, in_channels, height, width)`,
-    to produce an output Tensor with shape
-    (batch_size, in_channels, output_height, output_width).
-
-    The pooling kernel and stride sizes are automatically chosen for
-    desired output sizes.
-
-    For output_size:
-        If this argument is not provided, input height and width will be used
-        as output height and width.
-
-        If a single integer is provided for output_size, the output size is
-        (N x C x output_size x output_size) for any input (NCHW).
-
-        If a tuple of integers (height, width) are provided for output_size,
-        the output size is (N x C x height x width) for any input (NCHW).
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input data to the operator.
-
-    output_size : tuple of int. optional
-        Output height and width.
-
-    layout : str, optional
-        Layout of the input.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The computed result.
-    """
-    output_size = [] or output_size
-    return _make.adaptive_avg_pool2d(data, output_size, layout)
-
-def ndarray_size(data, dtype="int32"):
-    """Get number of elements of input tensor.
-
-    Parameters
-    ----------
-    data : tvm.relay.Expr
-        The input tensor.
-
-    dtype : str, optional
-        The target data type.
-
-    Returns
-    -------
-    result : tvm.relay.Expr
-        The number of elements of input tensor.
-    """
-    return _make.ndarray_size(data, dtype)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index a4fde283daad..c522ef907cbf 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -247,6 +247,16 @@ def legalize_conv2d_transpose(attrs, inputs, types):
 reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+# adaptive_max_pool2d
+reg.register_schedule("nn.adaptive_max_pool2d", strategy.schedule_adaptive_pool)
+reg.register_pattern("nn.adaptive_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# adaptive_avg_pool2d
+reg.register_schedule("nn.adaptive_avg_pool2d", strategy.schedule_adaptive_pool)
+reg.register_pattern("nn.adaptive_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
 # leaky_relu
 reg.register_broadcast_schedule("nn.leaky_relu")
 reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 9ecb5af8b551..30918a4183b1 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -2277,3 +2277,97 @@ def space_to_depth(data, block_size, layout='NCHW'):
                            in_height / block_size, in_width / block_size]
     """
     return _make.space_to_depth(data, block_size, layout)
+
+
+def adaptive_max_pool2d(data,
+                        output_size=None,
+                        layout="NCHW"):
+    r"""2D adaptive max pooling operator. This operator is experimental.
+
+    This operator takes data as input and does 2D max value calculation
+    across each window represented by WxH.
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with shape
+    (batch_size, in_channels, output_height, output_width).
+
+    The pooling kernel and stride sizes are automatically chosen for
+    desired output sizes.
+
+    For output_size:
+        If this argument is not provided, input height and width will be used
+        as output height and width.
+
+        If a single integer is provided for output_size, the output size is
+        (N x C x output_size x output_size) for any input (NCHW).
+
+        If a tuple of integers (height, width) are provided for output_size,
+        the output size is (N x C x height x width) for any input (NCHW).
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    output_size : tuple of int. optional
+        Output height and width.
+
+    layout : str, optional
+        Layout of the input.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    output_size = [] or output_size
+    return _make.adaptive_max_pool2d(data, output_size, layout)
+
+
+def adaptive_avg_pool2d(data,
+                        output_size=None,
+                        layout="NCHW"):
+    r"""2D adaptive average pooling operator. This operator is experimental.
+
+    This operator takes data as input and does 2D average value calculation
+    across each window represented by WxH.
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with shape
+    (batch_size, in_channels, output_height, output_width).
+
+    The pooling kernel and stride sizes are automatically chosen for
+    desired output sizes.
+
+    For output_size:
+        If this argument is not provided, input height and width will be used
+        as output height and width.
+
+        If a single integer is provided for output_size, the output size is
+        (N x C x output_size x output_size) for any input (NCHW).
+
+        If a tuple of integers (height, width) are provided for output_size,
+        the output size is (N x C x height x width) for any input (NCHW).
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    output_size : tuple of int. optional
+        Output height and width.
+
+    layout : str, optional
+        Layout of the input.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    output_size = [] or output_size
+    return _make.adaptive_avg_pool2d(data, output_size, layout)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 898038dea004..ada1f5e85cac 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -974,3 +974,22 @@ def shape_of(data, dtype="int32"):
         The shape tensor.
     """
     return _make.shape_of(data, dtype)
+
+
+def ndarray_size(data, dtype="int32"):
+    """Get number of elements of input tensor.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input tensor.
+
+    dtype : str, optional
+        The target data type.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The number of elements of input tensor.
+    """
+    return _make.ndarray_size(data, dtype)
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 77baae567ab6..f174882f8a3e 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -499,21 +499,21 @@ Array<te::Tensor> AdaptivePool2DCompute(const Attrs& attrs,
                             mode, layout.name()) };
 }
 
-// relay.contrib.adaptive_avg_pool2d
+// relay.nn.adaptive_avg_pool2d
 Expr MakeAdaptiveAvgPool2D(Expr data,
                            Array<IndexExpr> output_size,
                            std::string layout) {
   auto attrs = make_object<AdaptivePool2DAttrs>();
   attrs->output_size = std::move(output_size);
   attrs->layout = std::move(layout);
-  static const Op& op = Op::Get("contrib.adaptive_avg_pool2d");
+  static const Op& op = Op::Get("nn.adaptive_avg_pool2d");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.contrib._make.adaptive_avg_pool2d")
+TVM_REGISTER_GLOBAL("relay.op.nn._make.adaptive_avg_pool2d")
 .set_body_typed(MakeAdaptiveAvgPool2D);
 
-RELAY_REGISTER_OP("contrib.adaptive_avg_pool2d")
+RELAY_REGISTER_OP("nn.adaptive_avg_pool2d")
   .describe(R"code(Adaptive average pooling operation for 2D data.
 
 - **data**: This depends on the `layout` parameter. Input is 4D array of shape
@@ -538,21 +538,21 @@ RELAY_REGISTER_OP("contrib.adaptive_avg_pool2d")
 .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool2DCompute<topi::nn::kAvgPool>);
 
 
-// relay.contrib.adaptive_max_pool2d
+// relay.nn.adaptive_max_pool2d
 Expr MakeAdaptiveMaxPool2D(Expr data,
                            Array<IndexExpr> output_size,
                            std::string layout) {
   auto attrs = make_object<AdaptivePool2DAttrs>();
   attrs->output_size = std::move(output_size);
   attrs->layout = std::move(layout);
-  static const Op& op = Op::Get("contrib.adaptive_max_pool2d");
+  static const Op& op = Op::Get("nn.adaptive_max_pool2d");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.contrib._make.adaptive_max_pool2d")
+TVM_REGISTER_GLOBAL("relay.op.nn._make.adaptive_max_pool2d")
 .set_body_typed(MakeAdaptiveMaxPool2D);
 
-RELAY_REGISTER_OP("contrib.adaptive_max_pool2d")
+RELAY_REGISTER_OP("nn.adaptive_max_pool2d")
   .describe(R"code(Adaptive max pooling operation for 2D data.
 
 - **data**: This depends on the `layout` parameter. Input is 4D array of shape
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index caa6451542c9..2c7345865095 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -359,15 +359,15 @@ Array<te::Tensor> NdarraySizeCompute(const Attrs& attrs,
   return Array<te::Tensor>{topi::ndarray_size(inputs[0], param->dtype)};
 }
 
-TVM_REGISTER_GLOBAL("relay.op.contrib._make.ndarray_size")
+TVM_REGISTER_GLOBAL("relay.op._make.ndarray_size")
 .set_body_typed([](Expr data, DataType dtype) {
   auto attrs = make_object<NdarraySizeAttrs>();
   attrs->dtype = dtype;
-  static const Op& op = Op::Get("contrib.ndarray_size");
+  static const Op& op = Op::Get("ndarray_size");
   return CallNode::make(op, {data}, Attrs(attrs), {});
 });
 
-RELAY_REGISTER_OP("contrib.ndarray_size")
+RELAY_REGISTER_OP("ndarray_size")
 .describe(R"code(Returns a tensor representing the number of elements of input tensor.
 
 )code" TVM_ADD_FILELINE)
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 1e4be742ff25..2e6ed62a3014 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -335,7 +335,7 @@ def test_shape_of():
 def test_ndarray_size():
     def verify_ndarray_size(shape):
         x = relay.var("x", shape=shape)
-        func = relay.Function([x], relay.op.contrib.ndarray_size(x))
+        func = relay.Function([x], relay.op.ndarray_size(x))
         func = run_infer_type(func)
 
         x_data = np.random.uniform(size=shape).astype("float32")
@@ -374,7 +374,7 @@ def end_index(index, odim, idim):
                     l_sl = slice(l_start, l_end)
                     np_out[i, j, k, l] = np_op(np_data[i, j, k_sl, l_sl])
 
-    opfunc = relay.contrib.adaptive_avg_pool2d if pool_type == "avg" else relay.contrib.adaptive_max_pool2d
+    opfunc = relay.nn.adaptive_avg_pool2d if pool_type == "avg" else relay.nn.adaptive_max_pool2d
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = opfunc(x, out_size, layout)
     func = relay.Function([x], y)

From 9db632fb75e4c7e505316baed7523117c7c4bf90 Mon Sep 17 00:00:00 2001
From: Jon Soifer <soiferj@gmail.com>
Date: Thu, 27 Feb 2020 13:32:16 -0800
Subject: [PATCH 52/73] [Runtime] Fix TVM_DLL_EXPORT_TYPED_FUNC to work on
 Windows (#4955)

* [Runtime] Fixed TVM_DLL_EXPORT_TYPED_FUNC to work on Windows

* fix style

Co-authored-by: Jon Soifer <jonso@microsoft.com>
---
 include/tvm/runtime/packed_func.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 1373f5f9cca1..7e37b2aade90 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -919,7 +919,12 @@ class TVMRetValue : public TVMPODValue_ {
                          int* type_code,                                \
                          int num_args,                                  \
                          TVMValue* out_value,                           \
-                         int* out_type_code) {                          \
+                         int* out_type_code);                           \
+  int ExportName(TVMValue* args,                                        \
+                 int* type_code,                                        \
+                 int num_args,                                          \
+                 TVMValue* out_value,                                   \
+                 int* out_type_code) {                                  \
     try {                                                               \
       ::tvm::runtime::TVMRetValue rv;                                   \
       Function(::tvm::runtime::TVMArgs(                                 \

From 1dbdcfb5e0542dbe43a468ad31a71a674a13e1fa Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 27 Feb 2020 14:27:37 -0800
Subject: [PATCH 53/73] [DOCS] Sphinx -- Introduce alias detection. (#4954)

* [DOCS] Sphinx -- Introduce alias detection.

Background: some of our namespaces import function from another
namespace. For example tvm.te imports most of the operators from tvm.tir.

Previously we manually exclude these aliases from the doc.
However that means we can not link them by the alias name.

This PR adds a sphinx callback plugin to detect such aliases, and create a rubric block
on the button of its current docstring `Alias of the original class`.
It is done in a way so that we can refer to the generated docs.

We also fixed a few docs errors.

* Fix most of the issues
---
 docs/api/python/index.rst                  |   1 -
 docs/api/python/relay/op.rst               |  39 +-------
 docs/api/python/te.rst                     |   6 --
 docs/api/python/tvm.rst                    |  76 ---------------
 docs/conf.py                               | 104 ++++++++++++++++-----
 python/tvm/ir/base.py                      |  12 +--
 python/tvm/relay/op/transform.py           |  13 +--
 python/tvm/relay/op/vision/__init__.py     |   2 -
 python/tvm/relay/op/vision/multibox.py     |   1 -
 python/tvm/relay/op/vision/nms.py          |   1 -
 python/tvm/target/generic_func.py          |   2 -
 tutorials/language/intrin_math.py          |   4 +-
 vta/tutorials/frontend/deploy_detection.py |   2 +-
 13 files changed, 99 insertions(+), 164 deletions(-)
 delete mode 100644 docs/api/python/tvm.rst

diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index 796bba8c591c..f62a4b848650 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -21,7 +21,6 @@ Python API
 .. toctree::
    :maxdepth: 2
 
-   tvm
    runtime
    ndarray
    error
diff --git a/docs/api/python/relay/op.rst b/docs/api/python/relay/op.rst
index 3d8460adcce7..898f103e89bf 100644
--- a/docs/api/python/relay/op.rst
+++ b/docs/api/python/relay/op.rst
@@ -18,38 +18,7 @@
 tvm.relay.op
 ------------
 .. automodule:: tvm.relay.op
-    :members:
-
-.. autofunction:: tvm.relay.op.Op
-
-.. autofunction:: tvm.relay.op.OpPattern
-
-.. autofunction:: tvm.relay.op.get
-
-.. autofunction:: tvm.relay.op.register
-
-.. autofunction:: tvm.relay.op.register_schedule
-
-.. autofunction:: tvm.relay.op.register_pattern
-
-.. autofunction:: tvm.relay.op.register_compute
-
-.. autofunction:: tvm.relay.op.register_gradient
-
-.. autofunction:: tvm.relay.op.register_alter_op_layout
-
-.. autofunction:: tvm.relay.op.schedule_injective
-
-.. autofunction:: tvm.relay.op.debug
-
-.. automodule:: tvm.relay.op.reduce
-    :members:
-
-.. automodule:: tvm.relay.op.tensor
-    :members:
-
-.. automodule:: tvm.relay.op.transform
-    :members:
-
-.. automodule:: tvm.relay.op.nn
-    :members:
+   :members:
+   :imported-members:
+   :exclude-members: Tuple
+   :autosummary:
diff --git a/docs/api/python/te.rst b/docs/api/python/te.rst
index 1f70c4d384bb..363dae675d84 100644
--- a/docs/api/python/te.rst
+++ b/docs/api/python/te.rst
@@ -22,10 +22,4 @@ tvm.te
 .. automodule:: tvm.te
    :members:
    :imported-members:
-   :exclude-members:
-      any, all, min_value, max_value, trace,
-      exp, erf, tanh, sigmoid, log, cos, sin, atan, sqrt, rsqrt, floor, ceil,
-      trunc, abs, round, nearbyint, isnan, power, popcount, fmod, if_then_else,
-      div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod,
-      comm_reducer, min, max, sum
    :autosummary:
diff --git a/docs/api/python/tvm.rst b/docs/api/python/tvm.rst
deleted file mode 100644
index 56f36130b4b4..000000000000
--- a/docs/api/python/tvm.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm
----
-The user facing API for computation declaration.
-
-.. autosummary::
-
-   tvm.var
-   tvm.size_var
-   tvm.const
-   tvm.convert
-   tvm.placeholder
-   tvm.compute
-   tvm.scan
-   tvm.extern
-   tvm.decl_buffer
-   tvm.reduce_axis
-   tvm.thread_axis
-   tvm.comm_reducer
-   tvm.sum
-   tvm.div
-   tvm.indexdiv
-   tvm.indexmod
-   tvm.truncdiv
-   tvm.truncmod
-   tvm.floordiv
-   tvm.floormod
-   tvm.min
-   tvm.max
-   tvm.tag_scope
-   tvm.exp
-   tvm.intrin
-   tvm.call_pure_extern
-
-.. autofunction:: tvm.var
-.. autofunction:: tvm.size_var
-.. autofunction:: tvm.const
-.. autofunction:: tvm.convert
-.. autofunction:: tvm.placeholder
-.. autofunction:: tvm.compute
-.. autofunction:: tvm.scan
-.. autofunction:: tvm.extern
-.. autofunction:: tvm.decl_buffer
-.. autofunction:: tvm.reduce_axis
-.. autofunction:: tvm.thread_axis
-.. autofunction:: tvm.comm_reducer
-.. autofunction:: tvm.sum
-.. autofunction:: tvm.div
-.. autofunction:: tvm.indexdiv
-.. autofunction:: tvm.indexmod
-.. autofunction:: tvm.truncdiv
-.. autofunction:: tvm.truncmod
-.. autofunction:: tvm.floordiv
-.. autofunction:: tvm.floormod
-.. autofunction:: tvm.min
-.. autofunction:: tvm.max
-.. autofunction:: tvm.tag_scope
-.. autofunction:: tvm.exp
-.. autofunction:: tvm.intrin
-.. autofunction:: tvm.call_pure_extern
diff --git a/docs/conf.py b/docs/conf.py
index d882f75d83a7..95fd1c3c0467 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -30,6 +30,7 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 import sys
+import inspect
 import os, subprocess
 import shlex
 import recommonmark
@@ -183,19 +184,6 @@
    author, 'manual'),
 ]
 
-# hook for doxygen
-def run_doxygen(folder):
-    """Run the doxygen make command in the designated folder."""
-    try:
-        #retcode = subprocess.call("cd %s; make doc" % folder, shell=True)
-        retcode = subprocess.call("rm -rf _build/html/doxygen", shell=True)
-        retcode = subprocess.call("mkdir -p _build/html", shell=True)
-        retcode = subprocess.call("cp -rf doxygen/html _build/html/doxygen", shell=True)
-        if retcode < 0:
-            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
-    except OSError as e:
-        sys.stderr.write("doxygen execution failed: %s" % e)
-
 intersphinx_mapping = {
     'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),
     'numpy': ('https://docs.scipy.org/doc/numpy/', None),
@@ -224,18 +212,6 @@ def generate_doxygen_xml(app):
     """Run the doxygen make commands if we're on the ReadTheDocs server"""
     run_doxygen('..')
 
-def setup(app):
-    # Add hook for building doxygen xml when needed
-    # no c++ API for now
-    app.connect("builder-inited", generate_doxygen_xml)
-    app.add_stylesheet('css/tvm_theme.css')
-    app.add_config_value('recommonmark_config', {
-        'url_resolver': lambda url: github_doc_root + url,
-        'auto_doc_ref': True
-            }, True)
-    app.add_transform(AutoStructify)
-
-
 sphinx_gallery_conf = {
     'backreferences_dir': 'gen_modules/backreferences',
     'doc_module': ('tvm', 'numpy'),
@@ -255,3 +231,81 @@ def setup(app):
 autodoc_default_options = {
     'member-order': 'bysource',
 }
+
+# hook for doxygen
+def run_doxygen(folder):
+    """Run the doxygen make command in the designated folder."""
+    try:
+        #retcode = subprocess.call("cd %s; make doc" % folder, shell=True)
+        retcode = subprocess.call("rm -rf _build/html/doxygen", shell=True)
+        retcode = subprocess.call("mkdir -p _build/html", shell=True)
+        retcode = subprocess.call("cp -rf doxygen/html _build/html/doxygen", shell=True)
+        if retcode < 0:
+            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: %s" % e)
+
+# Maps the original namespace to list of potential modules
+# that we can import alias from.
+tvm_alias_check_map = {
+    "tvm.te": ["tvm.tir"],
+    "tvm.tir": ["tvm.ir", "tvm.runtime"],
+}
+
+def update_alias_docstring(name, obj, lines):
+    """Update the docstring of alias functions.
+
+    This function checks if the obj is an alias of another documented object
+    in a different module.
+
+    If it is an alias, then it will append the alias information to the docstring.
+
+    Parameters
+    ----------
+    name : str
+        The full name of the object in the doc.
+
+    obj : object
+        The original object.
+
+    lines : list
+        The docstring lines, need to be modified inplace.
+    """
+    arr = name.rsplit(".", 1)
+    if len(arr) != 2:
+        return
+    target_mod, target_name = arr
+
+    if target_mod not in tvm_alias_check_map:
+        return
+    if not hasattr(obj, "__module__"):
+        return
+    obj_mod = obj.__module__
+
+    for amod in tvm_alias_check_map[target_mod]:
+        if not obj_mod.startswith(amod):
+            continue
+
+        if hasattr(sys.modules[amod], target_name):
+            obj_type = ":py:func" if callable(obj) else ":py:class"
+            lines.append(
+                ".. rubric:: Alias of %s:`%s.%s`" % (obj_type, amod, target_name))
+
+
+def process_docstring(app, what, name, obj, options, lines):
+    """Sphinx callback to process docstring"""
+    if callable(obj) or inspect.isclass(obj):
+        update_alias_docstring(name, obj, lines)
+
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    app.connect("builder-inited", generate_doxygen_xml)
+    app.connect('autodoc-process-docstring', process_docstring)
+    app.add_stylesheet('css/tvm_theme.css')
+    app.add_config_value('recommonmark_config', {
+        'url_resolver': lambda url: github_doc_root + url,
+        'auto_doc_ref': True
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/python/tvm/ir/base.py b/python/tvm/ir/base.py
index 661a64a08bba..944daa13ec76 100644
--- a/python/tvm/ir/base.py
+++ b/python/tvm/ir/base.py
@@ -39,16 +39,16 @@ def astext(self, show_meta_data=True, annotate=None):
             Optionally annotate function to provide additional
             information in the comment block.
 
-        .. note::
-
-            The meta data section is necessary to fully parse the text format.
-            However, it can contain dumps that are big (e.g constant weights),
-            so it can be helpful to skip printing the meta data section.
-
         Returns
         -------
         text : str
             The text format of the expression.
+
+        Notes
+        -----
+        The meta data section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights),
+        so it can be helpful to skip printing the meta data section.
         """
         return _ffi_api.AsText(self, show_meta_data, annotate)
 
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 15c48df14827..0955978f81a0 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -419,12 +419,6 @@ def tile(data, reps):
     reps : tuple of int
         The number of times repeating the tensor data.
 
-    .. note::
-        Each dim size of reps must be a positive integer. If reps has length d,
-        the result will have dimension of max(d, data.ndim); If data.ndim < d,
-        data is promoted to be d-dimensional by prepending new axes.
-        If data.ndim >=  d, reps is promoted to a.ndim by pre-pending 1's to it.
-
     Returns
     -------
     ret : relay.Expr
@@ -442,6 +436,13 @@ def tile(data, reps):
 
         relay.tile(x, reps=(2,)) = [[1., 2., 1., 2.],
                                     [3., 4., 3., 4.]]
+
+    Notes
+    -----
+    Each dim size of reps must be a positive integer. If reps has length d,
+    the result will have dimension of max(d, data.ndim); If data.ndim < d,
+    data is promoted to be d-dimensional by prepending new axes.
+    If data.ndim >=  d, reps is promoted to a.ndim by pre-pending 1's to it.
     """
 
     return _make.tile(data, reps)
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
index da06ca65fbae..55e6bf9d5fd9 100644
--- a/python/tvm/relay/op/vision/__init__.py
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -16,8 +16,6 @@
 # under the License.
 # pylint: disable=wildcard-import
 """Vision network related operators."""
-from __future__ import absolute_import as _abs
-
 from .multibox import *
 from .nms import *
 from .rcnn import *
diff --git a/python/tvm/relay/op/vision/multibox.py b/python/tvm/relay/op/vision/multibox.py
index 065a3e7c7f0c..55fb01c5eaef 100644
--- a/python/tvm/relay/op/vision/multibox.py
+++ b/python/tvm/relay/op/vision/multibox.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Multibox operations."""
-from __future__ import absolute_import as _abs
 from . import _make
 from ...expr import TupleWrapper
 
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index d19dde306aca..cba08bfba824 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Non-maximum suppression operations."""
-from __future__ import absolute_import as _abs
 from . import _make
 from ...expr import TupleWrapper
 
diff --git a/python/tvm/target/generic_func.py b/python/tvm/target/generic_func.py
index bfcd2dd56b4f..1936ff1511be 100644
--- a/python/tvm/target/generic_func.py
+++ b/python/tvm/target/generic_func.py
@@ -116,7 +116,6 @@ def override_native_generic_func(func_name):
     .. code-block:: python
 
       import tvm
-from tvm import te
       # wrap function as target generic
       @tvm.target.override_native_generic_func("my_func")
       def my_func(a):
@@ -211,7 +210,6 @@ def generic_func(fdefault):
     .. code-block:: python
 
       import tvm
-from tvm import te
       # wrap function as target generic
       @tvm.target.generic_func
       def my_func(a):
diff --git a/tutorials/language/intrin_math.py b/tutorials/language/intrin_math.py
index eebab3f6c3c3..146263dab5d7 100644
--- a/tutorials/language/intrin_math.py
+++ b/tutorials/language/intrin_math.py
@@ -66,7 +66,7 @@
 # TVM intrinsic provides the user a mechanism to achieve this, and this
 # is the recommended way to solve the problem.
 # The following code use te.exp instead, which create an intrinsic call
-# :any:`te.exp` to do the exponential.
+# :py::func:`tvm.te.exp` to do the exponential.
 #
 n = te.var("n")
 A = te.placeholder((n,), name='A')
@@ -88,7 +88,7 @@
 ######################################################################
 # Intrinsic Lowering Rule
 # -----------------------
-# When :any:`te.exp` is called, TVM creates an intrinsic Call Expr.
+# When :py:func:`tvm.te.exp` is called, TVM creates an intrinsic Call Expr.
 # TVM uses transformation rules to transform the intrinsic
 # call to device specific extern calls.
 #
diff --git a/vta/tutorials/frontend/deploy_detection.py b/vta/tutorials/frontend/deploy_detection.py
index 09d8465f3da5..0d1dbddcb1ec 100644
--- a/vta/tutorials/frontend/deploy_detection.py
+++ b/vta/tutorials/frontend/deploy_detection.py
@@ -16,7 +16,7 @@
 # under the License.
 """
 Deploy Pretrained Vision Detection Model from Darknet on VTA
-================================================
+============================================================
 **Author**: `Hua Jiang <https://github.com/huajsj>`_
 
 This tutorial provides an end-to-end demo, on how to run Darknet YoloV3-tiny

From 236db6cafa51085f2eebdb8e297198d6a438fbd0 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Thu, 27 Feb 2020 18:57:07 -0800
Subject: [PATCH 54/73] fix doc warning (#4959)

---
 docs/langref/relay_op.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index e1e25a95485d..1be2eb51504b 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -196,15 +196,15 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.collapse_sum_like
    tvm.relay.slice_like
    tvm.relay.shape_of
-   tvm.relay.contrib.ndarray_size
+   tvm.relay.ndarray_size
    tvm.relay.layout_transform
    tvm.relay.device_copy
    tvm.relay.annotation.on_device
    tvm.relay.reverse_reshape
    tvm.relay.sequence_mask
    tvm.relay.nn.batch_matmul
-   tvm.relay.contrib.adaptive_max_pool2d
-   tvm.relay.contrib.adaptive_avg_pool2d
+   tvm.relay.nn.adaptive_max_pool2d
+   tvm.relay.nn.adaptive_avg_pool2d
    tvm.relay.one_hot
 
 
@@ -355,15 +355,15 @@ Level 10 Definitions
 .. autofunction:: tvm.relay.collapse_sum_like
 .. autofunction:: tvm.relay.slice_like
 .. autofunction:: tvm.relay.shape_of
-.. autofunction:: tvm.relay.contrib.ndarray_size
+.. autofunction:: tvm.relay.ndarray_size
 .. autofunction:: tvm.relay.layout_transform
 .. autofunction:: tvm.relay.device_copy
 .. autofunction:: tvm.relay.annotation.on_device
 .. autofunction:: tvm.relay.reverse_reshape
 .. autofunction:: tvm.relay.sequence_mask
 .. autofunction:: tvm.relay.nn.batch_matmul
-.. autofunction:: tvm.relay.contrib.adaptive_max_pool2d
-.. autofunction:: tvm.relay.contrib.adaptive_avg_pool2d
+.. autofunction:: tvm.relay.nn.adaptive_max_pool2d
+.. autofunction:: tvm.relay.nn.adaptive_avg_pool2d
 .. autofunction:: tvm.relay.one_hot
 
 

From a6fae5ed64d1e8bf0cb0d0e01cb27d7aa537e0e1 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 27 Feb 2020 18:57:18 -0800
Subject: [PATCH 55/73] [CI] Add pre-check script to check sphinx doc build.
 (#4956)

Introduce the check stage to the unittest stage for now
so we don't have to rebuild CI images.
As we make additional CPU images to make use of the sphinx,
consider move it to an earlier stage.
---
 Jenkinsfile                           |  1 +
 tests/scripts/task_sphinx_precheck.sh | 48 +++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100755 tests/scripts/task_sphinx_precheck.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index bb57abb32095..bedb452960a9 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -256,6 +256,7 @@ stage('Unit Test') {
         init_git()
         unpack_lib('gpu', tvm_multilib)
         timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_sphinx_precheck.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration.sh"
         }
diff --git a/tests/scripts/task_sphinx_precheck.sh b/tests/scripts/task_sphinx_precheck.sh
new file mode 100755
index 000000000000..cf45c8c57b36
--- /dev/null
+++ b/tests/scripts/task_sphinx_precheck.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Precheck if sphinx docs build can fail.
+set -e
+set -u
+set -o pipefail
+
+cleanup()
+{
+  rm -rf /tmp/$$.*
+}
+trap cleanup 0
+
+# cleanup cache
+rm -rf docs/tutorials
+rm -rf docs/vta/tutorials
+find . -type f -path "*.pyc" | xargs rm -f
+
+echo "PreCheck sphinx doc generation WARNINGS.."
+cd docs
+TVM_TUTORIAL_EXEC_PATTERN=none make html 2>/tmp/$$.log.txt
+
+grep -v -E "__mro__|RemovedInSphinx|UserWarning|FutureWarning" < /tmp/$$.log.txt > /tmp/$$.logclean.txt || true
+echo "---------Sphinx Log----------"
+cat /tmp/$$.logclean.txt
+echo "-----------------------------"
+if grep --quiet -E "WARN" < /tmp/$$.logclean.txt; then
+    echo "WARNINIG found in the log, please fix them."
+    echo "You can reproduce locally by running ./tests/script/task_sphinx_precheck.sh"
+    exit 1
+fi
+echo "No WARNINGS to be fixed."

From 7ccb4363eefa76ae355ee263aa5527d43fb699fb Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 28 Feb 2020 11:59:15 +0900
Subject: [PATCH 56/73] [Relay, Torch] Clean up and refactor PyTorch frontend
 (#4944)

* The initial import of refactored implementation, all tests passed

* enable mobilenet v2 test

* minor cleanup

* reorg

* fix lint

* use input names that come with torch IR

* fix typo

* introduce parse_operators

* fix lint

* add _ prefix
---
 python/tvm/relay/frontend/pytorch.py          | 597 +++++++++---------
 tests/python/frontend/pytorch/test_forward.py |  25 +-
 tutorials/frontend/from_pytorch.py            |  14 +-
 3 files changed, 324 insertions(+), 312 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index edd6ad84ae3e..fd66e3c1f367 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -18,6 +18,9 @@
 # pylint: disable=consider-iterating-dictionary, invalid-name, unused-argument, unused-variable, broad-except
 # pylint: disable=import-outside-toplevel, simplifiable-if-expression, unnecessary-comprehension
 """PT: PyTorch frontend."""
+import itertools
+from packaging import version
+
 import numpy as np
 
 import tvm
@@ -396,9 +399,11 @@ def _impl(inputs, input_types):
 
 def _size():
     def _impl(inputs, input_types):
-        axis = int(inputs[1])
         shape = _infer_shape(inputs[0])
-        return shape[axis]
+        if len(inputs) > 1:
+            axis = int(inputs[1])
+            return shape[axis]
+        return shape
     return _impl
 
 def _numtotensor():
@@ -484,10 +489,19 @@ def _impl(inputs, attrs, params):
 def _mean():
     def _impl(inputs, input_types):
         data = inputs[0]
-        axis = _infer_shape(inputs[1])
 
-        keepdims = int(inputs[2])
-        exclude = int(inputs[3])
+        if inputs[1]:
+            axis = _infer_shape(inputs[1])
+        else:
+            axis = None
+        if len(inputs) > 2 and inputs[2]:
+            keepdims = int(inputs[2])
+        else:
+            keepdims = False
+        if len(inputs) > 3 and inputs[3]:
+            exclude = int(inputs[3])
+        else:
+            exclude = False
 
         return _op.mean(data, axis, keepdims, exclude)
     return _impl
@@ -651,7 +665,7 @@ def _convert_elemwise_input(data, input_type):
     if isinstance(data, torch.Tensor):
         return _expr.const(data.item(), dtype=_convert_data_type(input_type))
     elif not isinstance(data, _expr.Expr):
-        return _expr.const(int(data), dtype=_convert_data_type(input_type))
+        return _expr.const(data, dtype=_convert_data_type(input_type))
     else:
         return data
 
@@ -718,293 +732,270 @@ def _convert_elemwise_input(data, input_type):
     "aten::sqrt"                            : _sqrt()
 }
 
-# Internal graph for parsing
 
-class Graph(object):
-    """ A helper class for parsing PyTorch model to Relay graph."""
+def _run_jit_passes(graph):
+    """ The inline pass is necessary to unwrap prim::CallMethod """
+    import torch
+    if version.parse(torch.__version__) >= version.parse("1.4.0"):
+        torch._C._jit_pass_inline(graph)
 
-    def __init__(self, script_module, input_shapes):
 
-        self._script_module = script_module
-        self._graph = script_module.graph.copy()
+def _is_int_seq(seq):
+    return len(seq) > 0 and all([isinstance(i, int) for i in seq])
 
-        # TODO: Temporary fix to remove prim::CallMethod node introduced in PT 1.4
-        import torch
-        from packaging import version
-        if version.parse(torch.__version__) >= version.parse("1.4.0"):
-            torch._C._jit_pass_inline(self._graph)
-
-        self._inputs_r = {}
-        self._params = {}
-        self._param_tensors = {}
-        self._consts = {}
-        self._ops = {}
-        self._op_inputs_r = {}
-        self._op_inputs_types = {}
-        self._input_shapes = input_shapes if input_shapes else {}
-        self._parsed_node_names = {}
-
-    def from_pytorch(self):
-        """ Construct relay nodes from PyTorch graph
-
-        Currently only supports traced PyTorch format which means no control flow.
-        User must perform torch.jit.trace on a model and pass this in.
-        Future support should include support scripted models (torch.jit.script) which
-        preserves control flow.
-
-        Returns
-        -------
-        mod : tvm.relay.Module
-            The module that optimizations will be performed on.
-
-        params : dict of str to tvm.runtime
-            Dict of converted parameters stored in tvm.runtime format
-        """
-        # Check for missing ops
-        missing_operators = self._parse_import_prerequisites()
-
-        if missing_operators:
-            raise tvm.error.OpNotImplemented( \
-                "The following operators are not implemented: {}".format(missing_operators))
-
-        # Translate PyTorch graph to by decorating Graph with state dict and inputs into each op
-        self._parse_inputs()
-        self._parse_params()
-        self._parse_ops()
-
-        outputs = []
-        nid = 0
-
-        for op_name, op_node in self._ops.items():
-            if op_node.kind() == "prim::ListConstruct":
-                if any(inp.debugName() in self._parsed_node_names.keys() \
-                       for inp in op_node.inputs()):
-                    list_constr = []
-                    for i in op_node.inputs():
-                        if i.debugName() in self._parsed_node_names.keys():
-                            list_constr.append( \
-                                outputs[self._parsed_node_names[i.debugName()]])
-                        elif i.node().kind() == "prim::Constant":
-                            list_constr.append(int(self._consts[i.debugName()]))
-                        elif i.debugName() in self._inputs_r.keys():
-                            list_constr.append(int(self._inputs_r[i.debugName()]))
-
-                    # Unwrap for tensors
-                    if len(list_constr) == 1:
-                        list_constr = list_constr[0]
-
-                    outputs.append(list_constr)
-                    self._parsed_node_names[op_name] = nid
-                    nid = nid+1
-            elif op_node.kind() != "prim::Constant":
-                for i in op_node.inputs():
-                    if i.debugName() in self._parsed_node_names.keys():
-                        for cnt in range(0, len(self._op_inputs_r[op_name])):
-                            if isinstance(self._op_inputs_r[op_name][cnt], str):
-                                if "call/var" in self._op_inputs_r[op_name][cnt]:
-                                    self._op_inputs_r[op_name][cnt] = \
-                                        outputs[self._parsed_node_names[i.debugName()]]
-                                    break
-
-                call = _convert_map[op_node.kind()](self._op_inputs_r[op_name],
-                                                    self._op_inputs_types[op_name])
-
-                outputs.append(call)
-                self._parsed_node_names[op_name] = nid
-                nid = nid+1
-
-        func = tvm.relay.Function(_analysis.free_vars(outputs[-1]), outputs[-1])
-
-        param = {k: tvm.nd.array(v) for k, v in self._param_tensors.items()}
-
-        return  _module.IRModule.from_expr(func), param
-
-    def _parse_inputs(self):
-        """ Map inputs to parser and inputs to graph. """
-        # Get names and objects of inputs for IR
-        ir_inputs = [i for i in self._graph.inputs()]
-
-        # Create corresponding shape and add to input
-        for input_name, ir_input in zip(self._input_shapes, ir_inputs[1:]):
-            input_shape = self._input_shapes[input_name]
-            ir_input.setDebugName(input_name)
-
-            ir_dtype = _convert_data_type(ir_input.type().scalarType().lower())
-            self._inputs_r[input_name] = _expr.var(input_name,
-                                                   shape=self._input_shapes[input_name],
-                                                   dtype=ir_dtype)
-
-        # Add self (first input of a PyTorch graph) to inputs, the value doesn't matter here
-        input_name = ir_inputs[0].debugName()
-        self._inputs_r[input_name] = "self"
-
-    def _parse_params(self):
-        """ Map state dictionary values to corresponding prim::GetAttr op node. """
-        # Grab weights, biases, etc. from graph
-        state_dict = self._script_module.state_dict()
-        param_names = []
-        for key, value in state_dict.items():
-            param_str = str(key)
-            param_name = param_str.split(".")[-1]
-            param_names.append(param_name)
-
-        # Get names of all inputs
-        input_names = [i for i in self._inputs_r.keys()]
-
-        # Iterate through graph for getAttr nodes and match full state_dict name to nodes
-        node_weight_map = {}
-        for node in self._graph.nodes():
-            if node.kind() == "prim::GetAttr":
-
-                attribute_names = node.attributeNames()
-                assert len(attribute_names) == 1
-                node_getattr_name = node.s(attribute_names[0])
-                node_arg = node.input().debugName()
-
-                if node.outputsSize() == 1:
-                    node_name = node.output().debugName()
-                else:
-                    node_name = [output.debugName() for output in node.outputs()][0]
-
-                if node_arg in input_names:
-                    node_weight_map[node_name] = node_getattr_name
-                else:
-                    previous_map = node_weight_map[node_arg[:]]
-                    node_weight_map[node_name] = previous_map+"."+node_getattr_name
-
-                if node_getattr_name in param_names:
-
-                    value = state_dict[node_weight_map[node_name]]
-                    tensor = tvm.nd.array(value.cpu().numpy())
-                    shape = tensor.shape
-                    self._param_tensors[node_name] = tensor
-
-                    self._params[node_name] = _expr.var(node_name,
-                                                        shape=shape,
-                                                        dtype=_convert_data_type(str(value.dtype)))
-
-    def _parse_ops(self):
-        """ Iterate through nodes and decorate graph with constants, operators,
-        and the inputs to each operator. """
-        # Traverse nodes and add to graph
-        for node in self._graph.nodes():
-
-            if node.outputsSize() == 1:
-                node_name = node.output().debugName()
-            else:
-                node_name = [output.debugName() for output in node.outputs()][0]
-
-            if node.kind() == "prim::Constant":
-                if node.hasAttributes():
-                    attribute_names = node.attributeNames()
-                    attr_name = attribute_names[0]
-                    ty = node.output().type().kind()
-
-                    if ty in ["IntType", "BoolType"]:
-                        self._consts[node_name] = node.i(attr_name)
-                    elif ty in ["FloatType", "LongType"]:
-                        self._consts[node_name] = node.f(attr_name)
-                    elif ty in ["TensorType", "CompleteTensorType"]:
-                        self._consts[node_name] = node.output().toIValue()
-                    else:
-                        self._consts[node_name] = "0"
-                else:
-                    self._consts[node_name] = "0"
-            elif node.kind() == "prim::ListConstruct":
-                list_shape = []
-                for input_node in node.inputs():
-                    if input_node.debugName() in self._inputs_r.keys():
-                        c = self._inputs_r[input_node.debugName()]
-                        assert isinstance(c, int)
-                        list_shape.append(c)
-                    elif input_node.debugName() in self._consts.keys():
-                        c = self._consts[input_node.debugName()]
-                        assert isinstance(c, int)
-                        list_shape.append(c)
-                self._inputs_r[node_name] = _expr.var(node_name, shape=list_shape)
-
-            if node.kind() != "prim::GetAttr":
-                self._add_op(node_name, node)
-
-    # Graph Helper Functions
-
-    def _add_op(self, node_id, op_node):
-        """ Add an operator and its operators inputs to the graph and insert placeholders
-            where an input is a call node.
-
-        Parameters
-        ----------
-        node_id : string
-            The ID of the op node
-
-        op_node : PyTorch Node object
-            The full Node object for the op node
-
-        """
-        self._ops[(node_id)] = op_node
-        input_list_r = []
-        input_list_types = []
-        for input_value in op_node.inputs():
-
-            inode_id = input_value.debugName()
-            inode = input_value.node()
-
-            if inode_id in self._inputs_r.keys():
-                input_list_r.append(self._inputs_r[inode_id])
-            elif inode_id in self._params.keys():
-                input_list_r.append(self._params[inode_id])
-            elif inode.kind() == "prim::Constant":
-                input_list_r.append(self._consts[inode_id])
+
+def _get_tensor_and_var(torch_tensor, name):
+    tensor = tvm.nd.array(torch_tensor.cpu().numpy())
+    var = _expr.var(name, shape=tensor.shape)
+    return tensor, var
+
+
+def _get_output_name(node):
+    assert node.outputsSize() == 1
+    return node.output().debugName()
+
+
+def _get_output_names(node):
+    return [output.debugName() for output in node.outputs()]
+
+
+def _get_input_names(node_or_graph):
+    return [inp.debugName() for inp in node_or_graph.inputs()]
+
+
+def _get_op_inputs(op_node, outputs, output_index_map):
+    input_names = [output_index_map[name]
+                   for name in _get_input_names(op_node)]
+    return [outputs[name] for name in input_names]
+
+
+def _update_outputs_from_pairs(name_output_pairs, outputs, output_index_map):
+    for output_name, output in name_output_pairs:
+        output_index_map[output_name] = len(outputs)
+        outputs.append(output)
+
+
+def _report_missing_conversion(op_names):
+    """ Check if all ops in an input graph are supported by TVM """
+    known_ops = ["prim::Constant", "prim::GetAttr",
+                 "prim::ListConstruct", "prim::ListUnpack",
+                 "prim::TupleConstruct", "prim::TupleUnpack"]
+    known_ops += list(_convert_map.keys())
+
+    missing = [op_name for op_name in op_names
+               if op_name not in known_ops]
+
+    if missing:
+        msg = "The following operators are not implemented: {}".format(missing)
+        raise NotImplementedError(msg)
+
+
+def _getattr_attr_name(node):
+    attribute_names = node.attributeNames()
+    assert len(attribute_names) == 1
+    attr_name = node.s(attribute_names[0])
+    return attr_name
+
+
+def _getattr_full_name(getattrs):
+    return ".".join([_getattr_attr_name(node) for node in getattrs])
+
+
+def _get_input_types(op_node):
+    """ Returns a torch type for each input nodes """
+    input_list_types = []
+    for input_node in op_node.inputs():
+        in_ty = input_node.type()
+        input_node_kind = in_ty.kind()
+        if input_node_kind == 'TensorType':
+            if in_ty.scalarType() is None:
+                input_list_types.append(None)
             else:
-                input_list_r.append("call/var."+inode_id)
-
-                # If the inputs of a ListConstruct op is a call or var, remove it from inputs
-                if op_node.kind() == "prim::ListConstruct":
-                    if node_id in self._inputs_r.keys():
-                        self._inputs_r.pop(node_id)
-
-            try:
-                input_value_kind = input_value.type().kind()
-                if input_value_kind in ["TensorType", "CompleteTensorType"]:
-                    if input_value.type().scalarType() is None:
-                        input_list_types.append("float")
-                    else:
-                        input_list_types.append(input_value.type().scalarType().lower())
-                elif input_value_kind == "ListType":
-                    input_list_types.append(str(input_value.type().getElementType()).lower())
-                elif input_value_kind in ["IntType", "FloatType", "BoolType", "StringType",
-                                          "OptionalType"]:
-                    input_list_types.append(str(input_value.type()).lower())
-                else:
-                    input_list_types.append("UnsupportedType")
-                    print("UnsupportedType "+str(input_value.type())+" and "+str(input_value_kind))
-            except Exception as e:
-                print("Internal PyTorch error. Failed to grab type.")
-
-        if op_node.kind() in ["aten::ones", "aten::zeros"]:
-            node_type = op_node.output().type().scalarType()
-            input_list_types[0] = node_type.lower()
-
-        self._op_inputs_r[node_id] = input_list_r
-        self._op_inputs_types[node_id] = input_list_types
-
-    def _parse_import_prerequisites(self):
-        """ Calculate the named preconditions from PyTorch graph.
-
-        Returns
-        -------
-        missing_operators : set object
-            Set of operator names which don't have their mapping in TVM
-            i.e. which are not supported
-
-        """
-        missing_operators = set()
-        for node in self._graph.nodes():
-            if not node.kind() in ["prim::Constant", "prim::ListConstruct", "prim::GetAttr"] \
-                    and not node.kind() in _convert_map:
-                missing_operators.add(node.kind())
-
-        return missing_operators
+                input_list_types.append(in_ty.scalarType().lower())
+        elif input_node_kind == 'ListType':
+            input_list_types.append(str(in_ty.getElementType()).lower())
+        elif input_node_kind in ['IntType', 'FloatType', 'BoolType',
+                                 'StringType', 'OptionalType']:
+            input_list_types.append(str(in_ty).lower())
+        else:
+            input_list_types.append('UnsupportedType')
+
+    if op_node.kind() in ['aten::ones', 'aten::zeros']:
+        node_type = op_node.output().type()
+        scalar_type = node_type.scalarType()
+        if scalar_type:
+            input_list_types[0] = scalar_type.lower()
+
+    return input_list_types
+
+
+def _get_constant(node):
+    """ Retrieve a constant associated with this prim::Constant node """
+    attribute_names = node.attributeNames()
+    num_attributes = len(attribute_names)
+
+    if num_attributes == 1:
+        attr_name = attribute_names[0]
+        ty = node.output().type().kind()
+
+        if ty in ["IntType", "BoolType"]:
+            return node.i(attr_name)
+        elif ty in ["FloatType", "LongType"]:
+            return node.f(attr_name)
+        elif ty in ["TensorType", "CompleteTensorType"]:
+            tensor = node.t(attr_name)
+            if len(tensor.shape) == 0:  # tensor(0.1)
+                return float(tensor)
+            return tensor
+        elif ty == "DeviceObjType":
+            return node.s(attr_name)
+        elif ty == "FunctionType":
+            return None
+        else:
+            raise NotImplementedError("Unsupported type: %s" % ty)
+    else:
+        assert num_attributes == 0
+        return None
+
+
+def _get_operator_nodes(nodes):
+    """ Returns torch IR nodes that need conversion to Relay """
+    ops = {}
+    # Traverse nodes and add to graph
+    for node in nodes:
+        if node.outputsSize() > 1:
+            node_name = "_".join(_get_output_names(node))
+        else:
+            node_name = _get_output_name(node)
+
+        if node.kind() != "prim::GetAttr":
+            ops[node_name] = node
+
+    return ops
+
+
+def parse_inputs(graph_inputs, input_shapes):
+    """ Return Relay vars from torch input vars """
+    ir_inputs = list(graph_inputs)
+    input_vars = {}
+
+    for input_name, ir_input in zip(input_shapes, ir_inputs[1:]):
+        input_vars[input_name] = _expr.var(input_name,
+                                           shape=input_shapes[input_name])
+    return input_vars
+
+
+def get_use_chains(root_node, terminate=lambda _: False):
+    """
+    Track a chain of users of this node forward, returning a list of chains
+    See get_attr_chains below for its usage
+    """
+    def concat_lists(lists):
+        return itertools.chain.from_iterable(lists)
+
+    def inner(current, accum):
+        users = []
+        for output in current.outputs():
+            users += [use.user for use in output.uses()]
+
+        if not users or terminate(users):
+            return [accum]
+
+        return concat_lists([inner(nxt, accum + [nxt]) for nxt in users])
+
+    return inner(root_node, [root_node])
+
+
+def get_attr_chains(root_getattr_node):
+    """ Returns chains of attribute access starting from root_getattr_node
+
+    For example, given attribute "block", as in "self.block" when "self" points
+    to the top level torch.nn.Module, it returns lists of attribute "chains",
+    e.g. ['block', '2'], ['block', '1'], ['block', '0', '_packed_params']
+
+    These sets of attributes form full attribute accessors. For example,
+    "self.block.1", "self.block.2" will return the second and third submodule,
+    and "self.block.0._packed_params" will return the parameters of the first
+    submodule.
+    """
+    def terminate(users):
+        next_attrs = [user for user in users if user.kind() == "prim::GetAttr"]
+        return len(next_attrs) == 0
+
+    return get_use_chains(root_getattr_node, terminate)
+
+
+def parse_params(graph, state_dict):
+    """
+    Return Relay vars and TVM NDArrays for input parameters
+    A chain of prim::GetAttr nodes is processed one at a time
+    """
+    getattr_nodes = graph.findAllNodes("prim::GetAttr", recurse=True)
+    params = {}
+    param_tensors = {}
+    seen = set()
+
+    for node in getattr_nodes:
+        if _get_output_name(node) in seen:
+            continue
+
+        for getattrs in get_attr_chains(node):
+            seen.update(map(_get_output_name, getattrs))
+
+            full_attr = _getattr_full_name(getattrs)
+            full_attr_node_name = _get_output_name(getattrs[-1])
+
+            if full_attr in state_dict:
+                torch_tensor = state_dict[full_attr]
+                tensor, var = _get_tensor_and_var(torch_tensor,
+                                                  full_attr_node_name)
+                param_tensors[full_attr_node_name] = tensor
+                params[full_attr_node_name] = var
+
+    return params, param_tensors
+
+
+def parse_operators(operators, outputs, output_index_map, ret_name):
+    """ Convert each Torch IR operators to Relay equivalent """
+    for node_name, op_node in operators.items():
+        operator = op_node.kind()
+        inputs = _get_op_inputs(op_node, outputs, output_index_map)
+
+        if operator == "prim::Constant":
+            output_index_map[node_name] = len(outputs)
+            outputs.append(_get_constant(op_node))
+        elif operator == 'prim::ListConstruct' and _is_int_seq(inputs):
+            output_index_map[node_name] = len(outputs)
+            outputs.append(_expr.var(node_name, shape=inputs))
+        elif operator in ['prim::ListConstruct', 'prim::TupleConstruct']:
+            output_index_map[node_name] = len(outputs)
+            outputs.append(inputs)
+        elif operator in ["prim::ListUnpack", 'prim::TupleUnpack']:
+            assert len(inputs) == 1
+            unpacked_names = _get_output_names(op_node)
+            _update_outputs_from_pairs(zip(unpacked_names, inputs[0]),
+                                       outputs, output_index_map)
+        else:
+            output_index_map[node_name] = len(outputs)
+            relay_op = _convert_map[operator]
+            outputs.append(relay_op(inputs, _get_input_types(op_node)))
+
+    return outputs[output_index_map[ret_name]]
+
+
+def get_all_op_names(graph):
+    """ Return all operator names in the input graph """
+    nodes = list(graph.nodes())
+    return set(node.kind() for node in nodes)
+
+
+def get_graph_input_names(script_module):
+    """ Use this function to set the keys for input_shapes"""
+    # It seems variable names could change the first time a copy is made
+    # Use the copy of the graph here to prevent troubles later
+    ir_inputs = _get_input_names(script_module.graph.copy())
+    return ir_inputs[1:]  # remove self at the 0th arg
+
 
 def from_pytorch(script_module, input_shapes):
     """ Load PyTorch model in the form of a scripted PyTorch model and convert into relay.
@@ -1016,17 +1007,35 @@ def from_pytorch(script_module, input_shapes):
         TorchScripted PyTorch graph
         Note: We currently only support traces (ie: torch.jit.trace(model, input))
 
-    shape : Dictionary of input dimensions
+    input_shapes : Dictionary of input dimensions
         Graph level input shape dictionary
+        The keys should be the same one returned by get_graph_input_names(...) above
 
     Returns
     -------
     mod : tvm.relay.Module
         The module that optimizations will be performed on.
 
-    params : dict of str to tvm.runtime
-        Dict of converted parameters stored in tvm.runtime format
+    params : dict of str to tvm.runtime.NDArray
+        Dict of converted parameters stored in tvm.runtime.ndarray format
     """
-    g = Graph(script_module, input_shapes)
-    mod, params = g.from_pytorch()
-    return mod, params
+    graph = script_module.graph.copy()
+    _run_jit_passes(graph)
+    op_names = get_all_op_names(graph)
+    _report_missing_conversion(op_names)
+
+    params = script_module.state_dict()
+    input_vars = parse_inputs(graph.inputs(), input_shapes)
+    param_vars, tensors = parse_params(graph, params)
+
+    input_vars.update(param_vars)
+    outputs = list(input_vars.values())
+    output_index_map = dict(zip(input_vars.keys(), range(len(outputs))))
+    ret_name = _get_input_names(graph.return_node())[0]
+
+    body = parse_operators(_get_operator_nodes(graph.nodes()), outputs,
+                           output_index_map, ret_name)
+    func = tvm.relay.Function(_analysis.free_vars(body), body)
+    tvm_params = {k: tvm.nd.array(v) for k, v in tensors.items()}
+
+    return _module.IRModule.from_expr(func), tvm_params
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index ba1d7bbe67bc..831389b7ebf5 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -31,6 +31,8 @@
 from tvm import relay
 from tvm.contrib import graph_runtime
 from tvm.relay.testing.config import ctx_list
+from tvm.relay.frontend.pytorch import get_graph_input_names
+
 
 sys.setrecursionlimit(10000)
 
@@ -94,6 +96,7 @@ def load_model(model_name):
     if hasattr(torchvision.models, model_name):
         return load_torchvision(model_name)
     try:
+        import pretrainedmodels
         if hasattr(pretrainedmodels, model_name):
             return load_pretrainedmodels(model_name)
     except ModuleNotFoundError:
@@ -167,16 +170,15 @@ def verify_model(model_name, input_data=[]):
         baseline_outputs = tuple(out.cpu().numpy() for out in baseline_outputs)
     else:
         baseline_outputs = (baseline_outputs.float().cpu().numpy(),)
-    output_shapes = [out.shape for out in baseline_outputs]
-    dtype = "float32"
-    input_name = "input0"
-    input_shapes = {input_name: list(baseline_input.shape)}
     trace = torch.jit.trace(baseline_model, baseline_input).float().eval()
+
     if torch.cuda.is_available():
         trace = trace.cuda()
     else:
         trace = trace.cpu()
 
+    input_name = get_graph_input_names(trace)[0]  # only one input
+    input_shapes = {input_name: list(baseline_input.shape)}
     mod, params = relay.frontend.from_pytorch(trace, input_shapes)
     compiled_input = {input_name: tvm.nd.array(baseline_input.cpu().numpy())}
 
@@ -276,7 +278,7 @@ def forward(self, *args):
 
     class Multiply2(Module):
         def forward(self, *args):
-            return args[0] * 1
+            return args[0] * 1.0
 
     class Multiply3(Module):
         def forward(self, *args):
@@ -507,7 +509,7 @@ def test_forward_size():
 
     class Size1(Module):
         def forward(self, *args):
-            return args[0].size(0) * args[0]
+            return float(args[0].size(0)) * args[0]
 
     with torch.no_grad():
         input_data = torch.rand(input_shape).float()
@@ -708,6 +710,10 @@ def test_mnasnet0_5():
     torch.set_grad_enabled(False)
     verify_model("mnasnet0_5")
 
+def test_mobilenet_v2():
+    torch.set_grad_enabled(False)
+    verify_model("mobilenet_v2")
+
 """
 #TODO: Fix VGG and AlexNet issues (probably due to pooling)
 def test_alexnet():
@@ -721,13 +727,9 @@ def test_vgg11():
 def test_vgg11_bn():
     torch.set_grad_enabled(False)
     verify_model("vgg11_bn")
-
-#TODO: Need to update schedule in tophub file after PR #4787 updated workloads
-def test_mobilenet_v2():
-    torch.set_grad_enabled(False)
-    verify_model("mobilenet_v2")
 """
 
+
 if __name__ == "__main__":
     # Single operator tests
     test_forward_add()
@@ -767,3 +769,4 @@ def test_mobilenet_v2():
     test_inception_v3()
     test_googlenet()
     test_mnasnet0_5()
+    test_mobilenet_v2()
diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
index c280c259c1fe..503f64a4e7d9 100644
--- a/tutorials/frontend/from_pytorch.py
+++ b/tutorials/frontend/from_pytorch.py
@@ -41,14 +41,13 @@
 be unstable.
 """
 
-# tvm, relay
 import tvm
 from tvm import relay
 
-# numpy, packaging
 import numpy as np
-from packaging import version
+
 from tvm.contrib.download import download_testdata
+from tvm.relay.frontend.pytorch import get_graph_input_names
 
 # PyTorch imports
 import torch
@@ -91,7 +90,8 @@
 # Import the graph to Relay
 # -------------------------
 # Convert PyTorch graph to Relay graph.
-shape_dict = {'img': img.shape}
+input_name = get_graph_input_names(scripted_model)[0]  # only one input
+shape_dict = {input_name: img.shape}
 mod, params = relay.frontend.from_pytorch(scripted_model,
                                           shape_dict)
 
@@ -116,12 +116,12 @@
 dtype = 'float32'
 m = graph_runtime.create(graph, lib, ctx)
 # Set inputs
-m.set_input('img', tvm.nd.array(img.astype(dtype)))
+m.set_input(input_name, tvm.nd.array(img.astype(dtype)))
 m.set_input(**params)
 # Execute
 m.run()
 # Get outputs
-tvm_output = m.get_output(0, tvm.nd.empty(((1, 1000)), 'float32'))
+tvm_output = m.get_output(0)
 
 #####################################################################
 # Look up synset name
@@ -163,4 +163,4 @@
     torch_class_key = class_id_to_key[top1_torch]
 
 print('Relay top-1 id: {}, class name: {}'.format(top1_tvm, key_to_classname[tvm_class_key]))
-print('Torch top-1 id: {}, class name: {}'.format(top1_torch, key_to_classname[torch_class_key]))
\ No newline at end of file
+print('Torch top-1 id: {}, class name: {}'.format(top1_torch, key_to_classname[torch_class_key]))

From a449d8b1fe8ca34bbc9eaca2e8ecfe76324118de Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 28 Feb 2020 13:20:04 -0800
Subject: [PATCH 57/73] [DOCS] Fix sphinx precheck (#4967)

* [DOCS] Fix sphinx precheck

* ignore keras warnings

* Remove more warnings
---
 docs/langref/relay_adt.rst            | 41 ++++++++--------
 docs/langref/relay_expr.rst           | 68 +++++++++++++--------------
 docs/langref/relay_type.rst           | 16 +++----
 tests/scripts/task_sphinx_precheck.sh |  2 +-
 4 files changed, 64 insertions(+), 63 deletions(-)

diff --git a/docs/langref/relay_adt.rst b/docs/langref/relay_adt.rst
index e487684063ac..a53c7515c62a 100644
--- a/docs/langref/relay_adt.rst
+++ b/docs/langref/relay_adt.rst
@@ -1,3 +1,4 @@
+
 ..  Licensed to the Apache Software Foundation (ASF) under one
     or more contributor license agreements.  See the NOTICE file
     distributed with this work for additional information
@@ -63,7 +64,7 @@ Hence, it is often easy to reason about ADTs.
 Below is a simple example of defining an ADT and using it in a function
 via a match expression:
 
-.. code-block:: python
+.. code-block::
 
    # Defines an ADT named "Numbers"
    data Numbers {
@@ -94,7 +95,7 @@ meaning that two ADTs with structurally identical constructors
 will nevertheless be distinct data types from the point of view of
 the typechecker.
 
-.. code-block:: python
+.. code-block::
 
    # structurally identical constructors to Numbers
    data Numbers2 {
@@ -117,7 +118,7 @@ can be polymorphic and take type parameters.
 For example, one of the standard ADTs commonly used in functional
 programming languages is the optional type, defined here:
 
-.. code-block:: python
+.. code-block::
 
    # a is a type parameter
    data Optional<a> {
@@ -141,7 +142,7 @@ imply, an ADT instance is thus given a type that contains the
 concrete type arguments for that instance, ensuring the information is
 kept around. Let the below example illustrate:
 
-.. code-block:: python
+.. code-block::
 
    # the signature for option indicates the type argument
    def @inc_scalar(%opt : Optional[Tensor[(), int32]]) -> Tensor[(), int32] {
@@ -198,7 +199,7 @@ Many commonly used ADTs involve recursion; some of these are given
 in `Common ADT Uses`_. As an example here, we will
 examine the list ADT, ubiquitous in functional languages:
 
-.. code-block:: python
+.. code-block::
 
    data List<a> {
       Nil : () -> List
@@ -216,7 +217,7 @@ end of the list is reached, which can be indicated with a :code:`Nil`
 Lists represented in this manner can easily be recursively processed.
 For example, the following function sums a list of integers:
 
-.. code-block:: python
+.. code-block::
 
    def @list_sum(%l : List[Tensor[(), int32]]) -> Tensor[(), int32] {
      match(%l) {
@@ -250,7 +251,7 @@ and the second has a :code:`Cons` constructor pattern that uses variable pattern
 
 The below example uses a wildcard pattern to ignore one of the arguments to :code:`Cons`:
 
-.. code-block:: python
+.. code-block::
 
    def @first<a>(%l : List[a]) -> Optional[a] {
      match(%l) {
@@ -262,7 +263,7 @@ The below example uses a wildcard pattern to ignore one of the arguments to :cod
 Here, a constructor pattern is nested inside another constructor pattern to avoid nested match expressions for a list option.
 A top-level wildcard pattern is also used to handle all cases that do not match the first clause:
 
-.. code-block:: python
+.. code-block::
 
    def @second_opt<a>(%ll : Optional[List[a]]) -> Optional[a] {
      match(%ll) {
@@ -281,7 +282,7 @@ Note that a match expression checks its patterns in the order the cases are list
 that matches the input value is the one that is evaluated. Here, a top-level variable pattern binds the whole
 input value:
 
-.. code-block:: python
+.. code-block::
 
    def @match_order_beware<a>(%l : List[a]) -> List[a] {
      match(%l) {
@@ -291,7 +292,7 @@ input value:
        case Nil() { Nil() }
      }
    }
-  
+
 Common ADT Uses
 ===============
 
@@ -312,7 +313,7 @@ list comprehensions and certain library functions in Python. Below are very comm
 through lists, which are included in Relay's Prelude. (These have all been extensively characterized
 in the functional programming literature, and we do not attempt to reproduce that work in this document.)
 
-.. code-block:: python
+.. code-block::
 
    # Map: for [h1, h2, ..., hn] returns [f(h1), f(h2), ..., f(hn)]
    def @map<a, b>(%f : fn(a) -> b, %l : List[a]) -> List[b] {
@@ -341,7 +342,7 @@ in the functional programming literature, and we do not attempt to reproduce tha
 Using these iteration constructs, many common operations over lists can be expressed compactly.
 For example, the following map doubles all members of a list:
 
-.. code-block:: python
+.. code-block::
 
    # directly written
    def @double(%l : List[Tensor[(), int32]]) -> List[Tensor[(), int32]] {
@@ -356,7 +357,7 @@ For example, the following map doubles all members of a list:
 
 The following right fold concatenates two lists:
 
-.. code-block:: python
+.. code-block::
 
    # directly written
    def @concat<a>(%l1 : List[a], %l2 : List[a]) -> List[a] {
@@ -371,7 +372,7 @@ The following right fold concatenates two lists:
 
 The following left fold flattens a list of lists (using concatenation):
 
-.. code-block:: python
+.. code-block::
 
   # directly written
   def @flatten<a>(%ll : List[List[a]]) -> List[a] {
@@ -401,13 +402,13 @@ First let us suppose that we have a function corresponding to a trained recurren
 cell, which takes in a past state and an input value and returns a new state and output value. In
 Relay, this would have the following signature:
 
-.. code-block:: python
+.. code-block::
 
    @cell : fn<state_type, in_type, out_type>(state_type, in_type) -> (state_type, out_type)
 
 We might consider a ReLU cell as a simple concrete example, with a trained version below:
 
-.. code-block:: python
+.. code-block::
 
   def @linear(%x, %w, %b) { %w*%x + %b }
 
@@ -429,7 +430,7 @@ We might consider a ReLU cell as a simple concrete example, with a trained versi
 
 Following Olah's example, we can encode a sequence (list) of inputs with the following left fold:
 
-.. code-block:: python
+.. code-block::
 
    def @encode<state_type, in_type, out_type>(%cell, %input : List[in_type], %init : state_type) -> state_type {
      # not using the output
@@ -439,7 +440,7 @@ Following Olah's example, we can encode a sequence (list) of inputs with the fol
 Using an *unfold* iterator (from Haskell's standard library), the same cell could be used to make
 a generator network (which takes a single input and produces a sequence of outputs):
 
-.. code-block:: python
+.. code-block::
 
    # included in Relay's Prelude
    def @unfoldr<a, b>(%f : fn(b) -> Optional[(a, b)], %z : b) -> List[a] {
@@ -468,7 +469,7 @@ a generator network (which takes a single input and produces a sequence of outpu
 An accumulating map (a fold that simultaneously updates an accumulator value and a list
 of outputs) can be used to write a general RNN (with an output for every input):
 
-.. code-block:: python
+.. code-block::
 
    def @map_accumr<a, b, c>(%f : fn(a, b) -> (a, c), %acc : a, %l : List[b]) -> (a, List[c]) {
      match(%l) {
@@ -500,7 +501,7 @@ Olah also gives an example of a bidirectional neural network, in which two sets
 cells (which may have different weights) process the input in both directions and produce a
 single set of outputs. The following is a Relay implementation of that example:
 
-.. code-block:: python
+.. code-block::
 
    # creates a list of tuples from two lists
    # included in Relay's Prelude
diff --git a/docs/langref/relay_expr.rst b/docs/langref/relay_expr.rst
index 1fd39bc90a3d..66bfe43a04d6 100644
--- a/docs/langref/relay_expr.rst
+++ b/docs/langref/relay_expr.rst
@@ -92,7 +92,7 @@ references to :code:`%a` in the inner scope refer to the later definition, while
 references to :code:`%a` in the outer scope continue to refer to
 the first one.
 
-.. code-block:: python
+.. code-block::
 
     let %a = 1;
     let %b = 2 * %a;  // %b = 2
@@ -129,14 +129,14 @@ A definition minimally consists of the keyword :code:`fn`, an empty set of
 parameters, and a body expression (:py:class:`~tvm.relay.expr.Expr`)
 contained by curly braces.
 
-.. code-block:: python
+.. code-block::
 
     fn() { body }
 
 A definition may contain any number of parameters. For example, a
 simple function that invokes the :code:`add` operator:
 
-.. code-block:: python
+.. code-block::
 
     fn(%x, %y) { add(%x, %y) }
 
@@ -147,7 +147,7 @@ One may also annotate explicit types on functions.
 For example, we can restrict the above function to only work
 on certain types:
 
-.. code-block:: python
+.. code-block::
 
     fn(%x : Tensor[(10, 10), float32], %y : Tensor[(10, 10), float32])
                -> Tensor[(10, 10), float32] {
@@ -155,7 +155,7 @@ on certain types:
     }
 
 The above function only takes arguments of type :code:`Tensor[(10, 10), float32]` and returns a value of
-type :code:`Tensor[(10, 10), float32]`. A function parameter is just a local 
+type :code:`Tensor[(10, 10), float32]`. A function parameter is just a local
 variable (:py:class:`~tvm.relay.expr.LocalVar`) optionally annotated with a type, written as :code:`%x : T`.
 
 When the type information is omitted, Relay attempts to infer the most general type
@@ -166,7 +166,7 @@ parameters and return type based on the function body and call sites.
 A recursive function expression can be defined using a :code:`let` binding,
 as here:
 
-.. code-block:: python
+.. code-block::
 
     let %fact = fn(%x : Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         if (%x == Constant(0, (10, 10), float32)) {
@@ -189,7 +189,7 @@ For example, in the below example, the final result will be
 a tensor of zero values because the closure for :code:`%f` stores the value of
 :code:`%x` at the pointer where :code:`%f` was defined.
 
-.. code-block:: python
+.. code-block::
 
     let %g = fn() {
       let %x = Constant(0, (10, 10), float32);
@@ -216,13 +216,13 @@ given at call sites.
 Type parameters are classified by *kind* and can
 only appear in parts of the type signature where their kind is appropriate
 (e.g., type parameters of kind :code:`Shape` can only appear where a shape
-would be expected in a tensor type); for a full discussion, 
+would be expected in a tensor type); for a full discussion,
 see :ref:`the documentation on type parameters <type-parameter>`.
 
 For example, one can define a polymorphic identity function for
 any Relay type as follows:
 
-.. code-block:: python
+.. code-block::
 
     fn<t : Type>(%x : t) -> t {
         %x
@@ -231,7 +231,7 @@ any Relay type as follows:
 The below definition is also polymorphic, but restricts its
 arguments to tensor types:
 
-.. code-block:: python
+.. code-block::
 
     fn<s : Shape, bt : BaseType>(%x : Tensor[s, bt]) {
         %x
@@ -244,7 +244,7 @@ Notice that the return type is omitted and will be inferred.
 A function may also be subject to one or more type relations, such as in
 the following:
 
-.. code-block:: python
+.. code-block::
 
     fn(%x, %y) where Broadcast { add(%x, %y) }
 
@@ -347,14 +347,14 @@ or global functions) and Relay operators.
 The syntax of calls follows that used in C-like languages, demonstrated in the
 example below:
 
-.. code-block:: python
+.. code-block::
 
    let %c = 1;
    let %f = fn(%x : Tensor[(), float32], %y : Tensor[(), float32]) { %x + %y + %c };
    %f(10, 11)
 
 When a closure is called (see `Closures`_),
-the closure's body is evaluated in the stored environment 
+the closure's body is evaluated in the stored environment
 (i.e., using the stored values for free variables) with
 local variable bindings added for each argument; the final value
 obtained by evaluating the body is the call's return value.
@@ -362,7 +362,7 @@ Thus, in the above example, the call evaluates to 22.
 In the case of operators, the implementation is opaque to Relay,
 so the result is left up to the registered TVM implementation.
 
-*Note: type parameters are not yet supported in the text format.* 
+*Note: type parameters are not yet supported in the text format.*
 
 A type-polymorphic function can also include type arguments at a call
 site. The type arguments are substituted for type parameters when
@@ -370,7 +370,7 @@ type checking. If a function is type-polymorphic and type arguments are not
 given, type inference will attempt to infer type arguments if possible.
 The following code gives examples of explicit and inferred type arguments:
 
-.. code-block:: python
+.. code-block::
 
     // %f : fn<a : Type, b : Type, c : Type>(a, b) -> c
     let %x1 = %f<Tensor[(), bool], Tensor[(), bool], Tensor[(), bool)]>(True, False);
@@ -380,7 +380,7 @@ The following code gives examples of explicit and inferred type arguments:
 
 Note that all type relations in the function type must hold at each
 call site. Specifically, this means that the relation will be checked
-against the specific types of the arguments at a given call site. This 
+against the specific types of the arguments at a given call site. This
 is also a form of polymorphism, since there may be multiple valid
 assignments of argument types and a return type so long as the relation
 is satisfied.
@@ -390,7 +390,7 @@ and has the :code:`Broadcast` relation, then there are many different
 shapes that the arguments in the below call could have that would satisfy
 the type annotation:
 
-.. code-block:: python
+.. code-block::
 
    let %x : Tensor[(100, 100, 100), float32] = %f(%a, %b);
    %x
@@ -416,7 +416,7 @@ but have syntactic sugar in the text format to enter their definitions into the
 a global function definition includes a global identifier and is allowed to recursively refer to
 that identifier in the body, as in the following example:
 
-.. code-block:: python
+.. code-block::
 
    def @ackermann(%m : Tensor[(), int32], %n : Tensor[(), int32]) -> Tensor[(), int32] {
        if (%m == 0) {
@@ -455,11 +455,11 @@ Tuples
 Construction
 ~~~~~~~~~~~~
 
-The tuple node builds a finite (that is, of statically known size) sequence of heterogeneous data. 
+The tuple node builds a finite (that is, of statically known size) sequence of heterogeneous data.
 These tuples match Python's closely, and their fixed length allows for efficient projection of their
 members.
 
-.. code-block:: python
+.. code-block::
 
    fn(%a : Tensor[(10, 10), float32], %b : float32, %c : Tensor[(100, 100), float32]) {
        let %tup = (%a, %b);     // type: (Tensor[(10, 10), float32], float32)
@@ -476,7 +476,7 @@ particular member of the tuple. Projections are 0-indexed.
 
 For example, the below projection evaluates to :code:`%b`:
 
-.. code-block:: python
+.. code-block::
 
    (%a, %b, %c).1
 
@@ -494,10 +494,10 @@ that may reference the bound identifier. If a type annotation
 on the bound variable is omitted, Relay attempts to infer the
 most general type permitted for the variable.
 
-The bound variable in a :code:`let` expression is only in scope 
+The bound variable in a :code:`let` expression is only in scope
 in its body, except when the variable defines a function expression.
 When a :code:`let` expression creates a function, the variable is also
-in scope in its value to allow for recursively defined functions 
+in scope in its value to allow for recursively defined functions
 (see the previous subsection).
 
 The value of a :code:`let` binding is the value of the final expression
@@ -505,7 +505,7 @@ after evaluating the bindings it depends on. For example, in the
 following example the entire expression evaluates to a tensor
 of shape :code:`(10, 10)` where all elements are 2:
 
-.. code-block:: python
+.. code-block::
 
    let %x : Tensor[(10, 10), float32] = Constant(1, (10, 10), float32);
    %x + %x
@@ -518,7 +518,7 @@ For example, the first and second :code:`let` bindings below
 may be evaluated in either order because neither has a dataflow
 dependency on the other:
 
-.. code-block:: python
+.. code-block::
 
    let %x = %a + %b;
    let %y = %c + %d;
@@ -549,7 +549,7 @@ of this nuance).
 In Relay's text format, a graph binding can be written as below (note the lack of a
 :code:`let` keyword and a semicolon):
 
-.. code-block:: python
+.. code-block::
 
    %1 = %a + %b
    %2 = %1 + %1
@@ -561,7 +561,7 @@ Python front-end by setting *Python variables* equal to the corresponding Relay
 using the variables repeatedly, as below (a C++ program using the corresponding API bindings
 could accomplish the same thing):
 
-.. code-block:: python
+.. code-block::
 
    sum1 = relay.add(a, b)
    sum2 = relay.add(sum1, sum1)
@@ -581,7 +581,7 @@ Relay has a simple if-then-else expression that allows programs to branch
 on a single value of type :code:`bool`, i.e., a zero-rank
 tensor of booleans (:code:`Tensor[(), bool]`).
 
-.. code-block:: python
+.. code-block::
 
     if (%t == %u) {
         %t
@@ -626,7 +626,7 @@ executed; the clause expression is evaluated and returned.
 
 For example, suppose we have an ADT for natural numbers:
 
-.. code-block:: python
+.. code-block::
 
    data Nat {
      Z : () -> Nat # zero
@@ -635,7 +635,7 @@ For example, suppose we have an ADT for natural numbers:
 
 Then the following function subtracts one from a passed nat:
 
-.. code-block:: python
+.. code-block::
 
    fn(%v: Nat[]) -> Nat[] {
      match(%v) {
@@ -647,7 +647,7 @@ Then the following function subtracts one from a passed nat:
 The following function subtracts two from its argument if it is at least
 two and returns the argument otherwise, using a nested constructor pattern:
 
-.. code-block:: python
+.. code-block::
 
    fn(%v : Nat[]) -> Nat[] {
      match(%v) {
@@ -661,7 +661,7 @@ As aforementioned, the ordering of match clauses is relevant.
 In the below example, the first clause will always match so
 those below it can never run:
 
-.. code-block:: python
+.. code-block::
 
    fn(%v : Nat[]) -> Nat[] {
      match(%v) {
@@ -677,7 +677,7 @@ See :py:class:`~tvm.relay.adt.Match` for its definition and documentation.
 TempExprs
 =========
 
-Program transformations (passes) in Relay may require inserting temporary 
+Program transformations (passes) in Relay may require inserting temporary
 state into the program AST to guide further transformations. The
 :code:`TempExpr` node is provided as a utility to developers for this purpose;
 nodes inheriting from :code:`TempExpr` cannot appear directly in user-provided
@@ -685,7 +685,7 @@ code but may be inserted in a pass. Any :code:`TempExpr` created in a pass
 should ideally be eliminated before the pass is complete, as a
 :code:`TempExpr` only stores internal state and has no semantics of its own.
 
-For an example of :code:`TempExpr` being used in a pass, 
+For an example of :code:`TempExpr` being used in a pass,
 see :code:`src/relay/pass/alter_op_layout.cc`, which uses :code:`TempExpr` nodes
 to store information about operator layouts as the pass tries to rearrange operator
 calls.
diff --git a/docs/langref/relay_type.rst b/docs/langref/relay_type.rst
index ce00dff755c9..0fc19b7301b7 100644
--- a/docs/langref/relay_type.rst
+++ b/docs/langref/relay_type.rst
@@ -80,7 +80,7 @@ running a program.
 
 For example, here is a simple concrete tensor type corresponding to a 10-by-10 tensor of 32-bit floats:
 
-.. code-block:: python
+.. code-block::
 
    Tensor[(10, 10), float32]
 
@@ -101,7 +101,7 @@ For example, in the below code, :code:`%t` is of type
 :code:`(Tensor[(), bool], Tensor[(10, 10), float32])`
 and :code:`%c` is of type :code:`Tensor[(10, 10), float32]`.
 
-.. code-block:: python
+.. code-block::
 
    let %t = (False, Constant(1, (10, 10), float32));
    let %c = %t.1;
@@ -116,7 +116,7 @@ Type Parameter
 
 Type parameters represent placeholder types used for polymorphism in functions.
 Type parameters are specified according to *kind*, corresponding to the types
-those parameters are allowed to replace: 
+those parameters are allowed to replace:
 
 - :code:`Type`, corresponding to top-level Relay types like tensor types, tuple types, and function types
 - :code:`BaseType`, corresponding to the base type of a tensor (e.g., :code:`float32`, :code:`bool`)
@@ -135,7 +135,7 @@ Like normal parameters, concrete arguments must be given for type parameters at
 For example, :code:`s` below is a type parameter of kind :code:`Shape` and it will
 be substituted with :code:`(10, 10)` at the call site below:
 
-.. code-block:: python
+.. code-block::
 
    def @plus<s : Shape>(%t1 : Tensor[s, float32], %t2 : Tensor[s, float32]) {
         add(%t1, %t2)
@@ -212,7 +212,7 @@ and the return type. For example, we can define the relation for :code:`flatten`
 If we have a relation like :code:`Broadcast` it becomes possible
 to type operators like :code:`add`:
 
-.. code-block:: python
+.. code-block::
 
     add : fn<t1 : Type, t2 : Type, t3 : Type>(t1, t2) -> t3
                 where Broadcast
@@ -359,7 +359,7 @@ This subsection uses the simple list ADT (included as a default
 ADT in Relay) to illustrate the constructs described in the previous
 sections. Its definition is as follows:
 
-.. code-block:: python
+.. code-block::
 
    data List<a> {
      Nil : () -> List
@@ -377,7 +377,7 @@ variable :code:`List` in the constructor definition.
 
 Below two instances of lists with their types given, using type calls:
 
-.. code-block:: python
+.. code-block::
 
    Cons(1, Cons(2, Nil())) # List[Tensor[(), int32]]
    Cons((1, 1), Cons((2, 2), Nil())) # List[(Tensor[(), int32], Tensor[(), int32])]
@@ -390,7 +390,7 @@ be specified.)
 Here are two lists that are rejected by the type system because
 the type parameters do not match:
 
-.. code-block:: python
+.. code-block::
 
    # attempting to put an integer on a list of int * int tuples
    Cons(1, Cons((1, 1), Nil()))
diff --git a/tests/scripts/task_sphinx_precheck.sh b/tests/scripts/task_sphinx_precheck.sh
index cf45c8c57b36..1f4288632b9d 100755
--- a/tests/scripts/task_sphinx_precheck.sh
+++ b/tests/scripts/task_sphinx_precheck.sh
@@ -36,7 +36,7 @@ echo "PreCheck sphinx doc generation WARNINGS.."
 cd docs
 TVM_TUTORIAL_EXEC_PATTERN=none make html 2>/tmp/$$.log.txt
 
-grep -v -E "__mro__|RemovedInSphinx|UserWarning|FutureWarning" < /tmp/$$.log.txt > /tmp/$$.logclean.txt || true
+grep -v -E "__mro__|RemovedInSphinx|UserWarning|FutureWarning|Keras" < /tmp/$$.log.txt > /tmp/$$.logclean.txt || true
 echo "---------Sphinx Log----------"
 cat /tmp/$$.logclean.txt
 echo "-----------------------------"

From 2355caa8afdc8e6a3638c9514f57686737cbd724 Mon Sep 17 00:00:00 2001
From: Ina Dobreva <55383260+inadob@users.noreply.github.com>
Date: Sat, 29 Feb 2020 23:30:16 +0200
Subject: [PATCH 58/73] [Frontend][TFLite] Add parser support for
 l2_normalization (#4966)

* [Frontend][TFLite] Add parser support for l2_normalization

* TF doesn't provide uint8 support
* TFL does the normalization only if it's over the last axis
* TFL uses only the default value for expilon

* Change error message
---
 python/tvm/relay/frontend/tflite.py          | 47 ++++++++++++++++++++
 tests/python/frontend/tflite/test_forward.py | 20 +++++++++
 2 files changed, 67 insertions(+)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 3a17083d60ac..5d26d9807aea 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -122,6 +122,7 @@ def __init__(self, model, subgraph, exp_tab):
             'LOGICAL_OR': self.convert_logical_or,
             'DETECTION_POSTPROCESS': self.convert_detection_postprocess,
             'SQUARE': self.convert_square,
+            'L2_NORMALIZATION': self.convert_l2_normalization,
         }
 
     def check_unsupported_ops(self):
@@ -405,6 +406,52 @@ def convert_resize_nearest_neighbor(self, op):
         """Convert TFLite RESIZE_NEAREST_NEIGHBOR"""
         return self._convert_resize("nearest_neighbor", op)
 
+    def convert_l2_normalization(self, op):
+        """Convert TFLite L2_NORMALIZATION """
+        try:
+            from tflite.Operator import Operator
+            from tflite.BuiltinOptions import BuiltinOptions
+            from tflite.L2NormOptions import L2NormOptions
+            from tflite.ActivationFunctionType import ActivationFunctionType
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 1, "input tensors length should be 1"
+        input_tensor = input_tensors[0]
+        in_expr = self.get_expr(input_tensor.tensor_idx)
+
+        output_tensors = self.get_output_tensors(op)
+        assert len(output_tensors) == 1, "output tensors length should be 1"
+        output_tensor = output_tensors[0]
+
+        assert op.BuiltinOptionsType() == BuiltinOptions.L2NormOptions
+        op_options = op.BuiltinOptions()
+        l2_norm_options = L2NormOptions()
+        l2_norm_options.Init(op_options.Bytes, op_options.Pos)
+        fused_activation_fn = l2_norm_options.FusedActivationFunction()
+
+        # TFLite supports normalization only over the last dim
+        input_tensor_rank = len(input_tensor.tensor.ShapeAsNumpy())
+
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFLite quantized L2_NORMALIZATION operator is not supported yet.')
+        # TFL uses only the default epsilon value
+        out = _op.nn.l2_normalize(in_expr, eps=1e-12, axis=[input_tensor_rank - 1])
+
+        # if we have fused activation fn
+        if fused_activation_fn != ActivationFunctionType.NONE:
+            if not output_tensor.qnn_params:
+                out = self.convert_fused_activation_function(out, fused_activation_fn)
+            else:
+                raise tvm.error.OpNotImplemented(
+                    'TFLite quantized L2_NORMALIZATION operator\
+                    with fused activation function is not supported yet.')
+
+        return out
+
     def convert_logistic(self, op):
         """Convert TFLite LOGISTIC"""
         try:
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 4a16325e3e40..ced24250c68d 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -33,6 +33,7 @@
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import variables
 try:
     from tensorflow import lite as interpreter_wrapper
@@ -1263,6 +1264,24 @@ def test_forward_unpack():
         _test_unpack(np.array(np.random.uniform(0, 5, (3, 6)), dtype=np.int32), axis=-2, num_unpacks=3)
         _test_unpack(np.array(np.random.uniform(0, 5, (2, 3, 4)), dtype=np.int32), axis=-3, num_unpacks=2)
 
+#######################################################################
+# L2 normalization
+# ----------------
+
+def _test_l2_normalization(data, axis, fused_activation_function=None):
+    """ One iteration of L2_NORMALIZATION """
+    with tf.Graph().as_default():
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+        out = nn_impl.l2_normalize(in_data, axis)
+        out = with_fused_activation_function(out, fused_activation_function)
+        compare_tflite_with_tvm(data, 'Placeholder:0', [in_data], [out])
+
+def test_forward_l2_normalization():
+    """ L2_NORMALIZATION """
+    data = np.random.uniform(size=(3, 6, 4)).astype('float32')
+    _test_l2_normalization(data, axis=2)
+    _test_l2_normalization(data, axis=2, fused_activation_function="RELU")
+
 #######################################################################
 # Logistic
 # --------
@@ -1649,6 +1668,7 @@ def test_forward_mediapipe_hand_landmark():
     test_forward_relu()
     test_forward_prelu()
     test_forward_fully_connected()
+    test_forward_l2_normalization()
 
     # Elemwise
     test_all_elemwise()

From 474c70d7cb0ed4455aad9c04c2e6dd02f635c1c5 Mon Sep 17 00:00:00 2001
From: jmorrill <jeremiah.morrill@gmail.com>
Date: Sat, 29 Feb 2020 13:52:22 -0800
Subject: [PATCH 59/73] Added CopyFromBytes and CopyToBytes convenience methods
 to NDArray.  Fixed typos. (#4970)

* Added CopyFromBytes and CopyToBytes convenience methods.  Fixed typos.

* Removed unneed argument check

* Use TVMArrayCopyFrom/ToBytes methods

* Moved CopyFrom/ToBytes to ndarray.cc

* CopyToBytes impl was using CopyFromBytes.  Fixed

* changed inline to TVM_DLL

* Used impl from TVMArrayCopyTo/FromBytes into NDArray CopyTo/FromBytes

* Move implementation of all CopyFrom/ToBytes into a common impls

* make arg const

* simplify method impl
---
 include/tvm/runtime/ndarray.h | 26 ++++++++++++---
 src/runtime/ndarray.cc        | 60 +++++++++++++++++++++++------------
 2 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 090cacff5c3a..2441ab659b84 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -68,19 +68,37 @@ class NDArray : public ObjectRef {
   /*!
    * \brief Copy data content from another array.
    * \param other The source array to be copied from.
-   * \note The copy may happen asynchrously if it involves a GPU context.
+   * \note The copy may happen asynchronously if it involves a GPU context.
    *       TVMSynchronize is necessary.
    */
   inline void CopyFrom(const DLTensor* other);
   inline void CopyFrom(const NDArray& other);
+  /*!
+ * \brief Copy data content from a byte buffer.
+ * \param data The source bytes to be copied from.
+ * \param nbytes The size of the buffer in bytes
+ *        Must be equal to the size of the NDArray.
+ * \note The copy may happen asynchronously if it involves a GPU context.
+ *       TVMSynchronize is necessary.
+ */
+  TVM_DLL void CopyFromBytes(const void* data, size_t nbytes);
   /*!
    * \brief Copy data content into another array.
    * \param other The source array to be copied from.
-   * \note The copy may happen asynchrously if it involves a GPU context.
+   * \note The copy may happen asynchronously if it involves a GPU context.
    *       TVMSynchronize is necessary.
    */
   inline void CopyTo(DLTensor* other) const;
   inline void CopyTo(const NDArray& other) const;
+  /*!
+   * \brief Copy data content into another array.
+   * \param data The source bytes to be copied from.
+   * \param nbytes The size of the data buffer.
+   *        Must be equal to the size of the NDArray.
+   * \note The copy may happen asynchronously if it involves a GPU context.
+   *       TVMSynchronize is necessary.
+   */
+  TVM_DLL void CopyToBytes(void* data, size_t nbytes) const;
   /*!
    * \brief Copy the data to another context.
    * \param ctx The target context.
@@ -182,7 +200,7 @@ class NDArray : public ObjectRef {
 
 /*!
  * \brief Save a DLTensor to stream
- * \param strm The outpu stream
+ * \param strm The output stream
  * \param tensor The tensor to be saved.
  */
 inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
@@ -205,7 +223,7 @@ class NDArray::ContainerBase {
   DLTensor dl_tensor;
 
   /*!
-   * \brief addtional context, reserved for recycling
+   * \brief additional context, reserved for recycling
    * \note We can attach additional content here
    *  which the current container depend on
    *  (e.g. reference to original memory when creating views).
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 91002c951c04..ff2f34ee6e4e 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -60,6 +60,32 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   return align;
 }
 
+void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
+  TVMContext cpu_ctx;
+  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_id = 0;
+  size_t arr_size = GetDataSize(*handle);
+  CHECK_EQ(arr_size, nbytes)
+      << "ArrayCopyFromBytes: size mismatch";
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(
+      data, 0,
+      handle->data, static_cast<size_t>(handle->byte_offset),
+      nbytes, cpu_ctx, handle->ctx, handle->dtype, nullptr);
+}
+
+void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) {
+  TVMContext cpu_ctx;
+  cpu_ctx.device_type = kDLCPU;
+  cpu_ctx.device_id = 0;
+  size_t arr_size = GetDataSize(*handle);
+  CHECK_EQ(arr_size, nbytes)
+      << "ArrayCopyToBytes: size mismatch";
+  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(
+      handle->data, static_cast<size_t>(handle->byte_offset),
+      data, 0,
+      nbytes, handle->ctx, cpu_ctx, handle->dtype, nullptr);
+}
+
 struct NDArray::Internal {
   // Default deleter for the container
   static void DefaultDeleter(Object* ptr_obj) {
@@ -185,6 +211,18 @@ NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   return NDArray(GetObjectPtr<Object>(data));
 }
 
+void NDArray::CopyToBytes(void* data, size_t nbytes) const {
+  CHECK(data != nullptr);
+  CHECK(data_ != nullptr);
+  ArrayCopyToBytes(&get_mutable()->dl_tensor, data, nbytes);
+}
+
+void NDArray::CopyFromBytes(const void* data, size_t nbytes) {
+  CHECK(data != nullptr);
+  CHECK(data_ != nullptr);
+  ArrayCopyFromBytes(&get_mutable()->dl_tensor, data, nbytes);
+}
+
 void NDArray::CopyFromTo(const DLTensor* from,
                          DLTensor* to,
                          TVMStreamHandle stream) {
@@ -286,16 +324,7 @@ int TVMArrayCopyFromBytes(TVMArrayHandle handle,
                           void* data,
                           size_t nbytes) {
   API_BEGIN();
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-  size_t arr_size = GetDataSize(*handle);
-  CHECK_EQ(arr_size, nbytes)
-      << "TVMArrayCopyFromBytes: size mismatch";
-  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(
-      data, 0,
-      handle->data, static_cast<size_t>(handle->byte_offset),
-      nbytes, cpu_ctx, handle->ctx, handle->dtype, nullptr);
+  ArrayCopyFromBytes(handle, data, nbytes);
   API_END();
 }
 
@@ -303,15 +332,6 @@ int TVMArrayCopyToBytes(TVMArrayHandle handle,
                         void* data,
                         size_t nbytes) {
   API_BEGIN();
-  TVMContext cpu_ctx;
-  cpu_ctx.device_type = kDLCPU;
-  cpu_ctx.device_id = 0;
-  size_t arr_size = GetDataSize(*handle);
-  CHECK_EQ(arr_size, nbytes)
-      << "TVMArrayCopyToBytes: size mismatch";
-  DeviceAPI::Get(handle->ctx)->CopyDataFromTo(
-      handle->data, static_cast<size_t>(handle->byte_offset),
-      data, 0,
-      nbytes, handle->ctx, cpu_ctx, handle->dtype, nullptr);
+  ArrayCopyToBytes(handle, data, nbytes);
   API_END();
 }

From 92a2427845106644c99efdddc918c293c0bf4765 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Sun, 1 Mar 2020 09:51:49 +0900
Subject: [PATCH 60/73] [Torch] Upsampling op support and enable registering a
 user defined op conversion map (#4961)

* add custom conversion map

* add roi align test using custom convert map

* refactor test

* add support for upsampling op and test on segmentation models

* remove redundant no_grad

* add upsampling test case

* make the default custom map None, instead of empty dict

* updated tests, remove packaging and drop PT 1.2 support

* add better support for aten::to and tests

* add a note on dilation in x86
---
 python/tvm/relay/frontend/pytorch.py          |  80 +++-
 tests/python/frontend/pytorch/test_forward.py | 345 +++++++++++-------
 tutorials/frontend/from_pytorch.py            |   2 +-
 3 files changed, 285 insertions(+), 142 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index fd66e3c1f367..b256faa5d6f9 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -19,7 +19,6 @@
 # pylint: disable=import-outside-toplevel, simplifiable-if-expression, unnecessary-comprehension
 """PT: PyTorch frontend."""
 import itertools
-from packaging import version
 
 import numpy as np
 
@@ -31,6 +30,7 @@
 from .. import op as _op
 from .common import get_relay_op
 from .common import infer_shape as _infer_shape
+from .common import infer_value as _infer_value
 
 __all__ = ["from_pytorch"]
 
@@ -614,6 +614,61 @@ def _impl(inputs, input_types):
         return _op.tensor.sqrt(data)
     return _impl
 
+def _floor():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        return _op.floor(data)
+    return _impl
+
+def _to():
+    def _impl(inputs, input_types):
+        data = inputs[0]
+        if inputs[3] in ["cpu", "cuda"]:
+            return data
+        # special handling for aten::to(data, 6, _, _, _) case
+        # 6 means dtype = float
+        # this happens when converting upsampling with scale factor
+        cast_func = {
+            6: float,
+            3: int,
+        }
+        cast_func_expr = {
+            6: lambda x: _op.cast(x, "float32"),
+            3: lambda x: _op.cast(x, "int32"),
+        }
+        if inputs[1] in cast_func and not isinstance(data, _expr.Expr):
+            return cast_func[inputs[1]](data)
+        elif inputs[1] in cast_func and isinstance(data, _expr.Expr):
+            return cast_func_expr[inputs[1]](data)
+        return data
+
+    return _impl
+
+def _upsample(method):
+    def _impl(inputs, input_types):
+        if isinstance(inputs[1], _expr.Var):
+            out_size = _infer_shape(inputs[1])
+        elif isinstance(inputs[1], list):
+            infer_res = [_infer_value(size, {}) for size in inputs[1]]
+            out_size = [np.asscalar(res.asnumpy().astype(np.int))
+                        for res in infer_res]
+
+        data = inputs[0]
+
+        if len(inputs) > 2:
+            align_corners = inputs[2]
+        else:
+            align_corners = False
+
+        if align_corners:
+            coord_trans = "align_corners"
+        else:
+            coord_trans = "half_pixel"
+
+        return _op.image.resize(data, out_size, "NCHW", method, coord_trans)
+
+    return _impl
+
 # Helper functions for operator implementation
 
 def _convert_data_type(input_type):
@@ -686,7 +741,7 @@ def _convert_elemwise_input(data, input_type):
     "aten::div_"                            : _elemwise("divide"),
     "aten::ones"                            : _ones(),
     "aten::zeros"                           : _zeros(),
-    "aten::to"                              : _identity(),
+    "aten::to"                              : _to(),
     "aten::unsqueeze"                       : _unsqueeze(),
     "aten::cat"                             : _concatenate(),
     "aten::slice"                           : _slice(),
@@ -729,15 +784,18 @@ def _convert_elemwise_input(data, input_type):
     "aten::permute"                         : _transpose(),
     "aten::sum"                             : _reduce("sum"),
     "aten::prod"                            : _reduce("prod"),
-    "aten::sqrt"                            : _sqrt()
+    "aten::sqrt"                            : _sqrt(),
+    'aten::floor'                           : _floor(),
+    "aten::detach"                          : _identity(),
+    "aten::upsample_bilinear2d"             : _upsample("bilinear"),
+    "aten::upsample_nearest2d"              : _upsample("nearest_neighbor"),
 }
 
 
 def _run_jit_passes(graph):
     """ The inline pass is necessary to unwrap prim::CallMethod """
     import torch
-    if version.parse(torch.__version__) >= version.parse("1.4.0"):
-        torch._C._jit_pass_inline(graph)
+    torch._C._jit_pass_inline(graph)
 
 
 def _is_int_seq(seq):
@@ -985,8 +1043,7 @@ def parse_operators(operators, outputs, output_index_map, ret_name):
 
 def get_all_op_names(graph):
     """ Return all operator names in the input graph """
-    nodes = list(graph.nodes())
-    return set(node.kind() for node in nodes)
+    return set(node.kind() for node in graph.nodes())
 
 
 def get_graph_input_names(script_module):
@@ -997,7 +1054,7 @@ def get_graph_input_names(script_module):
     return ir_inputs[1:]  # remove self at the 0th arg
 
 
-def from_pytorch(script_module, input_shapes):
+def from_pytorch(script_module, input_shapes, custom_convert_map=None):
     """ Load PyTorch model in the form of a scripted PyTorch model and convert into relay.
     The companion parameters will be handled automatically.
 
@@ -1011,6 +1068,9 @@ def from_pytorch(script_module, input_shapes):
         Graph level input shape dictionary
         The keys should be the same one returned by get_graph_input_names(...) above
 
+    custom_convert_map: Dictionary of str to Relay op
+        A custom op conversion map in the same format as _convert_map above
+
     Returns
     -------
     mod : tvm.relay.Module
@@ -1021,6 +1081,10 @@ def from_pytorch(script_module, input_shapes):
     """
     graph = script_module.graph.copy()
     _run_jit_passes(graph)
+
+    if custom_convert_map:
+        _convert_map.update(custom_convert_map)
+
     op_names = get_all_op_names(graph)
     _report_missing_conversion(op_names)
 
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 831389b7ebf5..c2ff94de546f 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -17,15 +17,12 @@
 # pylint: disable=import-self, invalid-name, unused-argument
 """Unit tests for various models and operators"""
 from time import time
-import os
 import sys
-from tempfile import TemporaryDirectory
 from scipy.stats import t as tdistr
 import numpy as np
 import torch
 from torch.nn import Module
 import tvm
-from tvm import te
 import torchvision
 
 from tvm import relay
@@ -36,22 +33,6 @@
 
 sys.setrecursionlimit(10000)
 
-def _vectorize(ten):
-    return ten.reshape(-1)
-
-def atol(tru, est):
-    def _atol_elt(tru, est):
-        return abs(tru - est)
-    tru = _vectorize(tru)
-    est = _vectorize(est)
-    return max([_atol_elt(x, y) for x, y in zip(tru, est)])
-
-def rtol(tru, est):
-    def _rtol_elt(tru, est):
-        return abs(tru - est) / min(abs(tru), abs(est))
-    tru = _vectorize(tru)
-    est = _vectorize(est)
-    return max([_rtol_elt(x, y) for x, y in zip(tru, est)])
 
 def assert_shapes_match(tru, est):
     if tru.shape != est.shape:
@@ -77,7 +58,7 @@ def load_torchvision(model_name):
             input_data[:, channel] /= std[channel]
         model = getattr(torchvision.models, model_name)(pretrained=True)
         model = model.float().eval()
-        return model, input_data
+        return model, [input_data]
 
 def load_pretrainedmodels(model_name):
     """Given a model name, returns a pretrainedmodels.pytorch model in eval
@@ -89,7 +70,7 @@ def load_pretrainedmodels(model_name):
     for channel in range(3):
         input_data[:, channel] -= model.mean[channel]
         input_data[:, channel] /= model.std[channel]
-    return model, input_data
+    return model, [input_data]
 
 def load_model(model_name):
     """Given a model name, returns a model as well as an example input."""
@@ -116,7 +97,7 @@ def measure_latency(model, input_shapes, output_shapes, thresh, dryruns=40):
     latencies = []
     count = 0
     while True:
-        if isinstance(model, torch.nn.Module):
+        if isinstance(model, Module):
             input_data = [torch.rand(shape).float() for shape in input_shapes]
             if torch.cuda.is_available():
                 input_data = list(map(lambda x: x.cuda(), input_data))
@@ -153,23 +134,34 @@ def measure_latency(model, input_shapes, output_shapes, thresh, dryruns=40):
             if err < thresh:
                 return est
 
-def verify_model(model_name, input_data=[]):
+def verify_model(model_name, input_data=[],
+                 custom_convert_map={},
+                 ctx_list=ctx_list()):
     """Assert that the output of a compiled model matches with that of its
     baseline."""
-    if len(input_data) == 0:
+    if isinstance(model_name, str):
         baseline_model, baseline_input = load_model(model_name)
-    else:
+    elif isinstance(input_data, list):
         baseline_model = model_name
         baseline_input = input_data
+    elif isinstance(input_data, torch.Tensor) or len(input_data.shape) == 0:
+        baseline_model = model_name
+        baseline_input = [input_data]
+    else:
+        assert False, "Unexpected input format"
+
     if torch.cuda.is_available():
         baseline_model = baseline_model.cuda()
-        baseline_input = baseline_input.cuda()
+        baseline_input = [inp.cuda() for inp in baseline_input]
+
     with torch.no_grad():
-        baseline_outputs = baseline_model(baseline_input)
+        baseline_outputs = baseline_model(*baseline_input)
+
     if isinstance(baseline_outputs, tuple):
         baseline_outputs = tuple(out.cpu().numpy() for out in baseline_outputs)
     else:
         baseline_outputs = (baseline_outputs.float().cpu().numpy(),)
+
     trace = torch.jit.trace(baseline_model, baseline_input).float().eval()
 
     if torch.cuda.is_available():
@@ -177,17 +169,21 @@ def verify_model(model_name, input_data=[]):
     else:
         trace = trace.cpu()
 
-    input_name = get_graph_input_names(trace)[0]  # only one input
-    input_shapes = {input_name: list(baseline_input.shape)}
-    mod, params = relay.frontend.from_pytorch(trace, input_shapes)
-    compiled_input = {input_name: tvm.nd.array(baseline_input.cpu().numpy())}
+    input_names = get_graph_input_names(trace)
+    input_shapes = dict(zip(input_names,
+                            [inp.shape for inp in baseline_input]))
+    mod, params = relay.frontend.from_pytorch(trace, input_shapes,
+                                              custom_convert_map)
+    compiled_input = dict(zip(input_names,
+                              [inp.cpu().numpy() for inp in baseline_input]))
 
     with relay.build_config(opt_level=3):
-        for target, ctx in ctx_list():
+        for target, ctx in ctx_list:
             relay_graph, relay_lib, relay_params = relay.build(mod, target=target, params=params)
             relay_model = graph_runtime.create(relay_graph, relay_lib, ctx)
             relay_model.set_input(**relay_params)
-            relay_model.set_input(**compiled_input)
+            for name, inp in compiled_input.items():
+                relay_model.set_input(name, inp)
             relay_model.run()
 
             for i, baseline_output in enumerate(baseline_outputs):
@@ -228,12 +224,11 @@ def forward(self, *args):
                 ones = ones.cuda()
             return args[0] + ones
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Add1().float().eval(), input_data=input_data)
-        verify_model(Add2().float().eval(), input_data=input_data)
-        verify_model(Add3().float().eval(), input_data=input_data)
-        verify_model(Add4().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Add1().float().eval(), input_data=input_data)
+    verify_model(Add2().float().eval(), input_data=input_data)
+    verify_model(Add3().float().eval(), input_data=input_data)
+    verify_model(Add4().float().eval(), input_data=input_data)
 
 def test_forward_subtract():
     torch.set_grad_enabled(False)
@@ -261,12 +256,11 @@ def forward(self, *args):
                 ones = ones.cuda()
             return args[0] - ones
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Subtract1().float().eval(), input_data=input_data)
-        verify_model(Subtract2().float().eval(), input_data=input_data)
-        verify_model(Subtract3().float().eval(), input_data=input_data)
-        verify_model(Subtract4().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Subtract1().float().eval(), input_data=input_data)
+    verify_model(Subtract2().float().eval(), input_data=input_data)
+    verify_model(Subtract3().float().eval(), input_data=input_data)
+    verify_model(Subtract4().float().eval(), input_data=input_data)
 
 def test_forward_multiply():
     torch.set_grad_enabled(False)
@@ -294,12 +288,11 @@ def forward(self, *args):
                 ones = ones.cuda()
             return args[0] * ones
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Multiply1().float().eval(), input_data=input_data)
-        verify_model(Multiply2().float().eval(), input_data=input_data)
-        verify_model(Multiply3().float().eval(), input_data=input_data)
-        verify_model(Multiply4().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Multiply1().float().eval(), input_data=input_data)
+    verify_model(Multiply2().float().eval(), input_data=input_data)
+    verify_model(Multiply3().float().eval(), input_data=input_data)
+    verify_model(Multiply4().float().eval(), input_data=input_data)
 
 def test_forward_unsqueeze():
     torch.set_grad_enabled(False)
@@ -327,10 +320,9 @@ def forward(self, *args):
             c = (args[0][:, :, 2] + 5) * 13
             return torch.cat([t.unsqueeze(2) for t in [a, b, c]], 2)
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Concatenate1().float().eval(), input_data=input_data)
-        verify_model(Concatenate2().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Concatenate1().float().eval(), input_data=input_data)
+    verify_model(Concatenate2().float().eval(), input_data=input_data)
 
 def test_forward_relu():
     torch.set_grad_enabled(False)
@@ -340,9 +332,8 @@ class ReLU1(Module):
         def forward(self, *args):
             return torch.nn.ReLU()(args[0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(ReLU1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(ReLU1().float().eval(), input_data=input_data)
 
 def test_forward_adaptiveavgpool():
     torch.set_grad_enabled(False)
@@ -356,10 +347,9 @@ class AdaptiveAvgPool2D2(Module):
         def forward(self, *args):
             return torch.nn.AdaptiveAvgPool2d([10, 10])(args[0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(AdaptiveAvgPool2D1().float().eval(), input_data=input_data)
-        verify_model(AdaptiveAvgPool2D2().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(AdaptiveAvgPool2D1().float().eval(), input_data=input_data)
+    verify_model(AdaptiveAvgPool2D2().float().eval(), input_data=input_data)
 
 def test_forward_maxpool():
     torch.set_grad_enabled(False)
@@ -373,10 +363,9 @@ class MaxPool2D2(Module):
         def forward(self, *args):
             return torch.nn.MaxPool2d(kernel_size=[10, 10])(args[0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(MaxPool2D1().float().eval(), input_data=input_data)
-        verify_model(MaxPool2D2().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(MaxPool2D1().float().eval(), input_data=input_data)
+    verify_model(MaxPool2D2().float().eval(), input_data=input_data)
 
 def test_forward_avgpool():
     torch.set_grad_enabled(False)
@@ -386,9 +375,8 @@ class AvgPool2D1(Module):
         def forward(self, *args):
             return torch.nn.AvgPool2d(kernel_size=[10, 10])(args[0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(AvgPool2D1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(AvgPool2D1().float().eval(), input_data=input_data)
 
 def test_forward_hardtanh():
     torch.set_grad_enabled(False)
@@ -398,9 +386,8 @@ class HardTanh1(Module):
         def forward(self, *args):
             return torch.nn.Hardtanh()(args[0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(HardTanh1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(HardTanh1().float().eval(), input_data=input_data)
 
 def test_forward_conv():
     torch.set_grad_enabled(False)
@@ -433,11 +420,10 @@ def __init__(self):
         def forward(self, *args):
             return self.softmax(self.conv(args[0]))
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Conv2D1().float().eval(), input_data=input_data)
-        verify_model(Conv2D2().float().eval(), input_data=input_data)
-        verify_model(Conv2D3().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Conv2D1().float().eval(), input_data=input_data)
+    verify_model(Conv2D2().float().eval(), input_data=input_data)
+    verify_model(Conv2D3().float().eval(), input_data=input_data)
 
 def test_forward_threshold():
     torch.set_grad_enabled(False)
@@ -447,9 +433,8 @@ class Threshold1(Module):
         def forward(self, *args):
             return torch.nn.Threshold(0, 0)(args[0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Threshold1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Threshold1().float().eval(), input_data=input_data)
 
 def test_forward_contiguous():
     torch.set_grad_enabled(False)
@@ -459,9 +444,8 @@ class Contiguous1(Module):
         def forward(self, *args):
             return args[0].contiguous()
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Contiguous1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Contiguous1().float().eval(), input_data=input_data)
 
 def test_forward_batchnorm():
     torch.set_grad_enabled(False)
@@ -481,10 +465,9 @@ def __init__(self):
         def forward(self, *args):
             return self.batch_norm(args[0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(BatchNorm1().float().eval(), input_data=input_data)
-        verify_model(BatchNorm2().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(BatchNorm1().float().eval(), input_data=input_data)
+    verify_model(BatchNorm2().float().eval(), input_data=input_data)
 
 def test_forward_transpose():
     torch.set_grad_enabled(False)
@@ -498,10 +481,9 @@ class Transpose2(Module):
         def forward(self, *args):
             return args[0].transpose(-2, -1)
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Transpose1().float().eval(), input_data=input_data)
-        verify_model(Transpose2().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Transpose1().float().eval(), input_data=input_data)
+    verify_model(Transpose2().float().eval(), input_data=input_data)
 
 def test_forward_size():
     torch.set_grad_enabled(False)
@@ -511,9 +493,8 @@ class Size1(Module):
         def forward(self, *args):
             return float(args[0].size(0)) * args[0]
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Size1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Size1().float().eval(), input_data=input_data)
 
 def test_forward_view():
     torch.set_grad_enabled(False)
@@ -527,10 +508,9 @@ class View2(Module):
         def forward(self, *args):
             return args[0].view(args[0].shape[0], -1)
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(View1().float().eval(), input_data=input_data)
-        verify_model(View2().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(View1().float().eval(), input_data=input_data)
+    verify_model(View2().float().eval(), input_data=input_data)
 
 def test_forward_select():
     torch.set_grad_enabled(False)
@@ -540,9 +520,8 @@ class Select1(Module):
         def forward(self, *args):
             return args[0].select(1, 1)
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Select1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Select1().float().eval(), input_data=input_data)
 
 def test_forward_clone():
     torch.set_grad_enabled(False)
@@ -552,9 +531,8 @@ class Clone1(Module):
         def forward(self, *args):
             return args[0].clone()
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Clone1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Clone1().float().eval(), input_data=input_data)
 
 def test_forward_logsoftmax():
     torch.set_grad_enabled(False)
@@ -564,9 +542,8 @@ class LogSoftmax1(Module):
         def forward(self, *args):
             return torch.nn.LogSoftmax(dim=1)(args[0][0, 0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(LogSoftmax1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(LogSoftmax1().float().eval(), input_data=input_data)
 
 def test_forward_sigmoid():
     torch.set_grad_enabled(False)
@@ -576,9 +553,8 @@ class Sigmoid1(Module):
         def forward(self, *args):
             return torch.nn.Sigmoid()(args[0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Sigmoid1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Sigmoid1().float().eval(), input_data=input_data)
 
 def test_forward_dense():
     torch.set_grad_enabled(False)
@@ -598,10 +574,9 @@ def __init__(self):
         def forward(self, *args):
             return self.linear(args[0][0, 0])
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Dense1().float().eval(), input_data=input_data)
-        verify_model(Dense2().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Dense1().float().eval(), input_data=input_data)
+    verify_model(Dense2().float().eval(), input_data=input_data)
 
 def test_forward_dropout():
     torch.set_grad_enabled(False)
@@ -611,9 +586,8 @@ class Dropout1(Module):
         def forward(self, *args):
             return torch.nn.functional.dropout(args[0][0, 0], 0.5, False)
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Dropout1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Dropout1().float().eval(), input_data=input_data)
 
 def test_forward_slice():
     torch.set_grad_enabled(False)
@@ -627,10 +601,9 @@ class Slice2(Module):
         def forward(self, *args):
             return args[0][0, :, :, :]
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Slice1().float().eval(), input_data=input_data)
-        verify_model(Slice2().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Slice1().float().eval(), input_data=input_data)
+    verify_model(Slice2().float().eval(), input_data=input_data)
 
 def test_forward_mean():
     torch.set_grad_enabled(False)
@@ -640,9 +613,8 @@ class Mean1(Module):
         def forward(self, *args):
             return args[0].mean(2)
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Mean1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Mean1().float().eval(), input_data=input_data)
 
 def test_forward_expand():
     torch.set_grad_enabled(False)
@@ -652,9 +624,8 @@ class Expand1(Module):
         def forward(self, *args):
             return args[0].expand((3, -1, -1, -1))
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Expand1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Expand1().float().eval(), input_data=input_data)
 
 def test_forward_pow():
     torch.set_grad_enabled(False)
@@ -664,9 +635,8 @@ class Pow1(Module):
         def forward(self, *args):
             return args[0] ** 2
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Pow1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Pow1().float().eval(), input_data=input_data)
 
 def test_forward_chunk():
     torch.set_grad_enabled(False)
@@ -677,9 +647,61 @@ def forward(self, *args):
             chunks = args[0].chunk(7, 2)
             return torch.cat(chunks, 2)
 
-    with torch.no_grad():
-        input_data = torch.rand(input_shape).float()
-        verify_model(Chunk1().float().eval(), input_data=input_data)
+    input_data = torch.rand(input_shape).float()
+    verify_model(Chunk1().float().eval(), input_data=input_data)
+
+def test_upsample():
+    class Upsample(Module):
+        def __init__(self, size=None, scale=None,
+                     mode="nearest", align_corners=None):
+            super().__init__()
+            self.size = size
+            self.scale = scale
+            self.mode = mode
+            self.align_corners = align_corners
+
+        def forward(self, x):
+            return torch.nn.functional.interpolate(x, size=self.size,
+                                                   scale_factor=self.scale,
+                                                   mode=self.mode,
+                                                   align_corners=self.align_corners)
+    inp = torch.rand((1, 3, 32, 32))
+    verify_model(Upsample(size=(64, 64), mode="nearest"), inp)
+    verify_model(Upsample(scale=2, mode="nearest"), inp)
+    verify_model(Upsample(size=(50, 50), mode="nearest"), inp)
+    verify_model(Upsample(size=(64, 64), mode="bilinear", align_corners=True), inp)
+    verify_model(Upsample(scale=2, mode="bilinear", align_corners=True), inp)
+    verify_model(Upsample(size=(50, 50), mode="bilinear", align_corners=True), inp)
+
+def test_to():
+    """ test for aten::to(...) """
+    class ToCPU(Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            return x.to("cpu")
+
+    class ToFloat(Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            return x.float()
+
+    class ToInt(Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            return x.int()
+
+    verify_model(ToCPU().eval(), torch.rand((1, 3, 32, 32)))
+    verify_model(ToFloat().eval(), torch.zeros((1, 3, 32, 32), dtype=torch.int))
+    verify_model(ToFloat().eval(), torch.tensor(2, dtype=torch.int))
+    verify_model(ToInt().eval(), torch.zeros((1, 3, 32, 32)))
+    verify_model(ToInt().eval(), torch.tensor(2.0))
+
 
 # Model tests
 def test_resnet18():
@@ -730,6 +752,57 @@ def test_vgg11_bn():
 """
 
 
+def test_custom_conversion_map():
+    def get_roi_align():
+        pool_size = 5
+        n_channels = 2 * (pool_size ** 2)
+        x = torch.rand(2, n_channels, 10, 10)
+        rois = torch.tensor([[0, 0, 0, 9, 9],  # format is (xyxy)
+                             [0, 0, 5, 4, 9],
+                             [0, 5, 5, 9, 9],
+                             [1, 0, 0, 9, 9]], dtype=torch.float)
+        roi_align = torchvision.ops.RoIAlign(pool_size, spatial_scale=1,
+                                             sampling_ratio=-1)
+        return roi_align.eval(), [x, rois]
+
+    def convert_roi_align():
+        def _impl(inputs, input_types):
+            spatial_scale = inputs[2]
+            pooled_size = (inputs[3], inputs[4])
+            sampling_ratio = inputs[5]
+            return relay.op.vision.roi_align(inputs[0], inputs[1],
+                                             pooled_size, spatial_scale,
+                                             sampling_ratio)
+        return _impl
+
+    custom_map = {'torchvision::roi_align': convert_roi_align()}
+    model, inputs = get_roi_align()
+
+    verify_model(model, inputs, custom_map)
+
+
+def test_segmentaton_models():
+    class SegmentationModelWrapper(Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model
+
+        def forward(self, inp):
+            out = self.model(inp)
+            return out["out"]
+
+    fcn = torchvision.models.segmentation.fcn_resnet101(pretrained=True)
+    deeplab = torchvision.models.segmentation.deeplabv3_resnet101(pretrained=True)
+
+    inp = [torch.rand((1, 3, 300, 300), dtype=torch.float)]
+
+    for model in [fcn, deeplab]:
+        # depthwise + dilated covolution not supported on x86
+        # see https://github.com/apache/incubator-tvm/issues/4962
+        verify_model(SegmentationModelWrapper(model.eval()), inp,
+                     ctx_list=[("cuda", tvm.gpu(0))])
+
+
 if __name__ == "__main__":
     # Single operator tests
     test_forward_add()
@@ -760,6 +833,8 @@ def test_vgg11_bn():
     test_forward_expand()
     test_forward_pow()
     test_forward_chunk()
+    test_upsample()
+    test_to()
 
     # Model tests
     test_resnet18()
@@ -770,3 +845,7 @@ def test_vgg11_bn():
     test_googlenet()
     test_mnasnet0_5()
     test_mobilenet_v2()
+
+    test_custom_conversion_map()
+
+    test_segmentaton_models()
diff --git a/tutorials/frontend/from_pytorch.py b/tutorials/frontend/from_pytorch.py
index 503f64a4e7d9..1c568ceb3ef5 100644
--- a/tutorials/frontend/from_pytorch.py
+++ b/tutorials/frontend/from_pytorch.py
@@ -37,7 +37,7 @@
 PyTorch versions should be backwards compatible but should be used
 with the proper TorchVision version.
 
-Currently, TVM supports PyTorch 1.4, 1.3, and 1.2. Other versions may
+Currently, TVM supports PyTorch 1.4 and 1.3. Other versions may
 be unstable.
 """
 

From 900d99cd9fa7367756b1d6b0682ac032759a8e39 Mon Sep 17 00:00:00 2001
From: zhengdi <zhengdi@nationalchip.com>
Date: Mon, 2 Mar 2020 00:33:59 +0800
Subject: [PATCH 61/73] [TOPI] fix docs errors (#4973)

---
 topi/python/topi/arm_cpu/injective.py | 2 +-
 topi/python/topi/cuda/injective.py    | 2 +-
 topi/python/topi/cuda/softmax.py      | 2 +-
 topi/python/topi/generic/injective.py | 2 +-
 topi/python/topi/hls/injective.py     | 2 +-
 topi/python/topi/opengl/injective.py  | 2 +-
 topi/python/topi/opengl/softmax.py    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/topi/python/topi/arm_cpu/injective.py b/topi/python/topi/arm_cpu/injective.py
index 696b70895825..966520088bc7 100644
--- a/topi/python/topi/arm_cpu/injective.py
+++ b/topi/python/topi/arm_cpu/injective.py
@@ -78,7 +78,7 @@ def schedule_concatenate(outs):
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of reduce in the format
+          The computation graph description of concatenate in the format
           of an array of tensors.
 
     Returns
diff --git a/topi/python/topi/cuda/injective.py b/topi/python/topi/cuda/injective.py
index 303fe5f7cc77..bd3e01dd6faa 100644
--- a/topi/python/topi/cuda/injective.py
+++ b/topi/python/topi/cuda/injective.py
@@ -72,7 +72,7 @@ def schedule_injective(outs):
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of reduce in the format
+          The computation graph description of injective in the format
           of an array of tensors.
 
     Returns
diff --git a/topi/python/topi/cuda/softmax.py b/topi/python/topi/cuda/softmax.py
index ded3ff9cfff8..54d5bfbae121 100644
--- a/topi/python/topi/cuda/softmax.py
+++ b/topi/python/topi/cuda/softmax.py
@@ -26,7 +26,7 @@ def schedule_softmax(outs):
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of reduce in the format
+          The computation graph description of softmax in the format
           of an array of tensors.
 
     Returns
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index 50de7988be10..fa6aee4864ec 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -45,7 +45,7 @@ def schedule_injective(outs):
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of reduce in the format
+          The computation graph description of injective in the format
           of an array of tensors.
 
     Returns
diff --git a/topi/python/topi/hls/injective.py b/topi/python/topi/hls/injective.py
index 6d0c6f4928ec..4c1fdf44067d 100644
--- a/topi/python/topi/hls/injective.py
+++ b/topi/python/topi/hls/injective.py
@@ -45,7 +45,7 @@ def schedule_injective(outs):
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of reduce in the format
+          The computation graph description of injective in the format
           of an array of tensors.
 
     Returns
diff --git a/topi/python/topi/opengl/injective.py b/topi/python/topi/opengl/injective.py
index 3d45247413d2..a5944f7eedb2 100644
--- a/topi/python/topi/opengl/injective.py
+++ b/topi/python/topi/opengl/injective.py
@@ -42,7 +42,7 @@ def schedule_injective(outs):
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of reduce in the format
+          The computation graph description of injective in the format
           of an array of tensors.
 
     Returns
diff --git a/topi/python/topi/opengl/softmax.py b/topi/python/topi/opengl/softmax.py
index e725134494fc..7b15a5373a3b 100644
--- a/topi/python/topi/opengl/softmax.py
+++ b/topi/python/topi/opengl/softmax.py
@@ -24,7 +24,7 @@ def schedule_softmax(outs):
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of reduce in the format
+          The computation graph description of softmax in the format
           of an array of tensors.
 
     Returns

From 51af454ad7f97a49b19bd02830edcdff9379c58f Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Sun, 1 Mar 2020 13:57:24 -0800
Subject: [PATCH 62/73] [Relay][FastMath] Relay pass to use fast exp/tanh
 (#4873)

* [Relay][FastMath] Relay pass to use fast exp/tanh

* Adding required_pass to the tests.

* FastMath test changes.
---
 include/tvm/relay/transform.h             |  7 ++
 python/tvm/relay/transform.py             | 16 ++++-
 src/relay/backend/build_module.cc         |  3 +
 src/relay/op/tensor/unary.cc              | 22 +++++++
 src/relay/pass/fast_math.cc               | 79 +++++++++++++++++++++++
 src/relay/pass/pattern_util.h             | 10 +++
 tests/python/relay/test_pass_fast_math.py | 52 +++++++++++++++
 topi/include/topi/elemwise.h              |  7 +-
 topi/python/topi/math.py                  | 16 +++++
 topi/src/topi.cc                          |  5 +-
 10 files changed, 211 insertions(+), 6 deletions(-)
 create mode 100644 src/relay/pass/fast_math.cc
 create mode 100644 tests/python/relay/test_pass_fast_math.py

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 8d886aa09ea2..28628006105b 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -163,6 +163,13 @@ TVM_DLL Pass PartialEval();
  */
 TVM_DLL Pass SimplifyInference();
 
+/*!
+ * \brief Replaces non linear activation functions with their fast but approximate counterparts.
+ *
+ * \return The Pass.
+ */
+TVM_DLL Pass FastMath();
+
 /*!
  * \brief Infer the type of an expression.
  *
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
index 45535afc486c..f773835d5c29 100644
--- a/python/tvm/relay/transform.py
+++ b/python/tvm/relay/transform.py
@@ -57,7 +57,8 @@ def build_config(opt_level=2,
                 "CanonicalizeCast": 3,
                 "EliminateCommonSubexpr": 3,
                 "CombineParallelConv2D": 4,
-                "CombineParallelDense": 4
+                "CombineParallelDense": 4,
+                "FastMath": 4
             }
 
     fallback_device : int, str, or tvmContext, optional
@@ -175,11 +176,22 @@ def SimplifyInference():
     Returns
     -------
     ret: tvm.relay.Pass
-        The registered to perform operator simplification.
+        The registered pass to perform operator simplification.
     """
     return _transform.SimplifyInference()
 
 
+def FastMath():
+    """ Converts the expensive non linear functions to their fast but approximate counterparts.
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass to perform fast math operations.
+    """
+    return _transform.FastMath()
+
+
 def CanonicalizeOps():
     """Canonicalize special operators to basic operators.
     This can simplify followed analysis, e.g. expanding bias_add to
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index ff64d4a3acbb..0c0a8b8cbfa8 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -305,6 +305,9 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (targets.size() == 1) {
       pass_seqs.push_back(transform::AlterOpLayout());
     }
+
+    // Fast math optimizations.
+    pass_seqs.push_back(transform::FastMath());
     pass_seqs.push_back(transform::FoldConstant());
 
     // Create a sequential pass and perform optimizations.
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 2c7345865095..1169fa801398 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -95,6 +95,17 @@ RELAY_REGISTER_UNARY_OP("exp")
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::exp));
 
 
+RELAY_REGISTER_UNARY_OP("fast_exp")
+.describe(R"code(Returns the fast_exp input array, computed element-wise.
+
+.. math::
+   \fast_exp(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::fast_exp));
+
+
 RELAY_REGISTER_UNARY_OP("erf")
 .describe(R"code(Returns the error function value for input array, computed element-wise.
 
@@ -250,6 +261,17 @@ RELAY_REGISTER_UNARY_OP("tanh")
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::tanh));
 
 
+RELAY_REGISTER_UNARY_OP("fast_tanh")
+.describe(R"code(Returns the fast_tanh of input array, computed element-wise.
+
+.. math::
+   Y = sinh(X) / cosh(X)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::fast_tanh));
+
+
 RELAY_REGISTER_UNARY_OP("negative")
 .describe(R"code(Returns the numeric negative of input array, computed element-wise.
 
diff --git a/src/relay/pass/fast_math.cc b/src/relay/pass/fast_math.cc
new file mode 100644
index 000000000000..898f760fdb50
--- /dev/null
+++ b/src/relay/pass/fast_math.cc
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file fast_math.cc
+ * \brief Replaces non linear activation functions with their fast but approximate counterparts.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/op.h>
+#include "pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+class FastMathMutator : public ExprMutator {
+ public:
+  FastMathMutator()
+      : exp_op_(Op::Get("exp")),
+        tanh_op_(Op::Get("tanh")) {}
+
+  Expr VisitExpr_(const CallNode* n) {
+    auto new_n = ExprMutator::VisitExpr_(n);
+    if (n->op == exp_op_) {
+      return FastExp(new_n.as<CallNode>()->args[0]);
+    } else if (n->op == tanh_op_) {
+      return FastTanh(new_n.as<CallNode>()->args[0]);
+    }
+    return new_n;
+  }
+
+ private:
+  // Cache the following ops. They will be used in the passes repeatedly for
+  // operator equivalence checking so that the registry lookup overhead can be
+  // reduced.
+  const Op& exp_op_;
+  const Op& tanh_op_;
+};
+
+Expr FastMath(const Expr& e) {
+  return FastMathMutator().Mutate(e);
+}
+
+namespace transform {
+
+Pass FastMath() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+    [=](Function f, IRModule m, PassContext pc) {
+    return Downcast<Function>(FastMath(f));
+  };
+  return CreateFunctionPass(pass_func, 4, "FastMath",
+                            {tir::StringImmNode::make("InferType")});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.FastMath")
+.set_body_typed(FastMath);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index f7d8f9c4665e..85750f5e2601 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -316,6 +316,16 @@ inline Expr Exp(Expr e) {
   return CallNode::make(op, {e});
 }
 
+inline Expr FastExp(Expr e) {
+  static const Op& op = Op::Get("fast_exp");
+  return CallNode::make(op, {e});
+}
+
+inline Expr FastTanh(Expr e) {
+  static const Op& op = Op::Get("fast_tanh");
+  return CallNode::make(op, {e});
+}
+
 inline Expr Log(Expr e) {
   static const Op& op = Op::Get("log");
   return CallNode::make(op, {e});
diff --git a/tests/python/relay/test_pass_fast_math.py b/tests/python/relay/test_pass_fast_math.py
new file mode 100644
index 000000000000..e75316f1e04b
--- /dev/null
+++ b/tests/python/relay/test_pass_fast_math.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm.ir import IRModule
+from tvm import relay
+from tvm.relay.transform import FastMath
+
+def test_exp():
+    x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
+    y = relay.exp(x)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule.from_expr(func)
+
+    fast_mod = FastMath()(mod)
+    assert "fast_exp" in fast_mod.astext()
+
+    # Check that FastMath option works for relay.build.
+    with relay.build_config(opt_level=3, required_pass=['FastMath']):
+        fast_mod = relay.optimize(mod, target='llvm', params=None)
+    assert "fast_exp" in fast_mod[0].astext()
+
+def test_tanh():
+    x = relay.var("x", shape=(1, 16, 16, 16), dtype="float32")
+    y = relay.tanh(x)
+    func = relay.Function([x], y)
+    mod = tvm.IRModule.from_expr(func)
+
+    fast_mod = FastMath()(mod)
+    assert "fast_tanh" in fast_mod.astext()
+
+    # Check that FastMath option works for relay.build.
+    with relay.build_config(opt_level=3, required_pass=['FastMath']):
+        fast_mod = relay.optimize(mod, target='llvm', params=None)
+    assert "fast_tanh" in fast_mod[0].astext()
+
+if __name__ == "__main__":
+    test_exp()
+    test_tanh()
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index e35e3e424d6e..3c0822f2b00e 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -58,6 +58,7 @@ TOPI_DECLARE_UNARY_OP(cos);
 TOPI_DECLARE_UNARY_OP(sin);
 TOPI_DECLARE_UNARY_OP(atan);
 TOPI_DECLARE_UNARY_OP(isnan);
+TOPI_DECLARE_UNARY_OP(tanh);
 
 /*
  * \brief Fast_tanh_float implementation from Eigen
@@ -113,9 +114,9 @@ inline Tensor fast_tanh_float(const Tensor& in,
 *
 * \return A Tensor whose op member is tanh
 */
-inline Tensor tanh(const Tensor& x,
-                   std::string name = "T_tanh",
-                   std::string tag = kElementWise) {
+inline Tensor fast_tanh(const Tensor& x,
+                        std::string name = "T_fast_tanh",
+                        std::string tag = kElementWise) {
   if (x->dtype == DataType::Float(32)) {
     // invoke fast_tanh_float implementation
     return fast_tanh_float(x, name, tag);
diff --git a/topi/python/topi/math.py b/topi/python/topi/math.py
index 5b6b9ab8da75..4a63c4535289 100644
--- a/topi/python/topi/math.py
+++ b/topi/python/topi/math.py
@@ -467,3 +467,19 @@ def fast_exp(x):
         The result.
     """
     return cpp.fast_exp(x, x.dtype, tag.ELEMWISE)
+
+
+def fast_tanh(x):
+    """Take tanhonential of input x using fast_tanh implementation
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        Input argument.
+
+    Returns
+    -------
+    y : tvm.Tensor
+        The result.
+    """
+    return cpp.fast_tanh(x, x.dtype, tag.ELEMWISE)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 79e223c30975..75517b818f45 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -188,7 +188,10 @@ TVM_REGISTER_GLOBAL("topi.tanh")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = tanh(args[0]);
   });
-
+TVM_REGISTER_GLOBAL("topi.fast_tanh")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = fast_tanh(args[0]);
+  });
 TVM_REGISTER_GLOBAL("topi.atan")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = atan(args[0]);

From 1c8e5b9364aba85e9bb474c0b17fed7cf463ac37 Mon Sep 17 00:00:00 2001
From: Samuel <siju.samuel@huawei.com>
Date: Mon, 2 Mar 2020 08:46:46 +0530
Subject: [PATCH 63/73] [TFLITE]FLOOR_MOD & FLOOR_DIV support (#4971)

* TFLite Floor_div & floor_mod parsing code

* Review comment updated
---
 python/tvm/relay/frontend/tflite.py          | 16 ++++++++++++++++
 tests/python/frontend/tflite/test_forward.py | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 5d26d9807aea..bc51c9138d6b 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -123,6 +123,8 @@ def __init__(self, model, subgraph, exp_tab):
             'DETECTION_POSTPROCESS': self.convert_detection_postprocess,
             'SQUARE': self.convert_square,
             'L2_NORMALIZATION': self.convert_l2_normalization,
+            'FLOOR_DIV': self.convert_floor_div,
+            'FLOOR_MOD': self.convert_floor_mod,
         }
 
     def check_unsupported_ops(self):
@@ -1579,6 +1581,20 @@ def convert_pad(self, op):
         out = _op.nn.pad(in_expr, pad_width=paddings, pad_value=pad_value)
         return out
 
+    def convert_floor_div(self, op):
+        """Convert TFLite FLOOR_DIV"""
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized FLOOR DIV operator is not supported yet.')
+        return self._convert_elemwise(_op.floor_divide, op)
+
+    def convert_floor_mod(self, op):
+        """Convert TFLite FLOOR_MOD"""
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized FLOOR MOD operator is not supported yet.')
+        return self._convert_elemwise(_op.floor_mod, op)
+
     def convert_mirror_pad(self, op):
         """Convert TFLite MIRROR_PAD"""
         try:
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index ced24250c68d..147839398d37 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -943,6 +943,22 @@ def _test_squared_difference(data):
     """ One iteration of squared difference """
     return _test_elemwise(math_ops.squared_difference, data)
 
+#######################################################################
+# Floor_divide
+# ------------
+
+def _test_floor_divide(data):
+    """ One iteration of floor_div"""
+    return _test_elemwise(math_ops.floordiv, data)
+
+#######################################################################
+# Floor_mod
+# ---------
+
+def _test_floor_mod(data):
+    """ One iteration of floor_mod"""
+    return _test_elemwise(math_ops.floormod, data)
+
 def _test_forward_elemwise(testop):
     """ Elewise"""
     testop([np.arange(6.0, dtype=np.float32).reshape((2, 1, 1, 3)),
@@ -991,6 +1007,9 @@ def test_all_elemwise():
     _test_forward_elemwise(_test_less_equal)
     _test_forward_elemwise(_test_equal)
     _test_forward_elemwise(_test_not_equal)
+    if package_version.parse(tf.VERSION) >= package_version.parse('1.14.0'):
+        _test_forward_elemwise(_test_floor_divide)
+        _test_forward_elemwise(_test_floor_mod)
 
 #######################################################################
 # Logical operators

From 892dc91a1db02472c43c5bae0c6c5a733de59db0 Mon Sep 17 00:00:00 2001
From: Ethan-Yan27 <Ethan-Yan27@users.noreply.github.com>
Date: Mon, 2 Mar 2020 14:00:08 +0800
Subject: [PATCH 64/73] [Doc]refine the example description of
 max/min/sum/tag_scope (#4974)

---
 python/tvm/te/tag.py | 2 +-
 python/tvm/tir/op.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/te/tag.py b/python/tvm/te/tag.py
index 1022875ce3dd..487e69687225 100644
--- a/python/tvm/te/tag.py
+++ b/python/tvm/te/tag.py
@@ -86,6 +86,6 @@ def tag_scope(tag):
         # or use tag_scope as decorator
         @tvm.te.tag_scope(tag="conv")
         def compute_relu(data):
-            return te.compute(data.shape, lambda *i: tvm.select(data(*i) < 0, 0.0, data(*i)))
+            return te.compute(data.shape, lambda *i: tvm.tir.Select(data(*i) < 0, 0.0, data(*i)))
     """
     return TagScope(tag)
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 4a52787262bb..a8aef8f85495 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -950,6 +950,7 @@ def reducer(expr, axis, where=None, *args):
 
                 # there are two way to use this {0} reducer:
                 # mode 1, accept (expr, axis, where) to produce an Reduce Expr
+                # tvm.{0} represents tvm.te.{0} or tvm.tir.{0}.
                 B = te.compute((m,), lambda i: tvm.{0}(A[i, k], axis=k), name="B")
 
                 # mode 2, simply use it with multiple Exprs:

From 0fb48360e63589a695263dd1886209a338f8f747 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Sun, 1 Mar 2020 22:23:46 -0800
Subject: [PATCH 65/73] [Relay][Pass] Add inline pass (#4927)

* add inline pass

* IsInline -> IsMarkedInlined

* fix comment
---
 include/tvm/relay/expr.h               |   9 +
 include/tvm/relay/transform.h          |   8 +
 python/tvm/relay/transform.py          |  13 +
 src/relay/ir/expr.cc                   |   6 +
 src/relay/pass/call_graph.cc           |   9 +-
 src/relay/pass/call_graph.h            |  12 +-
 src/relay/pass/inline.cc               | 229 +++++++
 tests/python/relay/test_pass_inline.py | 837 +++++++++++++++++++++++++
 8 files changed, 1118 insertions(+), 5 deletions(-)
 create mode 100644 src/relay/pass/inline.cc
 create mode 100644 tests/python/relay/test_pass_inline.py

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 1dcf957426c0..f627d187fdc2 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -221,6 +221,13 @@ class FunctionNode : public BaseFuncNode {
    */
   bool IsPrimitive() const;
 
+  /*!
+   * \brief Check whether the function is marked as inline.
+   *
+   * \return Whether the function should be inlined or not.
+   */
+  bool IsMarkedInline() const;
+
   /*!
    * \brief Check whether the function should use the TVM default compiler to build, or
    * use other compilers.
@@ -563,6 +570,8 @@ constexpr const char* kExternalSymbol = "ExternalSymbol";
 constexpr const char* kSkipOptimization = "SkipOptimization";
 /*! \brief Treat the function as a composite operator. */
 constexpr const char* kComposite = "Composite";
+/*! \brief Mark the function to be inlined. */
+constexpr const char* kInline = "Inline";
 }  // namespace attr
 
 }  // namespace relay
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 28628006105b..2837c1ff7f25 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -324,6 +324,14 @@ TVM_DLL Pass PrintIR(bool show_meta_data = true);
  */
 TVM_DLL Pass PartitionGraph();
 
+/*!
+ * \brief Inline the global functions marked as `inline` in a given Relay
+ * IRModule.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass Inline();
+
 }  // namespace transform
 
 /*!
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
index f773835d5c29..c54e4c875aa6 100644
--- a/python/tvm/relay/transform.py
+++ b/python/tvm/relay/transform.py
@@ -552,6 +552,19 @@ def PartitionGraph():
     return _transform.PartitionGraph()
 
 
+def Inline():
+    """Perform inlining on the given Relay IR module. The global functions that
+    are marked as `inline` should be always inlined. A cost model will be
+    needed in the future to decide if it is profitable to inline the function.
+
+    Returns
+    -------
+    ret: tvm.relay.Pass
+        The registered pass that performs inlining for a Relay IR module.
+    """
+    return _transform.Inline()
+
+
 def gradient(expr, mod=None, mode='higher_order'):
     """
     Transform the input function,
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 0292a6c2bb05..b87877a828b6 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -145,6 +145,12 @@ bool FunctionNode::IsPrimitive() const {
   return pval && pval->value != 0;
 }
 
+bool FunctionNode::IsMarkedInline() const {
+  ObjectRef res = FunctionGetAttr(GetRef<Function>(this), attr::kInline);
+  const tir::IntImmNode* pval = res.as<tir::IntImmNode>();
+  return pval && pval->value != 0;
+}
+
 Function FunctionNode::SetParams(const tvm::Map<Var, Constant>& parameters) const {
   return FunctionSetAttr(GetRef<Function>(this), attr::kParams, parameters);
 }
diff --git a/src/relay/pass/call_graph.cc b/src/relay/pass/call_graph.cc
index 6b82801776dd..9c612eb552fc 100644
--- a/src/relay/pass/call_graph.cc
+++ b/src/relay/pass/call_graph.cc
@@ -84,6 +84,13 @@ CallGraphEntry* CallGraphNode::operator[](const GlobalVar& gv) {
   return cit->second.get();
 }
 
+BaseFunc CallGraphNode::GetGlobalFunction(const GlobalVar& var) const {
+  CHECK(module->ContainGlobalVar(var->name_hint))
+      << "GlobalVar " << var->name_hint
+      << " not found in the current ir module";
+  return module->Lookup(var);
+}
+
 // Query the existence of a GlobalVar in the call graph. It creates an entry if
 // there is no such node available.
 CallGraphEntry* CallGraphNode::LookupGlobalVar(const GlobalVar& gv) {
@@ -306,7 +313,7 @@ TVM_REGISTER_GLOBAL("relay._analysis.PrintCallGraph")
 
 TVM_REGISTER_GLOBAL("relay._analysis.GetModule")
 .set_body_typed([](CallGraph call_graph) {
-  return call_graph->GetModule();
+  return call_graph->module;
 });
 
 TVM_REGISTER_GLOBAL("relay._analysis.PrintCallGraphGlobalVar")
diff --git a/src/relay/pass/call_graph.h b/src/relay/pass/call_graph.h
index 340ee30bc5d2..684e11a7600f 100644
--- a/src/relay/pass/call_graph.h
+++ b/src/relay/pass/call_graph.h
@@ -124,10 +124,14 @@ class CallGraphNode : public Object {
     return (*this)[module->GetGlobalVar(gvar_name)];
   }
 
-  /*! \brief Return the IR module. */
-  IRModule GetModule() const {
-    return module;
-  }
+  /*!
+   * \brief Get the global function corresponding to the variable.
+   *
+   * \param var The global variable.
+   *
+   * \return The found global function.
+   */
+  BaseFunc GetGlobalFunction(const GlobalVar& var) const;
 
   /*!
    * \brief Get the entries/root nodes of CallGraphNode.
diff --git a/src/relay/pass/inline.cc b/src/relay/pass/inline.cc
new file mode 100644
index 000000000000..6c8caeede59a
--- /dev/null
+++ b/src/relay/pass/inline.cc
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/pass/inline.cc
+ * \brief Global function inliner. It contains the following steps:
+ *
+ *  - Preprocessing: eligibility checking. Only inline the functions that can
+ *  be inlined. We currently only use simple rules to make the decision. No
+ *  profitibility analysis is available for now.
+ *
+ *  - Inline: replace the call with a function or the function body depending on
+ *  the attribute of the callee function. For example, we return the function
+ *  node when it doesn't use default compiler, i.e. llvm. This is because these
+ *  functions are packed to be offloaded to external codegen.
+ *
+ *  - Postprocessing: remove the replaced functions that have no reference.
+ */
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/support/logging.h>
+#include <tvm/relay/transform.h>
+#include <string>
+#include <unordered_set>
+
+#include "call_graph.h"
+
+using namespace tvm::runtime;
+
+namespace tvm {
+namespace relay {
+
+class Inliner : ExprMutator {
+ public:
+  explicit Inliner(CallGraphEntry* cur_node, CallGraphNode* call_graph)
+      : cur_node_(cur_node), call_graph_(call_graph) {}
+
+  Expr VisitExpr_(const CallNode* call_node) final {
+    Expr op = call_node->op;
+    const auto* gvn = op.as<GlobalVarNode>();
+
+    if (gvn) {
+      GlobalVar gv = GetRef<GlobalVar>(gvn);
+      auto* cg_node = (*call_graph_)[gv->name_hint];
+      if (CanInline(cg_node)) {
+        tvm::Array<Expr> call_args;
+        for (auto arg : call_node->args) {
+          auto new_arg = VisitExpr(arg);
+          call_args.push_back(new_arg);
+        }
+        cur_node_->RemoveCallTo(gv);
+        return MakeNewExpr(gv, call_args, GetRef<Call>(call_node));
+      }
+    }
+    return ExprMutator::VisitExpr_(call_node);
+  }
+
+  Expr VisitExpr_(const GlobalVarNode* gvn) final {
+    GlobalVar gv = GetRef<GlobalVar>(gvn);
+    auto* cg_node = (*call_graph_)[gv->name_hint];
+    if (CanInline(cg_node)) {
+      cur_node_->RemoveCallTo(gv);
+      return MakeNewExpr(gv, {}, GetRef<GlobalVar>(gvn));
+    }
+    return ExprMutator::VisitExpr_(gvn);
+  }
+
+  Function Inline(const Function& func) {
+    return FunctionNode::make(func->params,
+                              VisitExpr(func->body),
+                              func->ret_type,
+                              func->type_params,
+                              func->attrs);
+  }
+
+ private:
+  bool CanInline(const CallGraphEntry* cg_node) {
+    // The node must be a leaf node and it cannot be recursive.
+    if (!cg_node->empty() || cg_node->IsRecursive()) return false;
+
+    auto base_func = call_graph_->GetGlobalFunction(cg_node->GetGlobalVar());
+    auto func = Downcast<Function>(base_func);
+    // The body of a global functions must be defined.
+    if (!func->body.defined()) return false;
+
+    // The function must be annotated with the inline attribute.
+    if (!func->IsMarkedInline()) return false;
+
+    // The function is not abled to be inlined if any callee under the CallGraph
+    // of this function cannot be inlined.
+    for (const auto& it : *cg_node) {
+      if (!CanInline(it.second)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  // Make a new Relay expression to replace the callee.
+  Expr MakeNewExpr(const GlobalVar& global,
+                   const Array<Expr>& args,
+                   const Expr& callee) {
+    CHECK(callee->IsInstance<CallNode>() ||
+          callee->IsInstance<GlobalVarNode>());
+    auto base_func = call_graph_->GetGlobalFunction(global);
+    const auto* fn = base_func.as<FunctionNode>();
+    CHECK(fn) << "Expected to work on a Relay function.";
+
+    auto func = FunctionNode::make(fn->params,
+                                   fn->body,
+                                   fn->ret_type,
+                                   fn->type_params,
+                                   fn->attrs);
+    // Inline the function body to the caller if this function uses default
+    // compiler, i.e. no external codegen is needed.
+    if (func->UseDefaultCompiler()) {
+      CHECK_EQ(func->params.size(), args.size())
+          << "Mismatch found in the number of parameters and call args";
+      // Bind the parameters with call args.
+      Map<Var, Expr> bind_map;
+      for (size_t i = 0; i < args.size(); i++) {
+        bind_map.Set(fn->params[i], args[i]);
+      }
+      if (const auto* gvn = callee.as<GlobalVarNode>()) {
+        auto ret_type = gvn->checked_type();
+        // Cannot replace TensorType/TensorTupleType with FuncType. Therefore,
+        // we simply inline the function as a closure instead of directly using
+        // its body when the global var returns FuncType.
+        return ret_type->IsInstance<FuncTypeNode>() ? std::move(func)
+                                                    : func->body;
+      } else {
+        CHECK(callee->IsInstance<CallNode>());
+        return Bind(func->body, bind_map);
+      }
+    } else if (const auto* call_node = callee.as<CallNode>()) {
+        return CallNode::make(func, args, call_node->attrs, call_node->type_args);
+    } else {
+      return std::move(func);
+    }
+  }
+
+  /*!
+   * \brief The current call graph entry that is being handled. Each entry
+   * contains a global function.
+   */
+  CallGraphEntry* cur_node_;
+  /*! \brief The call graph that is used for global function lookup. */
+  const CallGraphNode* call_graph_;
+};
+
+IRModule Inline(const IRModule& module) {
+  CallGraph cg(module);
+  auto topo = cg->TopologicalOrder();
+  // Get the reverse topological order of the global functions.
+  std::reverse(topo.begin(), topo.end());
+  // Cache the functions that are originally entries. These functions will
+  // remain in the module after inlining.
+  std::unordered_set<CallGraphEntry*> original_entry;
+
+  for (auto* it : topo) {
+    if (it->GetRefCount() == 0) original_entry.emplace(it);
+    // Skip the leaf calls and the recursive calls that don't call other
+    // functions.
+    if (it->empty() || (it->IsRecursive() && it->size() == 1)) continue;
+    auto base_func = module->Lookup(it->GetNameHint());
+    if (const auto* fn = base_func.as<FunctionNode>()) {
+      auto func = GetRef<Function>(fn);
+      auto new_func = Inliner(it, cg.operator->()).Inline(func);
+      // TODO(zhiics) Maybe move this to CallGraph, but updating function from
+      // CallGraph arbitarily may lead to incorrect CallGraph.
+      cg->module->Update(it->GetGlobalVar(), new_func);
+    }
+  }
+
+  // Clean up the functions that are inlined and have no reference.
+  for (auto* cgn : topo) {
+    // Skip recursive functions and entry functions even if they are marked as
+    // `inline`.
+    if (cgn->IsRecursive() || original_entry.count(cgn)) continue;
+    auto base_func = cg->GetGlobalFunction(cgn->GetGlobalVar());
+    if (const auto* fn = base_func.as<FunctionNode>()) {
+      auto func = GetRef<Function>(fn);
+      if (func->IsMarkedInline()) {
+        CHECK_EQ(cgn->GetRefCount(), 0U)
+            << cgn->GetNameHint() << " is marked as inline but not inlined.";
+        cgn->CleanCallGraphEntries();
+        cg->RemoveGlobalVarFromModule(cgn, /*update_call_graph*/ true);
+      }
+    }
+  }
+
+  return cg->module;
+}
+
+namespace transform {
+
+Pass Inline() {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func =
+    [=](IRModule m, PassContext pc) {
+      return relay::Inline(m);
+  };
+  return CreateModulePass(pass_func, 1, "InlineGlobals", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.Inline")
+.set_body_typed(Inline);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_inline.py b/tests/python/relay/test_pass_inline.py
new file mode 100644
index 000000000000..7a0954b4887b
--- /dev/null
+++ b/tests/python/relay/test_pass_inline.py
@@ -0,0 +1,837 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, missing-docstring, too-many-statements
+import tvm
+from tvm import relay
+
+
+def get_recursive_count_loop():
+    mod = tvm.IRModule({})
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    sb = relay.ScopeBuilder()
+    with sb.if_scope(relay.equal(i, relay.const(0, dtype='int32'))):
+        sb.ret(i)
+    with sb.else_scope():
+        one_less = relay.subtract(i, relay.const(1, dtype='int32'))
+        rec_call = relay.Call(sum_up, [one_less])
+        sb.ret(relay.add(rec_call, i))
+    func = relay.Function([i],
+                          sb.get(),
+                          ret_type=relay.TensorType([], 'int32'))
+    func = func.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+    mod[sum_up] = func
+    iarg = relay.var('i', shape=[], dtype='int32')
+    mod["main"] = relay.Function([iarg], sum_up(iarg))
+    return mod, sum_up
+
+
+def test_call_chain_inline_leaf():
+    """Test when only leaf call is inlined.
+
+    The call graph is like the following:
+              main
+              /  \
+             g1   g2
+             /
+            g11(inline)
+    """
+
+    def get_mod():
+        mod = tvm.IRModule({})
+        x11 = relay.var("x11", shape=(3, 5))
+        g11 = relay.GlobalVar("g11")
+        fn11 = relay.Function([x11], x11)
+        fn11 = fn11.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        mod[g11] = fn11
+
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1 + g11(x1))
+        fn1 = relay.Function([x1, y1], sb.get())
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = g1(p0, p1)
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1 + x1)
+        fn1 = relay.Function([x1, y1], sb.get())
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = g1(p0, p1)
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_call_chain_inline_multiple_levels():
+    """Test when only leaf call is inlined.
+
+    The call graph is like the following:
+                  main
+                 /    \
+          g1(inline)   g2
+               /
+        g11(inline)
+
+    """
+
+    def get_mod():
+        mod = tvm.IRModule({})
+        x11 = relay.var("x11", shape=(3, 5))
+        g11 = relay.GlobalVar("g11")
+        fn11 = relay.Function([x11], x11)
+        fn11 = fn11.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        mod[g11] = fn11
+
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1 + g11(x1))
+        fn1 = relay.Function([x1, y1], sb.get())
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = g1(p0, p1)
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = p0 + p1 + p0
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_call_chain_inline_multiple_levels_extern_compiler():
+    """Test when only leaf call is inlined.
+
+    The call graph is like the following:
+                  main
+                 /    \
+          g1(inline)   g2
+               /
+        g11(inline, external compiler)
+
+    """
+
+    def get_mod():
+        mod = tvm.IRModule({})
+        x11 = relay.var("x11", shape=(3, 5))
+        g11 = relay.GlobalVar("g11")
+        fn11 = relay.Function([x11], x11)
+        fn11 = fn11.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn11 = fn11.set_attribute("Compiler", tvm.tir.StringImm("a"))
+        mod[g11] = fn11
+
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1 + g11(x1))
+        fn1 = relay.Function([x1, y1], sb.get())
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = g1(p0, p1)
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+        x11 = relay.var("x11", shape=(3, 5))
+        fn11 = relay.Function([x11], x11)
+        fn11 = fn11.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn11 = fn11.set_attribute("Compiler", tvm.tir.StringImm("a"))
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = p0 + p1 + fn11(p0)
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_recursive_call_with_global():
+    def get_mod():
+        mod = tvm.IRModule({})
+
+        x = relay.var('x', shape=[], dtype='int32')
+        fn0 = relay.Function([x], x)
+        fn0 = fn0.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        gx = relay.GlobalVar("gx")
+        mod[gx] = fn0
+
+        sum_up = relay.GlobalVar('sum_up')
+        i = relay.var('i', shape=[], dtype='int32')
+        sb = relay.ScopeBuilder()
+        with sb.if_scope(relay.equal(i, relay.const(0, dtype="int32"))):
+            sb.ret(i)
+        with sb.else_scope():
+            one_less = relay.subtract(i, relay.const(1, dtype="int32"))
+            global_call = gx(i)
+            rec_call = relay.Call(sum_up, [one_less]) + global_call
+            sb.ret(relay.add(rec_call, i))
+        func = relay.Function([i],
+                              sb.get(),
+                              ret_type=relay.TensorType([], "int32"))
+        func = func.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        mod[sum_up] = func
+        iarg = relay.var("i", shape=[], dtype='int32')
+        mod["main"] = relay.Function([iarg], sum_up(iarg))
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+
+        sum_up = relay.GlobalVar('sum_up')
+        i = relay.var('i', shape=[], dtype='int32')
+        sb = relay.ScopeBuilder()
+        with sb.if_scope(relay.equal(i, relay.const(0, dtype='int32'))):
+            sb.ret(i)
+        with sb.else_scope():
+            one_less = relay.subtract(i, relay.const(1, dtype='int32'))
+            rec_call = relay.Call(sum_up, [one_less]) + i
+            sb.ret(relay.add(rec_call, i))
+        func = relay.Function([i],
+                              sb.get(),
+                              ret_type=relay.TensorType([], 'int32'))
+        func = func.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        mod[sum_up] = func
+        iarg = relay.var('i', shape=[], dtype='int32')
+        mod["main"] = relay.Function([iarg], sum_up(iarg))
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_recursive_called():
+    mod, sum_up = get_recursive_count_loop()
+    iarg = relay.var('i', shape=[], dtype='int32')
+    mod["main"] = relay.Function([iarg], sum_up(iarg))
+    ref_mod = mod
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, ref_mod)
+
+
+def test_recursive_not_called():
+    def get_mod():
+        mod, sum_up = get_recursive_count_loop()
+        x = relay.var("x", shape=(2, 2))
+        y = relay.var("y", shape=(2, 2))
+        x1 = relay.var("x1", shape=(2, 2))
+        fn1 = relay.Function([x1], x1)
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+        mod["main"] = relay.Function([x, y], x + y + g1(x))
+        return mod
+
+    def expected():
+        mod, sum_up = get_recursive_count_loop()
+        x = relay.var("x", shape=(2, 2))
+        y = relay.var("y", shape=(2, 2))
+        mod["main"] = relay.Function([x, y], x + y + x)
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    ref_mod = expected()
+    assert relay.analysis.alpha_equal(mod, ref_mod)
+
+
+def test_recursive_not_called_extern_compiler():
+    def get_mod():
+        mod, sum_up = get_recursive_count_loop()
+        x = relay.var("x", shape=(2, 2))
+        y = relay.var("y", shape=(2, 2))
+        x1 = relay.var("x1", shape=(2, 2))
+        fn1 = relay.Function([x1], x1)
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn1 = fn1.set_attribute("Compiler", tvm.tir.StringImm("a"))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+        mod["main"] = relay.Function([x, y], x + y + g1(x))
+        return mod
+
+    def expected():
+        mod, sum_up = get_recursive_count_loop()
+        x = relay.var("x", shape=(2, 2))
+        y = relay.var("y", shape=(2, 2))
+        x1 = relay.var("x1", shape=(2, 2))
+        fn1 = relay.Function([x1], x1)
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn1 = fn1.set_attribute("Compiler", tvm.tir.StringImm("a"))
+        mod["main"] = relay.Function([x, y], x + y + fn1(x))
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    ref_mod = expected()
+    assert relay.analysis.alpha_equal(mod, ref_mod)
+
+
+def test_globalvar_as_call_arg():
+    def get_mod():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1)
+        fn1 = relay.Function([x1, y1], sb.get())
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = g1(p0, p1)
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    def expected():
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = p0 + p1
+        call_fn2 = p2 - p3
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_globalvar_as_call_arg_extern_compiler():
+    def get_mod():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1)
+        fn1 = relay.Function([x1, y1], sb.get())
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn1 = fn1.set_attribute("Compiler", tvm.tir.StringImm("a"))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn2 = fn2.set_attribute("Compiler", tvm.tir.StringImm("b"))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = g1(p0, p1)
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1)
+        fn1 = relay.Function([x1, y1], sb.get())
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn1 = fn1.set_attribute("Compiler", tvm.tir.StringImm("a"))
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn2 = fn2.set_attribute("Compiler", tvm.tir.StringImm("b"))
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = relay.Call(fn1, [p0, p1])
+        call_fn2 = relay.Call(fn2, [p2, p3])
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_inline_globalvar_without_args():
+    def get_mod():
+        mod = tvm.IRModule({})
+        fn1 = relay.Function([], relay.const(1))
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn2 = relay.Function([], relay.const(2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g1 = relay.GlobalVar('g1')
+        g2 = relay.GlobalVar('g2')
+        mod[g1] = fn1
+        mod[g2] = fn2
+        p = relay.var('p', 'bool')
+        mod['main'] = relay.Function([p], relay.Call(relay.If(p, g1, g2), []))
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+        fn1 = relay.Function([], relay.const(1))
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn2 = relay.Function([], relay.const(2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        p = relay.var('p', 'bool')
+        mod['main'] = relay.Function([p], relay.Call(
+            relay.If(p, fn1, fn2), []))
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_inline_globalvar_without_args_extern_compiler():
+    def get_mod():
+        mod = tvm.IRModule({})
+        fn1 = relay.Function([], relay.const(1))
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn1 = fn1.set_attribute("Compiler", tvm.tir.StringImm("a"))
+        fn2 = relay.Function([], relay.const(2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn2 = fn2.set_attribute("Compiler", tvm.tir.StringImm("b"))
+        g1 = relay.GlobalVar('g1')
+        g2 = relay.GlobalVar('g2')
+        mod[g1] = fn1
+        mod[g2] = fn2
+        p = relay.var('p', 'bool')
+        mod['main'] = relay.Function([p], relay.Call(relay.If(p, g1, g2), []))
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+        fn1 = relay.Function([], relay.const(1))
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn1 = fn1.set_attribute("Compiler", tvm.tir.StringImm("a"))
+        fn2 = relay.Function([], relay.const(2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn2 = fn2.set_attribute("Compiler", tvm.tir.StringImm("b"))
+        p = relay.var('p', 'bool')
+        mod['main'] = relay.Function([p], relay.Call(
+            relay.If(p, fn1, fn2), []))
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_globalvar_called_by_multiple_functions():
+    """Test when only leaf call is inlined.
+
+    The call graph is like the following:
+                  main    g0
+                 /    \   /
+                g1    g2(inline)
+    """
+
+    def get_mod():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1)
+        fn1 = relay.Function([x1, y1], sb.get())
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        sb1 = relay.ScopeBuilder()
+        sb1.ret(x2 - y2)
+        fn2 = relay.Function([x2, y2], sb1.get())
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        x0 = relay.var("x0", shape=(3, 5))
+        y0 = relay.var("y0", shape=(3, 5))
+        z0 = relay.var("z0", shape=(3, 5))
+        fn0 = relay.Function([x0, y0, z0], g2(x0, y0) + z0)
+        g0 = relay.GlobalVar("g0")
+        mod[g0] = fn0
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn1 = g1(p0, p1)
+        call_fn2 = g2(p2, p3)
+        mod["main"] = relay.Function([p0, p1, p2, p3], call_fn1 * call_fn2)
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        sb = relay.ScopeBuilder()
+        sb.ret(x1 + y1)
+        fn1 = relay.Function([x1, y1], sb.get())
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        p0 = relay.var("p0", shape=(3, 5))
+        p1 = relay.var("p1", shape=(3, 5))
+        p2 = relay.var("p2", shape=(3, 5))
+        p3 = relay.var("p3", shape=(3, 5))
+
+        call_fn2 = p2 - p3
+        mod["main"] = relay.Function([p0, p1, p2, p3], g1(p0, p1) * call_fn2)
+
+        x0 = relay.var("x0", shape=(3, 5))
+        y0 = relay.var("y0", shape=(3, 5))
+        z0 = relay.var("z0", shape=(3, 5))
+
+        fn0 = relay.Function([x0, y0, z0], x0 - y0 + z0)
+        g0 = relay.GlobalVar("g0")
+        mod[g0] = fn0
+
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_entry_with_inline():
+    """Test entry function with inline
+
+    The call graph is like the following:
+                g1(inline)    g2(inline)
+    """
+
+    def get_mod():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        fn1 = relay.Function([x1, y1], x1 + y1)
+        fn1 = fn1.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        fn2 = relay.Function([x2, y2], x2 - y2)
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, get_mod())
+
+
+def test_callee_not_inline():
+    """Test entry function with inline
+
+    The call graph is like the following:
+                    main
+                      |
+                 g2(inline)
+                      |
+                     g1
+    """
+
+    def get_mod():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        fn1 = relay.Function([x1, y1], x1 + y1)
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, get_mod())
+
+
+def test_callee_not_inline_leaf_inline():
+    """Test entry function with inline
+
+    The call graph is like the following:
+                    main
+                      |
+                 g2(inline)
+                      |
+                     g1
+                      |
+                 g0(inline)
+    """
+
+    def get_mod():
+        mod = tvm.IRModule({})
+        x0 = relay.var("x0", shape=(3, 5))
+        y0 = relay.var("y0", shape=(3, 5))
+        fn0 = relay.Function([x0, y0], x0 * y0)
+        fn0 = fn0.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g0 = relay.GlobalVar("g0")
+        mod[g0] = fn0
+
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        fn1 = relay.Function([x1, y1], x1 + g0(x1, y1))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        fn1 = relay.Function([x1, y1], x1 + x1 * y1)
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+def test_callee_not_inline_leaf_inline_extern_compiler():
+    """Test entry function with inline
+
+    The call graph is like the following:
+                    main
+                      |
+                 g2(inline)
+                      |
+                     g1
+                      |
+                 g0(inline, external compiler)
+    """
+
+    def get_mod():
+        mod = tvm.IRModule({})
+        x0 = relay.var("x0", shape=(3, 5))
+        y0 = relay.var("y0", shape=(3, 5))
+        fn0 = relay.Function([x0, y0], x0 * y0)
+        fn0 = fn0.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn0 = fn0.set_attribute("Compiler", tvm.tir.StringImm("aa"))
+        g0 = relay.GlobalVar("g0")
+        mod[g0] = fn0
+
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        fn1 = relay.Function([x1, y1], x1 + g0(x1, y1))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+        return mod
+
+    def expected():
+        mod = tvm.IRModule({})
+        x0 = relay.var("x0", shape=(3, 5))
+        y0 = relay.var("y0", shape=(3, 5))
+        fn0 = relay.Function([x0, y0], x0 * y0)
+        fn0 = fn0.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        fn0 = fn0.set_attribute("Compiler", tvm.tir.StringImm("aa"))
+
+        x1 = relay.var("x1", shape=(3, 5))
+        y1 = relay.var("y1", shape=(3, 5))
+        fn1 = relay.Function([x1, y1], x1 + fn0(x1, y1))
+        g1 = relay.GlobalVar("g1")
+        mod[g1] = fn1
+
+        x2 = relay.var("x2", shape=(3, 5))
+        y2 = relay.var("y2", shape=(3, 5))
+        fn2 = relay.Function([x2, y2], x2 - g1(x2, y2))
+        fn2 = fn2.set_attribute("Inline", tvm.tir.IntImm("int32", 1))
+        g2 = relay.GlobalVar("g2")
+        mod[g2] = fn2
+
+        return mod
+
+    mod = get_mod()
+    mod = relay.transform.Inline()(mod)
+    assert relay.analysis.alpha_equal(mod, expected())
+
+
+if __name__ == '__main__':
+    pytest.main()

From 8502691b5b7ca152da9eb626529070db53d479c8 Mon Sep 17 00:00:00 2001
From: maheshambule <15611578+maheshambule@users.noreply.github.com>
Date: Tue, 3 Mar 2020 02:27:40 +0530
Subject: [PATCH 66/73] [Frontend] [Tensorflow] ReadVariableOp operator support
 (#4952)

* tf frontend read variable op

* pylint fix

* tf frontend freezed graph pruned ops
---
 python/tvm/relay/frontend/tensorflow.py       | 11 ++++
 .../frontend/tensorflow/test_forward.py       | 60 +++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 6f27d73315a1..14d2418da710 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -1500,6 +1500,12 @@ def _impl(inputs, attr, params):
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
+# Operators that get pruned away when the complete graph is frozen.
+# These operators are not needed for inference.
+_freezed_graph_pruned_op_list = ['ReadVariableOp', 'ResourceGather', 'Variable',
+                                 'VariableV2', 'VarHandleOp', 'Assign', 'AssignVariableOp']
+
+
 # _convert_map defines maps of name to converter functor(callable)
 # for 1 to 1 mapping, use Renamer if nothing but name is different
 # use AttrCvt if attributes need to be converted
@@ -2187,6 +2193,11 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         missing_operators = self._parse_import_prerequisites(graph)
 
         if missing_operators:
+            freezed_ops = [op for op in missing_operators if op in _freezed_graph_pruned_op_list]
+            if freezed_ops:
+                raise Exception("Graph is not frozen. Provide a frozen graph. "
+                                "Found operators {}".format(freezed_ops))
+
             raise NotImplementedError( \
                 "The following operators are not implemented: {}".format(missing_operators))
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 9cd978e2e147..42408b706111 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -22,6 +22,7 @@
 """
 from __future__ import print_function
 import numpy as np
+import pytest
 import tensorflow as tf
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import graph_util
@@ -1061,6 +1062,62 @@ def test_forward_variable():
     _test_variable(np.random.uniform(size=(32, 100)).astype('float32'))
 
 
+def test_read_variable_op():
+    """ Read Variable op test """
+
+    tf.reset_default_graph()
+    data = np.random.uniform(size=(32, 100)).astype('float32')
+    input_tensor = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+
+    size = input_tensor.shape.dims[1]
+    var_data = np.random.uniform(-5, 5, size=[size, size]).astype(np.float32)
+    input_var = tf.Variable(var_data, name='var1', use_resource=True)
+    math_ops.matmul(input_tensor, input_var)
+
+    out_name = ['MatMul:0']
+    out_node = ['MatMul']
+    in_name = ['Placeholder:0']
+    in_node = ['Placeholder']
+    in_data = [data]
+
+    with tf.Session() as sess:
+        sess.run(variables.global_variables_initializer())
+
+        final_graph_def = sess.graph.as_graph_def(add_shapes=True)
+        tf_output = run_tf_graph(sess, in_data, in_name, out_name)
+
+        shape_dict = {e: i.shape for e, i in zip(in_name, in_data)}
+        with pytest.raises(Exception) as exexcinfo:
+            mod, params = relay.frontend.from_tensorflow(final_graph_def,
+                                                         layout=None,
+                                                         shape=shape_dict,
+                                                         outputs=None)
+
+        assert exexcinfo.value.args[0].startswith("Graph is not frozen. Provide a frozen graph.")
+
+        # Now convert the variables to constant and run inference on the converted graph
+        final_graph_def = tf.graph_util.convert_variables_to_constants(
+            sess,
+            sess.graph.as_graph_def(add_shapes=True),
+            out_node,
+        )
+
+        for device in ["llvm", "cuda"]:
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("Skip because %s is not enabled" % device)
+                continue
+
+            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node,
+                                       target=device, out_names=out_name,
+                                       num_output=len(out_name))
+            for i in range(len(tf_output)):
+                tvm.testing.assert_allclose(
+                    tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
+
+        sess.close()
+
+
 #######################################################################
 # MatMul, BatchMatMul, BatchMatMulV2
 # ----------------------------------
@@ -3038,3 +3095,6 @@ def test_forward_add_n():
     test_forward_where()
     test_forward_matmul()
     test_forward_batch_matmul()
+
+    # Internal misc. ops
+    test_read_variable_op()

From 09ddc3eb98e872321dc99823fd04f3dc6d080d43 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 3 Mar 2020 03:41:31 +0000
Subject: [PATCH 67/73] Pin xgboost dependency version to 0.90 (#4965)

* Sets xgboost dependency to be 0.90, preventing
   segfaults during TVM python unit tests execution

 * This is discussed in issue #4953
---
 python/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index a135a6c41101..937d682e3c85 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -164,7 +164,7 @@ def get_package_data_files():
                                'matplotlib'],
                       'extra_feature': ['tornado',
                                         'psutil',
-                                        'xgboost',
+                                        'xgboost==0.90',
                                         'mypy',
                                         'orderedset',
                                         'antlr4-python3-runtime']},

From 98b1759052c2dacb38b6d3e0bbdba38002bbef75 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Tue, 3 Mar 2020 01:30:28 -0800
Subject: [PATCH 68/73] [Relay] Target annotation for external codegen (#4933)

* op based external compiler annotation

* Use TVM register directly

* Small fix

* test graph

Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 include/tvm/relay/op_attr_types.h          |  14 ++
 python/tvm/relay/op/__init__.py            |   2 +-
 python/tvm/relay/op/contrib/__init__.py    |   4 +-
 python/tvm/relay/op/contrib/contrib.py     |  19 ---
 python/tvm/relay/op/contrib/dnnl.py        |  72 ++++++++
 python/tvm/relay/op/op.py                  |  22 +++
 python/tvm/relay/transform.py              |  19 +++
 src/relay/pass/annotate_target.cc          | 103 +++++++++++
 tests/python/relay/test_annotate_target.py | 188 +++++++++++++++++++++
 9 files changed, 421 insertions(+), 22 deletions(-)
 delete mode 100644 python/tvm/relay/op/contrib/contrib.py
 create mode 100644 python/tvm/relay/op/contrib/dnnl.py
 create mode 100644 src/relay/pass/annotate_target.cc
 create mode 100644 tests/python/relay/test_annotate_target.py

diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
index 1a2263e3f187..5b2fdd3ab4e1 100644
--- a/include/tvm/relay/op_attr_types.h
+++ b/include/tvm/relay/op_attr_types.h
@@ -179,6 +179,20 @@ using FTVMLegalize = runtime::TypedPackedFunc<
        const Array<Expr>& args,
        const Array<tvm::relay::Type>& arg_types)>;
 
+/*!
+ * \brief Annotates an expression to indicate if an op should be compiled using
+ * the given compiler/target.
+ *
+ * \param attrs The attribute of the original expr.
+ * \param args The arguments of the original expr.
+ *
+ * \return true if this op should be registered to invoke a specific compiler
+ * for codegen, otherwise, false.
+ */
+using FTVMAnnotateTarget = runtime::TypedPackedFunc<
+  bool(const Attrs& attrs,  // NOLINT(*)
+       const Array<Expr>& args)>;
+
 /*!
  * \brief Forward rewriting rule for a specific op.
  *
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 4a4823d7d23b..1a1d0d3ff7ed 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -19,7 +19,7 @@
 # operator defs
 from .op import get, register, register_compute, register_gradient, \
     register_pattern, register_alter_op_layout, register_legalize, \
-    Op, OpPattern, OpStrategy, debug
+    Op, OpPattern, OpStrategy, debug, register_external_compiler
 from . import strategy
 
 # Operators
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index c6e086aecd1d..4b6acceb3a83 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -15,5 +15,5 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=wildcard-import
-"""Neural network related operators."""
-from __future__ import absolute_import as _abs
+"""Contrib modules."""
+from .dnnl import *
diff --git a/python/tvm/relay/op/contrib/contrib.py b/python/tvm/relay/op/contrib/contrib.py
deleted file mode 100644
index cb7e5d407e10..000000000000
--- a/python/tvm/relay/op/contrib/contrib.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#pylint: disable=invalid-name, too-many-lines
-"""Contrib operations."""
-from __future__ import absolute_import as _abs
diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
new file mode 100644
index 000000000000..1aa71921806d
--- /dev/null
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -0,0 +1,72 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""DNNL library supported operators.
+There are two ways to registering a function for an op to indicate if it is
+supported by DNNL.
+
+- The first and simplest way is to use the helper so that
+users only need to provide the operator name and a boolean value to indicate if
+it is supported. For example:
+
+    .. code-block:: python
+
+      add = _register_external_op_helper("add")
+      add = _register_external_op_helper("add", True)
+      add = _register_external_op_helper("add", False)
+
+- The other way is to implement the function by themselves to
+check the attributes of the op and decide if it should be offloaded to DNNL.
+"""
+from ... import op as reg
+
+
+def _register_external_op_helper(op_name, supported=True):
+    """The helper function to indicate that a given operator can be supported
+    by DNNL.
+
+    Paramters
+    ---------
+    op_name : Str
+        The name of operator that will be registered.
+
+    Returns
+    -------
+    f : callable
+        A function that returns if the operator is supported by DNNL.
+    """
+    @reg.register(op_name, "target.dnnl")
+    def _func_wrapper(attrs, args):
+        return supported
+
+    return _func_wrapper
+
+
+_register_external_op_helper("nn.conv2d")
+_register_external_op_helper("nn.dense")
+_register_external_op_helper("nn.relu")
+_register_external_op_helper("add")
+_register_external_op_helper("subtract")
+_register_external_op_helper("multiply")
+
+
+@reg.register("nn.batch_norm", "target.dnnl")
+def batch_norm(attrs, args):
+    """Check if the external DNNL codegen should be used.
+    FIXME(@zhiics, @comaniac): Turn off due to not support of multiple outputs.
+    """
+    return False
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index 6be7d4d4f870..4cd4b2a2a465 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -453,14 +453,36 @@ def register_shape_func(op_name, data_dependant, shape_func=None, level=10):
     get(op_name).set_attr("TShapeDataDependant", data_dependant, level)
     return register(op_name, "FShapeFunc", shape_func, level)
 
+
+def register_external_compiler(op_name, fexternal=None, level=10):
+    """Register the external compiler for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the operator.
+
+    fexternal : function (attrs: Attrs, args: List[Expr], compiler: str)
+              -> new_expr: Expr
+        The function for wrapping a call expr with compiler_begin and
+        compiler_end.
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "FTVMExternalCompiler", fexternal, level)
+
+
 @tvm._ffi.register_func("relay.op.compiler._lower")
 def _lower(name, schedule, inputs, outputs):
     return lower(schedule, list(inputs) + list(outputs), name=name)
 
+
 @tvm._ffi.register_func("relay.op.compiler._build")
 def _build(lowered_funcs):
     return build(lowered_funcs, target="llvm")
 
+
 _schedule_injective = None
 _schedule_reduce = None
 
diff --git a/python/tvm/relay/transform.py b/python/tvm/relay/transform.py
index c54e4c875aa6..b2565f3f97eb 100644
--- a/python/tvm/relay/transform.py
+++ b/python/tvm/relay/transform.py
@@ -552,6 +552,25 @@ def PartitionGraph():
     return _transform.PartitionGraph()
 
 
+
+def AnnotateTarget(target):
+    """Annotate ops in an experession with a provied compiler/target and then
+    use it for codegen.
+
+    Parameters
+    ----------
+    target : String
+        The target compiler used for codegen.
+
+    Returns
+    -------
+    ret : tvm.relay.Pass
+        The annotated pass that wrapps ops with subgraph_start and
+        subgraph_end.
+    """
+    return _transform.AnnotateTarget(target)
+
+
 def Inline():
     """Perform inlining on the given Relay IR module. The global functions that
     are marked as `inline` should be always inlined. A cost model will be
diff --git a/src/relay/pass/annotate_target.cc b/src/relay/pass/annotate_target.cc
new file mode 100644
index 000000000000..732206958aaf
--- /dev/null
+++ b/src/relay/pass/annotate_target.cc
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/pass/annotate_target.cc
+ * \brief Wraps a call with compiler_begin and compiler_end to indicate that
+ * the op of this call node will use external compiler.
+ */
+
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace annotate_target {
+
+// A helper class to insert annotation boundaries for a program region that will
+// be handled by a specific compiler.
+class AnnotateTargetWrapper : public ExprMutator {
+ public:
+  explicit AnnotateTargetWrapper(const std::string& target) : target_(target) {}
+
+  Expr VisitExpr_(const CallNode* cn) {
+    // TODO(@zhiics, @comaniac) Handle composite functions.
+    auto new_e = ExprMutator::VisitExpr_(cn);
+
+    Call call = Downcast<Call>(new_e);
+    static auto fannotate = Op::GetAttr<FTVMAnnotateTarget>("target." + target_);
+    Op op = Downcast<Op>(call->op);
+    CHECK(op.defined());
+
+    if (fannotate.count(op)) {
+      bool external = fannotate[op](call->attrs, call->args);
+      if (external) {
+        tvm::Array<tvm::relay::Expr> compiler_begins;
+        for (const auto& it : call->args) {
+          const auto* begin_op =
+            runtime::Registry::Get("relay.op.annotation._make.compiler_begin");
+          CHECK(begin_op);
+          Expr begin = (*begin_op)(it, target_);
+          compiler_begins.push_back(begin);
+        }
+        Expr update_call = CallNode::make(call->op, compiler_begins, call->attrs);
+        const auto* end_op =
+          runtime::Registry::Get("relay.op.annotation._make.compiler_end");
+        CHECK(end_op);
+        Expr end = (*end_op)(update_call, target_);
+        return end;
+      }
+    } else {
+      LOG(WARNING) << op->name << " in " << target_
+                   << " is not registered. It will be executed on CPU.";
+    }
+    return new_e;
+  }
+
+ private:
+  std::string target_;
+};
+
+Expr AnnotateTarget(const Expr& expr, const std::string& target) {
+  return AnnotateTargetWrapper(target).Mutate(expr);
+}
+
+}  // namespace annotate_target
+
+namespace transform {
+
+Pass AnnotateTarget(const std::string& target) {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(relay::annotate_target::AnnotateTarget(f, target));
+      };
+  auto func_pass = CreateFunctionPass(pass_func, 0, "AnnotateTargetFunc",
+                                      {tir::StringImmNode::make("InferType")});
+  return transform::Sequential({func_pass, InferType()}, "AnnotateTarget");
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.AnnotateTarget")
+.set_body_typed(AnnotateTarget);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_annotate_target.py b/tests/python/relay/test_annotate_target.py
new file mode 100644
index 000000000000..f4e602a3973b
--- /dev/null
+++ b/tests/python/relay/test_annotate_target.py
@@ -0,0 +1,188 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for annotating external targets."""
+import os
+import sys
+import numpy as np
+import pytest
+
+import tvm
+import tvm.relay.testing
+import tvm.relay.transform as transform
+from tvm import relay
+from tvm import runtime
+from tvm.contrib import util
+
+
+def check_result(mod, map_inputs, out_shape, result, tol=1e-5, target="llvm",
+                 ctx=tvm.cpu(), params=None):
+    if sys.platform == "win32":
+        print("Skip test on Windows for now")
+        return
+
+    def update_lib(lib):
+        test_dir = os.path.dirname(
+            os.path.realpath(os.path.expanduser(__file__)))
+        source_dir = os.path.join(test_dir, "..", "..", "..")
+        contrib_path = os.path.join(source_dir, "src", "runtime", "contrib")
+
+        kwargs = {}
+        kwargs["options"] = ["-O2", "-std=c++11", "-I" + contrib_path]
+        tmp_path = util.tempdir()
+        lib_name = 'lib.so'
+        lib_path = tmp_path.relpath(lib_name)
+        lib.export_library(lib_path, fcompile=False, **kwargs)
+        lib = runtime.load_module(lib_path)
+
+        return lib
+
+    def check_vm_result():
+        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            exe = relay.vm.compile(mod, target=target, params=params)
+        code, lib = exe.save()
+        lib = update_lib(lib)
+        exe = runtime.vm.Executable.load_exec(code, lib)
+        vm = runtime.vm.VirtualMachine(exe)
+        vm.init(ctx)
+        out = vm.run(**map_inputs)
+        tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
+
+    def check_graph_runtime_result():
+        with relay.build_config(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            json, lib, param = relay.build(mod, target=target, params=params)
+        lib = update_lib(lib)
+        rt_mod = tvm.contrib.graph_runtime.create(json, lib, ctx)
+
+        for name, data in map_inputs.items():
+            rt_mod.set_input(name, data)
+        rt_mod.set_input(**param)
+        rt_mod.run()
+        out = tvm.nd.empty(out_shape, ctx=ctx)
+        out = rt_mod.get_output(0, out)
+
+        tvm.testing.assert_allclose(out.asnumpy(), result, rtol=tol, atol=tol)
+
+    check_vm_result()
+    check_graph_runtime_result()
+
+
+def test_extern_dnnl():
+    def annotated(dtype, ishape, w1shape):
+        data = relay.var('data', shape=(ishape), dtype=dtype)
+        weight1 = relay.var('weight1', shape=(w1shape), dtype=dtype)
+        depthwise_conv2d_1 = relay.nn.conv2d(data,
+                                             weight1,
+                                             kernel_size=(3, 3),
+                                             padding=(1, 1),
+                                             groups=32)
+        depthwise_conv2d_2 = relay.nn.conv2d(depthwise_conv2d_1,
+                                             weight1,
+                                             kernel_size=(3, 3),
+                                             padding=(1, 1),
+                                             groups=32)
+        out = relay.add(depthwise_conv2d_1, depthwise_conv2d_2)
+
+        f = relay.Function([data, weight1], out)
+
+        mod = tvm.IRModule.from_expr(f)
+        return mod
+
+    def expected(dtype, ishape, w1shape):
+        data = relay.var('data', shape=(ishape), dtype=dtype)
+        weight1 = relay.var('weight1', shape=(w1shape), dtype=dtype)
+        begin0 = relay.annotation.compiler_begin(data, "dnnl")
+        begin1 = relay.annotation.compiler_begin(weight1, "dnnl")
+        depthwise_conv2d_1 = relay.nn.conv2d(begin0,
+                                             begin1,
+                                             kernel_size=(3, 3),
+                                             padding=(1, 1),
+                                             groups=32)
+        end0 = relay.annotation.compiler_end(depthwise_conv2d_1, "dnnl")
+        begin2 = relay.annotation.compiler_begin(end0, "dnnl")
+        begin3 = relay.annotation.compiler_begin(end0, "dnnl")
+        begin4 = relay.annotation.compiler_begin(weight1, "dnnl")
+        depthwise_conv2d_2 = relay.nn.conv2d(begin3,
+                                             begin4,
+                                             kernel_size=(3, 3),
+                                             padding=(1, 1),
+                                             groups=32)
+        end1 = relay.annotation.compiler_end(depthwise_conv2d_2, "dnnl")
+        begin5 = relay.annotation.compiler_begin(end1, "dnnl")
+        out = relay.add(begin2, begin5)
+        end2 = relay.annotation.compiler_end(out, "dnnl")
+        f = relay.Function([data, weight1], end2)
+        mod = tvm.IRModule.from_expr(f)
+        return mod
+
+    dtype = "float32"
+    ishape = (1, 32, 14, 14)
+    w1shape = (32, 1, 3, 3)
+
+    def test_annotate():
+        mod = annotated(dtype, ishape, w1shape)
+        mod = transform.AnnotateTarget("dnnl")(mod)
+        ref_mod = expected(dtype, ishape, w1shape)
+        assert relay.analysis.alpha_equal(mod, ref_mod)
+
+    def test_run():
+        if not tvm.get_global_func("relay.ext.dnnl", True):
+            print("skip because DNNL codegen is not available")
+            return
+
+        ref_mod = annotated(dtype, ishape, w1shape)
+        mod = annotated(dtype, ishape, w1shape)
+        mod = transform.PartitionGraph()(mod)
+
+        i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+        w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
+
+        ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu())
+        ref_res = ref_ex.evaluate()(i_data, w1_data)
+
+        check_result(mod, {"data": i_data, "weight1": w1_data},
+                     (1, 32, 14, 14), ref_res.asnumpy(), tol=1e-5)
+
+    test_annotate()
+    test_run()
+
+
+def test_extern_dnnl_mobilenet():
+    if not tvm.get_global_func("relay.ext.dnnl", True):
+        print("skip because DNNL codegen is not available")
+        return
+
+    dtype = 'float32'
+    ishape = (1, 3, 224, 224)
+    mod, params = relay.testing.mobilenet.get_workload(
+        batch_size=1, dtype='float32')
+
+    mod = transform.AnnotateTarget("dnnl")(mod)
+    mod = transform.PartitionGraph()(mod)
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+
+    ref_mod, params = relay.testing.mobilenet.get_workload(batch_size=1,
+                                                           dtype='float32')
+    ref_ex = relay.create_executor("graph", mod=ref_mod, ctx=tvm.cpu(0))
+    ref_res = ref_ex.evaluate()(i_data, **params)
+
+    check_result(mod, {"data": i_data},
+                 (1, 1000), ref_res.asnumpy(), tol=1e-5, params=params)
+
+
+if __name__ == "__main__":
+    test_extern_dnnl()
+    test_extern_dnnl_mobilenet()

From 5a0f39b5481a30a2eec49e27cbc17a722bd6ee6a Mon Sep 17 00:00:00 2001
From: pyjhzwh <pyjhzwh@gmail.com>
Date: Wed, 4 Mar 2020 04:40:37 -0500
Subject: [PATCH 69/73] [Torch] fix unordered dictionary problem for python
 version under 3.6 (#4982)

* fix unordered dictionary problem for python version 3.5

* modify style
---
 python/tvm/relay/frontend/pytorch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index b256faa5d6f9..19bccca34bd1 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -918,7 +918,7 @@ def _get_constant(node):
 
 def _get_operator_nodes(nodes):
     """ Returns torch IR nodes that need conversion to Relay """
-    ops = {}
+    ops = []
     # Traverse nodes and add to graph
     for node in nodes:
         if node.outputsSize() > 1:
@@ -927,7 +927,7 @@ def _get_operator_nodes(nodes):
             node_name = _get_output_name(node)
 
         if node.kind() != "prim::GetAttr":
-            ops[node_name] = node
+            ops.append((node_name, node))
 
     return ops
 
@@ -1015,7 +1015,7 @@ def parse_params(graph, state_dict):
 
 def parse_operators(operators, outputs, output_index_map, ret_name):
     """ Convert each Torch IR operators to Relay equivalent """
-    for node_name, op_node in operators.items():
+    for node_name, op_node in operators:
         operator = op_node.kind()
         inputs = _get_op_inputs(op_node, outputs, output_index_map)
 

From 585f9ce6e7bef7d0e8902b1c1e55dcb3bbe84eed Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 4 Mar 2020 10:52:02 -0800
Subject: [PATCH 70/73] Tighten split's extent (#4931)

* Set split node's range to minimum of ext and split factor or split nparts, but only when PassDownDomain is called with allow_missing == false, i.e. by InferBound.  Add a helper PassUpThreadBinding() to get a map telling whether an IterVar has at least one leaf IterVar deriving from it binding to a thread. Add two unit tests.

* Enhance LoopVectorizer for vectorizing by 0.  Found at least one case from testtopi/tests/python/test_topi_transform.py::test_tile.

* Revert changes vectorize_loop.cc; when parent's ext is zero, set split's range to the factor or nparts.

* Update with comments.

* Refactor the ext tightening predicate.

* Fix reference types.

* Integrate tvm.te changes.

* Trivial comment change to trigger CI.

* Trivial comment correction to trigger testing.
---
 src/te/schedule/message_passing.cc            | 76 ++++++++++++++++++-
 .../unittest/test_schedule_bound_inference.py | 26 +++++++
 2 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/te/schedule/message_passing.cc b/src/te/schedule/message_passing.cc
index 5b6fa861895a..a7b248285c4d 100644
--- a/src/te/schedule/message_passing.cc
+++ b/src/te/schedule/message_passing.cc
@@ -51,17 +51,66 @@ void Update(std::unordered_map<IterVar, Range>* p_state,
   }
 }
 
+/*!
+ * \param Upward propagating whether an IterVar derives at least one leaf IterVar that binds to
+ * a thread.
+ *
+ * \param stage The stage to operate on.
+ * \param p_state The propagation result of each IterVar.
+ */
+void PassUpThreadBinding(const Stage& stage, std::unordered_map<IterVar, bool>* p_state) {
+  auto bound_to_thread = [&stage](const IterVar& iv) {
+    bool bound = false;
+    auto it = stage->iter_var_attrs.find(iv);
+    if (it != stage->iter_var_attrs.end()) {
+      bound = (*it).second->bind_thread.defined();
+    }
+    return bound;
+  };
+
+  auto& state = *p_state;
+  // Fill p_state with leaf itervars
+  for (const IterVar& iv : stage->leaf_iter_vars) {
+    state[iv] = bound_to_thread(iv);
+  }
+  // Traverse the graph bottom-up to propagate thread binding information
+  for (size_t i = stage->relations.size(); i != 0; --i) {
+    IterVarRelation rel = stage->relations[i - 1];
+    if (const SplitNode* s = rel.as<SplitNode>()) {
+      state[s->parent] = state[s->inner] || state[s->outer];
+    } else if (const FuseNode* s = rel.as<FuseNode>()) {
+      state[s->inner] = state[s->fused];
+      state[s->outer] = state[s->fused];
+    } else if (const RebaseNode* s = rel.as<RebaseNode>()) {
+      state[s->parent] = state[s->rebased];
+    } else if (rel.as<SingletonNode>()) {
+    } else {
+      LOG(FATAL) << "unknown relation type";
+    }
+  }
+}
+
 void PassDownDomain(const Stage& stage,
                     std::unordered_map<IterVar, Range>* p_state,
                     arith::Analyzer* actx,
                     bool allow_missing) {
-  auto ceil_div = [actx](PrimExpr a, PrimExpr b) {
+  auto ceil_div = [actx](const PrimExpr& a, const PrimExpr& b) {
     if (actx->CanProve(indexmod(a, b) == 0)) {
       return actx->Simplify(indexdiv(a, b));
     }
     return actx->Simplify(indexdiv(a + (b - 1), b));
   };
 
+  auto minimum_or_later  = [actx](const PrimExpr& a, const PrimExpr& b) {
+    if (actx->CanProve(a < b)) {
+      return actx->Simplify(a);
+    }
+    return actx->Simplify(b);
+  };
+
+  std::unordered_map<IterVar, bool> dominating_thread;
+  PassUpThreadBinding(stage, &dominating_thread);
+
   auto& state = *p_state;
   // forwar iteration on relations
   for (IterVarRelation rel : stage->relations) {
@@ -72,14 +121,35 @@ void PassDownDomain(const Stage& stage,
       }
       CHECK(!state.count(r->inner));
       const Range& range_parent = state.at(r->parent);
+      // Tighten iv's extent to min(parent_extent, factor_or_nparts), only if all of the
+      // following conditions are met:
+      // 1. No leaf IterVar derived from iv binds to any thread.  People may use split
+      // to force an IterVar extent to match the number of allocated threads to fuse stages
+      // that require different number of threads.  We don't want to change these extents.
+      // 2. allow_missing is false, i.e. that PassDownDomain is called by the final InferBound,
+      // rather than by an early compiler phase, such as rfactor().  We don't want to tighten an
+      // IterVar in an early phase allowing missing IterVars, because it may bind to a thread later.
+      // 3. range_parent's extent is not 0.  At lest one Topi test has a case where a tensor has one
+      // zero-sized dimension.  Split creates iv with a positive extent to avoid zero-extent
+      // IterVar.  We don't touch it.
+      auto resolve_min_extent_for_split = [&](const IterVar& iv, const PrimExpr& factor_or_nparts) {
+        return dominating_thread[iv] || allow_missing || is_zero(range_parent->extent)
+                   ? factor_or_nparts
+                   : minimum_or_later(range_parent->extent, factor_or_nparts);
+      };
       if (r->factor.defined()) {
         Update(p_state, r->inner,
-               Range::make_by_min_extent(0, r->factor), actx);
+               Range::make_by_min_extent(
+                   0, resolve_min_extent_for_split(r->inner, r->factor)),
+               actx);
         Update(p_state, r->outer,
                Range::make_by_min_extent(
                    0, ceil_div(range_parent->extent, r->factor)), actx);
       } else {
-        Update(p_state, r->outer, Range::make_by_min_extent(0, r->nparts), actx);
+        Update(p_state, r->outer,
+               Range::make_by_min_extent(
+                   0, resolve_min_extent_for_split(r->outer, r->nparts)),
+               actx);
         Update(p_state, r->inner,
                Range::make_by_min_extent(
                    0, ceil_div(range_parent->extent, r->nparts)), actx);
diff --git a/tests/python/unittest/test_schedule_bound_inference.py b/tests/python/unittest/test_schedule_bound_inference.py
index 484aa503e066..edae527c0183 100644
--- a/tests/python/unittest/test_schedule_bound_inference.py
+++ b/tests/python/unittest/test_schedule_bound_inference.py
@@ -70,6 +70,32 @@ def test_bound3():
     assert(bounds[A1.op.axis[0]].extent.value==32)
     assert(bounds[A1.op.axis[1]].extent.value==16)
 
+def test_bound_split_ext_less_than_factor():
+    m = 8
+    I = te.placeholder((m,), name='I')
+    EF = te.compute((m,), lambda i: I[i] * 2, name = "EF")
+    E = te.compute((m,), lambda i: EF[i] * 2, name = "E")
+    s = te.create_schedule([E.op])
+    xo, xi = s[E].split(s[E].op.axis[0], factor = 32)
+    s[EF].compute_at(s[E], xo)
+
+    bounds = tvm.te.schedule.InferBound(s)
+    assert isinstance(bounds, tvm.container.Map)
+    assert bounds[xi].extent.value == m
+
+def test_bound_split_ext_less_than_naprts():
+    m = 8
+    I = te.placeholder((m,), name='I')
+    EF = te.compute((m,), lambda i: I[i] * 2, name = "EF")
+    E = te.compute((m,), lambda i: EF[i] * 2, name = "E")
+    s = te.create_schedule([E.op])
+    xo, xi = s[E].split(s[E].op.axis[0], nparts = 32)
+    s[EF].compute_at(s[E], xo)
+
+    bounds = tvm.te.schedule.InferBound(s)
+    assert isinstance(bounds, tvm.container.Map)
+    assert bounds[xo].extent.value == m
+
 def test_bound_split_divisible():
     m = te.var('m')
     l = te.var('l')

From fc7f0783940c362bf48cd46817956381196201e2 Mon Sep 17 00:00:00 2001
From: Animesh Jain <anijain@umich.edu>
Date: Wed, 4 Mar 2020 11:24:56 -0800
Subject: [PATCH 71/73] [Torch, QNN] Add support for quantized models via QNN
 (#4977)

* qnn support initial import

* fix upsampling num input

* imagenet tests added

* add qunatized module tests

* quantized module tests working

* imagenet test working

* fix lint

* remove top level torch import to fix ci error

* disable lint warning on outside toplevel import

* revert parse -> convert change

* add comments to qnn translation

* address comments, add sample outputs

* add more comments

* refactor bias add and requantize step
---
 python/tvm/relay/frontend/pytorch.py          |  88 ++-
 python/tvm/relay/frontend/qnn_torch.py        | 692 ++++++++++++++++++
 tests/python/frontend/pytorch/qnn_test.py     | 455 ++++++++++++
 tests/python/frontend/pytorch/test_forward.py |   6 +
 4 files changed, 1232 insertions(+), 9 deletions(-)
 create mode 100644 python/tvm/relay/frontend/qnn_torch.py
 create mode 100644 tests/python/frontend/pytorch/qnn_test.py

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 19bccca34bd1..1bdcf0a1f525 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -19,6 +19,7 @@
 # pylint: disable=import-outside-toplevel, simplifiable-if-expression, unnecessary-comprehension
 """PT: PyTorch frontend."""
 import itertools
+import logging
 
 import numpy as np
 
@@ -32,6 +33,8 @@
 from .common import infer_shape as _infer_shape
 from .common import infer_value as _infer_value
 
+from . import qnn_torch
+
 __all__ = ["from_pytorch"]
 
 # operator implementation
@@ -146,6 +149,10 @@ def _impl(inputs, input_types):
 def _relu():
     def _impl(inputs, input_types):
         data = inputs[0]
+        if input_types[0] == "quint8":
+            assert len(inputs) == 3, "Input quant param not found in op inputs"
+            input_zero_point = _expr.const(inputs[2], dtype="int32")
+            return qnn_torch.quantized_relu(data, input_zero_point)
         return _op.nn.relu(data)
     return _impl
 
@@ -154,9 +161,14 @@ def _impl(inputs, input_types):
         data = inputs[0]
         output_size = _infer_shape(inputs[1])
 
-        return _op.nn.adaptive_avg_pool2d(
-            data,
-            output_size=output_size)
+        def func(x):
+            return _op.nn.adaptive_avg_pool2d(x, output_size=output_size)
+
+        if input_types[0] == "quint8":
+            return qnn_torch.quantized_adaptive_avg_2d(data, func)
+
+        return func(data)
+
     return _impl
 
 def _adaptive_max_2d():
@@ -503,7 +515,18 @@ def _impl(inputs, input_types):
         else:
             exclude = False
 
-        return _op.mean(data, axis, keepdims, exclude)
+        def func(x):
+            return _op.mean(x, axis, keepdims, exclude)
+
+        if input_types[0] == "quint8":
+            assert len(inputs) == 6, "Input quant param not found in op inputs"
+            input_scale = _expr.const(inputs[4])
+            input_zero_point = _expr.const(inputs[5])
+            return qnn_torch.quantized_mean(data, input_scale,
+                                            input_zero_point, func)
+
+        return func(data)
+
     return _impl
 
 def _chunk():
@@ -665,10 +688,40 @@ def _impl(inputs, input_types):
         else:
             coord_trans = "half_pixel"
 
-        return _op.image.resize(data, out_size, "NCHW", method, coord_trans)
+        def func(x):
+            return _op.image.resize(x, out_size, "NCHW", method, coord_trans)
+
+        if input_types[0] == "quint8":
+            import torch
+            from packaging import version
+
+            # Torch version > 1.4 changed upsampling API
+            if version.parse(torch.__version__) > version.parse("1.4.0"):
+                num_inputs = 7
+            else:
+                num_inputs = 5
+
+            assert len(inputs) == num_inputs, "Input quant param not found in op inputs"
+
+            input_scale = _expr.const(inputs[-2])
+            input_zero_point = _expr.const(inputs[-1])
+            return qnn_torch.quantized_upsample(data, input_scale,
+                                                input_zero_point, func)
+        return func(data)
 
     return _impl
 
+
+def _expand_as():
+    def _impl(inputs, input_types):
+        # TODO: maybe fix this
+        # This assumes expand_as can be removed because TVM has broadcast op
+        msg = "aten::expand_as(...) found, assume it is part of broadcast op"
+        logging.warning(msg)
+        return inputs[0]
+    return _impl
+
+
 # Helper functions for operator implementation
 
 def _convert_data_type(input_type):
@@ -789,6 +842,7 @@ def _convert_elemwise_input(data, input_type):
     "aten::detach"                          : _identity(),
     "aten::upsample_bilinear2d"             : _upsample("bilinear"),
     "aten::upsample_nearest2d"              : _upsample("nearest_neighbor"),
+    "aten::expand_as"                       : _expand_as()
 }
 
 
@@ -839,6 +893,7 @@ def _report_missing_conversion(op_names):
                  "prim::ListConstruct", "prim::ListUnpack",
                  "prim::TupleConstruct", "prim::TupleUnpack"]
     known_ops += list(_convert_map.keys())
+    known_ops += list(qnn_torch.convert_map.keys())
 
     missing = [op_name for op_name in op_names
                if op_name not in known_ops]
@@ -991,6 +1046,7 @@ def parse_params(graph, state_dict):
     getattr_nodes = graph.findAllNodes("prim::GetAttr", recurse=True)
     params = {}
     param_tensors = {}
+    packed_param_map = {}
     seen = set()
 
     for node in getattr_nodes:
@@ -1003,14 +1059,18 @@ def parse_params(graph, state_dict):
             full_attr = _getattr_full_name(getattrs)
             full_attr_node_name = _get_output_name(getattrs[-1])
 
-            if full_attr in state_dict:
+            if full_attr.endswith("_packed_params"):  # for quantized models
+                err_msg = "parameter %s not found in state dict" % full_attr
+                assert full_attr in state_dict, err_msg
+                packed_param_map[full_attr_node_name] = full_attr
+            elif full_attr in state_dict:
                 torch_tensor = state_dict[full_attr]
                 tensor, var = _get_tensor_and_var(torch_tensor,
                                                   full_attr_node_name)
                 param_tensors[full_attr_node_name] = tensor
                 params[full_attr_node_name] = var
 
-    return params, param_tensors
+    return params, param_tensors, packed_param_map
 
 
 def parse_operators(operators, outputs, output_index_map, ret_name):
@@ -1090,16 +1150,26 @@ def from_pytorch(script_module, input_shapes, custom_convert_map=None):
 
     params = script_module.state_dict()
     input_vars = parse_inputs(graph.inputs(), input_shapes)
-    param_vars, tensors = parse_params(graph, params)
+    param_vars, tensors, packed_param_map = parse_params(graph, params)
+    tvm_params = {k: tvm.nd.array(v) for k, v in tensors.items()}
 
     input_vars.update(param_vars)
     outputs = list(input_vars.values())
     output_index_map = dict(zip(input_vars.keys(), range(len(outputs))))
     ret_name = _get_input_names(graph.return_node())[0]
 
+    # For quantized models
+    if "aten::quantize_per_tensor" in op_names:
+        weight_quant_params = qnn_torch.get_weight_quant_params(script_module)
+        qnn_torch.add_input_quant_params_to_op_inputs(graph)
+        qnn_torch.add_quant_params_to_outputs(outputs, output_index_map,
+                                              packed_param_map,
+                                              weight_quant_params)
+        qnn_torch.add_quant_params(tvm_params, weight_quant_params)
+        _convert_map.update(qnn_torch.convert_map)
+
     body = parse_operators(_get_operator_nodes(graph.nodes()), outputs,
                            output_index_map, ret_name)
     func = tvm.relay.Function(_analysis.free_vars(body), body)
-    tvm_params = {k: tvm.nd.array(v) for k, v in tensors.items()}
 
     return _module.IRModule.from_expr(func), tvm_params
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
new file mode 100644
index 000000000000..0704e34b77ef
--- /dev/null
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -0,0 +1,692 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, import-outside-toplevel
+""" Functions to convert quantized torch models to QNN """
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay import expr as _expr
+from tvm.relay import op as _op
+from tvm.relay.frontend.common import infer_shape
+
+
+class QNNParam:
+    """ A placeholder for weight quantization parameters """
+
+    def __init__(self, weight, bias, scale, zero_point, param_key):
+        param_prefix = param_key[:-len("._packed_params")]
+        self.weight_var = _expr.var(param_prefix + "_weight",
+                                    shape=weight.shape)
+        self.weight = weight
+
+        if bias is not None:
+            self.bias_var = _expr.var(param_prefix + "_bias",
+                                      shape=bias.shape)
+            self.bias = bias.detach().numpy()
+        else:
+            self.bias_var = None
+            self.bias = None
+
+        self.scale = _expr.const(scale)
+        self.zero_point = _expr.const(zero_point, dtype="int32")
+
+
+def _unpack_quant_params(param_name, packed_params, unpack_func):
+    # Torch stores quantized params in a custom packed format,
+    # need to unpack and retrieve them as numpy arrays
+    qweight, bias = unpack_func(packed_params)
+    weight_np = qweight.dequantize().numpy()
+
+    import torch
+    if qweight.qscheme() == torch.per_tensor_affine:
+        param = QNNParam(weight_np, bias, qweight.q_scale(),
+                         int(qweight.q_zero_point()), param_name)
+    else:
+        scales = qweight.q_per_channel_scales().numpy()
+        zero_points = qweight.q_per_channel_zero_points().numpy()
+        # This is an assumption posed by QNN
+        msg = "The values of zero points should be all zero for per channel"
+        assert np.all(zero_points == 0), msg
+        param = QNNParam(weight_np, bias, scales, 0, param_name)
+
+    return param
+
+
+def get_weight_quant_params(script_module):
+    """ Retrive and unpack weight parameters from quantized modules """
+    conv_packed_params = []
+    linear_packed_params = []
+
+    import torch
+    # conv and linear requires different unpacking function
+    # extract all conv and linear parameters separately to distinguish them
+    for name, m in script_module.named_modules():
+        if isinstance(m, torch.jit.RecursiveScriptModule):
+            if "Conv" in m.original_name:
+                conv_packed_params.append((name, m.state_dict()))
+            elif m.original_name == "LinearPackedParams":
+                linear_packed_params.append((name, m.state_dict()))
+
+    pairs = [(torch.ops.quantized.conv2d_unpack, conv_packed_params),
+             (torch.ops.quantized.linear_unpack, linear_packed_params)]
+
+    quant_params = {}
+    param_name = "_packed_params"
+    for unpack_func, params in pairs:
+        for name, state_dict in params:
+            assert len(state_dict) == 1
+            assert param_name in state_dict
+            key = name + "." + param_name
+            packed_param = state_dict[param_name]
+            quant_params[key] = _unpack_quant_params(key, packed_param,
+                                                     unpack_func)
+
+    return quant_params
+
+
+def add_quant_params_to_outputs(outputs, output_index_map,
+                                packed_param_map, quant_params):
+    """
+    Add quant params to outputs so that they can be referenced by other
+    ops later. Weights are quantized here.
+    """
+    for node_name, packed_param_name in packed_param_map.items():
+        qparam = quant_params[packed_param_name]
+        output_index_map[node_name] = len(outputs)
+        qweight = relay.qnn.op.quantize(qparam.weight_var, qparam.scale,
+                                        qparam.zero_point, out_dtype="int8",
+                                        axis=0)
+        param_tup = (qweight, qparam.scale, qparam.zero_point, qparam.bias_var)
+        outputs.append(param_tup)
+
+
+def _get_quant_param_for_input(input_value):
+    """
+    We want to know the input scale and zp of this input_value, since
+    input quant params are not explicitly passed around in torch (they
+    are embeded in a QTensor data structure, not visible statically).
+    We know that it is quantized using output scale and zp
+    of some previous quantized op. The purpose of this function
+    is to find that pair of parameters.
+    """
+    # Indices for output scale and zp
+    # For example, in quantized::conv2d(%input, %1, %2, %3, %4, %5, %6, %7),
+    # 6th and 7th arg are output scale and zp respectively.
+    output_quant_param_indices = {
+        "aten::quantize_per_tensor": (1, 2),
+        "quantized::conv2d": (6, 7),
+        "quantized::conv2d_relu": (6, 7),
+        "quantized::linear": (2, 3),
+        "quantized::linear_relu": (2, 3),
+        "quantized::add_relu": (2, 3),
+        "quantized::add": (2, 3),
+        "quantized::mul_relu": (2, 3),
+        "quantized::mul": (2, 3),
+        "quantized::cat": (2, 3),
+        "quantized::mul_scalar": (2, 3),
+        "quantized::add_scalar": (2, 3)
+    }
+
+    def dfs(current_node):
+        # trace back to find the producer of this input value
+        current_op = current_node.kind()
+        if current_op in output_quant_param_indices:
+            indices = output_quant_param_indices[current_op]
+            scale = current_node.inputsAt(indices[0])
+            zp = current_node.inputsAt(indices[1])
+            return scale, zp
+
+        # Trace back eariler nodes, dfs order
+        # Assume quantized tensor comes earlier in the args
+        for arg in current_node.inputs():
+            return dfs(arg.node())
+
+        # shouldn't happen
+        assert False, "No producer for %s" % (str(current_node))
+
+    return dfs(input_value.node())
+
+
+def _get_add_scalar_output_quant_param(input_scale, input_zero_point,
+                                       scalar):
+    """
+    Determine the output scale and zp of quantized::add_scalar op
+    This is used for mobilenet v3
+    Refer to aten/src/ATen/native/quantized/cpu/qadd.cpp
+    The names of variables are the same as torch impl
+    """
+    q_min = 0
+    q_max = 255
+    s = input_scale
+    z = input_zero_point
+    c = scalar
+    c_q = round(c / s)
+
+    if q_min > z - c_q:
+        s_prime = (float(q_max) - (z - c_q)) / (float(q_max) - q_min) * s
+        z_prime = q_min
+    elif q_max < z - c_q:
+        s_prime = (float(z - c_q) - q_min) / (float(q_max) - q_min) * s
+        z_prime = q_max
+    else:
+        s_prime = s
+        z_prime = z - c_q
+
+    return s_prime, z_prime
+
+
+def _get_mul_scalar_output_quant_param(input_scale, input_zero_point,
+                                       scalar):
+    """
+    Determine the output scale and zp of quantized::mul_scalar op
+    This is used for mobilenet v3
+    Refer to aten/src/ATen/native/quantized/cpu/qmul.cpp
+    The names of variables are the same as torch impl
+    """
+    q_min = 0
+    q_max = 255
+    self_scale = input_scale
+    self_zero_point = input_zero_point
+    other_val = scalar
+
+    if other_val > 0.0:
+        s_prime = other_val * self_scale
+        z_prime = self_zero_point
+    elif other_val == 0.0:
+        s_prime = 1.0
+        z_prime = 0
+    else:
+        s_prime = abs(other_val) * self_scale
+        z_prime = q_max - (self_zero_point - q_min)
+
+    return s_prime, z_prime
+
+
+def _add_output_quant_params_to_scalar_op(node, graph,
+                                          input_scale, input_zero_point,
+                                          scalar):
+    """
+    The output scale and zp of {add,mul}_scalar op are not explicit in the IR
+    They are required for _get_quant_param_for_input above to work correctly
+    So calculate these params using the same way torch does, and make new
+    constant nodes in the input IR. Also add these params to the inputs of
+    scalar op.
+
+    For example,
+       %6 : float = prim::Constant[value=3.]()
+       %input : QUInt8(1, 3, 224, 224) = quantized::add_scalar(%x.1, %6)
+    becomes
+       %6 : float = prim::Constant[value=3.]()
+       %7 : float = prim::Constant[value=0.015686161816120148]()
+       %8 : int = prim::Constant[value=0]()
+       %input : UInt8(1, 3, 224, 224) = quantized::add_scalar(%x.1, %6, %7, %8)
+
+    %7 and %8 are newly created output scale and zp constant nodes
+    """
+    import torch
+    operator = node.kind()
+
+    if operator == "quantized::mul_scalar":
+        out_scale, out_zero_point = \
+          _get_mul_scalar_output_quant_param(input_scale, input_zero_point,
+                                             scalar)
+    elif operator == "quantized::add_scalar":
+        out_scale, out_zero_point = \
+          _get_add_scalar_output_quant_param(input_scale, input_zero_point,
+                                             scalar)
+    else:
+        raise NotImplementedError("unsupported scalar op: %s" % operator)
+
+    # create new constant nodes and add them to graph
+    out_scale_node = graph.create("prim::Constant")
+    out_zero_point_node = graph.create("prim::Constant")
+    out_scale_node.insertBefore(node)
+    out_zero_point_node.insertBefore(node)
+    out_scale_node.f_("value", out_scale)
+    out_zero_point_node.i_("value", out_zero_point)
+    out_scale_node.output().setType(torch._C.FloatType.get())
+    out_zero_point_node.output().setType(torch._C.IntType.get())
+    node.addInput(out_scale_node.output())
+    node.addInput(out_zero_point_node.output())
+
+
+def add_input_quant_params_to_op_inputs(graph):
+    """
+    In Torch, input quant params are not explicitly passed around
+    Instead, they are stored in QTensor data structure, and retrieved
+    at runtime by each quantized ops.
+    However, they need to be known statically for QNN translation.
+    To workaround and simplify the translation of inputs, we manually add
+    input quant params to inputs of Torch quantized operators listed below.
+    See _quantized_conv2d() below for example of why this is helpful.
+
+    For example,
+      %input : QUInt8(1, 512, 7, 7) = quantized::add(%x.8, %x.9, %434, %435)
+    becomes
+      %395 : float = prim::Constant[value=0.036212071776390076]()
+      %396 : int = prim::Constant[value=0]()
+      %430 : float = prim::Constant[value=0.16080744564533234]()
+      %431 : int = prim::Constant[value=42]()
+      %input : QUInt8(1, 512, 7, 7) = quantized::add(%x.8, %x.9, %434, %435,
+                                                     %430, %431, %395, %396)
+
+    %434, %435 are output scale and zp of quantized::add op
+    %430, %431, %395, %396 are two pairs of input (scale, zp) for two tensors
+    added by this function
+    """
+    # How many quantized tensors each op takes as inputs?
+    # A pair of (scale, zp) for each input quantized tensor will be added
+    # to the input nodes
+    num_quantized_inputs = {"quantized::conv2d": 1,
+                            "quantized::conv2d_relu": 1,
+                            "quantized::linear": 1,
+                            "quantized::linear_relu": 1,
+                            "quantized::add_relu": 2,
+                            "quantized::add": 2,
+                            "quantized::mul_relu": 2,
+                            "quantized::mul": 2,
+                            "aten::dequantize": 1,
+                            "aten::mean": 1,
+                            "aten::upsample_bilinear2d": 1,
+                            "aten::relu_": 1,
+                            "aten::relu": 1,
+                            "quantized::add_scalar": 1,
+                            "quantized::mul_scalar": 1,
+                            'quantized::relu6': 1}
+
+    need_input_quant_param = set(num_quantized_inputs.keys())
+    need_input_quant_param.add("quantized::cat")
+
+    for node in graph.nodes():
+        operator = node.kind()
+        if operator not in need_input_quant_param:
+            continue
+
+        input_scales = []
+        input_zero_points = []
+
+        if operator == "quantized::cat":
+            # the number of inputs to concat is not constant
+            # so handle it separately
+            inputs = node.inputsAt(0).node().inputs()
+            for inp in inputs:
+                scale, zp = _get_quant_param_for_input(inp)
+                input_scales.append(scale)
+                input_zero_points.append(zp)
+        else:
+            for i in range(num_quantized_inputs[operator]):
+                scale, zp = _get_quant_param_for_input(node.inputsAt(i))
+                input_scales.append(scale)
+                input_zero_points.append(zp)
+
+        if operator in ["quantized::add_scalar", "quantized::mul_scalar"]:
+            scalar = node.inputsAt(1).node().f("value")
+            inp_scale = input_scales[0].node().f("value")
+            inp_zero_point = input_zero_points[0].node().i("value")
+
+            # see the comments in this function above
+            _add_output_quant_params_to_scalar_op(node, graph,
+                                                  inp_scale, inp_zero_point,
+                                                  scalar)
+
+        for scale, zp in zip(input_scales, input_zero_points):
+            node.addInput(scale)
+            node.addInput(zp)
+
+
+def add_quant_params(params, quant_params):
+    """ Add quant parameters to TVM param map """
+    for qparam in quant_params.values():
+        params[qparam.weight_var.name_hint] = tvm.nd.array(qparam.weight)
+        if qparam.bias is not None:
+            params[qparam.bias_var.name_hint] = tvm.nd.array(qparam.bias)
+
+
+def quantized_adaptive_avg_2d(data, func_fp32):
+    # this follows tflite impl
+    inp = _op.cast(data, dtype="int32")
+    out = func_fp32(inp)
+    return _op.cast(out, "uint8")
+
+
+def quantized_mean(data, input_scale, input_zero_point, func_fp32):
+    # refer to aten/src/ATen/native/quantized/cpu/qreduction.cpp
+    dequantized = relay.qnn.op.dequantize(data, input_scale, input_zero_point)
+    out = func_fp32(dequantized)
+    return relay.qnn.op.quantize(out, input_scale, input_zero_point,
+                                 out_dtype="uint8", axis=1)
+
+
+def quantized_upsample(data, input_scale, input_zero_point, func_fp32):
+    # currently piggy backs to fp32, it gets identical output as torch
+    data = relay.qnn.op.dequantize(data, input_scale, input_zero_point)
+    out = func_fp32(data)
+    return relay.qnn.op.quantize(out, input_scale, input_zero_point,
+                                 out_dtype="uint8", axis=1)
+
+
+def quantized_relu(data, input_zero_point):
+    # refer to aten/src/ATen/native/quantized/cpu/qrelu.cpp
+    zp = _op.cast(input_zero_point, dtype="uint8")
+    return _op.tensor.maximum(data, zp)
+
+
+def _quantize_per_tensor():
+    def _impl(inputs, _):
+        return relay.qnn.op.quantize(inputs[0], _expr.const(inputs[1]),
+                                     _expr.const(inputs[2]), out_dtype="uint8",
+                                     axis=1)
+    return _impl
+
+
+def _dequantize():
+    def _impl(inputs, _):
+        assert len(inputs) == 3, "Input quant params not found in op inputs"
+        inp_scale = _expr.const(inputs[1])
+        inp_zero_point = _expr.const(inputs[2])
+        return relay.qnn.op.dequantize(inputs[0], inp_scale, inp_zero_point)
+    return _impl
+
+
+def _get_numpy(relay_const_scalar):
+    return relay_const_scalar.data.asnumpy()
+
+
+def _get_scalar(relay_const_scalar):
+    return np.asscalar(_get_numpy(relay_const_scalar))
+
+
+def _do_bias_and_requantize(output, bias, input_scale, weight_scale,
+                            output_scale, output_zero_point,
+                            with_relu):
+    """ Output processing for conv and linear """
+    # this is a vector for per channel case
+    requant_input_scale = _expr.const(_get_numpy(input_scale) *
+                                      _get_numpy(weight_scale))
+    # Torch does bias add and requanize scale in fp32
+    # refer to third_party/fbgemm/include/fbgemm/OutputProcessing-inl.h
+    # Instead, we do bias add in int32 and use qnn requantize, which needs
+    # integer input.
+    # We observed no loss in accuracy in doing this way, and it is better
+    # for tvm because bias quantization can be done at compile time
+    # Instead, the torch way requires rounding of activation at runtime
+
+    if bias is not None:
+        qbias = relay.qnn.op.quantize(bias, requant_input_scale,
+                                      _expr.const(0, "int32"),
+                                      out_dtype="int32", axis=0)
+        requantize_input = _op.nn.bias_add(output, qbias)
+    else:
+        requantize_input = output
+
+    requantized = relay.qnn.op.requantize(requantize_input,
+                                          requant_input_scale,
+                                          relay.const(0, 'int32'),
+                                          output_scale, output_zero_point,
+                                          out_dtype="int32", axis=1)
+    clip_min = 0
+    if with_relu:
+        clip_min = _get_scalar(output_zero_point)
+
+    clip = _op.tensor.clip(requantized, clip_min, 255.)
+    return _op.cast(clip, dtype="uint8")
+
+
+def _quantized_conv2d(with_relu=False):
+    def _impl(inputs, _):
+        # refer to src/ATen/native/quantized/cpu/qconv.cpp
+        # inputs[0]: input tensor
+        # inputs[1]: (weight, scale, zero_point, bias)
+        # inputs[2-5]: stride, padding, dilation, groups
+        # inputs[6]: output_scale
+        # inputs[7]: output_zero_point
+        # inputs[8]: input_scale (added manually by frontend)
+        # inputs[9]: input_zero_point (added manually by frontend)
+        weight = inputs[1][0]
+        weight_scale = inputs[1][1]
+        weight_zero_point = inputs[1][2]
+
+        output_scale = _expr.const(inputs[6])
+        output_zero_point = _expr.const(inputs[7])
+
+        assert len(inputs) == 10, "Input quant params not found in op inputs"
+        # These are manually added by add_input_quant_params_to_op_inputs above
+        # In torch, they are retrieved from QTensor data structure at runtime
+        input_scale = _expr.const(inputs[8])
+        input_zero_point = _expr.const(inputs[9])
+
+        strides, padding, dilation = inputs[2], inputs[3], inputs[4]
+        strides = infer_shape(inputs[2])
+        padding = infer_shape(inputs[3])
+        dilation = infer_shape(inputs[4])
+        groups = inputs[5]
+
+        weight_shape = infer_shape(weight)
+        kernel_size = (weight_shape[2], weight_shape[3])
+        out_channels = weight_shape[0]
+
+        if padding[0] != 0 or padding[1] != 0:
+            pad_val = _get_scalar(input_zero_point)
+            inp = _op.nn.pad(inputs[0], pad_width=((0, 0),
+                                                   (0, 0),
+                                                   (padding[0], padding[0]),
+                                                   (padding[1], padding[1])),
+                             pad_value=float(pad_val))
+        else:
+            inp = inputs[0]
+
+        # padding is (0, 0) because we did explicit pad op with
+        # pad value being zero point above
+        conv_out = relay.qnn.op.conv2d(inp, weight,
+                                       input_zero_point, weight_zero_point,
+                                       input_scale, weight_scale,
+                                       kernel_size=kernel_size,
+                                       dilation=dilation, strides=strides,
+                                       padding=(0, 0), groups=groups,
+                                       channels=out_channels)
+        bias_var = inputs[1][3]
+
+        return _do_bias_and_requantize(conv_out, bias_var, input_scale,
+                                       weight_scale, output_scale,
+                                       output_zero_point, with_relu)
+
+    return _impl
+
+
+def _linear(with_relu=False):
+    # similar to conv
+    def _impl(inputs, _):
+        weight = inputs[1][0]
+        weight_scale = inputs[1][1]
+        weight_zero_point = inputs[1][2]
+        output_scale = _expr.const(inputs[2])
+        output_zero_point = _expr.const(inputs[3])
+        assert len(inputs) == 6, "Input quant params not found in op inputs"
+        # Manually added by add_input_quant_params_to_op_inputs above
+        input_scale = _expr.const(inputs[4])
+        input_zero_point = _expr.const(inputs[5])
+
+        weight_shape = infer_shape(weight)
+        dense = relay.qnn.op.dense(inputs[0], weight,
+                                   input_zero_point, weight_zero_point,
+                                   input_scale, weight_scale,
+                                   units=weight_shape[0])
+        bias_var = inputs[1][3]
+
+        return _do_bias_and_requantize(dense, bias_var, input_scale,
+                                       weight_scale, output_scale,
+                                       output_zero_point, with_relu)
+
+    return _impl
+
+
+def _binop(relay_op, with_relu=False):
+    # refer to aten/src/ATen/native/quantized/cpu/{qadd, qmul}.cpp
+    # they piggy backs to fp32 math by dequantize -> fp32 math -> quantize
+    def _impl(inputs, _):
+        output_scale = _expr.const(inputs[2])
+        output_zero_point = _expr.const(inputs[3])
+        assert len(inputs) == 8, "Input quant params not found in op inputs"
+        # Manually added by add_input_quant_params_to_op_inputs above
+        input_scale_lhs = _expr.const(inputs[4])
+        input_zero_point_lhs = _expr.const(inputs[5])
+        input_scale_rhs = _expr.const(inputs[6])
+        input_zero_point_rhs = _expr.const(inputs[7])
+        lhs = inputs[0]
+        rhs = inputs[1]
+
+        if isinstance(lhs, _expr.Call) and lhs.op.name == 'qnn.quantize':
+            lhs = lhs.args[0]
+        else:
+            lhs = relay.qnn.op.dequantize(lhs,
+                                          input_scale_lhs,
+                                          input_zero_point_lhs)
+
+        if isinstance(rhs, _expr.Call) and rhs.op.name == 'qnn.quantize':
+            rhs = rhs.args[0]
+        else:
+            rhs = relay.qnn.op.dequantize(rhs,
+                                          input_scale_rhs,
+                                          input_zero_point_rhs)
+        fp32_out = relay_op(lhs, rhs)
+
+        if with_relu:
+            fp32_out = _op.nn.relu(fp32_out)
+
+        return relay.qnn.op.quantize(fp32_out,
+                                     output_scale,
+                                     output_zero_point,
+                                     axis=-1,
+                                     out_dtype="uint8")
+    return _impl
+
+
+def _cat():
+    # refer to aten/src/ATen/native/quantized/cpu/qconcat.cpp
+    # for concat they also piggy backs to fp32(!)
+    # dequantize -> fp32 math -> quantize
+    # we can also use QNN concat op. we observed no change in accuracy
+    def _impl(inputs, _):
+        axis = inputs[1]
+        output_scale = _expr.const(inputs[2])
+        output_zero_point = _expr.const(inputs[3])
+        num_inputs = (len(inputs) - 4) // 2
+        dequantized = []
+
+        for i in range(0, num_inputs):
+            inp_scale = _expr.const(inputs[4+i*2])
+            inp_zp = _expr.const(inputs[4+i*2+1])
+            dequantized.append(relay.qnn.op.dequantize(inputs[0][i],
+                                                       inp_scale, inp_zp))
+
+        concat = _op.tensor.concatenate(dequantized, axis=axis)
+        return relay.qnn.op.quantize(concat, output_scale, output_zero_point,
+                                     axis=1, out_dtype="uint8")
+
+    return _impl
+
+
+def _add_scalar():
+    # this is used for mobilenet v3
+    def _impl(inputs, _):
+        # refer to aten/src/ATen/native/quantized/cpu/qadd.cpp
+        assert len(inputs) == 6, "Input quant params not found in op inputs"
+        s = inputs[4]
+        z = inputs[5]
+        c = inputs[1]
+        c_q = round(c / s)
+        q_min = 0
+        q_max = 255
+
+        # math for calculating output scale and zp are already done
+        # during _add_output_quant_params_to_scalar_op above
+        out_scale = _expr.const(inputs[2])
+        out_zp = _expr.const(inputs[3])
+
+        if q_min > z - c_q or q_max < z - c_q:
+            dequant = relay.qnn.op.dequantize(inputs[0],
+                                              _expr.const(s), _expr.const(z))
+            dequantized_add = _op.tensor.add(dequant, _expr.const(c_q * s))
+            return relay.qnn.op.quantize(dequantized_add, out_scale, out_zp,
+                                         axis=1, out_dtype="uint8")
+        # only scale change
+        return inputs[0]
+
+    return _impl
+
+
+def quantize_scalar(data, scale, zero_point):
+    # used to quantize 6., in mobilenet v3
+    transformed = zero_point + data / scale
+    return max(0, min(round(transformed), 255))
+
+
+def _relu6():
+    # refer to src/ATen/native/quantized/cpu/qrelu.cpp
+    def _impl(inputs, _):
+        assert len(inputs) == 4, "Input quant params not found in op inputs"
+        input_scale = inputs[2]
+        input_zero_point = inputs[3]
+        six = quantize_scalar(6., input_scale, input_zero_point)
+        return _op.tensor.clip(inputs[0], input_zero_point, six)
+    return _impl
+
+
+def _mul_scalar():
+    # this is used for mobilenet v3
+    def _impl(inputs, _):
+        # refer to aten/src/ATen/native/quantized/cpu/qmul.cpp
+        # math for calculating output scale and zp are already done
+        # during _add_output_quant_params_to_scalar_op above
+        assert len(inputs) == 6, "Input quant params not found in op inputs"
+        other_val = inputs[1]  # scalar
+
+        if other_val > 0.0:
+            # only scale change
+            return inputs[0]
+        if other_val == 0.0:
+            shape = infer_shape(inputs[0])
+            return _op.full(_expr.const(0), shape, dtype="uint8")
+
+        # negative scale case
+        q_min = 0
+        q_max = 255
+        bias = _expr.const(q_max + q_min, dtype="int8")
+        int8 = bias - _op.cast(inputs[0], "int8")
+        return _op.cast(int8, "uint8")
+
+    return _impl
+
+
+convert_map = {
+    'aten::quantize_per_tensor': _quantize_per_tensor(),
+    'quantized::conv2d_relu': _quantized_conv2d(True),
+    'aten::dequantize': _dequantize(),
+    'quantized::conv2d': _quantized_conv2d(),
+    'quantized::add_relu': _binop(relay.add, True),
+    'quantized::add': _binop(relay.add),
+    'quantized::mul_relu': _binop(relay.multiply, True),
+    'quantized::mul': _binop(relay.multiply),
+    'quantized::linear': _linear(),
+    'quantized::linear_relu': _linear(True),
+    'quantized::cat': _cat(),
+    'quantized::add_scalar': _add_scalar(),
+    'quantized::mul_scalar': _mul_scalar(),
+    'quantized::relu6': _relu6()
+}
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
new file mode 100644
index 000000000000..e3a876c79591
--- /dev/null
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -0,0 +1,455 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Tests on quantized torch model conversion """
+import os
+
+from PIL import Image
+
+import numpy as np
+
+import torch
+from torch import nn
+from torch.quantization import QuantStub, DeQuantStub
+from torch.quantization import fuse_modules, QuantWrapper
+
+import tvm
+from tvm import relay
+from tvm.relay.frontend.pytorch import get_graph_input_names
+from tvm.contrib.download import download_testdata
+
+
+def torch_version_check():
+    from packaging import version
+    return version.parse(torch.__version__) > version.parse("1.4.0")
+
+
+def get_tvm_runtime(script_module, input_name, ishape):
+
+    input_shapes = {input_name: ishape}
+    mod, params = relay.frontend.from_pytorch(script_module, input_shapes)
+
+    with relay.build_config(opt_level=3):
+        # test on only cpu for now, torch cannot run quant models on cuda
+        # also not to make CI too slow
+        json, lib, params = relay.build(mod, target="llvm", params=params)
+
+    runtime = tvm.contrib.graph_runtime.create(json, lib, tvm.cpu(0))
+    runtime.set_input(**params)
+    return runtime
+
+
+def get_qconfig(per_channel):
+    from torch.quantization.observer import MovingAverageMinMaxObserver
+    from torch.quantization.observer import default_weight_observer
+
+    if per_channel:
+        return torch.quantization.get_default_qconfig('fbgemm')
+    else:
+        act = MovingAverageMinMaxObserver.with_args(reduce_range=False)
+        return torch.quantization.QConfig(activation=act,
+                                          weight=default_weight_observer)
+
+
+def quantize_model(model, inp, per_channel=False, dummy=True):
+    model.fuse_model()
+    model.qconfig = get_qconfig(per_channel)
+    torch.quantization.prepare(model, inplace=True)
+    model(inp)
+    torch.quantization.convert(model, inplace=True)
+
+
+class ConvBn(nn.Module):
+    def __init__(self, with_relu=False):
+        super().__init__()
+        layers = [nn.Conv2d(3, 32, 3, bias=True),
+                  nn.BatchNorm2d(32)]
+        if with_relu:
+            layers.append(nn.ReLU())
+        self.conv = nn.Sequential(*layers)
+        self.quant_wrap = QuantWrapper(self.conv)
+        self.with_relu = with_relu
+
+    def forward(self, x):
+        return self.quant_wrap(x)
+
+    def fuse_model(self):
+        indices = ["0", "1"]
+        if self.with_relu:
+            indices.append("2")
+        fuse_modules(self.conv, indices, inplace=True)
+
+
+class Linear(nn.Module):
+    def __init__(self, with_relu=False):
+        super().__init__()
+        layers = [nn.Linear(16, 32)]
+        if with_relu:
+            layers.append(nn.ReLU())
+        self.fc = nn.Sequential(*layers)
+        self.quant_wrap = QuantWrapper(self.fc)
+        self.with_relu = with_relu
+
+    def forward(self, x):
+        return self.quant_wrap(x)
+
+    def fuse_model(self):
+        if self.with_relu:
+            fuse_modules(self.fc, ["0", "1"], inplace=True)
+
+
+class ReLU(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.relu = QuantWrapper(nn.ReLU())
+
+    def forward(self, x):
+        return self.relu(x)
+
+    def fuse_model(self):
+        pass
+
+
+# Mobilenet V3 related modules
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True, add_stub=False):
+        super().__init__()
+        self.float_op = nn.quantized.FloatFunctional()
+        self.relu6 = nn.ReLU6(inplace=inplace)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.add_stub = add_stub
+
+    def forward(self, x):
+        if self.add_stub:
+            x = self.quant(x)
+        relu6 = self.relu6(self.float_op.add_scalar(x, 3.))
+        mul = self.float_op.mul_scalar(relu6, 1/6.)
+        if self.add_stub:
+            mul = self.dequant(mul)
+        return mul
+
+    def fuse_model(self):
+        pass
+
+
+class Hswish(nn.Module):
+    def __init__(self, inplace=True, add_stub=False):
+        super(Hswish, self).__init__()
+        self.float_op = nn.quantized.FloatFunctional()
+        self.hsigmoid = Hsigmoid(inplace, add_stub=False)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.add_stub = add_stub
+
+    def forward(self, x):
+        if self.add_stub:
+            x = self.quant(x)
+        mul = self.float_op.mul(x, self.hsigmoid(x))
+        if self.add_stub:
+            mul = self.dequant(mul)
+        return mul
+
+    def fuse_model(self):
+        pass
+
+
+class SqueezeExcite(nn.Module):
+    def __init__(self, channel, reduction=4, add_stub=False):
+        super(SqueezeExcite, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            Hsigmoid(add_stub=False)
+        )
+        self.fmul = nn.quantized.FloatFunctional()
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.add_stub = add_stub
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        if self.add_stub:
+            x = self.quant(x)
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        out = self.fmul.mul(x, y.expand_as(x))
+        if self.add_stub:
+            return self.dequant(out)
+        else:
+            return out
+
+    def fuse_model(self):
+        fuse_modules(self.fc, ["0", "1"], inplace=True)
+
+
+# test on quantized::mul_scalar with negative scale
+class MulScalarNegative(nn.Module):
+    def __init__(self, ):
+        super().__init__()
+        self.float_op = nn.quantized.FloatFunctional()
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        mul = self.float_op.mul_scalar(x, -0.3)
+        return self.dequant(mul)
+
+    def fuse_model(self):
+        pass
+
+
+class UpsamplingBilinear(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.relu = QuantWrapper(nn.ReLU())
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        upsample = nn.functional.interpolate(x, scale_factor=2,
+                                             mode='bilinear',
+                                             align_corners=True)
+        return self.dequant(upsample)
+
+    def fuse_model(self):
+        pass
+
+
+def test_quantized_modules():
+    imagenet_ishape = (1, 3, 224, 224)
+
+    qmodules = [
+       ("relu", imagenet_ishape, ReLU(), False),
+       ("upsample bilinear", (1, 3, 64, 64), UpsamplingBilinear(), False),
+    ]
+
+    for per_channel in [False, True]:
+        if per_channel:
+            postfix = ", per_channel"
+        else:
+            postfix = ""
+
+        qmodules += [
+           ("conv_bn" + postfix, imagenet_ishape, ConvBn(), per_channel),
+           ("conv_bn_relu" + postfix, imagenet_ishape, ConvBn(with_relu=True), per_channel),
+           ("linear" + postfix, (16, 16), Linear(), per_channel),
+           ("linear_relu" + postfix, (16, 16), Linear(with_relu=True), per_channel)
+        ]
+
+    if torch_version_check():
+        qmodules += [
+           ("hsigmoid", imagenet_ishape, Hsigmoid(add_stub=True), False),
+           ("hswish", imagenet_ishape, Hswish(add_stub=True), False),
+           ("semodule", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), False),
+           ("semodule, per_channel", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), True),
+           ("mul_scalar negative", imagenet_ishape, MulScalarNegative(), False)
+        ]
+    else:
+        print("Skipping tests that require torch > 1.4")
+
+    for (module_name, ishape, raw_module, per_channel) in qmodules:
+        raw_module.eval()
+        inp = torch.rand(ishape)
+
+        quantize_model(raw_module, inp, per_channel=per_channel, dummy=True)
+        script_module = torch.jit.trace(raw_module, inp).eval()
+
+        with torch.no_grad():
+            pt_result = script_module(inp.clone()).numpy()
+
+        input_name = get_graph_input_names(script_module)[0]
+
+        runtime = get_tvm_runtime(script_module, input_name, ishape)
+        runtime.set_input(input_name, inp.numpy().copy())
+        runtime.run()
+        tvm_result = runtime.get_output(0).asnumpy()
+
+        max_abs_diff = np.max(np.abs(tvm_result - pt_result))
+        mean_abs_diff = np.mean(np.abs(tvm_result - pt_result))
+        num_identical = np.sum(tvm_result == pt_result)
+        match_ratio = num_identical / float(np.prod(tvm_result.shape))
+
+        print(module_name, max_abs_diff, mean_abs_diff, match_ratio)
+
+        # sample outputs
+        """
+        relu 0.0039215684 2.6052087e-08 0.9999933567176871
+        upsample bilinear 0.0 0.0 1.0
+        conv_bn 0.22062653 0.011478779 0.6909348115006899
+        conv_bn_relu 0.3700896 0.010921672 0.7489366477964451
+        linear 0.15987062 0.009231662 0.794921875
+        linear_relu 0.14180502 0.0053220326 0.8828125
+        conv_bn, per_channel 0.01654929 2.9486866e-06 0.9998218235127019
+        conv_bn_relu, per_channel 0.009089053 1.4926576e-06 0.9998357732732732
+        linear, per_channel 0.0 0.0 1.0
+        linear_relu, per_channel 0.0 0.0 1.0
+        hsigmoid 0.002614379 0.00020525524 0.9214896896258503
+        hswish 0.0052286386 0.00063522335 0.7587359162414966
+        semodule, per_channel 0.0039885044 0.0008620687 0.7838592529296875
+        mul_scalar negative 0.0011764616 7.815566e-09 0.9999933567176871
+        """
+
+        # we cannot make any guarantee on how close the raw output is to torch
+        # tvm.testing.assert_allclose(tvm_result, pt_result, rtol=1e-1, atol=1e-1)
+
+
+def test_quantized_imagenet():
+    def get_transform():
+        import torchvision.transforms as transforms
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                         std=[0.229, 0.224, 0.225])
+        return transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                normalize,
+            ])
+
+    def get_real_image(im_height, im_width):
+        repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/InceptionV1/'
+        img_name = 'elephant-299.jpg'
+        image_url = os.path.join(repo_base, img_name)
+        img_path = download_testdata(image_url, img_name, module='data')
+        return Image.open(img_path).resize((im_height, im_width))
+
+    def get_imagenet_input():
+        im = get_real_image(224, 224)
+        preprocess = get_transform()
+        pt_tensor = preprocess(im)
+        return np.expand_dims(pt_tensor.numpy(), 0)
+
+    from torchvision.models.quantization import resnet as qresnet
+    from torchvision.models.quantization import mobilenet as qmobilenet
+    from torchvision.models.quantization import inception as qinception
+    from torchvision.models.quantization import googlenet as qgooglenet
+
+    qmodels = []
+
+    for per_channel in [False, True]:
+        qmodels += [
+            ("resnet18", qresnet.resnet18(pretrained=True), per_channel),
+            ("mobilenet_v2", qmobilenet.mobilenet_v2(pretrained=True), per_channel),
+            ("inception_v3", qinception.inception_v3(pretrained=True), per_channel),
+            ("googlenet", qgooglenet(pretrained=True), per_channel),
+        ]
+
+    results = []
+
+    for (model_name, raw_model, per_channel) in qmodels:
+        raw_model.eval()
+
+        if per_channel:
+            model_name += ", per channel quantization"
+        else:
+            model_name += ", per tensor quantization"
+
+        inp = get_imagenet_input()
+        pt_inp = torch.from_numpy(inp)
+
+        quantize_model(raw_model, pt_inp, per_channel=per_channel, dummy=False)
+        script_module = torch.jit.trace(raw_model, pt_inp).eval()
+
+        with torch.no_grad():
+            pt_result = script_module(pt_inp).numpy()
+
+        input_name = get_graph_input_names(script_module)[0]
+        runtime = get_tvm_runtime(script_module, input_name, (1, 3, 224, 224))
+        runtime.set_input(input_name, inp)
+        runtime.run()
+
+        tvm_result = runtime.get_output(0).asnumpy()
+
+        results.append((model_name, pt_result[0], tvm_result[0]))
+
+    for (model_name, pt_result, tvm_result) in results:
+        max_abs_diff = np.max(np.abs(tvm_result - pt_result))
+        mean_abs_diff = np.mean(np.abs(tvm_result - pt_result))
+        num_identical = np.sum(tvm_result == pt_result)
+        pt_top3_labels = np.argsort(pt_result)[::-1][:3]
+        tvm_top3_labels = np.argsort(pt_result)[::-1][:3]
+
+        print("\nModel name: %s" % model_name)
+        print("PyTorch top3 label:", pt_top3_labels)
+        print("TVM top3 label:", tvm_top3_labels)
+        print("max abs diff:", max_abs_diff)
+        print("mean abs_diff:", mean_abs_diff)
+        print("%d in 1000 raw outputs identical." % num_identical)
+
+        assert set(pt_top3_labels) == set(tvm_top3_labels)
+
+        # sample outputs
+        """
+        Model name: resnet18, per tensor quantization
+        PyTorch top3 label: [386 101 385]
+        TVM top3 label: [386 101 385]
+        max abs diff: 0.65681696
+        mean abs_diff: 0.14055882
+        236 in 1000 raw outputs identical.
+
+        Model name: mobilenet_v2, per tensor quantization
+        PyTorch top3 label: [101 386 385]
+        TVM top3 label: [101 386 385]
+        max abs diff: 2.1262953
+        mean abs_diff: 0.41025686
+        101 in 1000 raw outputs identical.
+
+        Model name: inception_v3, per tensor quantization
+        PyTorch top3 label: [101 386 385]
+        TVM top3 label: [101 386 385]
+        max abs diff: 0.9994669
+        mean abs_diff: 0.098697364
+        272 in 1000 raw outputs identical.
+
+        Model name: googlenet, per tensor quantization
+        PyTorch top3 label: [101 386 385]
+        TVM top3 label: [101 386 385]
+        max abs diff: 0.28248847
+        mean abs_diff: 0.0634469
+        274 in 1000 raw outputs identical.
+
+        Model name: resnet18, per channel quantization
+        PyTorch top3 label: [101 386 385]
+        TVM top3 label: [101 386 385]
+        max abs diff: 0.65908074
+        mean abs_diff: 0.1274223
+        469 in 1000 raw outputs identical.
+
+        Model name: mobilenet_v2, per channel quantization
+        PyTorch top3 label: [101 386 385]
+        TVM top3 label: [101 386 385]
+        max abs diff: 0.71120834
+        mean abs_diff: 0.15883648
+        423 in 1000 raw outputs identical.
+
+        Model name: inception_v3, per channel quantization
+        PyTorch top3 label: [386 101 385]
+        TVM top3 label: [386 101 385]
+        max abs diff: 1.3372154
+        mean abs_diff: 0.1225224
+        401 in 1000 raw outputs identical.
+
+        Model name: googlenet, per channel quantization
+        PyTorch top3 label: [101 386 385]
+        TVM top3 label: [101 386 385]
+        max abs diff: 0.34015465
+        mean abs_diff: 0.054197952
+        558 in 1000 raw outputs identical.
+        """
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index c2ff94de546f..641f5c9f99dd 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -849,3 +849,9 @@ def forward(self, inp):
     test_custom_conversion_map()
 
     test_segmentaton_models()
+
+    # Quantization test
+    from qnn_test import test_quantized_imagenet, test_quantized_modules
+
+    test_quantized_modules()
+    test_quantized_imagenet()

From 7a06bbed74eb81c0c1ac267de95b9b5881d13c2f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 4 Mar 2020 18:34:08 -0600
Subject: [PATCH 72/73] Fix gpu not found when running TVM docker (#4975)

---
 docker/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/build.sh b/docker/build.sh
index a2e21d140f6f..21141b71ca7d 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -79,7 +79,7 @@ if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then
 fi
 
 # Use nvidia-docker if the container is GPU.
-if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* ]]; then
+if [[ "${CONTAINER_TYPE}" == *"gpu"* ]]; then
     if ! type "nvidia-docker" 1> /dev/null 2> /dev/null
     then
         DOCKER_BINARY="docker"

From fe74b37ab578e6d3c540b0f6ac187a220ccc028a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 4 Mar 2020 18:35:38 -0600
Subject: [PATCH 73/73] Conditions updated to cover better user scenarios
 (#4951)

* Conditions updated to cover better user scenarios

* [1] New test case added

* [2] New test case added

* [3] Proper variable name used

* [4] Review Comments handled

* [5] Review comments handled

* [6] Review comments handled
---
 src/relay/ir/alpha_equal.cc                 | 10 +--
 tests/cpp/relay_pass_alpha_equal.cc         | 67 +++++++++++++++++++++
 tests/python/relay/test_pass_alpha_equal.py | 32 ++++++++++
 3 files changed, 104 insertions(+), 5 deletions(-)
 create mode 100644 tests/cpp/relay_pass_alpha_equal.cc

diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 78688d7dc730..c622599dd89c 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -50,14 +50,14 @@ class AlphaEqualHandler:
    * \return The comparison result.
    */
   bool Equal(const ObjectRef& lhs, const ObjectRef& rhs) {
-    if (lhs.same_as(rhs)) return true;
     if (!lhs.defined() || !rhs.defined()) return false;
-    if (lhs->IsInstance<TypeNode>()) {
-      if (!rhs->IsInstance<TypeNode>()) return false;
+    if (lhs.same_as(rhs)) return true;
+    if (lhs->IsInstance<TypeNode>() || rhs->IsInstance<TypeNode>()) {
+      if (!rhs->IsInstance<TypeNode>() || !lhs->IsInstance<TypeNode>()) return false;
       return TypeEqual(Downcast<Type>(lhs), Downcast<Type>(rhs));
     }
-    if (lhs->IsInstance<ExprNode>()) {
-      if (!rhs->IsInstance<ExprNode>()) return false;
+    if (lhs->IsInstance<ExprNode>() || rhs->IsInstance<ExprNode>()) {
+      if (!rhs->IsInstance<ExprNode>() || !lhs->IsInstance<ExprNode>()) return false;
       return ExprEqual(Downcast<Expr>(lhs), Downcast<Expr>(rhs));
     }
     if (const auto lhsm = lhs.as<IRModuleNode>()) {
diff --git a/tests/cpp/relay_pass_alpha_equal.cc b/tests/cpp/relay_pass_alpha_equal.cc
new file mode 100644
index 000000000000..0207fca00cf7
--- /dev/null
+++ b/tests/cpp/relay_pass_alpha_equal.cc
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/te/operation.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/type.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/transform.h>
+
+using namespace tvm;
+
+class TestAlphaEquals {
+  runtime::PackedFunc *_packed_func;
+ public:
+  TestAlphaEquals(const char* func_name) {
+    _packed_func = new runtime::PackedFunc();
+    TVMFuncGetGlobal(func_name, reinterpret_cast<TVMFunctionHandle*>(&_packed_func));
+  }
+
+  void UpdatePackedFunc(const char* func_name) {
+    TVMFuncGetGlobal(func_name, reinterpret_cast<TVMFunctionHandle*>(&_packed_func));
+  }
+
+  bool operator()(ObjectRef input_1, ObjectRef input_2) {
+    TVMRetValue rv;
+    std::vector<TVMValue> values(2);
+    std::vector<int> codes(2);
+    runtime::TVMArgsSetter setter(values.data(), codes.data());
+    setter(0, input_1);
+    setter(1, input_2);
+    _packed_func->CallPacked(TVMArgs(values.data(), codes.data(), 2), &rv);
+    return bool(rv);
+  };
+
+};
+
+TEST(Relay, AlphaTestEmptyTypeNodes) {
+  auto x = TypeVar("x", kTypeData);
+  auto y = TypeVar();
+  EXPECT_FALSE(relay::AlphaEqual(x, y));
+
+  TestAlphaEquals test_equals("relay._make._alpha_equal");
+  EXPECT_FALSE(test_equals(x, y));
+}
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
index 7e34f48ec7e1..ec026be69e63 100644
--- a/tests/python/relay/test_pass_alpha_equal.py
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -28,6 +28,15 @@ def alpha_equal(x, y):
     """
     return analysis.alpha_equal(x, y) and analysis.structural_hash(x) == analysis.structural_hash(y)
 
+def alpha_equal_commutative(x, y):
+    """
+    Check for commutative property of equality
+    """
+    xy = analysis.alpha_equal(x, y)
+    yx = analysis.alpha_equal(y, x)
+    assert xy == yx
+    return xy
+
 def test_tensor_type_alpha_equal():
     t1 = relay.TensorType((3, 4), "float32")
     t2 = relay.TensorType((3, 4), "float32")
@@ -219,6 +228,26 @@ def test_constant_alpha_equal():
     assert not alpha_equal(x, y)
     assert alpha_equal(x, relay.const(1))
 
+def test_type_node_alpha_equal():
+    v1 = relay.TypeVar('v1', 6)
+    v2 = relay.TypeVar('v2', 6)
+    assert not alpha_equal(v1, v2)
+
+    v1 = relay.TypeVar('v1', 0)
+    v2 = relay.TypeVar('v2', 6)
+    assert not alpha_equal(v1, v2)
+
+    assert alpha_equal_commutative(v1, v1)
+
+def test_type_node_incompatible_alpha_equal():
+    v1 = relay.TypeVar('v1', 6)
+    v2 = relay.Var("v2")
+    assert not alpha_equal_commutative(v1, v2)
+
+def test_expr_node_incompatible_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.PatternVar(relay.Var("v2"))
+    assert not alpha_equal_commutative(v1, v2)
 
 def test_var_alpha_equal():
     v1 = relay.Var("v1")
@@ -676,6 +705,9 @@ def test_fn_attribute():
     test_tensor_type_alpha_equal()
     test_incomplete_type_alpha_equal()
     test_constant_alpha_equal()
+    test_type_node_alpha_equal()
+    test_type_node_incompatible_alpha_equal()
+    test_expr_node_incompatible_alpha_equal()
     test_func_type_alpha_equal()
     test_tuple_type_alpha_equal()
     test_type_relation_alpha_equal()

' + stage_label(stage) + '
' + linebrk(str( + stage["compute"]), TVMDD_TABLE_BODY_WIDTH) + '
' \ + + input_port + '
' + node_label + '
' \ + + output_port + '